diff options
| author | Tim Redfern <tim@eclectronics.org> | 2014-02-17 13:36:38 +0000 |
|---|---|---|
| committer | Tim Redfern <tim@eclectronics.org> | 2014-02-17 13:36:38 +0000 |
| commit | 22e28216336da876e1fd17f380ce42eaf1446769 (patch) | |
| tree | 444dad3dc7e2656992d29f34f7bce31970c122a5 /ffmpeg/libavcodec/arm | |
| parent | ae5e8541f6e06e64c28719467cdf366ac57aff31 (diff) | |
chasing indexing error
Diffstat (limited to 'ffmpeg/libavcodec/arm')
87 files changed, 0 insertions, 19056 deletions
diff --git a/ffmpeg/libavcodec/arm/Makefile b/ffmpeg/libavcodec/arm/Makefile deleted file mode 100644 index 277abd9..0000000 --- a/ffmpeg/libavcodec/arm/Makefile +++ /dev/null @@ -1,98 +0,0 @@ -ARCH_HEADERS = mathops.h - -OBJS += arm/fmtconvert_init_arm.o - -OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ - arm/sbrdsp_init_arm.o -OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \ - arm/ac3dsp_arm.o -OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o -OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o \ - arm/dsputil_arm.o \ - arm/jrevdct_arm.o \ - arm/simple_idct_arm.o -OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \ - arm/fft_fixed_init_arm.o -OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \ - arm/flacdsp_arm.o -OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o -OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o -OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o -OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o -OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \ - arm/hpeldsp_arm.o -OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o -OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o -OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_arm.o -OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o -OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o -OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o -OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o -OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o -OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \ - arm/rv40dsp_init_arm.o -OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o \ - -ARMV5TE-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv5te.o \ - arm/simple_idct_armv5te.o -ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \ - arm/mpegvideo_armv5te_s.o -ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \ - arm/videodsp_armv5te.o - -ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \ - arm/dsputil_armv6.o \ - arm/simple_idct_armv6.o \ - -ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o -ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o -ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ - arm/hpeldsp_armv6.o -ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o -ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ - arm/vp8dsp_init_armv6.o \ - arm/vp8dsp_armv6.o - -VFP-OBJS += arm/fmtconvert_vfp.o - -VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \ - arm/synth_filter_vfp.o -VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o -VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o -VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp_armv6.o - -NEON-OBJS += arm/fmtconvert_neon.o - -NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o -NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ - arm/sbrdsp_neon.o -NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \ - arm/synth_filter_neon.o -NEON-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_neon.o \ - arm/dsputil_neon.o \ - arm/int_neon.o \ - arm/simple_idct_neon.o -NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \ - arm/fft_fixed_neon.o -NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o -NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \ - arm/h264idct_neon.o -NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o -NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \ - arm/hpeldsp_neon.o -NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \ - arm/hpeldsp_neon.o -NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \ - arm/mdct_fixed_neon.o -NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o -NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o -NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o -NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ - arm/rv40dsp_neon.o -NEON-OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_neon.o \ - arm/vc1dsp_neon.o -NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o -NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o -NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o -NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_neon.o \ - arm/vp8dsp_neon.o diff --git a/ffmpeg/libavcodec/arm/aac.h b/ffmpeg/libavcodec/arm/aac.h deleted file mode 100644 index cafa881..0000000 --- a/ffmpeg/libavcodec/arm/aac.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_AAC_H -#define AVCODEC_ARM_AAC_H - -#include "config.h" - -#if HAVE_NEON_INLINE - -#define VMUL2 VMUL2 -static inline float *VMUL2(float *dst, const float *v, unsigned idx, - const float *scale) -{ - unsigned v0, v1; - __asm__ ("ubfx %0, %6, #0, #4 \n\t" - "ubfx %1, %6, #4, #4 \n\t" - "ldr %0, [%5, %0, lsl #2] \n\t" - "ldr %1, [%5, %1, lsl #2] \n\t" - "vld1.32 {d1[]}, [%7,:32] \n\t" - "vmov d0, %0, %1 \n\t" - "vmul.f32 d0, d0, d1 \n\t" - "vst1.32 {d0}, [%2,:64]! \n\t" - : "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1]) - : "r"(v), "r"(idx), "r"(scale) - : "d0", "d1"); - return dst; -} - -#define VMUL4 VMUL4 -static inline float *VMUL4(float *dst, const float *v, unsigned idx, - const float *scale) -{ - unsigned v0, v1, v2, v3; - __asm__ ("ubfx %0, %10, #0, #2 \n\t" - "ubfx %1, %10, #2, #2 \n\t" - "ldr %0, [%9, %0, lsl #2] \n\t" - "ubfx %2, %10, #4, #2 \n\t" - "ldr %1, [%9, %1, lsl #2] \n\t" - "ubfx %3, %10, #6, #2 \n\t" - "ldr %2, [%9, %2, lsl #2] \n\t" - "vmov d0, %0, %1 \n\t" - "ldr %3, [%9, %3, lsl #2] \n\t" - "vld1.32 {d2[],d3[]},[%11,:32] \n\t" - "vmov d1, %2, %3 \n\t" - "vmul.f32 q0, q0, q1 \n\t" - "vst1.32 {q0}, [%4,:128]! \n\t" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), - "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3]) - : "r"(v), "r"(idx), "r"(scale) - : "d0", "d1", "d2", "d3"); - return dst; -} - -#define VMUL2S VMUL2S -static inline float *VMUL2S(float *dst, const float *v, unsigned idx, - unsigned sign, const float *scale) -{ - unsigned v0, v1, v2, v3; - __asm__ ("ubfx %0, %8, #0, #4 \n\t" - "ubfx %1, %8, #4, #4 \n\t" - "ldr %0, [%7, %0, lsl #2] \n\t" - "lsl %2, %10, #30 \n\t" - "ldr %1, [%7, %1, lsl #2] \n\t" - "lsl %3, %10, #31 \n\t" - "vmov d0, %0, %1 \n\t" - "bic %2, %2, #1<<30 \n\t" - "vld1.32 {d1[]}, [%9,:32] \n\t" - "vmov d2, %2, %3 \n\t" - "veor d0, d0, d2 \n\t" - "vmul.f32 d0, d0, d1 \n\t" - "vst1.32 {d0}, [%4,:64]! \n\t" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), - "=m"(dst[0]), "=m"(dst[1]) - : "r"(v), "r"(idx), "r"(scale), "r"(sign) - : "d0", "d1", "d2"); - return dst; -} - -#define VMUL4S VMUL4S -static inline float *VMUL4S(float *dst, const float *v, unsigned idx, - unsigned sign, const float *scale) -{ - unsigned v0, v1, v2, v3, nz; - __asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t" - "ubfx %0, %12, #0, #2 \n\t" - "ubfx %1, %12, #2, #2 \n\t" - "ldr %0, [%11,%0, lsl #2] \n\t" - "ubfx %2, %12, #4, #2 \n\t" - "ldr %1, [%11,%1, lsl #2] \n\t" - "ubfx %3, %12, #6, #2 \n\t" - "ldr %2, [%11,%2, lsl #2] \n\t" - "vmov d0, %0, %1 \n\t" - "ldr %3, [%11,%3, lsl #2] \n\t" - "lsr %6, %12, #12 \n\t" - "rbit %6, %6 \n\t" - "vmov d1, %2, %3 \n\t" - "lsls %6, %6, #1 \n\t" - "and %0, %5, #1<<31 \n\t" - "it cs \n\t" - "lslcs %5, %5, #1 \n\t" - "lsls %6, %6, #1 \n\t" - "and %1, %5, #1<<31 \n\t" - "it cs \n\t" - "lslcs %5, %5, #1 \n\t" - "lsls %6, %6, #1 \n\t" - "and %2, %5, #1<<31 \n\t" - "it cs \n\t" - "lslcs %5, %5, #1 \n\t" - "vmov d4, %0, %1 \n\t" - "and %3, %5, #1<<31 \n\t" - "vmov d5, %2, %3 \n\t" - "veor q0, q0, q2 \n\t" - "vmul.f32 q0, q0, q1 \n\t" - "vst1.32 {q0}, [%4,:128]! \n\t" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst), - "+r"(sign), "=r"(nz), - "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3]) - : "r"(v), "r"(idx), "r"(scale) - : "cc", "d0", "d1", "d2", "d3", "d4", "d5"); - return dst; -} - -#endif /* HAVE_NEON_INLINE */ - -#endif /* AVCODEC_ARM_AAC_H */ diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c b/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c deleted file mode 100644 index e04787c..0000000 --- a/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2012 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "libavutil/arm/cpu.h" -#include "libavutil/attributes.h" -#include "libavcodec/aacpsdsp.h" - -void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n); -void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2], - float *src1, int n); -void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2], - const float (*filter)[8][2], - int stride, int n); -void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64], - int i, int len); -void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2], - int i, int len); -void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2], - float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2], - const float phi_fract[2], float (*Q_fract)[2], - const float *transient_gain, float g_decay_slope, - int len); -void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2], - float h[2][4], float h_step[2][4], - int len); - -av_cold void ff_psdsp_init_arm(PSDSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->add_squares = ff_ps_add_squares_neon; - s->mul_pair_single = ff_ps_mul_pair_single_neon; - s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon; - s->hybrid_analysis = ff_ps_hybrid_analysis_neon; - s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_neon.S b/ffmpeg/libavcodec/arm/aacpsdsp_neon.S deleted file mode 100644 index a93bbfe..0000000 --- a/ffmpeg/libavcodec/arm/aacpsdsp_neon.S +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Copyright (c) 2012 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_ps_add_squares_neon, export=1 - mov r3, r0 - sub r2, r2, #4 - vld1.32 {q0}, [r1,:128]! - vmul.f32 q0, q0, q0 - vld1.32 {q2}, [r1,:128]! - vmul.f32 q2, q2, q2 - vld1.32 {q1}, [r0,:128]! -1: - vpadd.f32 d6, d0, d1 - vld1.32 {q0}, [r1,:128]! - vpadd.f32 d7, d4, d5 - vmul.f32 q0, q0, q0 - vld1.32 {q2}, [r1,:128]! - vadd.f32 q3, q1, q3 - vld1.32 {q1}, [r0,:128]! - vmul.f32 q2, q2, q2 - vst1.32 {q3}, [r3,:128]! - subs r2, r2, #4 - bgt 1b - vpadd.f32 d6, d0, d1 - vpadd.f32 d7, d4, d5 - vadd.f32 q1, q1, q3 - vst1.32 {q1}, [r3,:128]! - bx lr -endfunc - -function ff_ps_mul_pair_single_neon, export=1 - sub r3, r3, #4 - tst r1, #8 - bne 2f - vld1.32 {q0}, [r1,:128]! -1: - vld1.32 {q3}, [r2,:128]! - vmul.f32 d4, d0, d6[0] - vmul.f32 d5, d1, d6[1] - vld1.32 {q1}, [r1,:128]! - vmul.f32 d6, d2, d7[0] - vmul.f32 d7, d3, d7[1] - vld1.32 {q0}, [r1,:128]! - vst1.32 {q2,q3}, [r0,:128]! - subs r3, r3, #4 - bgt 1b - vld1.32 {q3}, [r2,:128]! - vmul.f32 d4, d0, d6[0] - vmul.f32 d5, d1, d6[1] - vld1.32 {q1}, [r1,:128]! - vmul.f32 d6, d2, d7[0] - vmul.f32 d7, d3, d7[1] - vst1.32 {q2,q3}, [r0,:128]! - bx lr -2: - vld1.32 {d0}, [r1,:64]! - vld1.32 {d1,d2}, [r1,:128]! -1: - vld1.32 {q3}, [r2,:128]! - vmul.f32 d4, d0, d6[0] - vmul.f32 d5, d1, d6[1] - vld1.32 {d0,d1}, [r1,:128]! - vmul.f32 d6, d2, d7[0] - vmul.f32 d7, d0, d7[1] - vmov d0, d1 - vld1.32 {d1,d2}, [r1,:128]! - vst1.32 {q2,q3}, [r0,:128]! - subs r3, r3, #4 - bgt 1b - vld1.32 {q3}, [r2,:128]! - vmul.f32 d4, d0, d6[0] - vmul.f32 d5, d1, d6[1] - vld1.32 {d0}, [r1,:64]! - vmul.f32 d6, d2, d7[0] - vmul.f32 d7, d0, d7[1] - vst1.32 {q2,q3}, [r0,:128]! - bx lr -endfunc - -function ff_ps_hybrid_synthesis_deint_neon, export=1 - push {r4-r8,lr} - add r0, r0, r2, lsl #2 - add r1, r1, r2, lsl #5+1+2 - rsb r2, r2, #64 - mov r5, #64*4 - mov lr, r0 - add r4, r0, #38*64*4 - mov r12, r3 -2: - vld1.32 {d0,d1}, [r1,:128]! - vst1.32 {d0[0]}, [lr,:32], r5 - vst1.32 {d0[1]}, [r4,:32], r5 - vst1.32 {d1[0]}, [lr,:32], r5 - vst1.32 {d1[1]}, [r4,:32], r5 - subs r12, r12, #2 - bgt 2b - add r0, r0, #4 - sub r2, r2, #1 - tst r2, #2 - bne 6f -1: - mov lr, r0 - add r4, r0, #38*64*4 - add r6, r1, # 32*2*4 - add r7, r1, #2*32*2*4 - add r8, r1, #3*32*2*4 - mov r12, r3 -2: - vld1.32 {d0,d1}, [r1,:128]! - vld1.32 {d2,d3}, [r6,:128]! - vld1.32 {d4,d5}, [r7,:128]! - vld1.32 {d6,d7}, [r8,:128]! - vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5 - vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5 - vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5 - vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5 - subs r12, r12, #2 - bgt 2b - add r0, r0, #16 - add r1, r1, #3*32*2*4 - subs r2, r2, #4 - bgt 1b - pop {r4-r8,pc} -6: - mov lr, r0 - add r4, r0, #38*64*4 - add r6, r1, #32*2*4 - mov r12, r3 -2: - vld1.32 {d0,d1}, [r1,:128]! - vld1.32 {d2,d3}, [r6,:128]! - vst2.32 {d0[0],d2[0]}, [lr,:64], r5 - vst2.32 {d0[1],d2[1]}, [r4,:64], r5 - vst2.32 {d1[0],d3[0]}, [lr,:64], r5 - vst2.32 {d1[1],d3[1]}, [r4,:64], r5 - subs r12, r12, #2 - bgt 2b - add r0, r0, #8 - add r1, r1, #32*2*4 - sub r2, r2, #2 - b 1b -endfunc - -function ff_ps_hybrid_analysis_neon, export=1 - vldm r1, {d19-d31} - ldr r12, [sp] - lsl r3, r3, #3 - vadd.f32 d16, d19, d31 - vadd.f32 d17, d20, d30 - vsub.f32 d18, d19, d31 - vsub.f32 d19, d20, d30 - vsub.f32 d0, d21, d29 - vsub.f32 d1, d22, d28 - vadd.f32 d2, d21, d29 - vadd.f32 d3, d22, d28 - vadd.f32 d20, d23, d27 - vadd.f32 d21, d24, d26 - vsub.f32 d22, d23, d27 - vsub.f32 d23, d24, d26 - vmov.i32 d6, #1<<31 - vmov.i32 d7, #0 - vmov.f32 q14, #0.0 - vmov.f32 q15, #0.0 - vtrn.32 d6, d7 - vrev64.32 q9, q9 - vrev64.32 q0, q0 - vrev64.32 q11, q11 - veor q9, q9, q3 - veor q0, q0, q3 - veor q11, q11, q3 - vld1.32 {q13}, [r2,:128]! - vtrn.32 q8, q9 - vtrn.32 q1, q0 - vtrn.32 q10, q11 - sub r12, r12, #1 - vmla.f32 q14, q8, q13 - vld1.32 {q2}, [r2,:128]! - vmla.f32 q15, q9, q13 -1: - vmla.f32 q14, q1, q2 - vld1.32 {q13}, [r2,:128]! - vmla.f32 q15, q0, q2 - vmla.f32 q14, q10, q13 - vld1.32 {q2}, [r2,:128]! - vmla.f32 q15, q11, q13 - vld1.32 {q13}, [r2,:128]! - vadd.f32 d6, d28, d29 - vadd.f32 d7, d30, d31 - vmov.f32 q14, #0.0 - vmov.f32 q15, #0.0 - vmla.f32 q14, q8, q13 - vpadd.f32 d6, d6, d7 - vmla.f32 q15, q9, q13 - vmla.f32 d6, d25, d4[0] - vld1.32 {q2}, [r2,:128]! - vst1.32 {d6}, [r0,:64], r3 - subs r12, r12, #1 - bgt 1b - vmla.f32 q14, q1, q2 - vld1.32 {q13}, [r2,:128]! - vmla.f32 q15, q0, q2 - vmla.f32 q14, q10, q13 - vld1.32 {q2}, [r2,:128]! - vmla.f32 q15, q11, q13 - vadd.f32 d6, d28, d29 - vadd.f32 d7, d30, d31 - vpadd.f32 d6, d6, d7 - vmla.f32 d6, d25, d4[0] - vst1.32 {d6}, [r0,:64], r3 - bx lr -endfunc - -function ff_ps_stereo_interpolate_neon, export=1 - vld1.32 {q0}, [r2] - vld1.32 {q14}, [r3] - vadd.f32 q15, q14, q14 - mov r2, r0 - mov r3, r1 - ldr r12, [sp] - vadd.f32 q1, q0, q14 - vadd.f32 q0, q0, q15 - vld1.32 {q2}, [r0,:64]! - vld1.32 {q3}, [r1,:64]! - subs r12, r12, #1 - beq 2f -1: - vmul.f32 d16, d4, d2[0] - vmul.f32 d17, d5, d0[0] - vmul.f32 d18, d4, d2[1] - vmul.f32 d19, d5, d0[1] - vmla.f32 d16, d6, d3[0] - vmla.f32 d17, d7, d1[0] - vmla.f32 d18, d6, d3[1] - vmla.f32 d19, d7, d1[1] - vadd.f32 q1, q1, q15 - vadd.f32 q0, q0, q15 - vld1.32 {q2}, [r0,:64]! - vld1.32 {q3}, [r1,:64]! - vst1.32 {q8}, [r2,:64]! - vst1.32 {q9}, [r3,:64]! - subs r12, r12, #2 - bgt 1b - it lt - bxlt lr -2: - vmul.f32 d16, d4, d2[0] - vmul.f32 d18, d4, d2[1] - vmla.f32 d16, d6, d3[0] - vmla.f32 d18, d6, d3[1] - vst1.32 {d16}, [r2,:64]! - vst1.32 {d18}, [r3,:64]! - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/ac3dsp_arm.S b/ffmpeg/libavcodec/arm/ac3dsp_arm.S deleted file mode 100644 index 1aea190..0000000 --- a/ffmpeg/libavcodec/arm/ac3dsp_arm.S +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_ac3_update_bap_counts_arm, export=1 - push {lr} - ldrb lr, [r1], #1 -1: - lsl r3, lr, #1 - ldrh r12, [r0, r3] - subs r2, r2, #1 - it gt - ldrbgt lr, [r1], #1 - add r12, r12, #1 - strh r12, [r0, r3] - bgt 1b - pop {pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/ac3dsp_armv6.S b/ffmpeg/libavcodec/arm/ac3dsp_armv6.S deleted file mode 100644 index 1d2563d..0000000 --- a/ffmpeg/libavcodec/arm/ac3dsp_armv6.S +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_ac3_bit_alloc_calc_bap_armv6, export=1 - ldr r12, [sp] - cmp r12, #-960 - beq 4f - push {r4-r11,lr} - add r5, sp, #40 - movrelx r4, X(ff_ac3_bin_to_band_tab), r11 - movrelx lr, X(ff_ac3_band_start_tab) - ldm r5, {r5-r7} - ldrb r4, [r4, r2] - add r1, r1, r2, lsl #1 @ psd + start - add r0, r0, r4, lsl #1 @ mask + band - add r4, r4, lr - add r7, r7, r2 @ bap + start -1: - ldrsh r9, [r0], #2 @ mask[band] - mov r8, #0xff0 - sub r9, r9, r12 @ - snr_offset - ldrb r10, [r4, #1]! @ band_start_tab[++band] - subs r9, r9, r5 @ - floor - it lt - movlt r9, #0 - cmp r10, r3 @ - end - and r9, r9, r8, lsl #1 @ & 0x1fe0 - ite gt - subgt r8, r3, r2 - suble r8, r10, r2 - mov r2, r10 - add r9, r9, r5 @ + floor => m - tst r8, #1 - add r11, r7, r8 - bne 3f - b 5f -2: - ldrsh r8, [r1], #2 - ldrsh lr, [r1], #2 - sub r8, r8, r9 - sub lr, lr, r9 - usat r8, #6, r8, asr #5 @ address - usat lr, #6, lr, asr #5 - ldrb r8, [r6, r8] @ bap_tab[address] - ldrb lr, [r6, lr] - strb r8, [r7], #1 @ bap[bin] - strb lr, [r7], #1 -5: cmp r7, r11 - blo 2b - cmp r3, r10 - bgt 1b - pop {r4-r11,pc} -3: - ldrsh r8, [r1], #2 @ psd[bin] - sub r8, r8, r9 @ - m - usat r8, #6, r8, asr #5 @ address - ldrb r8, [r6, r8] @ bap_tab[address] - strb r8, [r7], #1 @ bap[bin] - b 5b -4: - ldr r0, [sp, #12] - mov r1, #0 - mov r2, #256 - b X(memset) -endfunc diff --git a/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c b/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c deleted file mode 100644 index a3c32ff..0000000 --- a/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/arm/cpu.h" -#include "libavutil/attributes.h" -#include "libavcodec/ac3dsp.h" -#include "config.h" - -void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs); -int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len); -void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift); -void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift); -void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len); -void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs); -void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src, - const int16_t *window, unsigned n); -void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4], - const int32_t *coef0, - const int32_t *coef1, - int len); -void ff_ac3_sum_square_butterfly_float_neon(float sum[4], - const float *coef0, - const float *coef1, - int len); - -void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd, - int start, int end, - int snr_offset, int floor, - const uint8_t *bap_tab, uint8_t *bap); - -void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len); - -av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact) -{ - int cpu_flags = av_get_cpu_flags(); - - c->update_bap_counts = ff_ac3_update_bap_counts_arm; - - if (have_armv6(cpu_flags)) { - c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6; - } - - if (have_neon(cpu_flags)) { - c->ac3_exponent_min = ff_ac3_exponent_min_neon; - c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon; - c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon; - c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon; - c->float_to_fixed24 = ff_float_to_fixed24_neon; - c->extract_exponents = ff_ac3_extract_exponents_neon; - c->apply_window_int16 = ff_apply_window_int16_neon; - c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon; - c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/ac3dsp_neon.S b/ffmpeg/libavcodec/arm/ac3dsp_neon.S deleted file mode 100644 index 89d0ae8..0000000 --- a/ffmpeg/libavcodec/arm/ac3dsp_neon.S +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_ac3_max_msb_abs_int16_neon, export=1 - vmov.i16 q0, #0 - vmov.i16 q2, #0 -1: vld1.16 {q1}, [r0,:128]! - vabs.s16 q1, q1 - vld1.16 {q3}, [r0,:128]! - vabs.s16 q3, q3 - vorr q0, q0, q1 - vorr q2, q2, q3 - subs r1, r1, #16 - bgt 1b - vorr q0, q0, q2 - vorr d0, d0, d1 - vpmax.u16 d0, d0, d0 - vpmax.u16 d0, d0, d0 - vmov.u16 r0, d0[0] - bx lr -endfunc - -function ff_ac3_exponent_min_neon, export=1 - cmp r1, #0 - it eq - bxeq lr - push {lr} - mov r12, #256 -1: - vld1.8 {q0}, [r0,:128] - mov lr, r1 - add r3, r0, #256 -2: vld1.8 {q1}, [r3,:128], r12 - subs lr, lr, #1 - vmin.u8 q0, q0, q1 - bgt 2b - subs r2, r2, #16 - vst1.8 {q0}, [r0,:128]! - bgt 1b - pop {pc} -endfunc - -function ff_ac3_lshift_int16_neon, export=1 - vdup.16 q0, r2 -1: vld1.16 {q1}, [r0,:128] - vshl.s16 q1, q1, q0 - vst1.16 {q1}, [r0,:128]! - subs r1, r1, #8 - bgt 1b - bx lr -endfunc - -function ff_ac3_rshift_int32_neon, export=1 - rsb r2, r2, #0 - vdup.32 q0, r2 -1: vld1.32 {q1}, [r0,:128] - vshl.s32 q1, q1, q0 - vst1.32 {q1}, [r0,:128]! - subs r1, r1, #4 - bgt 1b - bx lr -endfunc - -function ff_float_to_fixed24_neon, export=1 -1: vld1.32 {q0-q1}, [r1,:128]! - vcvt.s32.f32 q0, q0, #24 - vld1.32 {q2-q3}, [r1,:128]! - vcvt.s32.f32 q1, q1, #24 - vcvt.s32.f32 q2, q2, #24 - vst1.32 {q0-q1}, [r0,:128]! - vcvt.s32.f32 q3, q3, #24 - vst1.32 {q2-q3}, [r0,:128]! - subs r2, r2, #16 - bgt 1b - bx lr -endfunc - -function ff_ac3_extract_exponents_neon, export=1 - vmov.i32 q15, #8 -1: - vld1.32 {q0}, [r1,:128]! - vabs.s32 q1, q0 - vclz.i32 q3, q1 - vsub.i32 q3, q3, q15 - vmovn.i32 d6, q3 - vmovn.i16 d6, q3 - vst1.32 {d6[0]}, [r0,:32]! - subs r2, r2, #4 - bgt 1b - bx lr -endfunc - -function ff_apply_window_int16_neon, export=1 - push {r4,lr} - add r4, r1, r3, lsl #1 - add lr, r0, r3, lsl #1 - sub r4, r4, #16 - sub lr, lr, #16 - mov r12, #-16 -1: - vld1.16 {q0}, [r1,:128]! - vld1.16 {q2}, [r2,:128]! - vld1.16 {q1}, [r4,:128], r12 - vrev64.16 q3, q2 - vqrdmulh.s16 q0, q0, q2 - vqrdmulh.s16 d2, d2, d7 - vqrdmulh.s16 d3, d3, d6 - vst1.16 {q0}, [r0,:128]! - vst1.16 {q1}, [lr,:128], r12 - subs r3, r3, #16 - bgt 1b - - pop {r4,pc} -endfunc - -function ff_ac3_sum_square_butterfly_int32_neon, export=1 - vmov.i64 q0, #0 - vmov.i64 q1, #0 - vmov.i64 q2, #0 - vmov.i64 q3, #0 -1: - vld1.32 {d16}, [r1]! - vld1.32 {d17}, [r2]! - vadd.s32 d18, d16, d17 - vsub.s32 d19, d16, d17 - vmlal.s32 q0, d16, d16 - vmlal.s32 q1, d17, d17 - vmlal.s32 q2, d18, d18 - vmlal.s32 q3, d19, d19 - subs r3, r3, #2 - bgt 1b - vadd.s64 d0, d0, d1 - vadd.s64 d1, d2, d3 - vadd.s64 d2, d4, d5 - vadd.s64 d3, d6, d7 - vst1.64 {q0-q1}, [r0] - bx lr -endfunc - -function ff_ac3_sum_square_butterfly_float_neon, export=1 - vmov.f32 q0, #0.0 - vmov.f32 q1, #0.0 -1: - vld1.32 {d16}, [r1]! - vld1.32 {d17}, [r2]! - vadd.f32 d18, d16, d17 - vsub.f32 d19, d16, d17 - vmla.f32 d0, d16, d16 - vmla.f32 d1, d17, d17 - vmla.f32 d2, d18, d18 - vmla.f32 d3, d19, d19 - subs r3, r3, #2 - bgt 1b - vpadd.f32 d0, d0, d1 - vpadd.f32 d1, d2, d3 - vst1.32 {q0}, [r0] - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/asm-offsets.h b/ffmpeg/libavcodec/arm/asm-offsets.h deleted file mode 100644 index 5cfc5cb..0000000 --- a/ffmpeg/libavcodec/arm/asm-offsets.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_ASM_OFFSETS_H -#define AVCODEC_ARM_ASM_OFFSETS_H - -#ifndef __ASSEMBLER__ -#include <stddef.h> -#define CHK_OFFS(s, m, o) struct check_##o { \ - int x_##o[offsetof(s, m) == o? 1: -1]; \ - } -#endif - -/* MpegEncContext */ -#define Y_DC_SCALE 0xa8 -#define C_DC_SCALE 0xac -#define AC_PRED 0xb0 -#define BLOCK_LAST_INDEX 0xb4 -#define H263_AIC 0xe4 -#define INTER_SCANTAB_RASTER_END 0x12c - -#endif /* AVCODEC_ARM_ASM_OFFSETS_H */ diff --git a/ffmpeg/libavcodec/arm/dca.h b/ffmpeg/libavcodec/arm/dca.h deleted file mode 100644 index 35971a8..0000000 --- a/ffmpeg/libavcodec/arm/dca.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_DCA_H -#define AVCODEC_ARM_DCA_H - -#include <stdint.h> - -#include "config.h" -#include "libavcodec/mathops.h" - -#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB - -#define decode_blockcodes decode_blockcodes -static inline int decode_blockcodes(int code1, int code2, int levels, - int32_t *values) -{ - int32_t v0, v1, v2, v3, v4, v5; - - __asm__ ("smmul %0, %6, %10 \n" - "smmul %3, %7, %10 \n" - "smlabb %6, %0, %9, %6 \n" - "smlabb %7, %3, %9, %7 \n" - "smmul %1, %0, %10 \n" - "smmul %4, %3, %10 \n" - "sub %6, %6, %8, lsr #1 \n" - "sub %7, %7, %8, lsr #1 \n" - "smlabb %0, %1, %9, %0 \n" - "smlabb %3, %4, %9, %3 \n" - "smmul %2, %1, %10 \n" - "smmul %5, %4, %10 \n" - "str %6, [%11, #0] \n" - "str %7, [%11, #16] \n" - "sub %0, %0, %8, lsr #1 \n" - "sub %3, %3, %8, lsr #1 \n" - "smlabb %1, %2, %9, %1 \n" - "smlabb %4, %5, %9, %4 \n" - "smmul %6, %2, %10 \n" - "smmul %7, %5, %10 \n" - "str %0, [%11, #4] \n" - "str %3, [%11, #20] \n" - "sub %1, %1, %8, lsr #1 \n" - "sub %4, %4, %8, lsr #1 \n" - "smlabb %2, %6, %9, %2 \n" - "smlabb %5, %7, %9, %5 \n" - "str %1, [%11, #8] \n" - "str %4, [%11, #24] \n" - "sub %2, %2, %8, lsr #1 \n" - "sub %5, %5, %8, lsr #1 \n" - "str %2, [%11, #12] \n" - "str %5, [%11, #28] \n" - : "=&r"(v0), "=&r"(v1), "=&r"(v2), - "=&r"(v3), "=&r"(v4), "=&r"(v5), - "+&r"(code1), "+&r"(code2) - : "r"(levels - 1), "r"(-levels), - "r"(ff_inverse[levels]), "r"(values) - : "memory"); - - return code1 | code2; -} - -#endif - -#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y - -#define int8x8_fmul_int32 int8x8_fmul_int32 -static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale) -{ - __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" - "vld1.8 {d0}, [%1,:64] \n" - "vmovl.s8 q0, d0 \n" - "vmovl.s16 q1, d1 \n" - "vmovl.s16 q0, d0 \n" - "vcvt.f32.s32 q0, q0 \n" - "vcvt.f32.s32 q1, q1 \n" - "vmul.f32 q0, q0, %y2 \n" - "vmul.f32 q1, q1, %y2 \n" - "vst1.32 {q0-q1}, [%m0,:128] \n" - : "=Um"(*(float (*)[8])dst) - : "r"(src), "x"(scale) - : "d0", "d1", "d2", "d3"); -} - -#endif - -#endif /* AVCODEC_ARM_DCA_H */ diff --git a/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/ffmpeg/libavcodec/arm/dcadsp_init_arm.c deleted file mode 100644 index 8893f48..0000000 --- a/ffmpeg/libavcodec/arm/dcadsp_init_arm.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "libavutil/arm/cpu.h" -#include "libavutil/attributes.h" -#include "libavcodec/dcadsp.h" - -void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs, - int decifactor, float scale); -void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act, - SynthFilterContext *synth, FFTContext *imdct, - float synth_buf_ptr[512], - int *synth_buf_offset, float synth_buf2[32], - const float window[512], float *samples_out, - float raXin[32], float scale); -void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs, - int decifactor, float scale); - -void ff_synth_filter_float_vfp(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - -void ff_synth_filter_float_neon(FFTContext *imdct, - float *synth_buf_ptr, int *synth_buf_offset, - float synth_buf2[32], const float window[512], - float out[32], const float in[32], - float scale); - -av_cold void ff_dcadsp_init_arm(DCADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { - s->lfe_fir = ff_dca_lfe_fir_vfp; - s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp; - } - if (have_neon(cpu_flags)) - s->lfe_fir = ff_dca_lfe_fir_neon; -} - -av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_vfp; - if (have_neon(cpu_flags)) - s->synth_filter_float = ff_synth_filter_float_neon; -} diff --git a/ffmpeg/libavcodec/arm/dcadsp_neon.S b/ffmpeg/libavcodec/arm/dcadsp_neon.S deleted file mode 100644 index 6a6c77a..0000000 --- a/ffmpeg/libavcodec/arm/dcadsp_neon.S +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_dca_lfe_fir_neon, export=1 - push {r4-r6,lr} - - add r4, r0, r3, lsl #2 @ out2 - add r5, r2, #256*4-16 @ cf1 - sub r1, r1, #12 - cmp r3, #32 - ite eq - moveq r6, #256/32 - movne r6, #256/64 -NOVFP vldr s0, [sp, #16] @ scale - mov lr, #-16 -1: - vmov.f32 q2, #0.0 @ v0 - vmov.f32 q3, #0.0 @ v1 - mov r12, r6 -2: - vld1.32 {q8}, [r2,:128]! @ cf0 - vld1.32 {q9}, [r5,:128], lr @ cf1 - vld1.32 {q1}, [r1], lr @ in - subs r12, r12, #4 - vrev64.32 q10, q8 - vmla.f32 q3, q1, q9 - vmla.f32 d4, d2, d21 - vmla.f32 d5, d3, d20 - bne 2b - - add r1, r1, r6, lsl #2 - subs r3, r3, #1 - vadd.f32 d4, d4, d5 - vadd.f32 d6, d6, d7 - vpadd.f32 d4, d4, d6 - vmul.f32 d5, d4, d0[0] - vst1.32 {d5[0]}, [r0,:32]! - vst1.32 {d5[1]}, [r4,:32]! - bne 1b - - pop {r4-r6,pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.S b/ffmpeg/libavcodec/arm/dsputil_arm.S deleted file mode 100644 index 586a833..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_arm.S +++ /dev/null @@ -1,125 +0,0 @@ -@ -@ ARMv4 optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> -@ -@ This file is part of FFmpeg. -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "libavutil/arm/asm.S" - -#if !HAVE_ARMV5TE_EXTERNAL -#define pld @ -#endif - - .align 5 -@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride) -function ff_add_pixels_clamped_arm, export=1 - push {r4-r10} - mov r10, #8 -1: - ldr r4, [r1] /* load dest */ - /* block[0] and block[1]*/ - ldrsh r5, [r0] - ldrsh r7, [r0, #2] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r6, r5 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #4] /* moved form [A] */ - orr r9, r9, r8, lsl #8 - /* block[2] and block[3] */ - /* [A] */ - ldrsh r7, [r0, #6] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - ldr r4, [r1, #4] /* moved form [B] */ - orr r9, r9, r8, lsl #24 - /* store dest */ - ldrsh r5, [r0, #8] /* moved form [C] */ - str r9, [r1] - - /* load dest */ - /* [B] */ - /* block[4] and block[5] */ - /* [C] */ - ldrsh r7, [r0, #10] - and r6, r4, #0xFF - and r8, r4, #0xFF00 - add r6, r6, r5 - add r8, r7, r8, lsr #8 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - mov r9, r6 - ldrsh r5, [r0, #12] /* moved from [D] */ - orr r9, r9, r8, lsl #8 - /* block[6] and block[7] */ - /* [D] */ - ldrsh r7, [r0, #14] - and r6, r4, #0xFF0000 - and r8, r4, #0xFF000000 - add r6, r5, r6, lsr #16 - add r8, r7, r8, lsr #24 - mvn r5, r5 - mvn r7, r7 - tst r6, #0x100 - it ne - movne r6, r5, lsr #24 - tst r8, #0x100 - it ne - movne r8, r7, lsr #24 - orr r9, r9, r6, lsl #16 - add r0, r0, #16 /* moved from [E] */ - orr r9, r9, r8, lsl #24 - subs r10, r10, #1 /* moved from [F] */ - /* store dest */ - str r9, [r1, #4] - - /* [E] */ - /* [F] */ - add r1, r1, r2 - bne 1b - - pop {r4-r10} - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.h b/ffmpeg/libavcodec/arm/dsputil_arm.h deleted file mode 100644 index b7b5bdc..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_arm.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_DSPUTIL_H -#define AVCODEC_ARM_DSPUTIL_H - -#include "libavcodec/avcodec.h" -#include "libavcodec/dsputil.h" - -void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx); -void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx); - -#endif /* AVCODEC_ARM_DSPUTIL_H */ diff --git a/ffmpeg/libavcodec/arm/dsputil_armv6.S b/ffmpeg/libavcodec/arm/dsputil_armv6.S deleted file mode 100644 index 6ec238b..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_armv6.S +++ /dev/null @@ -1,381 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_add_pixels_clamped_armv6, export=1 - push {r4-r8,lr} - mov r3, #8 -1: - ldm r0!, {r4,r5,r12,lr} - ldrd r6, r7, [r1] - pkhbt r8, r4, r5, lsl #16 - pkhtb r5, r5, r4, asr #16 - pkhbt r4, r12, lr, lsl #16 - pkhtb lr, lr, r12, asr #16 - pld [r1, r2] - uxtab16 r8, r8, r6 - uxtab16 r5, r5, r6, ror #8 - uxtab16 r4, r4, r7 - uxtab16 lr, lr, r7, ror #8 - usat16 r8, #8, r8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 lr, #8, lr - orr r6, r8, r5, lsl #8 - orr r7, r4, lr, lsl #8 - subs r3, r3, #1 - strd_post r6, r7, r1, r2 - bgt 1b - pop {r4-r8,pc} -endfunc - -function ff_get_pixels_armv6, export=1 - pld [r1, r2] - push {r4-r8, lr} - mov lr, #8 -1: - ldrd_post r4, r5, r1, r2 - subs lr, lr, #1 - uxtb16 r6, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r12, r5 - uxtb16 r8, r5, ror #8 - pld [r1, r2] - pkhbt r5, r6, r4, lsl #16 - pkhtb r6, r4, r6, asr #16 - pkhbt r7, r12, r8, lsl #16 - pkhtb r12, r8, r12, asr #16 - stm r0!, {r5,r6,r7,r12} - bgt 1b - - pop {r4-r8, pc} -endfunc - -function ff_diff_pixels_armv6, export=1 - pld [r1, r3] - pld [r2, r3] - push {r4-r9, lr} - mov lr, #8 -1: - ldrd_post r4, r5, r1, r3 - ldrd_post r6, r7, r2, r3 - uxtb16 r8, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r6 - uxtb16 r6, r6, ror #8 - pld [r1, r3] - ssub16 r9, r8, r9 - ssub16 r6, r4, r6 - uxtb16 r8, r5 - uxtb16 r5, r5, ror #8 - pld [r2, r3] - pkhbt r4, r9, r6, lsl #16 - pkhtb r6, r6, r9, asr #16 - uxtb16 r9, r7 - uxtb16 r7, r7, ror #8 - ssub16 r9, r8, r9 - ssub16 r5, r5, r7 - subs lr, lr, #1 - pkhbt r8, r9, r5, lsl #16 - pkhtb r9, r5, r9, asr #16 - stm r0!, {r4,r6,r8,r9} - bgt 1b - - pop {r4-r9, pc} -endfunc - -function ff_pix_abs16_armv6, export=1 - ldr r0, [sp] - push {r4-r9, lr} - mov r12, #0 - mov lr, #0 - ldm r1, {r4-r7} - ldr r8, [r2] -1: - ldr r9, [r2, #4] - pld [r1, r3] - usada8 r12, r4, r8, r12 - ldr r8, [r2, #8] - pld [r2, r3] - usada8 lr, r5, r9, lr - ldr r9, [r2, #12] - usada8 r12, r6, r8, r12 - subs r0, r0, #1 - usada8 lr, r7, r9, lr - beq 2f - add r1, r1, r3 - ldm r1, {r4-r7} - add r2, r2, r3 - ldr r8, [r2] - b 1b -2: - add r0, r12, lr - pop {r4-r9, pc} -endfunc - -function ff_pix_abs16_x2_armv6, export=1 - ldr r12, [sp] - push {r4-r11, lr} - mov r0, #0 - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 -1: - ldr r8, [r2] - ldr r9, [r2, #4] - lsr r10, r8, #8 - ldr r4, [r1] - lsr r6, r9, #8 - orr r10, r10, r9, lsl #24 - ldr r5, [r2, #8] - eor r11, r8, r10 - uhadd8 r7, r8, r10 - orr r6, r6, r5, lsl #24 - and r11, r11, lr - uadd8 r7, r7, r11 - ldr r8, [r1, #4] - usada8 r0, r4, r7, r0 - eor r7, r9, r6 - lsr r10, r5, #8 - and r7, r7, lr - uhadd8 r4, r9, r6 - ldr r6, [r2, #12] - uadd8 r4, r4, r7 - pld [r1, r3] - orr r10, r10, r6, lsl #24 - usada8 r0, r8, r4, r0 - ldr r4, [r1, #8] - eor r11, r5, r10 - ldrb r7, [r2, #16] - and r11, r11, lr - uhadd8 r8, r5, r10 - ldr r5, [r1, #12] - uadd8 r8, r8, r11 - pld [r2, r3] - lsr r10, r6, #8 - usada8 r0, r4, r8, r0 - orr r10, r10, r7, lsl #24 - subs r12, r12, #1 - eor r11, r6, r10 - add r1, r1, r3 - uhadd8 r9, r6, r10 - and r11, r11, lr - uadd8 r9, r9, r11 - add r2, r2, r3 - usada8 r0, r5, r9, r0 - bgt 1b - - pop {r4-r11, pc} -endfunc - -.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3 - ldr \n0, [r2] - eor \n1, \p0, \n0 - uhadd8 \p0, \p0, \n0 - and \n1, \n1, lr - ldr \n2, [r1] - uadd8 \p0, \p0, \n1 - ldr \n1, [r2, #4] - usada8 r0, \p0, \n2, r0 - pld [r1, r3] - eor \n3, \p1, \n1 - uhadd8 \p1, \p1, \n1 - and \n3, \n3, lr - ldr \p0, [r1, #4] - uadd8 \p1, \p1, \n3 - ldr \n2, [r2, #8] - usada8 r0, \p1, \p0, r0 - pld [r2, r3] - eor \p0, \p2, \n2 - uhadd8 \p2, \p2, \n2 - and \p0, \p0, lr - ldr \p1, [r1, #8] - uadd8 \p2, \p2, \p0 - ldr \n3, [r2, #12] - usada8 r0, \p2, \p1, r0 - eor \p1, \p3, \n3 - uhadd8 \p3, \p3, \n3 - and \p1, \p1, lr - ldr \p0, [r1, #12] - uadd8 \p3, \p3, \p1 - add r1, r1, r3 - usada8 r0, \p3, \p0, r0 - add r2, r2, r3 -.endm - -function ff_pix_abs16_y2_armv6, export=1 - pld [r1] - pld [r2] - ldr r12, [sp] - push {r4-r11, lr} - mov r0, #0 - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 - ldr r4, [r2] - ldr r5, [r2, #4] - ldr r6, [r2, #8] - ldr r7, [r2, #12] - add r2, r2, r3 -1: - usad_y2 r4, r5, r6, r7, r8, r9, r10, r11 - subs r12, r12, #2 - usad_y2 r8, r9, r10, r11, r4, r5, r6, r7 - bgt 1b - - pop {r4-r11, pc} -endfunc - -function ff_pix_abs8_armv6, export=1 - pld [r2, r3] - ldr r12, [sp] - push {r4-r9, lr} - mov r0, #0 - mov lr, #0 - ldrd_post r4, r5, r1, r3 -1: - subs r12, r12, #2 - ldr r7, [r2, #4] - ldr_post r6, r2, r3 - ldrd_post r8, r9, r1, r3 - usada8 r0, r4, r6, r0 - pld [r2, r3] - usada8 lr, r5, r7, lr - ldr r7, [r2, #4] - ldr_post r6, r2, r3 - beq 2f - ldrd_post r4, r5, r1, r3 - usada8 r0, r8, r6, r0 - pld [r2, r3] - usada8 lr, r9, r7, lr - b 1b -2: - usada8 r0, r8, r6, r0 - usada8 lr, r9, r7, lr - add r0, r0, lr - pop {r4-r9, pc} -endfunc - -function ff_sse16_armv6, export=1 - ldr r12, [sp] - push {r4-r9, lr} - mov r0, #0 -1: - ldrd r4, r5, [r1] - ldr r8, [r2] - uxtb16 lr, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r8 - uxtb16 r8, r8, ror #8 - ldr r7, [r2, #4] - usub16 lr, lr, r9 - usub16 r4, r4, r8 - smlad r0, lr, lr, r0 - uxtb16 r6, r5 - uxtb16 lr, r5, ror #8 - uxtb16 r8, r7 - uxtb16 r9, r7, ror #8 - smlad r0, r4, r4, r0 - ldrd r4, r5, [r1, #8] - usub16 r6, r6, r8 - usub16 r8, lr, r9 - ldr r7, [r2, #8] - smlad r0, r6, r6, r0 - uxtb16 lr, r4 - uxtb16 r4, r4, ror #8 - uxtb16 r9, r7 - uxtb16 r7, r7, ror #8 - smlad r0, r8, r8, r0 - ldr r8, [r2, #12] - usub16 lr, lr, r9 - usub16 r4, r4, r7 - smlad r0, lr, lr, r0 - uxtb16 r6, r5 - uxtb16 r5, r5, ror #8 - uxtb16 r9, r8 - uxtb16 r8, r8, ror #8 - smlad r0, r4, r4, r0 - usub16 r6, r6, r9 - usub16 r5, r5, r8 - smlad r0, r6, r6, r0 - add r1, r1, r3 - add r2, r2, r3 - subs r12, r12, #1 - smlad r0, r5, r5, r0 - bgt 1b - - pop {r4-r9, pc} -endfunc - -function ff_pix_norm1_armv6, export=1 - push {r4-r6, lr} - mov r12, #16 - mov lr, #0 -1: - ldm r0, {r2-r5} - uxtb16 r6, r2 - uxtb16 r2, r2, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r3 - smlad lr, r2, r2, lr - uxtb16 r3, r3, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r4 - smlad lr, r3, r3, lr - uxtb16 r4, r4, ror #8 - smlad lr, r6, r6, lr - uxtb16 r6, r5 - smlad lr, r4, r4, lr - uxtb16 r5, r5, ror #8 - smlad lr, r6, r6, lr - subs r12, r12, #1 - add r0, r0, r1 - smlad lr, r5, r5, lr - bgt 1b - - mov r0, lr - pop {r4-r6, pc} -endfunc - -function ff_pix_sum_armv6, export=1 - push {r4-r7, lr} - mov r12, #16 - mov r2, #0 - mov r3, #0 - mov lr, #0 - ldr r4, [r0] -1: - subs r12, r12, #1 - ldr r5, [r0, #4] - usada8 r2, r4, lr, r2 - ldr r6, [r0, #8] - usada8 r3, r5, lr, r3 - ldr r7, [r0, #12] - usada8 r2, r6, lr, r2 - beq 2f - ldr_pre r4, r0, r1 - usada8 r3, r7, lr, r3 - bgt 1b -2: - usada8 r3, r7, lr, r3 - add r0, r2, r3 - pop {r4-r7, pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/dsputil_init_arm.c b/ffmpeg/libavcodec/arm/dsputil_init_arm.c deleted file mode 100644 index 68991fa..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_init_arm.c +++ /dev/null @@ -1,86 +0,0 @@ -/* - * ARM optimized DSP utils - * Copyright (c) 2001 Lionel Ulmer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "dsputil_arm.h" - -void ff_j_rev_dct_arm(int16_t *data); -void ff_simple_idct_arm(int16_t *data); - -/* XXX: local hack */ -static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); -static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size); - -void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest, - int line_size); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - converted */ -static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_j_rev_dct_arm (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_j_rev_dct_arm (block); - ff_add_pixels_clamped(block, dest, line_size); -} -static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block) -{ - ff_simple_idct_arm (block); - ff_put_pixels_clamped(block, dest, line_size); -} -static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block) -{ - ff_simple_idct_arm (block); - ff_add_pixels_clamped(block, dest, line_size); -} - -av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx) -{ - int cpu_flags = av_get_cpu_flags(); - - ff_put_pixels_clamped = c->put_pixels_clamped; - ff_add_pixels_clamped = c->add_pixels_clamped; - - if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) { - if(avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_ARM){ - c->idct_put = j_rev_dct_arm_put; - c->idct_add = j_rev_dct_arm_add; - c->idct = ff_j_rev_dct_arm; - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; - } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM){ - c->idct_put = simple_idct_arm_put; - c->idct_add = simple_idct_arm_add; - c->idct = ff_simple_idct_arm; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } - } - - c->add_pixels_clamped = ff_add_pixels_clamped_arm; - - if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx); - if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx); - if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx); -} diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c b/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c deleted file mode 100644 index 841fbfa..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil_arm.h" - -void ff_simple_idct_armv5te(int16_t *data); -void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data); -void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data); - -av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx) -{ - if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 && - (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) { - c->idct_put = ff_simple_idct_put_armv5te; - c->idct_add = ff_simple_idct_add_armv5te; - c->idct = ff_simple_idct_armv5te; - c->idct_permutation_type = FF_NO_IDCT_PERM; - } -} diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv6.c b/ffmpeg/libavcodec/arm/dsputil_init_armv6.c deleted file mode 100644 index 8f38302..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_init_armv6.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavcodec/avcodec.h" -#include "dsputil_arm.h" - -void ff_simple_idct_armv6(int16_t *data); -void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); -void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); - -void ff_add_pixels_clamped_armv6(const int16_t *block, - uint8_t *restrict pixels, - int line_size); - -void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride); -void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1, - const uint8_t *s2, int stride); - -int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); -int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); -int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2, - int line_size, int h); - -int ff_pix_norm1_armv6(uint8_t *pix, int line_size); -int ff_pix_sum_armv6(uint8_t *pix, int line_size); - -av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - - if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 && - (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) { - c->idct_put = ff_simple_idct_put_armv6; - c->idct_add = ff_simple_idct_add_armv6; - c->idct = ff_simple_idct_armv6; - c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM; - } - - if (!high_bit_depth) - c->get_pixels = ff_get_pixels_armv6; - c->add_pixels_clamped = ff_add_pixels_clamped_armv6; - c->diff_pixels = ff_diff_pixels_armv6; - - c->pix_abs[0][0] = ff_pix_abs16_armv6; - c->pix_abs[0][1] = ff_pix_abs16_x2_armv6; - c->pix_abs[0][2] = ff_pix_abs16_y2_armv6; - - c->pix_abs[1][0] = ff_pix_abs8_armv6; - - c->sad[0] = ff_pix_abs16_armv6; - c->sad[1] = ff_pix_abs8_armv6; - - c->sse[0] = ff_sse16_armv6; - - c->pix_norm1 = ff_pix_norm1_armv6; - c->pix_sum = ff_pix_sum_armv6; -} diff --git a/ffmpeg/libavcodec/arm/dsputil_init_neon.c b/ffmpeg/libavcodec/arm/dsputil_init_neon.c deleted file mode 100644 index c1f250a..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_init_neon.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavcodec/avcodec.h" -#include "dsputil_arm.h" - -void ff_simple_idct_neon(int16_t *data); -void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); -void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); - -void ff_clear_block_neon(int16_t *block); -void ff_clear_blocks_neon(int16_t *blocks); - -void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int); -void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int); -void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int); - -void ff_vector_clipf_neon(float *dst, const float *src, float min, float max, - int len); -void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min, - int32_t max, unsigned int len); - -int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len); -int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, - const int16_t *v3, int len, int mul); - -av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx) -{ - const int high_bit_depth = avctx->bits_per_raw_sample > 8; - - if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) { - if (avctx->idct_algo == FF_IDCT_AUTO || - avctx->idct_algo == FF_IDCT_SIMPLENEON) { - c->idct_put = ff_simple_idct_put_neon; - c->idct_add = ff_simple_idct_add_neon; - c->idct = ff_simple_idct_neon; - c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM; - } - } - - if (!high_bit_depth) { - c->clear_block = ff_clear_block_neon; - c->clear_blocks = ff_clear_blocks_neon; - } - - c->add_pixels_clamped = ff_add_pixels_clamped_neon; - c->put_pixels_clamped = ff_put_pixels_clamped_neon; - c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon; - - c->vector_clipf = ff_vector_clipf_neon; - c->vector_clip_int32 = ff_vector_clip_int32_neon; - - c->scalarproduct_int16 = ff_scalarproduct_int16_neon; - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; -} diff --git a/ffmpeg/libavcodec/arm/dsputil_neon.S b/ffmpeg/libavcodec/arm/dsputil_neon.S deleted file mode 100644 index 6c8231e..0000000 --- a/ffmpeg/libavcodec/arm/dsputil_neon.S +++ /dev/null @@ -1,186 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_clear_block_neon, export=1 - vmov.i16 q0, #0 - .rept 8 - vst1.16 {q0}, [r0,:128]! - .endr - bx lr -endfunc - -function ff_clear_blocks_neon, export=1 - vmov.i16 q0, #0 - .rept 8*6 - vst1.16 {q0}, [r0,:128]! - .endr - bx lr -endfunc - -function ff_put_pixels_clamped_neon, export=1 - vld1.16 {d16-d19}, [r0,:128]! - vqmovun.s16 d0, q8 - vld1.16 {d20-d23}, [r0,:128]! - vqmovun.s16 d1, q9 - vld1.16 {d24-d27}, [r0,:128]! - vqmovun.s16 d2, q10 - vld1.16 {d28-d31}, [r0,:128]! - vqmovun.s16 d3, q11 - vst1.8 {d0}, [r1,:64], r2 - vqmovun.s16 d4, q12 - vst1.8 {d1}, [r1,:64], r2 - vqmovun.s16 d5, q13 - vst1.8 {d2}, [r1,:64], r2 - vqmovun.s16 d6, q14 - vst1.8 {d3}, [r1,:64], r2 - vqmovun.s16 d7, q15 - vst1.8 {d4}, [r1,:64], r2 - vst1.8 {d5}, [r1,:64], r2 - vst1.8 {d6}, [r1,:64], r2 - vst1.8 {d7}, [r1,:64], r2 - bx lr -endfunc - -function ff_put_signed_pixels_clamped_neon, export=1 - vmov.u8 d31, #128 - vld1.16 {d16-d17}, [r0,:128]! - vqmovn.s16 d0, q8 - vld1.16 {d18-d19}, [r0,:128]! - vqmovn.s16 d1, q9 - vld1.16 {d16-d17}, [r0,:128]! - vqmovn.s16 d2, q8 - vld1.16 {d18-d19}, [r0,:128]! - vadd.u8 d0, d0, d31 - vld1.16 {d20-d21}, [r0,:128]! - vadd.u8 d1, d1, d31 - vld1.16 {d22-d23}, [r0,:128]! - vadd.u8 d2, d2, d31 - vst1.8 {d0}, [r1,:64], r2 - vqmovn.s16 d3, q9 - vst1.8 {d1}, [r1,:64], r2 - vqmovn.s16 d4, q10 - vst1.8 {d2}, [r1,:64], r2 - vqmovn.s16 d5, q11 - vld1.16 {d24-d25}, [r0,:128]! - vadd.u8 d3, d3, d31 - vld1.16 {d26-d27}, [r0,:128]! - vadd.u8 d4, d4, d31 - vadd.u8 d5, d5, d31 - vst1.8 {d3}, [r1,:64], r2 - vqmovn.s16 d6, q12 - vst1.8 {d4}, [r1,:64], r2 - vqmovn.s16 d7, q13 - vst1.8 {d5}, [r1,:64], r2 - vadd.u8 d6, d6, d31 - vadd.u8 d7, d7, d31 - vst1.8 {d6}, [r1,:64], r2 - vst1.8 {d7}, [r1,:64], r2 - bx lr -endfunc - -function ff_add_pixels_clamped_neon, export=1 - mov r3, r1 - vld1.8 {d16}, [r1,:64], r2 - vld1.16 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vld1.8 {d17}, [r1,:64], r2 - vld1.16 {d2-d3}, [r0,:128]! - vqmovun.s16 d0, q0 - vld1.8 {d18}, [r1,:64], r2 - vaddw.u8 q1, q1, d17 - vld1.16 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.8 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.8 {d19}, [r1,:64], r2 - vld1.16 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vqmovun.s16 d4, q2 - vst1.8 {d2}, [r3,:64], r2 - vld1.8 {d16}, [r1,:64], r2 - vqmovun.s16 d6, q3 - vld1.16 {d0-d1}, [r0,:128]! - vaddw.u8 q0, q0, d16 - vst1.8 {d4}, [r3,:64], r2 - vld1.8 {d17}, [r1,:64], r2 - vld1.16 {d2-d3}, [r0,:128]! - vaddw.u8 q1, q1, d17 - vst1.8 {d6}, [r3,:64], r2 - vqmovun.s16 d0, q0 - vld1.8 {d18}, [r1,:64], r2 - vld1.16 {d4-d5}, [r0,:128]! - vaddw.u8 q2, q2, d18 - vst1.8 {d0}, [r3,:64], r2 - vqmovun.s16 d2, q1 - vld1.8 {d19}, [r1,:64], r2 - vqmovun.s16 d4, q2 - vld1.16 {d6-d7}, [r0,:128]! - vaddw.u8 q3, q3, d19 - vst1.8 {d2}, [r3,:64], r2 - vqmovun.s16 d6, q3 - vst1.8 {d4}, [r3,:64], r2 - vst1.8 {d6}, [r3,:64], r2 - bx lr -endfunc - -function ff_vector_clipf_neon, export=1 -VFP vdup.32 q1, d0[1] -VFP vdup.32 q0, d0[0] -NOVFP vdup.32 q0, r2 -NOVFP vdup.32 q1, r3 -NOVFP ldr r2, [sp] - vld1.f32 {q2},[r1,:128]! - vmin.f32 q10, q2, q1 - vld1.f32 {q3},[r1,:128]! - vmin.f32 q11, q3, q1 -1: vmax.f32 q8, q10, q0 - vmax.f32 q9, q11, q0 - subs r2, r2, #8 - beq 2f - vld1.f32 {q2},[r1,:128]! - vmin.f32 q10, q2, q1 - vld1.f32 {q3},[r1,:128]! - vmin.f32 q11, q3, q1 - vst1.f32 {q8},[r0,:128]! - vst1.f32 {q9},[r0,:128]! - b 1b -2: vst1.f32 {q8},[r0,:128]! - vst1.f32 {q9},[r0,:128]! - bx lr -endfunc - -function ff_vector_clip_int32_neon, export=1 - vdup.32 q0, r2 - vdup.32 q1, r3 - ldr r2, [sp] -1: - vld1.32 {q2-q3}, [r1,:128]! - vmin.s32 q2, q2, q1 - vmin.s32 q3, q3, q1 - vmax.s32 q2, q2, q0 - vmax.s32 q3, q3, q0 - vst1.32 {q2-q3}, [r0,:128]! - subs r2, r2, #8 - bgt 1b - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c b/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c deleted file mode 100644 index ef098f4..0000000 --- a/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/cpu.h" - -#define CONFIG_FFT_FLOAT 0 -#include "libavcodec/fft.h" - -void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z); -void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i); -void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i); - -av_cold void ff_fft_fixed_init_arm(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; -#if CONFIG_FFT - s->fft_calc = ff_fft_fixed_calc_neon; -#endif - -#if CONFIG_MDCT - if (!s->inverse && s->nbits >= 3) { - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; - s->mdct_calc = ff_mdct_fixed_calc_neon; - s->mdct_calcw = ff_mdct_fixed_calcw_neon; - } -#endif - } -} diff --git a/ffmpeg/libavcodec/arm/fft_fixed_neon.S b/ffmpeg/libavcodec/arm/fft_fixed_neon.S deleted file mode 100644 index d4a38a2..0000000 --- a/ffmpeg/libavcodec/arm/fft_fixed_neon.S +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro bflies d0, d1, r0, r1 - vrev64.32 \r0, \d1 @ t5, t6, t1, t2 - vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2 - vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2 - vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5 - vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1 - @ t5, t6, t4, t3 - vhsub.s16 \d1, \d0, \r0 - vhadd.s16 \d0, \d0, \r0 -.endm - -.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1 - vrev32.16 \r0, \d3 - vmull.s16 \w0, \d3, \c0 - vmlal.s16 \w0, \r0, \c1 - vshrn.s32 \d3, \w0, #15 - bflies \q0, \q1, \w0, \w1 -.endm - -.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \ - r0, r1, w0, w1 - vrev32.16 \r0, \d1 - vrev32.16 \r1, \d3 - vmull.s16 \w0, \d1, \c0 - vmlal.s16 \w0, \r0, \c1 - vmull.s16 \w1, \d3, \c2 - vmlal.s16 \w1, \r1, \c3 - vshrn.s32 \d1, \w0, #15 - vshrn.s32 \d3, \w1, #15 - bflies \q0, \q1, \w0, \w1 -.endm - -.macro fft4 d0, d1, r0, r1 - vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7 - vhsub.s16 \r1, \d1, \d0 - vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5 - vmov.i64 \d1, #0xffff00000000 - vbit \r0, \r1, \d1 - vrev64.16 \r1, \r0 @ t7, t8, t4, t3 - vtrn.32 \r0, \r1 @ t3, t4, t7, t8 - vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7 - vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1 - vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3 -.endm - -.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1 - fft4 \d0, \d1, \r0, \r1 - vtrn.32 \d0, \d1 @ z0, z2, z1, z3 - vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4 - vhsub.s16 \d3, \d2, \d3 @ z5, z7 - vmov \d2, \r0 - transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1 -.endm - -function fft4_neon - vld1.16 {d0-d1}, [r0] - fft4 d0, d1, d2, d3 - vst1.16 {d0-d1}, [r0] - bx lr -endfunc - -function fft8_neon - vld1.16 {d0-d3}, [r0,:128] - movrel r1, coefs - vld1.16 {d30}, [r1,:64] - vdup.16 d31, d30[0] - fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9 - vtrn.32 d0, d1 - vtrn.32 d2, d3 - vst1.16 {d0-d3}, [r0,:128] - bx lr -endfunc - -function fft16_neon - vld1.16 {d0-d3}, [r0,:128]! - vld1.16 {d4-d7}, [r0,:128] - movrel r1, coefs - sub r0, r0, #32 - vld1.16 {d28-d31},[r1,:128] - vdup.16 d31, d28[0] - fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9 - vswp d5, d6 - fft4 q2, q3, q8, q9 - vswp d5, d6 - vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7 - vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15 - vswp d1, d2 - vdup.16 d31, d28[0] - transform01 q0, q2, d5, d31, d28, d20, q8, q9 - vdup.16 d26, d29[0] - vdup.16 d27, d30[0] - transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \ - d20, d21, q8, q9 - vtrn.32 q0, q1 - vtrn.32 q2, q3 - vst1.16 {d0-d3}, [r0,:128]! - vst1.16 {d4-d7}, [r0,:128] - bx lr -endfunc - -function fft_pass_neon - push {r4,lr} - movrel lr, coefs+24 - vld1.16 {d30}, [lr,:64] - lsl r12, r2, #3 - vmov d31, d30 - add r3, r1, r2, lsl #2 - mov lr, #-8 - sub r3, r3, #2 - mov r4, r0 - vld1.16 {d27[]}, [r3,:16] - sub r3, r3, #6 - vld1.16 {q0}, [r4,:128], r12 - vld1.16 {q1}, [r4,:128], r12 - vld1.16 {q2}, [r4,:128], r12 - vld1.16 {q3}, [r4,:128], r12 - vld1.16 {d28}, [r1,:64]! - vld1.16 {d29}, [r3,:64], lr - vswp d1, d2 - vswp d5, d6 - vtrn.32 d0, d1 - vtrn.32 d4, d5 - vdup.16 d25, d28[1] - vmul.s16 d27, d27, d31 - transform01 q0, q2, d5, d25, d27, d20, q8, q9 - b 2f -1: - mov r4, r0 - vdup.16 d26, d29[0] - vld1.16 {q0}, [r4,:128], r12 - vld1.16 {q1}, [r4,:128], r12 - vld1.16 {q2}, [r4,:128], r12 - vld1.16 {q3}, [r4,:128], r12 - vld1.16 {d28}, [r1,:64]! - vld1.16 {d29}, [r3,:64], lr - vswp d1, d2 - vswp d5, d6 - vtrn.32 d0, d1 - vtrn.32 d4, d5 - vdup.16 d24, d28[0] - vdup.16 d25, d28[1] - vdup.16 d27, d29[3] - vmul.s16 q13, q13, q15 - transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \ - d16, d17, q9, q10 -2: - vtrn.32 d2, d3 - vtrn.32 d6, d7 - vdup.16 d24, d28[2] - vdup.16 d26, d29[2] - vdup.16 d25, d28[3] - vdup.16 d27, d29[1] - vmul.s16 q13, q13, q15 - transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \ - d16, d17, q9, q10 - vtrn.32 d0, d1 - vtrn.32 d2, d3 - vtrn.32 d4, d5 - vtrn.32 d6, d7 - vswp d1, d2 - vswp d5, d6 - mov r4, r0 - vst1.16 {q0}, [r4,:128], r12 - vst1.16 {q1}, [r4,:128], r12 - vst1.16 {q2}, [r4,:128], r12 - vst1.16 {q3}, [r4,:128], r12 - add r0, r0, #16 - subs r2, r2, #2 - bgt 1b - pop {r4,pc} -endfunc - -#define F_SQRT1_2 23170 -#define F_COS_16_1 30274 -#define F_COS_16_3 12540 - -const coefs, align=4 - .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2 - .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1 - .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3 - .short 1, -1, -1, 1 -endconst - -.macro def_fft n, n2, n4 -function fft\n\()_neon - push {r4, lr} - mov r4, r0 - bl fft\n2\()_neon - add r0, r4, #\n4*2*4 - bl fft\n4\()_neon - add r0, r4, #\n4*3*4 - bl fft\n4\()_neon - mov r0, r4 - pop {r4, lr} - movrelx r1, X(ff_cos_\n\()_fixed) - mov r2, #\n4/2 - b fft_pass_neon -endfunc -.endm - - def_fft 32, 16, 8 - def_fft 64, 32, 16 - def_fft 128, 64, 32 - def_fft 256, 128, 64 - def_fft 512, 256, 128 - def_fft 1024, 512, 256 - def_fft 2048, 1024, 512 - def_fft 4096, 2048, 1024 - def_fft 8192, 4096, 2048 - def_fft 16384, 8192, 4096 - def_fft 32768, 16384, 8192 - def_fft 65536, 32768, 16384 - -function ff_fft_fixed_calc_neon, export=1 - ldr r2, [r0] - sub r2, r2, #2 - movrel r3, fft_fixed_tab_neon - ldr r3, [r3, r2, lsl #2] - mov r0, r1 - bx r3 -endfunc - -const fft_fixed_tab_neon - .word fft4_neon - .word fft8_neon - .word fft16_neon - .word fft32_neon - .word fft64_neon - .word fft128_neon - .word fft256_neon - .word fft512_neon - .word fft1024_neon - .word fft2048_neon - .word fft4096_neon - .word fft8192_neon - .word fft16384_neon - .word fft32768_neon - .word fft65536_neon -endconst diff --git a/ffmpeg/libavcodec/arm/fft_init_arm.c b/ffmpeg/libavcodec/arm/fft_init_arm.c deleted file mode 100644 index 7e49b9c..0000000 --- a/ffmpeg/libavcodec/arm/fft_init_arm.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/cpu.h" -#include "libavcodec/fft.h" -#include "libavcodec/rdft.h" -#include "libavcodec/synth_filter.h" - -void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); -void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); - -void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input); - -void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); -void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); - -void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); - -av_cold void ff_fft_init_arm(FFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp(cpu_flags)) { -#if CONFIG_MDCT - if (!have_vfpv3(cpu_flags)) - s->imdct_half = ff_imdct_half_vfp; -#endif - } - - if (have_neon(cpu_flags)) { -#if CONFIG_FFT - s->fft_permute = ff_fft_permute_neon; - s->fft_calc = ff_fft_calc_neon; -#endif -#if CONFIG_MDCT - s->imdct_calc = ff_imdct_calc_neon; - s->imdct_half = ff_imdct_half_neon; - s->mdct_calc = ff_mdct_calc_neon; - s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE; -#endif - } -} - -#if CONFIG_RDFT -av_cold void ff_rdft_init_arm(RDFTContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) - s->rdft_calc = ff_rdft_calc_neon; -} -#endif diff --git a/ffmpeg/libavcodec/arm/fft_neon.S b/ffmpeg/libavcodec/arm/fft_neon.S deleted file mode 100644 index 8b9ae2a..0000000 --- a/ffmpeg/libavcodec/arm/fft_neon.S +++ /dev/null @@ -1,375 +0,0 @@ -/* - * ARM NEON optimised FFT - * - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * Copyright (c) 2009 Naotoshi Nojiri - * - * This algorithm (though not any of the implementation details) is - * based on libdjbfft by D. J. Bernstein. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define M_SQRT1_2 0.70710678118654752440 - - -function fft4_neon - vld1.32 {d0-d3}, [r0,:128] - - vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 - vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 - vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 - vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 - vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 - vadd.f32 d1, d6, d7 - vsub.f32 d3, d6, d7 - vadd.f32 d0, d4, d5 - vsub.f32 d2, d4, d5 - - vst1.32 {d0-d3}, [r0,:128] - - bx lr -endfunc - -function fft8_neon - mov r1, r0 - vld1.32 {d0-d3}, [r1,:128]! - vld1.32 {d16-d19}, [r1,:128] - - movw r2, #0x04f3 @ sqrt(1/2) - movt r2, #0x3f35 - eor r3, r2, #1<<31 - vdup.32 d31, r2 - - vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 - vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 - vmov d28, r3, r2 - vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 - vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 - vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 - vrev64.32 d29, d28 - vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 - vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 - vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w - vext.32 q3, q2, q2, #1 - vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w - vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 - vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 - vmul.f32 d24, d17, d31 @ a2r*w,a2i*w - vmul.f32 d25, d19, d31 @ a3r*w,a3i*w - vadd.f32 d0, d20, d21 - vsub.f32 d2, d20, d21 - vadd.f32 d1, d22, d23 - vrev64.32 q13, q13 - vsub.f32 d3, d22, d23 - vsub.f32 d6, d6, d7 - vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 - vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 - vadd.f32 d7, d4, d5 - vsub.f32 d18, d2, d6 - vext.32 q13, q12, q12, #1 - vadd.f32 d2, d2, d6 - vsub.f32 d16, d0, d7 - vadd.f32 d5, d25, d24 - vsub.f32 d4, d26, d27 - vadd.f32 d0, d0, d7 - vsub.f32 d17, d1, d5 - vsub.f32 d19, d3, d4 - vadd.f32 d3, d3, d4 - vadd.f32 d1, d1, d5 - - vst1.32 {d16-d19}, [r1,:128] - vst1.32 {d0-d3}, [r0,:128] - - bx lr -endfunc - -function fft16_neon - movrel r1, mppm - vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} - pld [r0, #32] - vld1.32 {d2-d3}, [r1,:128] - vext.32 q13, q9, q9, #1 - vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} - vadd.f32 d4, d16, d17 - vsub.f32 d5, d16, d17 - vadd.f32 d18, d18, d19 - vsub.f32 d19, d26, d27 - - vadd.f32 d20, d22, d23 - vsub.f32 d22, d22, d23 - vsub.f32 d23, d24, d25 - vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} - vadd.f32 d21, d24, d25 - vmul.f32 d24, d22, d2 - vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} - vmul.f32 d25, d23, d3 - vuzp.32 d16, d17 @ {r0,r1,i0,i1} - vmul.f32 q1, q11, d2[1] - vuzp.32 d18, d19 @ {r2,r3,i2,i3} - vrev64.32 q12, q12 - vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} - vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} - vzip.32 q10, q11 - vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - sub r0, r0, #96 - vext.32 q13, q13, q13, #1 - vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} - vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} - vext.32 q15, q15, q15, #1 - vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} - vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} - vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} - vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} - vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} - vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} - movrelx r2, X(ff_cos_16) - vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} - vrev64.32 d1, d1 - vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} - vrev64.32 d3, d3 - movrel r3, pmmp - vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} - vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} - vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} - vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} - vld1.32 {d4-d5}, [r2,:64] - vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} - vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} - vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} - vld1.32 {d6-d7}, [r3,:128] - vrev64.32 q1, q14 - vmul.f32 q14, q14, d4[1] - vmul.f32 q1, q1, q3 - vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} - vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} - vzip.32 q12, q14 - vadd.f32 d0, d28, d24 - vadd.f32 d1, d25, d29 - vsub.f32 d2, d25, d29 - vsub.f32 d3, d28, d24 - vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} - vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} - vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} - mov r1, #32 - vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} - vrev64.32 q0, q13 - vmul.f32 q13, q13, d5[0] - vrev64.32 q1, q15 - vmul.f32 q15, q15, d5[1] - vst2.32 {d16-d17},[r0,:128], r1 - vmul.f32 q0, q0, q3 - vst2.32 {d20-d21},[r0,:128], r1 - vmul.f32 q1, q1, q3 - vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} - vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} - vst2.32 {d24-d25},[r0,:128], r1 - vst2.32 {d28-d29},[r0,:128] - vzip.32 q13, q15 - sub r0, r0, #80 - vadd.f32 d0, d30, d26 - vadd.f32 d1, d27, d31 - vsub.f32 d2, d27, d31 - vsub.f32 d3, d30, d26 - vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} - vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} - vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} - vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} - vst2.32 {d18-d19},[r0,:128], r1 - vst2.32 {d22-d23},[r0,:128], r1 - vst2.32 {d26-d27},[r0,:128], r1 - vst2.32 {d30-d31},[r0,:128] - bx lr -endfunc - -function fft_pass_neon - push {r4-r6,lr} - mov r6, r2 @ n - lsl r5, r2, #3 @ 2 * n * sizeof FFTSample - lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex - lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex - add r3, r2, r4 - add r4, r4, r0 @ &z[o1] - add r2, r2, r0 @ &z[o2] - add r3, r3, r0 @ &z[o3] - vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - movrel r12, pmmp - vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} - add r5, r5, r1 @ wim - vld1.32 {d6-d7}, [r12,:128] @ pmmp - vswp d21, d22 - vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} - sub r5, r5, #4 @ wim-- - vrev64.32 q1, q11 - vmul.f32 q11, q11, d4[1] - vmul.f32 q1, q1, q3 - vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] - vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} - vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} - sub r6, r6, #1 @ n-- - vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} - vzip.32 q10, q11 - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - vsub.f32 q10, q8, q0 - vadd.f32 q8, q8, q0 - vsub.f32 q11, q9, q1 - vadd.f32 q9, q9, q1 - vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} - vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} - vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} - vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} - sub r5, r5, #8 @ wim -= 2 -1: - vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} - vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} - vswp d21, d22 - vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} - vrev64.32 q0, q10 - vmul.f32 q10, q10, d4[0] - vrev64.32 q1, q11 - vmul.f32 q11, q11, d4[1] - vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} - vmul.f32 q0, q0, q3 - sub r5, r5, #8 @ wim -= 2 - vmul.f32 q1, q1, q3 - vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} - vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} - vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} - subs r6, r6, #1 @ n-- - vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} - vzip.32 q10, q11 - vadd.f32 d0, d22, d20 - vadd.f32 d1, d21, d23 - vsub.f32 d2, d21, d23 - vsub.f32 d3, d22, d20 - vsub.f32 q10, q8, q0 - vadd.f32 q8, q8, q0 - vsub.f32 q11, q9, q1 - vadd.f32 q9, q9, q1 - vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} - vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} - vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} - vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} - bne 1b - - pop {r4-r6,pc} -endfunc - -.macro def_fft n, n2, n4 - .align 6 -function fft\n\()_neon - push {r4, lr} - mov r4, r0 - bl fft\n2\()_neon - add r0, r4, #\n4*2*8 - bl fft\n4\()_neon - add r0, r4, #\n4*3*8 - bl fft\n4\()_neon - mov r0, r4 - pop {r4, lr} - movrelx r1, X(ff_cos_\n) - mov r2, #\n4/2 - b fft_pass_neon -endfunc -.endm - - def_fft 32, 16, 8 - def_fft 64, 32, 16 - def_fft 128, 64, 32 - def_fft 256, 128, 64 - def_fft 512, 256, 128 - def_fft 1024, 512, 256 - def_fft 2048, 1024, 512 - def_fft 4096, 2048, 1024 - def_fft 8192, 4096, 2048 - def_fft 16384, 8192, 4096 - def_fft 32768, 16384, 8192 - def_fft 65536, 32768, 16384 - -function ff_fft_calc_neon, export=1 - ldr r2, [r0] - sub r2, r2, #2 - movrel r3, fft_tab_neon - ldr r3, [r3, r2, lsl #2] - mov r0, r1 - bx r3 -endfunc - -function ff_fft_permute_neon, export=1 - push {r4,lr} - mov r12, #1 - ldr r2, [r0] @ nbits - ldr r3, [r0, #12] @ tmp_buf - ldr r0, [r0, #8] @ revtab - lsl r12, r12, r2 - mov r2, r12 -1: - vld1.32 {d0-d1}, [r1,:128]! - ldr r4, [r0], #4 - uxth lr, r4 - uxth r4, r4, ror #16 - add lr, r3, lr, lsl #3 - add r4, r3, r4, lsl #3 - vst1.32 {d0}, [lr,:64] - vst1.32 {d1}, [r4,:64] - subs r12, r12, #2 - bgt 1b - - sub r1, r1, r2, lsl #3 -1: - vld1.32 {d0-d3}, [r3,:128]! - vst1.32 {d0-d3}, [r1,:128]! - subs r2, r2, #4 - bgt 1b - - pop {r4,pc} -endfunc - -const fft_tab_neon - .word fft4_neon - .word fft8_neon - .word fft16_neon - .word fft32_neon - .word fft64_neon - .word fft128_neon - .word fft256_neon - .word fft512_neon - .word fft1024_neon - .word fft2048_neon - .word fft4096_neon - .word fft8192_neon - .word fft16384_neon - .word fft32768_neon - .word fft65536_neon -endconst - -const pmmp, align=4 - .float +1.0, -1.0, -1.0, +1.0 -endconst - -const mppm, align=4 - .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 -endconst diff --git a/ffmpeg/libavcodec/arm/flacdsp_arm.S b/ffmpeg/libavcodec/arm/flacdsp_arm.S deleted file mode 100644 index f8861c5..0000000 --- a/ffmpeg/libavcodec/arm/flacdsp_arm.S +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function flac_lpc_16_1_arm - ldr r12, [sp] - push {r4, lr} - ldr r1, [r1] - subs r12, r12, #2 - ldr lr, [r0], #4 - beq 2f - it lt - poplt {r4, pc} -1: - mul r4, lr, r1 - ldm r0, {r2, lr} - add_sh r2, r2, r4, asr r3 - mul r4, r2, r1 - subs r12, r12, #2 - add_sh lr, lr, r4, asr r3 - stm r0!, {r2, lr} - bgt 1b - it lt - poplt {r4, pc} -2: - mul r4, lr, r1 - ldr r2, [r0] - add_sh r2, r2, r4, asr r3 - str r2, [r0] - pop {r4, pc} -endfunc - -function flac_lpc_16_2_arm - ldr r12, [sp] - subs r12, r12, r2 - it le - bxle lr - - push {r4-r9, lr} - ldm r0!, {r6, r7} - ldm r1, {r8, r9} - subs r12, r12, #1 - beq 2f -1: - mul r4, r6, r8 - mul r5, r7, r8 - mla r4, r7, r9, r4 - ldm r0, {r6, r7} - add_sh r6, r6, r4, asr r3 - mla r5, r6, r9, r5 - add_sh r7, r7, r5, asr r3 - stm r0!, {r6, r7} - subs r12, r12, #2 - bgt 1b - it lt - poplt {r4-r9, pc} -2: - mul r4, r6, r8 - mla r4, r7, r9, r4 - ldr r5, [r0] - add_sh r5, r5, r4, asr r3 - str r5, [r0] - pop {r4-r9, pc} -endfunc - -function ff_flac_lpc_16_arm, export=1 - cmp r2, #2 - blt flac_lpc_16_1_arm - beq flac_lpc_16_2_arm - - ldr r12, [sp] - subs r12, r12, r2 - it le - bxle lr - - push {r4-r9, lr} - - subs r12, r12, #1 - beq 3f -1: - sub lr, r2, #2 - mov r4, #0 - mov r5, #0 - - ldr r7, [r0], #4 - ldr r9, [r1], #4 -2: - mla r4, r7, r9, r4 - ldm r0!, {r6, r7} - mla r5, r6, r9, r5 - ldm r1!, {r8, r9} - mla r4, r6, r8, r4 - subs lr, lr, #2 - mla r5, r7, r8, r5 - bgt 2b - blt 6f - - mla r4, r7, r9, r4 - ldr r7, [r0], #4 - mla r5, r7, r9, r5 - ldr r9, [r1], #4 -6: - mla r4, r7, r9, r4 - ldm r0, {r6, r7} - add_sh r6, r6, r4, asr r3 - mla r5, r6, r9, r5 - add_sh r7, r7, r5, asr r3 - stm r0!, {r6, r7} - sub r0, r0, r2, lsl #2 - sub r1, r1, r2, lsl #2 - - subs r12, r12, #2 - bgt 1b - it lt - poplt {r4-r9, pc} -3: - mov r4, #0 -4: - ldr r5, [r1], #4 - ldr r6, [r0], #4 - mla r4, r5, r6, r4 - subs r2, r2, #1 - bgt 4b - ldr r5, [r0] - add_sh r5, r5, r4, asr r3 - str r5, [r0] - pop {r4-r9, pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/flacdsp_init_arm.c b/ffmpeg/libavcodec/arm/flacdsp_init_arm.c deleted file mode 100644 index 9b93942..0000000 --- a/ffmpeg/libavcodec/arm/flacdsp_init_arm.c +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2012 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavcodec/flacdsp.h" -#include "config.h" - -void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order, - int qlevel, int len); - -av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt, - int bps) -{ - if (bps <= 16) - c->lpc = ff_flac_lpc_16_arm; -} diff --git a/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c b/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c deleted file mode 100644 index 37319ed..0000000 --- a/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c +++ /dev/null @@ -1,65 +0,0 @@ -/* - * ARM optimized Format Conversion Utils - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/fmtconvert.h" - -void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src, - float mul, int len); - -void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, - float mul, int len); -void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, - const int32_t *src, const float *mul, - int len); - -void ff_float_to_int16_neon(int16_t *dst, const float *src, long len); -void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int); - -void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); - -av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_vfp(cpu_flags)) { - if (!have_vfpv3(cpu_flags)) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp; - c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp; - } - - if (have_armv6(cpu_flags)) { - c->float_to_int16 = ff_float_to_int16_vfp; - } - } - - if (have_neon(cpu_flags)) { - c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon; - - if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { - c->float_to_int16 = ff_float_to_int16_neon; - c->float_to_int16_interleave = ff_float_to_int16_interleave_neon; - } - } -} diff --git a/ffmpeg/libavcodec/arm/fmtconvert_neon.S b/ffmpeg/libavcodec/arm/fmtconvert_neon.S deleted file mode 100644 index 55d070e..0000000 --- a/ffmpeg/libavcodec/arm/fmtconvert_neon.S +++ /dev/null @@ -1,392 +0,0 @@ -/* - * ARM NEON optimised Format Conversion Utils - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/arm/asm.S" - -function ff_float_to_int16_neon, export=1 - subs r2, r2, #8 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q9, q1, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vshrn.s32 d4, q8, #16 - vld1.64 {d0-d1}, [r1,:128]! - vcvt.s32.f32 q0, q0, #16 - vshrn.s32 d5, q9, #16 - vld1.64 {d2-d3}, [r1,:128]! - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vld1.64 {d16-d17},[r1,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r1,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6-d7}, [r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vld1.64 {d0-d1}, [r1,:128]! - vshrn.s32 d4, q8, #16 - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r1,:128]! - vshrn.s32 d5, q9, #16 - vcvt.s32.f32 q1, q1, #16 - vshrn.s32 d6, q0, #16 - vst1.64 {d4-d5}, [r0,:128]! - vshrn.s32 d7, q1, #16 - vst1.64 {d6-d7}, [r0,:128]! - bx lr -3: vshrn.s32 d4, q8, #16 - vshrn.s32 d5, q9, #16 - vst1.64 {d4-d5}, [r0,:128]! - bx lr -endfunc - -function ff_float_to_int16_interleave_neon, export=1 - cmp r3, #2 - itt lt - ldrlt r1, [r1] - blt ff_float_to_int16_neon - bne 4f - - ldr r3, [r1] - ldr r1, [r1, #4] - - subs r2, r2, #8 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q8, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q9, q1, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 3f - bics ip, r2, #15 - beq 2f -1: subs ip, ip, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q10, q8, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vld1.64 {d26-d27},[r1,:128]! - vsri.32 q11, q9, #16 - vst1.64 {d20-d21},[r0,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q12, q0, #16 - vld1.64 {d16-d17},[r3,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d25},[r0,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r3,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r1,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r1,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d26-d27},[r0,:128]! - bne 1b - ands r2, r2, #15 - beq 3f -2: vsri.32 q10, q8, #16 - vld1.64 {d0-d1}, [r3,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r3,:128]! - vcvt.s32.f32 q1, q1, #16 - vld1.64 {d24-d25},[r1,:128]! - vcvt.s32.f32 q12, q12, #16 - vsri.32 q11, q9, #16 - vld1.64 {d26-d27},[r1,:128]! - vcvt.s32.f32 q13, q13, #16 - vst1.64 {d20-d21},[r0,:128]! - vsri.32 q12, q0, #16 - vst1.64 {d22-d23},[r0,:128]! - vsri.32 q13, q1, #16 - vst1.64 {d24-d27},[r0,:128]! - bx lr -3: vsri.32 q10, q8, #16 - vsri.32 q11, q9, #16 - vst1.64 {d20-d23},[r0,:128]! - bx lr - -4: push {r4-r8,lr} - cmp r3, #4 - lsl ip, r3, #1 - blt 4f - - @ 4 channels -5: ldmia r1!, {r4-r7} - mov lr, r2 - mov r8, r0 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #8 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 q9, q8, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 q11, q10, #16 - vld1.64 {d4-d5}, [r6,:128]! - vcvt.s32.f32 q2, q2, #16 - vzip.32 d18, d22 - vld1.64 {d6-d7}, [r7,:128]! - vcvt.s32.f32 q3, q3, #16 - vzip.32 d19, d23 - vst1.64 {d18}, [r8], ip - vsri.32 q1, q0, #16 - vst1.64 {d22}, [r8], ip - vsri.32 q3, q2, #16 - vst1.64 {d19}, [r8], ip - vzip.32 d2, d6 - vst1.64 {d23}, [r8], ip - vzip.32 d3, d7 - beq 7f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.64 {d2}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.64 {d6}, [r8], ip - vld1.64 {d20-d21},[r6,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.64 {d3}, [r8], ip - vld1.64 {d22-d23},[r7,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.64 {d7}, [r8], ip - b 6b -7: vst1.64 {d2}, [r8], ip - vst1.64 {d6}, [r8], ip - vst1.64 {d3}, [r8], ip - vst1.64 {d7}, [r8], ip - subs r3, r3, #4 - it eq - popeq {r4-r8,pc} - cmp r3, #4 - add r0, r0, #8 - bge 5b - - @ 2 channels -4: cmp r3, #2 - blt 4f - ldmia r1!, {r4-r5} - mov lr, r2 - mov r8, r0 - tst lr, #8 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - beq 6f - subs lr, lr, #8 - beq 7f - vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 -6: subs lr, lr, #16 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vsri.32 d18, d16, #16 - vld1.64 {d2-d3}, [r5,:128]! - vcvt.s32.f32 q1, q1, #16 - vsri.32 d19, d17, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r5,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vsri.32 d2, d0, #16 - vst1.32 {d19[1]}, [r8], ip - vsri.32 d3, d1, #16 - vst1.32 {d22[0]}, [r8], ip - vsri.32 d6, d4, #16 - vst1.32 {d22[1]}, [r8], ip - vsri.32 d7, d5, #16 - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip - beq 6f - vld1.64 {d16-d17},[r4,:128]! - vcvt.s32.f32 q8, q8, #16 - vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vld1.64 {d18-d19},[r5,:128]! - vcvt.s32.f32 q9, q9, #16 - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vld1.64 {d20-d21},[r4,:128]! - vcvt.s32.f32 q10, q10, #16 - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vld1.64 {d22-d23},[r5,:128]! - vcvt.s32.f32 q11, q11, #16 - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - bgt 6b -6: vst1.32 {d2[0]}, [r8], ip - vst1.32 {d2[1]}, [r8], ip - vst1.32 {d3[0]}, [r8], ip - vst1.32 {d3[1]}, [r8], ip - vst1.32 {d6[0]}, [r8], ip - vst1.32 {d6[1]}, [r8], ip - vst1.32 {d7[0]}, [r8], ip - vst1.32 {d7[1]}, [r8], ip - b 8f -7: vsri.32 d18, d16, #16 - vsri.32 d19, d17, #16 - vst1.32 {d18[0]}, [r8], ip - vsri.32 d22, d20, #16 - vst1.32 {d18[1]}, [r8], ip - vsri.32 d23, d21, #16 - vst1.32 {d19[0]}, [r8], ip - vst1.32 {d19[1]}, [r8], ip - vst1.32 {d22[0]}, [r8], ip - vst1.32 {d22[1]}, [r8], ip - vst1.32 {d23[0]}, [r8], ip - vst1.32 {d23[1]}, [r8], ip -8: subs r3, r3, #2 - add r0, r0, #4 - it eq - popeq {r4-r8,pc} - - @ 1 channel -4: ldr r4, [r1],#4 - tst r2, #8 - mov lr, r2 - mov r5, r0 - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - bne 8f -6: subs lr, lr, #16 - vld1.64 {d4-d5}, [r4,:128]! - vcvt.s32.f32 q2, q2, #16 - vld1.64 {d6-d7}, [r4,:128]! - vcvt.s32.f32 q3, q3, #16 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - beq 7f - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 -7: vst1.16 {d4[1]}, [r5,:16], ip - vst1.16 {d4[3]}, [r5,:16], ip - vst1.16 {d5[1]}, [r5,:16], ip - vst1.16 {d5[3]}, [r5,:16], ip - vst1.16 {d6[1]}, [r5,:16], ip - vst1.16 {d6[3]}, [r5,:16], ip - vst1.16 {d7[1]}, [r5,:16], ip - vst1.16 {d7[3]}, [r5,:16], ip - bgt 6b - pop {r4-r8,pc} -8: subs lr, lr, #8 - vst1.16 {d0[1]}, [r5,:16], ip - vst1.16 {d0[3]}, [r5,:16], ip - vst1.16 {d1[1]}, [r5,:16], ip - vst1.16 {d1[3]}, [r5,:16], ip - vst1.16 {d2[1]}, [r5,:16], ip - vst1.16 {d2[3]}, [r5,:16], ip - vst1.16 {d3[1]}, [r5,:16], ip - vst1.16 {d3[3]}, [r5,:16], ip - it eq - popeq {r4-r8,pc} - vld1.64 {d0-d1}, [r4,:128]! - vcvt.s32.f32 q0, q0, #16 - vld1.64 {d2-d3}, [r4,:128]! - vcvt.s32.f32 q1, q1, #16 - b 6b -endfunc - -function ff_int32_to_float_fmul_scalar_neon, export=1 -VFP vdup.32 q0, d0[0] -VFP len .req r2 -NOVFP vdup.32 q0, r2 -NOVFP len .req r3 - - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 -1: subs len, len, #8 - pld [r1, #16] - vmul.f32 q9, q3, q0 - vmul.f32 q10, q8, q0 - beq 2f - vld1.32 {q1},[r1,:128]! - vcvt.f32.s32 q3, q1 - vld1.32 {q2},[r1,:128]! - vcvt.f32.s32 q8, q2 - vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - b 1b -2: vst1.32 {q9}, [r0,:128]! - vst1.32 {q10},[r0,:128]! - bx lr - .unreq len -endfunc diff --git a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S deleted file mode 100644 index b14af45..0000000 --- a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/arm/asm.S" - -/** - * ARM VFP optimised int32 to float conversion. - * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned - * (16 bytes alignment is best for BCM2835), little-endian. - */ -@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len) -function ff_int32_to_float_fmul_array8_vfp, export=1 - push {lr} - ldr a1, [sp, #4] - subs lr, a1, #3*8 - bcc 50f @ too short to pipeline - @ Now need to find (len / 8) % 3. The approximation - @ x / 24 = (x * 0xAB) >> 12 - @ is good for x < 4096, which is true for both AC3 and DCA. - mov a1, #0xAB - ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 - mul a1, lr, a1 - vpush {s16-s31} - mov a1, a1, lsr #12 - add a1, a1, a1, lsl #1 - rsb a1, a1, lr, lsr #3 - cmp a1, #1 - fmrx a1, FPSCR - fmxr FPSCR, ip - beq 11f - blo 10f - @ Array is (2 + multiple of 3) x 8 floats long - @ drop through... - vldmia a3!, {s16-s23} - vldmia a4!, {s2,s3} - vldmia a3!, {s24-s31} - vcvt.f32.s32 s16, s16 - vcvt.f32.s32 s17, s17 - vcvt.f32.s32 s18, s18 - vcvt.f32.s32 s19, s19 - vcvt.f32.s32 s20, s20 - vcvt.f32.s32 s21, s21 - vcvt.f32.s32 s22, s22 - vcvt.f32.s32 s23, s23 - vmul.f32 s16, s16, s2 - @ drop through... -3: - vldmia a3!, {s8-s15} - vldmia a4!, {s1} - vcvt.f32.s32 s24, s24 - vcvt.f32.s32 s25, s25 - vcvt.f32.s32 s26, s26 - vcvt.f32.s32 s27, s27 - vcvt.f32.s32 s28, s28 - vcvt.f32.s32 s29, s29 - vcvt.f32.s32 s30, s30 - vcvt.f32.s32 s31, s31 - vmul.f32 s24, s24, s3 - vstmia a2!, {s16-s19} - vstmia a2!, {s20-s23} -2: - vldmia a3!, {s16-s23} - vldmia a4!, {s2} - vcvt.f32.s32 s8, s8 - vcvt.f32.s32 s9, s9 - vcvt.f32.s32 s10, s10 - vcvt.f32.s32 s11, s11 - vcvt.f32.s32 s12, s12 - vcvt.f32.s32 s13, s13 - vcvt.f32.s32 s14, s14 - vcvt.f32.s32 s15, s15 - vmul.f32 s8, s8, s1 - vstmia a2!, {s24-s27} - vstmia a2!, {s28-s31} -1: - vldmia a3!, {s24-s31} - vldmia a4!, {s3} - vcvt.f32.s32 s16, s16 - vcvt.f32.s32 s17, s17 - vcvt.f32.s32 s18, s18 - vcvt.f32.s32 s19, s19 - vcvt.f32.s32 s20, s20 - vcvt.f32.s32 s21, s21 - vcvt.f32.s32 s22, s22 - vcvt.f32.s32 s23, s23 - vmul.f32 s16, s16, s2 - vstmia a2!, {s8-s11} - vstmia a2!, {s12-s15} - - subs lr, lr, #8*3 - bpl 3b - - vcvt.f32.s32 s24, s24 - vcvt.f32.s32 s25, s25 - vcvt.f32.s32 s26, s26 - vcvt.f32.s32 s27, s27 - vcvt.f32.s32 s28, s28 - vcvt.f32.s32 s29, s29 - vcvt.f32.s32 s30, s30 - vcvt.f32.s32 s31, s31 - vmul.f32 s24, s24, s3 - vstmia a2!, {s16-s19} - vstmia a2!, {s20-s23} - vstmia a2!, {s24-s27} - vstmia a2!, {s28-s31} - - fmxr FPSCR, a1 - vpop {s16-s31} - pop {pc} - -10: @ Array is (multiple of 3) x 8 floats long - vldmia a3!, {s8-s15} - vldmia a4!, {s1,s2} - vldmia a3!, {s16-s23} - vcvt.f32.s32 s8, s8 - vcvt.f32.s32 s9, s9 - vcvt.f32.s32 s10, s10 - vcvt.f32.s32 s11, s11 - vcvt.f32.s32 s12, s12 - vcvt.f32.s32 s13, s13 - vcvt.f32.s32 s14, s14 - vcvt.f32.s32 s15, s15 - vmul.f32 s8, s8, s1 - b 1b - -11: @ Array is (1 + multiple of 3) x 8 floats long - vldmia a3!, {s24-s31} - vldmia a4!, {s3} - vldmia a3!, {s8-s15} - vldmia a4!, {s1} - vcvt.f32.s32 s24, s24 - vcvt.f32.s32 s25, s25 - vcvt.f32.s32 s26, s26 - vcvt.f32.s32 s27, s27 - vcvt.f32.s32 s28, s28 - vcvt.f32.s32 s29, s29 - vcvt.f32.s32 s30, s30 - vcvt.f32.s32 s31, s31 - vmul.f32 s24, s24, s3 - b 2b - -50: - ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 - fmrx ip, FPSCR - fmxr FPSCR, lr -51: - vldmia a3!, {s8-s15} - vldmia a4!, {s0} - vcvt.f32.s32 s8, s8 - vcvt.f32.s32 s9, s9 - vcvt.f32.s32 s10, s10 - vcvt.f32.s32 s11, s11 - vcvt.f32.s32 s12, s12 - vcvt.f32.s32 s13, s13 - vcvt.f32.s32 s14, s14 - vcvt.f32.s32 s15, s15 - vmul.f32 s8, s8, s0 - subs a1, a1, #8 - vstmia a2!, {s8-s11} - vstmia a2!, {s12-s15} - bne 51b - - fmxr FPSCR, ip - pop {pc} -endfunc - -/** - * ARM VFP optimised int32 to float conversion. - * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned - * (16 bytes alignment is best for BCM2835), little-endian. - * TODO: could be further optimised by unrolling and interleaving, as above - */ -@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len) -function ff_int32_to_float_fmul_scalar_vfp, export=1 -VFP tmp .req a4 -VFP len .req a3 -NOVFP tmp .req a3 -NOVFP len .req a4 -NOVFP vmov s0, a3 - ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 - fmrx ip, FPSCR - fmxr FPSCR, tmp -1: - vldmia a2!, {s8-s15} - vcvt.f32.s32 s8, s8 - vcvt.f32.s32 s9, s9 - vcvt.f32.s32 s10, s10 - vcvt.f32.s32 s11, s11 - vcvt.f32.s32 s12, s12 - vcvt.f32.s32 s13, s13 - vcvt.f32.s32 s14, s14 - vcvt.f32.s32 s15, s15 - vmul.f32 s8, s8, s0 - subs len, len, #8 - vstmia a1!, {s8-s11} - vstmia a1!, {s12-s15} - bne 1b - - fmxr FPSCR, ip - bx lr -endfunc - .unreq tmp - .unreq len diff --git a/ffmpeg/libavcodec/arm/h264chroma_init_arm.c b/ffmpeg/libavcodec/arm/h264chroma_init_arm.c deleted file mode 100644 index 13f7e0d..0000000 --- a/ffmpeg/libavcodec/arm/h264chroma_init_arm.c +++ /dev/null @@ -1,51 +0,0 @@ -/* - * ARM NEON optimised H.264 chroma functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/h264chroma.h" - -void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int); - -av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth) -{ - const int high_bit_depth = bit_depth > 8; - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags) && !high_bit_depth) { - c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon; - c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon; - c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon; - - c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon; - c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon; - c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/h264cmc_neon.S b/ffmpeg/libavcodec/arm/h264cmc_neon.S deleted file mode 100644 index 0bcae11..0000000 --- a/ffmpeg/libavcodec/arm/h264cmc_neon.S +++ /dev/null @@ -1,411 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ -.macro h264_chroma_mc8 type, codec=h264 -function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 - push {r4-r7, lr} - ldrd r4, r5, [sp, #20] - .ifc \type,avg - mov lr, r0 - .endif - pld [r1] - pld [r1, r2] - - .ifc \codec,rv40 - movrel r6, rv40bias - lsr r7, r5, #1 - add r6, r6, r7, lsl #3 - lsr r7, r4, #1 - add r6, r6, r7, lsl #1 - vld1.16 {d22[],d23[]}, [r6,:16] - .endif - .ifc \codec,vc1 - vmov.u16 q11, #28 - .endif - -A muls r7, r4, r5 -T mul r7, r4, r5 -T cmp r7, #0 - rsb r6, r7, r5, lsl #3 - rsb r12, r7, r4, lsl #3 - sub r4, r7, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 - - beq 2f - - vdup.8 d0, r4 - vdup.8 d1, r12 - vld1.8 {d4, d5}, [r1], r2 - vdup.8 d2, r6 - vdup.8 d3, r7 - vext.8 d5, d4, d5, #1 - -1: vld1.8 {d6, d7}, [r1], r2 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - vext.8 d7, d6, d7, #1 - vld1.8 {d4, d5}, [r1], r2 - vmlal.u8 q8, d6, d2 - pld [r1] - vext.8 d5, d4, d5, #1 - vmlal.u8 q8, d7, d3 - vmull.u8 q9, d6, d0 - subs r3, r3, #2 - vmlal.u8 q9, d7, d1 - vmlal.u8 q9, d4, d2 - vmlal.u8 q9, d5, d3 - pld [r1, r2] - .ifc \codec,h264 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 - .else - vadd.u16 q8, q8, q11 - vadd.u16 q9, q9, q11 - vshrn.u16 d16, q8, #6 - vshrn.u16 d17, q9, #6 - .endif - .ifc \type,avg - vld1.8 {d20}, [lr,:64], r2 - vld1.8 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 - .endif - vst1.8 {d16}, [r0,:64], r2 - vst1.8 {d17}, [r0,:64], r2 - bgt 1b - - pop {r4-r7, pc} - -2: tst r6, r6 - add r12, r12, r6 - vdup.8 d0, r4 - vdup.8 d1, r12 - - beq 4f - - vld1.8 {d4}, [r1], r2 - -3: vld1.8 {d6}, [r1], r2 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d1 - vld1.8 {d4}, [r1], r2 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d1 - pld [r1] - .ifc \codec,h264 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 - .else - vadd.u16 q8, q8, q11 - vadd.u16 q9, q9, q11 - vshrn.u16 d16, q8, #6 - vshrn.u16 d17, q9, #6 - .endif - pld [r1, r2] - .ifc \type,avg - vld1.8 {d20}, [lr,:64], r2 - vld1.8 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 - .endif - subs r3, r3, #2 - vst1.8 {d16}, [r0,:64], r2 - vst1.8 {d17}, [r0,:64], r2 - bgt 3b - - pop {r4-r7, pc} - -4: vld1.8 {d4, d5}, [r1], r2 - vld1.8 {d6, d7}, [r1], r2 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - pld [r1] - subs r3, r3, #2 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d7, d1 - pld [r1, r2] - .ifc \codec,h264 - vrshrn.u16 d16, q8, #6 - vrshrn.u16 d17, q9, #6 - .else - vadd.u16 q8, q8, q11 - vadd.u16 q9, q9, q11 - vshrn.u16 d16, q8, #6 - vshrn.u16 d17, q9, #6 - .endif - .ifc \type,avg - vld1.8 {d20}, [lr,:64], r2 - vld1.8 {d21}, [lr,:64], r2 - vrhadd.u8 q8, q8, q10 - .endif - vst1.8 {d16}, [r0,:64], r2 - vst1.8 {d17}, [r0,:64], r2 - bgt 4b - - pop {r4-r7, pc} -endfunc -.endm - -/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */ -.macro h264_chroma_mc4 type, codec=h264 -function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 - push {r4-r7, lr} - ldrd r4, r5, [sp, #20] - .ifc \type,avg - mov lr, r0 - .endif - pld [r1] - pld [r1, r2] - - .ifc \codec,rv40 - movrel r6, rv40bias - lsr r7, r5, #1 - add r6, r6, r7, lsl #3 - lsr r7, r4, #1 - add r6, r6, r7, lsl #1 - vld1.16 {d22[],d23[]}, [r6,:16] - .endif - .ifc \codec,vc1 - vmov.u16 q11, #28 - .endif - -A muls r7, r4, r5 -T mul r7, r4, r5 -T cmp r7, #0 - rsb r6, r7, r5, lsl #3 - rsb r12, r7, r4, lsl #3 - sub r4, r7, r4, lsl #3 - sub r4, r4, r5, lsl #3 - add r4, r4, #64 - - beq 2f - - vdup.8 d0, r4 - vdup.8 d1, r12 - vld1.8 {d4}, [r1], r2 - vdup.8 d2, r6 - vdup.8 d3, r7 - - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - - vtrn.32 d0, d1 - vtrn.32 d2, d3 - -1: vld1.8 {d6}, [r1], r2 - vext.8 d7, d6, d7, #1 - vtrn.32 d6, d7 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d6, d2 - vld1.8 {d4}, [r1], r2 - vext.8 d5, d4, d5, #1 - vtrn.32 d4, d5 - pld [r1] - vmull.u8 q9, d6, d0 - vmlal.u8 q9, d4, d2 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - .ifc \codec,h264 - vrshrn.u16 d16, q8, #6 - .else - vadd.u16 q8, q8, q11 - vshrn.u16 d16, q8, #6 - .endif - subs r3, r3, #2 - pld [r1, r2] - .ifc \type,avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 - .endif - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 1b - - pop {r4-r7, pc} - -2: tst r6, r6 - add r12, r12, r6 - vdup.8 d0, r4 - vdup.8 d1, r12 - vtrn.32 d0, d1 - - beq 4f - - vext.32 d1, d0, d1, #1 - vld1.32 {d4[0]}, [r1], r2 - -3: vld1.32 {d4[1]}, [r1], r2 - vmull.u8 q8, d4, d0 - vld1.32 {d4[0]}, [r1], r2 - vmull.u8 q9, d4, d1 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - pld [r1] - .ifc \codec,h264 - vrshrn.u16 d16, q8, #6 - .else - vadd.u16 q8, q8, q11 - vshrn.u16 d16, q8, #6 - .endif - .ifc \type,avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 - .endif - subs r3, r3, #2 - pld [r1, r2] - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 3b - - pop {r4-r7, pc} - -4: vld1.8 {d4}, [r1], r2 - vld1.8 {d6}, [r1], r2 - vext.8 d5, d4, d5, #1 - vext.8 d7, d6, d7, #1 - vtrn.32 d4, d5 - vtrn.32 d6, d7 - vmull.u8 q8, d4, d0 - vmull.u8 q9, d6, d0 - subs r3, r3, #2 - vadd.i16 d16, d16, d17 - vadd.i16 d17, d18, d19 - pld [r1] - .ifc \codec,h264 - vrshrn.u16 d16, q8, #6 - .else - vadd.u16 q8, q8, q11 - vshrn.u16 d16, q8, #6 - .endif - .ifc \type,avg - vld1.32 {d20[0]}, [lr,:32], r2 - vld1.32 {d20[1]}, [lr,:32], r2 - vrhadd.u8 d16, d16, d20 - .endif - pld [r1] - vst1.32 {d16[0]}, [r0,:32], r2 - vst1.32 {d16[1]}, [r0,:32], r2 - bgt 4b - - pop {r4-r7, pc} -endfunc -.endm - -.macro h264_chroma_mc2 type -function ff_\type\()_h264_chroma_mc2_neon, export=1 - push {r4-r6, lr} - ldr r4, [sp, #16] - ldr lr, [sp, #20] - pld [r1] - pld [r1, r2] - orrs r5, r4, lr - beq 2f - - mul r5, r4, lr - rsb r6, r5, lr, lsl #3 - rsb r12, r5, r4, lsl #3 - sub r4, r5, r4, lsl #3 - sub r4, r4, lr, lsl #3 - add r4, r4, #64 - vdup.8 d0, r4 - vdup.8 d2, r12 - vdup.8 d1, r6 - vdup.8 d3, r5 - vtrn.16 q0, q1 -1: - vld1.32 {d4[0]}, [r1], r2 - vld1.32 {d4[1]}, [r1], r2 - vrev64.32 d5, d4 - vld1.32 {d5[1]}, [r1] - vext.8 q3, q2, q2, #1 - vtrn.16 q2, q3 - vmull.u8 q8, d4, d0 - vmlal.u8 q8, d5, d1 - .ifc \type,avg - vld1.16 {d18[0]}, [r0,:16], r2 - vld1.16 {d18[1]}, [r0,:16] - sub r0, r0, r2 - .endif - vtrn.32 d16, d17 - vadd.i16 d16, d16, d17 - vrshrn.u16 d16, q8, #6 - .ifc \type,avg - vrhadd.u8 d16, d16, d18 - .endif - vst1.16 {d16[0]}, [r0,:16], r2 - vst1.16 {d16[1]}, [r0,:16], r2 - subs r3, r3, #2 - bgt 1b - pop {r4-r6, pc} -2: - .ifc \type,put - ldrh_post r5, r1, r2 - strh_post r5, r0, r2 - ldrh_post r6, r1, r2 - strh_post r6, r0, r2 - .else - vld1.16 {d16[0]}, [r1], r2 - vld1.16 {d16[1]}, [r1], r2 - vld1.16 {d18[0]}, [r0,:16], r2 - vld1.16 {d18[1]}, [r0,:16] - sub r0, r0, r2 - vrhadd.u8 d16, d16, d18 - vst1.16 {d16[0]}, [r0,:16], r2 - vst1.16 {d16[1]}, [r0,:16], r2 - .endif - subs r3, r3, #2 - bgt 2b - pop {r4-r6, pc} -endfunc -.endm - - h264_chroma_mc8 put - h264_chroma_mc8 avg - h264_chroma_mc4 put - h264_chroma_mc4 avg - h264_chroma_mc2 put - h264_chroma_mc2 avg - -#if CONFIG_RV40_DECODER -const rv40bias - .short 0, 16, 32, 16 - .short 32, 28, 32, 28 - .short 0, 32, 16, 32 - .short 32, 28, 32, 28 -endconst - - h264_chroma_mc8 put, rv40 - h264_chroma_mc8 avg, rv40 - h264_chroma_mc4 put, rv40 - h264_chroma_mc4 avg, rv40 -#endif - -#if CONFIG_VC1_DECODER - h264_chroma_mc8 put, vc1 - h264_chroma_mc8 avg, vc1 - h264_chroma_mc4 put, vc1 - h264_chroma_mc4 avg, vc1 -#endif diff --git a/ffmpeg/libavcodec/arm/h264dsp_init_arm.c b/ffmpeg/libavcodec/arm/h264dsp_init_arm.c deleted file mode 100644 index 2cafbaf..0000000 --- a/ffmpeg/libavcodec/arm/h264dsp_init_arm.c +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/h264dsp.h" - -int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); - -void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); -void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha, - int beta, int8_t *tc0); - -void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height, - int log2_den, int weight, int offset); -void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height, - int log2_den, int weight, int offset); -void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height, - int log2_den, int weight, int offset); - -void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride, - int height, int log2_den, int weightd, - int weights, int offset); -void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride, - int height, int log2_den, int weightd, - int weights, int offset); -void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride, - int height, int log2_den, int weightd, - int weights, int offset); - -void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset, - int16_t *block, int stride, - const uint8_t nnzc[6*8]); -void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset, - int16_t *block, int stride, - const uint8_t nnzc[6*8]); -void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset, - int16_t *block, int stride, - const uint8_t nnzc[6*8]); - -void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride); -void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset, - int16_t *block, int stride, - const uint8_t nnzc[6*8]); - -static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth, - const int chroma_format_idc) -{ -#if HAVE_NEON - if (bit_depth == 8) { - c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon; - c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon; - if(chroma_format_idc == 1){ - c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon; - c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon; - } - - c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon; - c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon; - c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon; - - c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon; - c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon; - c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon; - - c->h264_idct_add = ff_h264_idct_add_neon; - c->h264_idct_dc_add = ff_h264_idct_dc_add_neon; - c->h264_idct_add16 = ff_h264_idct_add16_neon; - c->h264_idct_add16intra = ff_h264_idct_add16intra_neon; - if (chroma_format_idc == 1) - c->h264_idct_add8 = ff_h264_idct_add8_neon; - c->h264_idct8_add = ff_h264_idct8_add_neon; - c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon; - c->h264_idct8_add4 = ff_h264_idct8_add4_neon; - } -#endif // HAVE_NEON -} - -av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, - const int chroma_format_idc) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_armv6(cpu_flags)) - c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; - if (have_neon(cpu_flags)) - h264dsp_init_neon(c, bit_depth, chroma_format_idc); -} diff --git a/ffmpeg/libavcodec/arm/h264dsp_neon.S b/ffmpeg/libavcodec/arm/h264dsp_neon.S deleted file mode 100644 index 274a547..0000000 --- a/ffmpeg/libavcodec/arm/h264dsp_neon.S +++ /dev/null @@ -1,541 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" -#include "neon.S" - - /* H.264 loop filter */ - -.macro h264_loop_filter_start - ldr r12, [sp] - tst r2, r2 - ldr r12, [r12] - it ne - tstne r3, r3 - vmov.32 d24[0], r12 - and r12, r12, r12, lsl #16 - it eq - bxeq lr - ands r12, r12, r12, lsl #8 - it lt - bxlt lr -.endm - -.macro h264_loop_filter_luma - vdup.8 q11, r2 @ alpha - vmovl.u8 q12, d24 - vabd.u8 q6, q8, q0 @ abs(p0 - q0) - vmovl.u16 q12, d24 - vabd.u8 q14, q9, q8 @ abs(p1 - p0) - vsli.16 q12, q12, #8 - vabd.u8 q15, q1, q0 @ abs(q1 - q0) - vsli.32 q12, q12, #16 - vclt.u8 q6, q6, q11 @ < alpha - vdup.8 q11, r3 @ beta - vclt.s8 q7, q12, #0 - vclt.u8 q14, q14, q11 @ < beta - vclt.u8 q15, q15, q11 @ < beta - vbic q6, q6, q7 - vabd.u8 q4, q10, q8 @ abs(p2 - p0) - vand q6, q6, q14 - vabd.u8 q5, q2, q0 @ abs(q2 - q0) - vclt.u8 q4, q4, q11 @ < beta - vand q6, q6, q15 - vclt.u8 q5, q5, q11 @ < beta - vand q4, q4, q6 - vand q5, q5, q6 - vand q12, q12, q6 - vrhadd.u8 q14, q8, q0 - vsub.i8 q6, q12, q4 - vqadd.u8 q7, q9, q12 - vhadd.u8 q10, q10, q14 - vsub.i8 q6, q6, q5 - vhadd.u8 q14, q2, q14 - vmin.u8 q7, q7, q10 - vqsub.u8 q11, q9, q12 - vqadd.u8 q2, q1, q12 - vmax.u8 q7, q7, q11 - vqsub.u8 q11, q1, q12 - vmin.u8 q14, q2, q14 - vmovl.u8 q2, d0 - vmax.u8 q14, q14, q11 - vmovl.u8 q10, d1 - vsubw.u8 q2, q2, d16 - vsubw.u8 q10, q10, d17 - vshl.i16 q2, q2, #2 - vshl.i16 q10, q10, #2 - vaddw.u8 q2, q2, d18 - vaddw.u8 q10, q10, d19 - vsubw.u8 q2, q2, d2 - vsubw.u8 q10, q10, d3 - vrshrn.i16 d4, q2, #3 - vrshrn.i16 d5, q10, #3 - vbsl q4, q7, q9 - vbsl q5, q14, q1 - vneg.s8 q7, q6 - vmovl.u8 q14, d16 - vmin.s8 q2, q2, q6 - vmovl.u8 q6, d17 - vmax.s8 q2, q2, q7 - vmovl.u8 q11, d0 - vmovl.u8 q12, d1 - vaddw.s8 q14, q14, d4 - vaddw.s8 q6, q6, d5 - vsubw.s8 q11, q11, d4 - vsubw.s8 q12, q12, d5 - vqmovun.s16 d16, q14 - vqmovun.s16 d17, q6 - vqmovun.s16 d0, q11 - vqmovun.s16 d1, q12 -.endm - -function ff_h264_v_loop_filter_luma_neon, export=1 - h264_loop_filter_start - - vld1.8 {d0, d1}, [r0,:128], r1 - vld1.8 {d2, d3}, [r0,:128], r1 - vld1.8 {d4, d5}, [r0,:128], r1 - sub r0, r0, r1, lsl #2 - sub r0, r0, r1, lsl #1 - vld1.8 {d20,d21}, [r0,:128], r1 - vld1.8 {d18,d19}, [r0,:128], r1 - vld1.8 {d16,d17}, [r0,:128], r1 - - vpush {d8-d15} - - h264_loop_filter_luma - - sub r0, r0, r1, lsl #1 - vst1.8 {d8, d9}, [r0,:128], r1 - vst1.8 {d16,d17}, [r0,:128], r1 - vst1.8 {d0, d1}, [r0,:128], r1 - vst1.8 {d10,d11}, [r0,:128] - - vpop {d8-d15} - bx lr -endfunc - -function ff_h264_h_loop_filter_luma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, #4 - vld1.8 {d6}, [r0], r1 - vld1.8 {d20}, [r0], r1 - vld1.8 {d18}, [r0], r1 - vld1.8 {d16}, [r0], r1 - vld1.8 {d0}, [r0], r1 - vld1.8 {d2}, [r0], r1 - vld1.8 {d4}, [r0], r1 - vld1.8 {d26}, [r0], r1 - vld1.8 {d7}, [r0], r1 - vld1.8 {d21}, [r0], r1 - vld1.8 {d19}, [r0], r1 - vld1.8 {d17}, [r0], r1 - vld1.8 {d1}, [r0], r1 - vld1.8 {d3}, [r0], r1 - vld1.8 {d5}, [r0], r1 - vld1.8 {d27}, [r0], r1 - - transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13 - - vpush {d8-d15} - - h264_loop_filter_luma - - transpose_4x4 q4, q8, q0, q5 - - sub r0, r0, r1, lsl #4 - add r0, r0, #2 - vst1.32 {d8[0]}, [r0], r1 - vst1.32 {d16[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d10[0]}, [r0], r1 - vst1.32 {d8[1]}, [r0], r1 - vst1.32 {d16[1]}, [r0], r1 - vst1.32 {d0[1]}, [r0], r1 - vst1.32 {d10[1]}, [r0], r1 - vst1.32 {d9[0]}, [r0], r1 - vst1.32 {d17[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - vst1.32 {d11[0]}, [r0], r1 - vst1.32 {d9[1]}, [r0], r1 - vst1.32 {d17[1]}, [r0], r1 - vst1.32 {d1[1]}, [r0], r1 - vst1.32 {d11[1]}, [r0], r1 - - vpop {d8-d15} - bx lr -endfunc - -.macro h264_loop_filter_chroma - vdup.8 d22, r2 @ alpha - vmovl.u8 q12, d24 - vabd.u8 d26, d16, d0 @ abs(p0 - q0) - vmovl.u8 q2, d0 - vabd.u8 d28, d18, d16 @ abs(p1 - p0) - vsubw.u8 q2, q2, d16 - vsli.16 d24, d24, #8 - vshl.i16 q2, q2, #2 - vabd.u8 d30, d2, d0 @ abs(q1 - q0) - vaddw.u8 q2, q2, d18 - vclt.u8 d26, d26, d22 @ < alpha - vsubw.u8 q2, q2, d2 - vdup.8 d22, r3 @ beta - vrshrn.i16 d4, q2, #3 - vclt.u8 d28, d28, d22 @ < beta - vclt.u8 d30, d30, d22 @ < beta - vmin.s8 d4, d4, d24 - vneg.s8 d25, d24 - vand d26, d26, d28 - vmax.s8 d4, d4, d25 - vand d26, d26, d30 - vmovl.u8 q11, d0 - vand d4, d4, d26 - vmovl.u8 q14, d16 - vaddw.s8 q14, q14, d4 - vsubw.s8 q11, q11, d4 - vqmovun.s16 d16, q14 - vqmovun.s16 d0, q11 -.endm - -function ff_h264_v_loop_filter_chroma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, r1, lsl #1 - vld1.8 {d18}, [r0,:64], r1 - vld1.8 {d16}, [r0,:64], r1 - vld1.8 {d0}, [r0,:64], r1 - vld1.8 {d2}, [r0,:64] - - h264_loop_filter_chroma - - sub r0, r0, r1, lsl #1 - vst1.8 {d16}, [r0,:64], r1 - vst1.8 {d0}, [r0,:64], r1 - - bx lr -endfunc - -function ff_h264_h_loop_filter_chroma_neon, export=1 - h264_loop_filter_start - - sub r0, r0, #2 - vld1.32 {d18[0]}, [r0], r1 - vld1.32 {d16[0]}, [r0], r1 - vld1.32 {d0[0]}, [r0], r1 - vld1.32 {d2[0]}, [r0], r1 - vld1.32 {d18[1]}, [r0], r1 - vld1.32 {d16[1]}, [r0], r1 - vld1.32 {d0[1]}, [r0], r1 - vld1.32 {d2[1]}, [r0], r1 - - vtrn.16 d18, d0 - vtrn.16 d16, d2 - vtrn.8 d18, d16 - vtrn.8 d0, d2 - - h264_loop_filter_chroma - - vtrn.16 d18, d0 - vtrn.16 d16, d2 - vtrn.8 d18, d16 - vtrn.8 d0, d2 - - sub r0, r0, r1, lsl #3 - vst1.32 {d18[0]}, [r0], r1 - vst1.32 {d16[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d2[0]}, [r0], r1 - vst1.32 {d18[1]}, [r0], r1 - vst1.32 {d16[1]}, [r0], r1 - vst1.32 {d0[1]}, [r0], r1 - vst1.32 {d2[1]}, [r0], r1 - - bx lr -endfunc - -@ Biweighted prediction - -.macro biweight_16 macs, macd - vdup.8 d0, r4 - vdup.8 d1, r5 - vmov q2, q8 - vmov q3, q8 -1: subs r3, r3, #2 - vld1.8 {d20-d21},[r0,:128], r2 - \macd q2, d0, d20 - pld [r0] - \macd q3, d0, d21 - vld1.8 {d22-d23},[r1,:128], r2 - \macs q2, d1, d22 - pld [r1] - \macs q3, d1, d23 - vmov q12, q8 - vld1.8 {d28-d29},[r0,:128], r2 - vmov q13, q8 - \macd q12, d0, d28 - pld [r0] - \macd q13, d0, d29 - vld1.8 {d30-d31},[r1,:128], r2 - \macs q12, d1, d30 - pld [r1] - \macs q13, d1, d31 - vshl.s16 q2, q2, q9 - vshl.s16 q3, q3, q9 - vqmovun.s16 d4, q2 - vqmovun.s16 d5, q3 - vshl.s16 q12, q12, q9 - vshl.s16 q13, q13, q9 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vmov q3, q8 - vst1.8 {d4- d5}, [r6,:128], r2 - vmov q2, q8 - vst1.8 {d24-d25},[r6,:128], r2 - bne 1b - pop {r4-r6, pc} -.endm - -.macro biweight_8 macs, macd - vdup.8 d0, r4 - vdup.8 d1, r5 - vmov q1, q8 - vmov q10, q8 -1: subs r3, r3, #2 - vld1.8 {d4},[r0,:64], r2 - \macd q1, d0, d4 - pld [r0] - vld1.8 {d5},[r1,:64], r2 - \macs q1, d1, d5 - pld [r1] - vld1.8 {d6},[r0,:64], r2 - \macd q10, d0, d6 - pld [r0] - vld1.8 {d7},[r1,:64], r2 - \macs q10, d1, d7 - pld [r1] - vshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vmov q10, q8 - vst1.8 {d2},[r6,:64], r2 - vmov q1, q8 - vst1.8 {d4},[r6,:64], r2 - bne 1b - pop {r4-r6, pc} -.endm - -.macro biweight_4 macs, macd - vdup.8 d0, r4 - vdup.8 d1, r5 - vmov q1, q8 - vmov q10, q8 -1: subs r3, r3, #4 - vld1.32 {d4[0]},[r0,:32], r2 - vld1.32 {d4[1]},[r0,:32], r2 - \macd q1, d0, d4 - pld [r0] - vld1.32 {d5[0]},[r1,:32], r2 - vld1.32 {d5[1]},[r1,:32], r2 - \macs q1, d1, d5 - pld [r1] - blt 2f - vld1.32 {d6[0]},[r0,:32], r2 - vld1.32 {d6[1]},[r0,:32], r2 - \macd q10, d0, d6 - pld [r0] - vld1.32 {d7[0]},[r1,:32], r2 - vld1.32 {d7[1]},[r1,:32], r2 - \macs q10, d1, d7 - pld [r1] - vshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vmov q10, q8 - vst1.32 {d2[0]},[r6,:32], r2 - vst1.32 {d2[1]},[r6,:32], r2 - vmov q1, q8 - vst1.32 {d4[0]},[r6,:32], r2 - vst1.32 {d4[1]},[r6,:32], r2 - bne 1b - pop {r4-r6, pc} -2: vshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vst1.32 {d2[0]},[r6,:32], r2 - vst1.32 {d2[1]},[r6,:32], r2 - pop {r4-r6, pc} -.endm - -.macro biweight_func w -function ff_biweight_h264_pixels_\w\()_neon, export=1 - push {r4-r6, lr} - ldr r12, [sp, #16] - add r4, sp, #20 - ldm r4, {r4-r6} - lsr lr, r4, #31 - add r6, r6, #1 - eors lr, lr, r5, lsr #30 - orr r6, r6, #1 - vdup.16 q9, r12 - lsl r6, r6, r12 - vmvn q9, q9 - vdup.16 q8, r6 - mov r6, r0 - beq 10f - subs lr, lr, #1 - beq 20f - subs lr, lr, #1 - beq 30f - b 40f -10: biweight_\w vmlal.u8, vmlal.u8 -20: rsb r4, r4, #0 - biweight_\w vmlal.u8, vmlsl.u8 -30: rsb r4, r4, #0 - rsb r5, r5, #0 - biweight_\w vmlsl.u8, vmlsl.u8 -40: rsb r5, r5, #0 - biweight_\w vmlsl.u8, vmlal.u8 -endfunc -.endm - - biweight_func 16 - biweight_func 8 - biweight_func 4 - -@ Weighted prediction - -.macro weight_16 add - vdup.8 d0, r12 -1: subs r2, r2, #2 - vld1.8 {d20-d21},[r0,:128], r1 - vmull.u8 q2, d0, d20 - pld [r0] - vmull.u8 q3, d0, d21 - vld1.8 {d28-d29},[r0,:128], r1 - vmull.u8 q12, d0, d28 - pld [r0] - vmull.u8 q13, d0, d29 - \add q2, q8, q2 - vrshl.s16 q2, q2, q9 - \add q3, q8, q3 - vrshl.s16 q3, q3, q9 - vqmovun.s16 d4, q2 - vqmovun.s16 d5, q3 - \add q12, q8, q12 - vrshl.s16 q12, q12, q9 - \add q13, q8, q13 - vrshl.s16 q13, q13, q9 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vst1.8 {d4- d5}, [r4,:128], r1 - vst1.8 {d24-d25},[r4,:128], r1 - bne 1b - pop {r4, pc} -.endm - -.macro weight_8 add - vdup.8 d0, r12 -1: subs r2, r2, #2 - vld1.8 {d4},[r0,:64], r1 - vmull.u8 q1, d0, d4 - pld [r0] - vld1.8 {d6},[r0,:64], r1 - vmull.u8 q10, d0, d6 - \add q1, q8, q1 - pld [r0] - vrshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - \add q10, q8, q10 - vrshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vst1.8 {d2},[r4,:64], r1 - vst1.8 {d4},[r4,:64], r1 - bne 1b - pop {r4, pc} -.endm - -.macro weight_4 add - vdup.8 d0, r12 - vmov q1, q8 - vmov q10, q8 -1: subs r2, r2, #4 - vld1.32 {d4[0]},[r0,:32], r1 - vld1.32 {d4[1]},[r0,:32], r1 - vmull.u8 q1, d0, d4 - pld [r0] - blt 2f - vld1.32 {d6[0]},[r0,:32], r1 - vld1.32 {d6[1]},[r0,:32], r1 - vmull.u8 q10, d0, d6 - pld [r0] - \add q1, q8, q1 - vrshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - \add q10, q8, q10 - vrshl.s16 q10, q10, q9 - vqmovun.s16 d4, q10 - vmov q10, q8 - vst1.32 {d2[0]},[r4,:32], r1 - vst1.32 {d2[1]},[r4,:32], r1 - vmov q1, q8 - vst1.32 {d4[0]},[r4,:32], r1 - vst1.32 {d4[1]},[r4,:32], r1 - bne 1b - pop {r4, pc} -2: \add q1, q8, q1 - vrshl.s16 q1, q1, q9 - vqmovun.s16 d2, q1 - vst1.32 {d2[0]},[r4,:32], r1 - vst1.32 {d2[1]},[r4,:32], r1 - pop {r4, pc} -.endm - -.macro weight_func w -function ff_weight_h264_pixels_\w\()_neon, export=1 - push {r4, lr} - ldr r12, [sp, #8] - ldr r4, [sp, #12] - cmp r3, #1 - lsl r4, r4, r3 - vdup.16 q8, r4 - mov r4, r0 - ble 20f - rsb lr, r3, #1 - vdup.16 q9, lr - cmp r12, #0 - blt 10f - weight_\w vhadd.s16 -10: rsb r12, r12, #0 - weight_\w vhsub.s16 -20: rsb lr, r3, #0 - vdup.16 q9, lr - cmp r12, #0 - blt 10f - weight_\w vadd.s16 -10: rsb r12, r12, #0 - weight_\w vsub.s16 -endfunc -.endm - - weight_func 16 - weight_func 8 - weight_func 4 diff --git a/ffmpeg/libavcodec/arm/h264idct_neon.S b/ffmpeg/libavcodec/arm/h264idct_neon.S deleted file mode 100644 index 2edeca2..0000000 --- a/ffmpeg/libavcodec/arm/h264idct_neon.S +++ /dev/null @@ -1,413 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_h264_idct_add_neon, export=1 - vld1.64 {d0-d3}, [r1,:128] - vmov.i16 q15, #0 - - vswp d1, d2 - vst1.16 {q15}, [r1,:128]! - vadd.i16 d4, d0, d1 - vst1.16 {q15}, [r1,:128]! - vshr.s16 q8, q1, #1 - vsub.i16 d5, d0, d1 - vadd.i16 d6, d2, d17 - vsub.i16 d7, d16, d3 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vtrn.16 d0, d1 - vtrn.16 d3, d2 - vtrn.32 d0, d3 - vtrn.32 d1, d2 - - vadd.i16 d4, d0, d3 - vld1.32 {d18[0]}, [r0,:32], r2 - vswp d1, d3 - vshr.s16 q8, q1, #1 - vld1.32 {d19[1]}, [r0,:32], r2 - vsub.i16 d5, d0, d1 - vld1.32 {d18[1]}, [r0,:32], r2 - vadd.i16 d6, d16, d3 - vld1.32 {d19[0]}, [r0,:32], r2 - vsub.i16 d7, d2, d17 - sub r0, r0, r2, lsl #2 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vrshr.s16 q0, q0, #6 - vrshr.s16 q1, q1, #6 - - vaddw.u8 q0, q0, d18 - vaddw.u8 q1, q1, d19 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - - sub r1, r1, #32 - bx lr -endfunc - -function ff_h264_idct_dc_add_neon, export=1 - mov r3, #0 - vld1.16 {d2[],d3[]}, [r1,:16] - strh r3, [r1] - vrshr.s16 q1, q1, #6 - vld1.32 {d0[0]}, [r0,:32], r2 - vld1.32 {d0[1]}, [r0,:32], r2 - vaddw.u8 q2, q1, d0 - vld1.32 {d1[0]}, [r0,:32], r2 - vld1.32 {d1[1]}, [r0,:32], r2 - vaddw.u8 q1, q1, d1 - vqmovun.s16 d0, q2 - vqmovun.s16 d1, q1 - sub r0, r0, r2, lsl #2 - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - bx lr -endfunc - -function ff_h264_idct_add16_neon, export=1 - push {r4-r8,lr} - mov r4, r0 - mov r5, r1 - mov r1, r2 - mov r2, r3 - ldr r6, [sp, #24] - movrel r7, scan8 - mov ip, #16 -1: ldrb r8, [r7], #1 - ldr r0, [r5], #4 - ldrb r8, [r6, r8] - subs r8, r8, #1 - blt 2f - ldrsh lr, [r1] - add r0, r0, r4 - it ne - movne lr, #0 - cmp lr, #0 - ite ne - adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB - adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB - blx lr -2: subs ip, ip, #1 - add r1, r1, #32 - bne 1b - pop {r4-r8,pc} -endfunc - -function ff_h264_idct_add16intra_neon, export=1 - push {r4-r8,lr} - mov r4, r0 - mov r5, r1 - mov r1, r2 - mov r2, r3 - ldr r6, [sp, #24] - movrel r7, scan8 - mov ip, #16 -1: ldrb r8, [r7], #1 - ldr r0, [r5], #4 - ldrb r8, [r6, r8] - add r0, r0, r4 - cmp r8, #0 - ldrsh r8, [r1] - iteet ne - adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB - adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB - cmpeq r8, #0 - blxne lr - subs ip, ip, #1 - add r1, r1, #32 - bne 1b - pop {r4-r8,pc} -endfunc - -function ff_h264_idct_add8_neon, export=1 - push {r4-r10,lr} - ldm r0, {r4,r9} - add r5, r1, #16*4 - add r1, r2, #16*32 - mov r2, r3 - mov r10, r1 - ldr r6, [sp, #32] - movrel r7, scan8+16 - mov r12, #0 -1: ldrb r8, [r7, r12] - ldr r0, [r5, r12, lsl #2] - ldrb r8, [r6, r8] - add r0, r0, r4 - add r1, r10, r12, lsl #5 - cmp r8, #0 - ldrsh r8, [r1] - iteet ne - adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB - adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB - cmpeq r8, #0 - blxne lr - add r12, r12, #1 - cmp r12, #4 - itt eq - moveq r12, #16 - moveq r4, r9 - cmp r12, #20 - blt 1b - pop {r4-r10,pc} -endfunc - -.macro idct8x8_cols pass - .if \pass == 0 - qa .req q2 - qb .req q14 - vshr.s16 q2, q10, #1 - vadd.i16 q0, q8, q12 - vld1.16 {q14-q15},[r1,:128] - vst1.16 {q3}, [r1,:128]! - vst1.16 {q3}, [r1,:128]! - vsub.i16 q1, q8, q12 - vshr.s16 q3, q14, #1 - vsub.i16 q2, q2, q14 - vadd.i16 q3, q3, q10 - .else - qa .req q14 - qb .req q2 - vtrn.32 q8, q10 - vtrn.16 q12, q13 - vtrn.32 q9, q11 - vtrn.32 q12, q2 - vtrn.32 q13, q15 - vswp d21, d4 - vshr.s16 q14, q10, #1 - vswp d17, d24 - vshr.s16 q3, q2, #1 - vswp d19, d26 - vadd.i16 q0, q8, q12 - vswp d23, d30 - vsub.i16 q1, q8, q12 - vsub.i16 q14, q14, q2 - vadd.i16 q3, q3, q10 - .endif - vadd.i16 q10, q1, qa - vsub.i16 q12, q1, qa - vadd.i16 q8, q0, q3 - vsub.i16 qb, q0, q3 - vsub.i16 q0, q13, q11 - vadd.i16 q1, q15, q9 - vsub.i16 qa, q15, q9 - vadd.i16 q3, q13, q11 - vsub.i16 q0, q0, q15 - vsub.i16 q1, q1, q11 - vadd.i16 qa, qa, q13 - vadd.i16 q3, q3, q9 - vshr.s16 q9, q9, #1 - vshr.s16 q11, q11, #1 - vshr.s16 q13, q13, #1 - vshr.s16 q15, q15, #1 - vsub.i16 q0, q0, q15 - vsub.i16 q1, q1, q11 - vadd.i16 qa, qa, q13 - vadd.i16 q3, q3, q9 - vshr.s16 q9, q0, #2 - vshr.s16 q11, q1, #2 - vshr.s16 q13, qa, #2 - vshr.s16 q15, q3, #2 - vsub.i16 q3, q3, q9 - vsub.i16 qa, q11, qa - vadd.i16 q1, q1, q13 - vadd.i16 q0, q0, q15 - .if \pass == 0 - vsub.i16 q15, q8, q3 - vadd.i16 q8, q8, q3 - vadd.i16 q9, q10, q2 - vsub.i16 q2, q10, q2 - vtrn.16 q8, q9 - vadd.i16 q10, q12, q1 - vtrn.16 q2, q15 - vadd.i16 q11, q14, q0 - vsub.i16 q13, q12, q1 - vtrn.16 q10, q11 - vsub.i16 q12, q14, q0 - .else - vsub.i16 q15, q8, q3 - vadd.i16 q8, q8, q3 - vadd.i16 q9, q10, q14 - vsub.i16 q14, q10, q14 - vadd.i16 q10, q12, q1 - vsub.i16 q13, q12, q1 - vadd.i16 q11, q2, q0 - vsub.i16 q12, q2, q0 - .endif - .unreq qa - .unreq qb -.endm - -function ff_h264_idct8_add_neon, export=1 - vmov.i16 q3, #0 - vld1.16 {q8-q9}, [r1,:128] - vst1.16 {q3}, [r1,:128]! - vst1.16 {q3}, [r1,:128]! - vld1.16 {q10-q11},[r1,:128] - vst1.16 {q3}, [r1,:128]! - vst1.16 {q3}, [r1,:128]! - vld1.16 {q12-q13},[r1,:128] - vst1.16 {q3}, [r1,:128]! - vst1.16 {q3}, [r1,:128]! - - idct8x8_cols 0 - idct8x8_cols 1 - - mov r3, r0 - vrshr.s16 q8, q8, #6 - vld1.8 {d0}, [r0,:64], r2 - vrshr.s16 q9, q9, #6 - vld1.8 {d1}, [r0,:64], r2 - vrshr.s16 q10, q10, #6 - vld1.8 {d2}, [r0,:64], r2 - vrshr.s16 q11, q11, #6 - vld1.8 {d3}, [r0,:64], r2 - vrshr.s16 q12, q12, #6 - vld1.8 {d4}, [r0,:64], r2 - vrshr.s16 q13, q13, #6 - vld1.8 {d5}, [r0,:64], r2 - vrshr.s16 q14, q14, #6 - vld1.8 {d6}, [r0,:64], r2 - vrshr.s16 q15, q15, #6 - vld1.8 {d7}, [r0,:64], r2 - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vqmovun.s16 d0, q8 - vaddw.u8 q11, q11, d3 - vqmovun.s16 d1, q9 - vaddw.u8 q12, q12, d4 - vqmovun.s16 d2, q10 - vst1.8 {d0}, [r3,:64], r2 - vaddw.u8 q13, q13, d5 - vqmovun.s16 d3, q11 - vst1.8 {d1}, [r3,:64], r2 - vaddw.u8 q14, q14, d6 - vqmovun.s16 d4, q12 - vst1.8 {d2}, [r3,:64], r2 - vaddw.u8 q15, q15, d7 - vqmovun.s16 d5, q13 - vst1.8 {d3}, [r3,:64], r2 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - vst1.8 {d4}, [r3,:64], r2 - vst1.8 {d5}, [r3,:64], r2 - vst1.8 {d6}, [r3,:64], r2 - vst1.8 {d7}, [r3,:64], r2 - - sub r1, r1, #128 - bx lr -endfunc - -function ff_h264_idct8_dc_add_neon, export=1 - mov r3, #0 - vld1.16 {d30[],d31[]},[r1,:16] - strh r3, [r1] - vld1.32 {d0}, [r0,:64], r2 - vrshr.s16 q15, q15, #6 - vld1.32 {d1}, [r0,:64], r2 - vld1.32 {d2}, [r0,:64], r2 - vaddw.u8 q8, q15, d0 - vld1.32 {d3}, [r0,:64], r2 - vaddw.u8 q9, q15, d1 - vld1.32 {d4}, [r0,:64], r2 - vaddw.u8 q10, q15, d2 - vld1.32 {d5}, [r0,:64], r2 - vaddw.u8 q11, q15, d3 - vld1.32 {d6}, [r0,:64], r2 - vaddw.u8 q12, q15, d4 - vld1.32 {d7}, [r0,:64], r2 - vaddw.u8 q13, q15, d5 - vaddw.u8 q14, q15, d6 - vaddw.u8 q15, q15, d7 - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - sub r0, r0, r2, lsl #3 - vst1.32 {d0}, [r0,:64], r2 - vqmovun.s16 d4, q12 - vst1.32 {d1}, [r0,:64], r2 - vqmovun.s16 d5, q13 - vst1.32 {d2}, [r0,:64], r2 - vqmovun.s16 d6, q14 - vst1.32 {d3}, [r0,:64], r2 - vqmovun.s16 d7, q15 - vst1.32 {d4}, [r0,:64], r2 - vst1.32 {d5}, [r0,:64], r2 - vst1.32 {d6}, [r0,:64], r2 - vst1.32 {d7}, [r0,:64], r2 - bx lr -endfunc - -function ff_h264_idct8_add4_neon, export=1 - push {r4-r8,lr} - mov r4, r0 - mov r5, r1 - mov r1, r2 - mov r2, r3 - ldr r6, [sp, #24] - movrel r7, scan8 - mov r12, #16 -1: ldrb r8, [r7], #4 - ldr r0, [r5], #16 - ldrb r8, [r6, r8] - subs r8, r8, #1 - blt 2f - ldrsh lr, [r1] - add r0, r0, r4 - it ne - movne lr, #0 - cmp lr, #0 - ite ne - adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB - adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB - blx lr -2: subs r12, r12, #4 - add r1, r1, #128 - bne 1b - pop {r4-r8,pc} -endfunc - -const scan8 - .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 - .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 - .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 - .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 - .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 - .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 - .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 - .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 - .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8 - .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8 - .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8 - .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8 -endconst diff --git a/ffmpeg/libavcodec/arm/h264pred_init_arm.c b/ffmpeg/libavcodec/arm/h264pred_init_arm.c deleted file mode 100644 index 1562f0b..0000000 --- a/ffmpeg/libavcodec/arm/h264pred_init_arm.c +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/h264pred.h" - -void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride); - -void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride); -void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride); - -static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id, - const int bit_depth, - const int chroma_format_idc) -{ -#if HAVE_NEON - const int high_depth = bit_depth > 8; - - if (high_depth) - return; - if(chroma_format_idc == 1){ - h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon; - h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon; - if (codec_id != AV_CODEC_ID_VP8) - h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon; - h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon; - if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) { - h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon; - h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon; - h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon; - h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon; - h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon; - h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon; - h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon; - } - } - - h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon; - h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon; - h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon; - h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon; - h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon; - h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon; - if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) - h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon; -#endif // HAVE_NEON -} - -av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id, - int bit_depth, const int chroma_format_idc) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) - h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc); -} diff --git a/ffmpeg/libavcodec/arm/h264pred_neon.S b/ffmpeg/libavcodec/arm/h264pred_neon.S deleted file mode 100644 index 4dc47ba..0000000 --- a/ffmpeg/libavcodec/arm/h264pred_neon.S +++ /dev/null @@ -1,359 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - - .macro ldcol.8 rd, rs, rt, n=8, hi=0 -.if \n == 8 || \hi == 0 - vld1.8 {\rd[0]}, [\rs], \rt - vld1.8 {\rd[1]}, [\rs], \rt - vld1.8 {\rd[2]}, [\rs], \rt - vld1.8 {\rd[3]}, [\rs], \rt -.endif -.if \n == 8 || \hi == 1 - vld1.8 {\rd[4]}, [\rs], \rt - vld1.8 {\rd[5]}, [\rs], \rt - vld1.8 {\rd[6]}, [\rs], \rt - vld1.8 {\rd[7]}, [\rs], \rt -.endif - .endm - - .macro add16x8 dq, dl, dh, rl, rh - vaddl.u8 \dq, \rl, \rh - vadd.u16 \dl, \dl, \dh - vpadd.u16 \dl, \dl, \dl - vpadd.u16 \dl, \dl, \dl - .endm - -function ff_pred16x16_128_dc_neon, export=1 - vmov.i8 q0, #128 - b .L_pred16x16_dc_end -endfunc - -function ff_pred16x16_top_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {q0}, [r2,:128] - add16x8 q0, d0, d1, d0, d1 - vrshrn.u16 d0, q0, #4 - vdup.8 q0, d0[0] - b .L_pred16x16_dc_end -endfunc - -function ff_pred16x16_left_dc_neon, export=1 - sub r2, r0, #1 - ldcol.8 d0, r2, r1 - ldcol.8 d1, r2, r1 - add16x8 q0, d0, d1, d0, d1 - vrshrn.u16 d0, q0, #4 - vdup.8 q0, d0[0] - b .L_pred16x16_dc_end -endfunc - -function ff_pred16x16_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {q0}, [r2,:128] - sub r2, r0, #1 - ldcol.8 d2, r2, r1 - ldcol.8 d3, r2, r1 - vaddl.u8 q0, d0, d1 - vaddl.u8 q1, d2, d3 - vadd.u16 q0, q0, q1 - vadd.u16 d0, d0, d1 - vpadd.u16 d0, d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #5 - vdup.8 q0, d0[0] -.L_pred16x16_dc_end: - mov r3, #8 -6: vst1.8 {q0}, [r0,:128], r1 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 6b - bx lr -endfunc - -function ff_pred16x16_hor_neon, export=1 - sub r2, r0, #1 - mov r3, #16 -1: vld1.8 {d0[],d1[]},[r2], r1 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred16x16_vert_neon, export=1 - sub r0, r0, r1 - vld1.8 {q0}, [r0,:128], r1 - mov r3, #8 -1: vst1.8 {q0}, [r0,:128], r1 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred16x16_plane_neon, export=1 - sub r3, r0, r1 - add r2, r3, #8 - sub r3, r3, #1 - vld1.8 {d0}, [r3] - vld1.8 {d2}, [r2,:64], r1 - ldcol.8 d1, r3, r1 - add r3, r3, r1 - ldcol.8 d3, r3, r1 - vrev64.8 q0, q0 - vaddl.u8 q8, d2, d3 - vsubl.u8 q2, d2, d0 - vsubl.u8 q3, d3, d1 - movrel r3, p16weight - vld1.8 {q0}, [r3,:128] - vmul.s16 q2, q2, q0 - vmul.s16 q3, q3, q0 - vadd.i16 d4, d4, d5 - vadd.i16 d5, d6, d7 - vpadd.i16 d4, d4, d5 - vpadd.i16 d4, d4, d4 - vshll.s16 q3, d4, #2 - vaddw.s16 q2, q3, d4 - vrshrn.s32 d4, q2, #6 - mov r3, #0 - vtrn.16 d4, d5 - vadd.i16 d2, d4, d5 - vshl.i16 d3, d2, #3 - vrev64.16 d16, d17 - vsub.i16 d3, d3, d2 - vadd.i16 d16, d16, d0 - vshl.i16 d2, d16, #4 - vsub.i16 d2, d2, d3 - vshl.i16 d3, d4, #4 - vext.16 q0, q0, q0, #7 - vsub.i16 d6, d5, d3 - vmov.16 d0[0], r3 - vmul.i16 q0, q0, d4[0] - vdup.16 q1, d2[0] - vdup.16 q2, d4[0] - vdup.16 q3, d6[0] - vshl.i16 q2, q2, #3 - vadd.i16 q1, q1, q0 - vadd.i16 q3, q3, q2 - mov r3, #16 -1: - vqshrun.s16 d0, q1, #5 - vadd.i16 q1, q1, q2 - vqshrun.s16 d1, q1, #5 - vadd.i16 q1, q1, q3 - vst1.8 {q0}, [r0,:128], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -const p16weight, align=4 - .short 1,2,3,4,5,6,7,8 -endconst - -function ff_pred8x8_hor_neon, export=1 - sub r2, r0, #1 - mov r3, #8 -1: vld1.8 {d0[]}, [r2], r1 - vst1.8 {d0}, [r0,:64], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred8x8_vert_neon, export=1 - sub r0, r0, r1 - vld1.8 {d0}, [r0,:64], r1 - mov r3, #4 -1: vst1.8 {d0}, [r0,:64], r1 - vst1.8 {d0}, [r0,:64], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred8x8_plane_neon, export=1 - sub r3, r0, r1 - add r2, r3, #4 - sub r3, r3, #1 - vld1.32 {d0[0]}, [r3] - vld1.32 {d2[0]}, [r2,:32], r1 - ldcol.8 d0, r3, r1, 4, hi=1 - add r3, r3, r1 - ldcol.8 d3, r3, r1, 4 - vaddl.u8 q8, d2, d3 - vrev32.8 d0, d0 - vtrn.32 d2, d3 - vsubl.u8 q2, d2, d0 - movrel r3, p16weight - vld1.16 {q0}, [r3,:128] - vmul.s16 d4, d4, d0 - vmul.s16 d5, d5, d0 - vpadd.i16 d4, d4, d5 - vpaddl.s16 d4, d4 - vshl.i32 d5, d4, #4 - vadd.s32 d4, d4, d5 - vrshrn.s32 d4, q2, #5 - mov r3, #0 - vtrn.16 d4, d5 - vadd.i16 d2, d4, d5 - vshl.i16 d3, d2, #2 - vrev64.16 d16, d16 - vsub.i16 d3, d3, d2 - vadd.i16 d16, d16, d0 - vshl.i16 d2, d16, #4 - vsub.i16 d2, d2, d3 - vshl.i16 d3, d4, #3 - vext.16 q0, q0, q0, #7 - vsub.i16 d6, d5, d3 - vmov.16 d0[0], r3 - vmul.i16 q0, q0, d4[0] - vdup.16 q1, d2[0] - vdup.16 q2, d4[0] - vdup.16 q3, d6[0] - vshl.i16 q2, q2, #3 - vadd.i16 q1, q1, q0 - vadd.i16 q3, q3, q2 - mov r3, #8 -1: - vqshrun.s16 d0, q1, #5 - vadd.i16 q1, q1, q3 - vst1.8 {d0}, [r0,:64], r1 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -function ff_pred8x8_128_dc_neon, export=1 - vmov.i8 q0, #128 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_top_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - vpaddl.u8 d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #2 - vdup.8 d1, d0[1] - vdup.8 d0, d0[0] - vtrn.32 d0, d1 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_left_dc_neon, export=1 - sub r2, r0, #1 - ldcol.8 d0, r2, r1 - vpaddl.u8 d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #2 - vdup.8 d1, d0[1] - vdup.8 d0, d0[0] - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - sub r2, r0, #1 - ldcol.8 d1, r2, r1 - vtrn.32 d0, d1 - vpaddl.u8 q0, q0 - vpadd.u16 d0, d0, d1 - vpadd.u16 d1, d0, d0 - vrshrn.u16 d2, q0, #3 - vrshrn.u16 d3, q0, #2 - vdup.8 d0, d2[4] - vdup.8 d1, d3[3] - vdup.8 d4, d3[2] - vdup.8 d5, d2[5] - vtrn.32 q0, q2 -.L_pred8x8_dc_end: - mov r3, #4 - add r2, r0, r1, lsl #2 -6: vst1.8 {d0}, [r0,:64], r1 - vst1.8 {d1}, [r2,:64], r1 - subs r3, r3, #1 - bne 6b - bx lr -endfunc - -function ff_pred8x8_l0t_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - sub r2, r0, #1 - ldcol.8 d1, r2, r1, 4 - vtrn.32 d0, d1 - vpaddl.u8 q0, q0 - vpadd.u16 d0, d0, d1 - vpadd.u16 d1, d0, d0 - vrshrn.u16 d2, q0, #3 - vrshrn.u16 d3, q0, #2 - vdup.8 d0, d2[4] - vdup.8 d1, d3[0] - vdup.8 q2, d3[2] - vtrn.32 q0, q2 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_l00_dc_neon, export=1 - sub r2, r0, #1 - ldcol.8 d0, r2, r1, 4 - vpaddl.u8 d0, d0 - vpadd.u16 d0, d0, d0 - vrshrn.u16 d0, q0, #2 - vmov.i8 d1, #128 - vdup.8 d0, d0[0] - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_0lt_dc_neon, export=1 - sub r2, r0, r1 - vld1.8 {d0}, [r2,:64] - add r2, r0, r1, lsl #2 - sub r2, r2, #1 - ldcol.8 d1, r2, r1, 4, hi=1 - vtrn.32 d0, d1 - vpaddl.u8 q0, q0 - vpadd.u16 d0, d0, d1 - vpadd.u16 d1, d0, d0 - vrshrn.u16 d3, q0, #2 - vrshrn.u16 d2, q0, #3 - vdup.8 d0, d3[0] - vdup.8 d1, d3[3] - vdup.8 d4, d3[2] - vdup.8 d5, d2[5] - vtrn.32 q0, q2 - b .L_pred8x8_dc_end -endfunc - -function ff_pred8x8_0l0_dc_neon, export=1 - add r2, r0, r1, lsl #2 - sub r2, r2, #1 - ldcol.8 d1, r2, r1, 4 - vpaddl.u8 d2, d1 - vpadd.u16 d2, d2, d2 - vrshrn.u16 d1, q1, #2 - vmov.i8 d0, #128 - vdup.8 d1, d1[0] - b .L_pred8x8_dc_end -endfunc diff --git a/ffmpeg/libavcodec/arm/h264qpel_init_arm.c b/ffmpeg/libavcodec/arm/h264qpel_init_arm.c deleted file mode 100644 index eaa1324..0000000 --- a/ffmpeg/libavcodec/arm/h264qpel_init_arm.c +++ /dev/null @@ -1,171 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "config.h" -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/h264qpel.h" - -void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); - -void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); - -void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); - -void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t); -void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t); - -av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth) -{ - const int high_bit_depth = bit_depth > 8; - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags) && !high_bit_depth) { - c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon; - c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon; - c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon; - c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon; - c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon; - c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon; - c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon; - c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon; - c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon; - c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon; - c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon; - c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon; - c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon; - c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon; - c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon; - c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon; - - c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon; - c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon; - c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon; - c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon; - c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon; - c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon; - c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon; - c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon; - c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon; - c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon; - c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon; - c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon; - c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon; - c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon; - c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon; - c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon; - - c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon; - c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon; - c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon; - c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon; - c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon; - c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon; - c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon; - c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon; - c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon; - c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon; - c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon; - c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon; - c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon; - c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon; - c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon; - c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon; - - c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon; - c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon; - c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon; - c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon; - c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon; - c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon; - c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon; - c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon; - c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon; - c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon; - c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon; - c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon; - c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon; - c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon; - c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon; - c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/h264qpel_neon.S b/ffmpeg/libavcodec/arm/h264qpel_neon.S deleted file mode 100644 index 21336c6..0000000 --- a/ffmpeg/libavcodec/arm/h264qpel_neon.S +++ /dev/null @@ -1,955 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" -#include "neon.S" - - /* H.264 qpel MC */ - -.macro lowpass_const r - movw \r, #5 - movt \r, #20 - vmov.32 d6[0], \r -.endm - -.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1 - .if \narrow - t0 .req q0 - t1 .req q8 - .else - t0 .req \d0 - t1 .req \d1 - .endif - vext.8 d2, \r0, \r1, #2 - vext.8 d3, \r0, \r1, #3 - vaddl.u8 q1, d2, d3 - vext.8 d4, \r0, \r1, #1 - vext.8 d5, \r0, \r1, #4 - vaddl.u8 q2, d4, d5 - vext.8 d30, \r0, \r1, #5 - vaddl.u8 t0, \r0, d30 - vext.8 d18, \r2, \r3, #2 - vmla.i16 t0, q1, d6[1] - vext.8 d19, \r2, \r3, #3 - vaddl.u8 q9, d18, d19 - vext.8 d20, \r2, \r3, #1 - vmls.i16 t0, q2, d6[0] - vext.8 d21, \r2, \r3, #4 - vaddl.u8 q10, d20, d21 - vext.8 d31, \r2, \r3, #5 - vaddl.u8 t1, \r2, d31 - vmla.i16 t1, q9, d6[1] - vmls.i16 t1, q10, d6[0] - .if \narrow - vqrshrun.s16 \d0, t0, #5 - vqrshrun.s16 \d1, t1, #5 - .endif - .unreq t0 - .unreq t1 -.endm - -.macro lowpass_8_1 r0, r1, d0, narrow=1 - .if \narrow - t0 .req q0 - .else - t0 .req \d0 - .endif - vext.8 d2, \r0, \r1, #2 - vext.8 d3, \r0, \r1, #3 - vaddl.u8 q1, d2, d3 - vext.8 d4, \r0, \r1, #1 - vext.8 d5, \r0, \r1, #4 - vaddl.u8 q2, d4, d5 - vext.8 d30, \r0, \r1, #5 - vaddl.u8 t0, \r0, d30 - vmla.i16 t0, q1, d6[1] - vmls.i16 t0, q2, d6[0] - .if \narrow - vqrshrun.s16 \d0, t0, #5 - .endif - .unreq t0 -.endm - -.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d - vext.16 q1, \r0, \r1, #2 - vext.16 q0, \r0, \r1, #3 - vaddl.s16 q9, d2, d0 - vext.16 q2, \r0, \r1, #1 - vaddl.s16 q1, d3, d1 - vext.16 q3, \r0, \r1, #4 - vaddl.s16 q10, d4, d6 - vext.16 \r1, \r0, \r1, #5 - vaddl.s16 q2, d5, d7 - vaddl.s16 q0, \h0, \h1 - vaddl.s16 q8, \l0, \l1 - - vshl.i32 q3, q9, #4 - vshl.i32 q9, q9, #2 - vshl.i32 q15, q10, #2 - vadd.i32 q9, q9, q3 - vadd.i32 q10, q10, q15 - - vshl.i32 q3, q1, #4 - vshl.i32 q1, q1, #2 - vshl.i32 q15, q2, #2 - vadd.i32 q1, q1, q3 - vadd.i32 q2, q2, q15 - - vadd.i32 q9, q9, q8 - vsub.i32 q9, q9, q10 - - vadd.i32 q1, q1, q0 - vsub.i32 q1, q1, q2 - - vrshrn.s32 d18, q9, #10 - vrshrn.s32 d19, q1, #10 - - vqmovun.s16 \d, q9 -.endm - -function put_h264_qpel16_h_lowpass_neon_packed - mov r4, lr - mov r12, #16 - mov r3, #8 - bl put_h264_qpel8_h_lowpass_neon - sub r1, r1, r2, lsl #4 - add r1, r1, #8 - mov r12, #16 - mov lr, r4 - b put_h264_qpel8_h_lowpass_neon -endfunc - -.macro h264_qpel_h_lowpass type -function \type\()_h264_qpel16_h_lowpass_neon - push {lr} - mov r12, #16 - bl \type\()_h264_qpel8_h_lowpass_neon - sub r0, r0, r3, lsl #4 - sub r1, r1, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - mov r12, #16 - pop {lr} -endfunc - -function \type\()_h264_qpel8_h_lowpass_neon -1: vld1.8 {d0, d1}, [r1], r2 - vld1.8 {d16,d17}, [r1], r2 - subs r12, r12, #2 - lowpass_8 d0, d1, d16, d17, d0, d16 - .ifc \type,avg - vld1.8 {d2}, [r0,:64], r3 - vrhadd.u8 d0, d0, d2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 d16, d16, d3 - sub r0, r0, r3 - .endif - vst1.8 {d0}, [r0,:64], r3 - vst1.8 {d16}, [r0,:64], r3 - bne 1b - bx lr -endfunc -.endm - - h264_qpel_h_lowpass put - h264_qpel_h_lowpass avg - -.macro h264_qpel_h_lowpass_l2 type -function \type\()_h264_qpel16_h_lowpass_l2_neon - push {lr} - mov r12, #16 - bl \type\()_h264_qpel8_h_lowpass_l2_neon - sub r0, r0, r2, lsl #4 - sub r1, r1, r2, lsl #4 - sub r3, r3, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - add r3, r3, #8 - mov r12, #16 - pop {lr} -endfunc - -function \type\()_h264_qpel8_h_lowpass_l2_neon -1: vld1.8 {d0, d1}, [r1], r2 - vld1.8 {d16,d17}, [r1], r2 - vld1.8 {d28}, [r3], r2 - vld1.8 {d29}, [r3], r2 - subs r12, r12, #2 - lowpass_8 d0, d1, d16, d17, d0, d1 - vrhadd.u8 q0, q0, q14 - .ifc \type,avg - vld1.8 {d2}, [r0,:64], r2 - vrhadd.u8 d0, d0, d2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 d1, d1, d3 - sub r0, r0, r2 - .endif - vst1.8 {d0}, [r0,:64], r2 - vst1.8 {d1}, [r0,:64], r2 - bne 1b - bx lr -endfunc -.endm - - h264_qpel_h_lowpass_l2 put - h264_qpel_h_lowpass_l2 avg - -function put_h264_qpel16_v_lowpass_neon_packed - mov r4, lr - mov r2, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl put_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 - b put_h264_qpel8_v_lowpass_neon -endfunc - -.macro h264_qpel_v_lowpass type -function \type\()_h264_qpel16_v_lowpass_neon - mov r4, lr - bl \type\()_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_v_lowpass_neon - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl \type\()_h264_qpel8_v_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 -endfunc - -function \type\()_h264_qpel8_v_lowpass_neon - vld1.8 {d8}, [r1], r3 - vld1.8 {d10}, [r1], r3 - vld1.8 {d12}, [r1], r3 - vld1.8 {d14}, [r1], r3 - vld1.8 {d22}, [r1], r3 - vld1.8 {d24}, [r1], r3 - vld1.8 {d26}, [r1], r3 - vld1.8 {d28}, [r1], r3 - vld1.8 {d9}, [r1], r3 - vld1.8 {d11}, [r1], r3 - vld1.8 {d13}, [r1], r3 - vld1.8 {d15}, [r1], r3 - vld1.8 {d23}, [r1] - - transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 - lowpass_8 d8, d9, d10, d11, d8, d10 - lowpass_8 d12, d13, d14, d15, d12, d14 - lowpass_8 d22, d23, d24, d25, d22, d24 - lowpass_8 d26, d27, d28, d29, d26, d28 - transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28 - - .ifc \type,avg - vld1.8 {d9}, [r0,:64], r2 - vrhadd.u8 d8, d8, d9 - vld1.8 {d11}, [r0,:64], r2 - vrhadd.u8 d10, d10, d11 - vld1.8 {d13}, [r0,:64], r2 - vrhadd.u8 d12, d12, d13 - vld1.8 {d15}, [r0,:64], r2 - vrhadd.u8 d14, d14, d15 - vld1.8 {d23}, [r0,:64], r2 - vrhadd.u8 d22, d22, d23 - vld1.8 {d25}, [r0,:64], r2 - vrhadd.u8 d24, d24, d25 - vld1.8 {d27}, [r0,:64], r2 - vrhadd.u8 d26, d26, d27 - vld1.8 {d29}, [r0,:64], r2 - vrhadd.u8 d28, d28, d29 - sub r0, r0, r2, lsl #3 - .endif - - vst1.8 {d8}, [r0,:64], r2 - vst1.8 {d10}, [r0,:64], r2 - vst1.8 {d12}, [r0,:64], r2 - vst1.8 {d14}, [r0,:64], r2 - vst1.8 {d22}, [r0,:64], r2 - vst1.8 {d24}, [r0,:64], r2 - vst1.8 {d26}, [r0,:64], r2 - vst1.8 {d28}, [r0,:64], r2 - - bx lr -endfunc -.endm - - h264_qpel_v_lowpass put - h264_qpel_v_lowpass avg - -.macro h264_qpel_v_lowpass_l2 type -function \type\()_h264_qpel16_v_lowpass_l2_neon - mov r4, lr - bl \type\()_h264_qpel8_v_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_v_lowpass_l2_neon - sub r0, r0, r3, lsl #4 - sub r12, r12, r2, lsl #4 - add r0, r0, #8 - add r12, r12, #8 - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - bl \type\()_h264_qpel8_v_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - mov lr, r4 -endfunc - -function \type\()_h264_qpel8_v_lowpass_l2_neon - vld1.8 {d8}, [r1], r3 - vld1.8 {d10}, [r1], r3 - vld1.8 {d12}, [r1], r3 - vld1.8 {d14}, [r1], r3 - vld1.8 {d22}, [r1], r3 - vld1.8 {d24}, [r1], r3 - vld1.8 {d26}, [r1], r3 - vld1.8 {d28}, [r1], r3 - vld1.8 {d9}, [r1], r3 - vld1.8 {d11}, [r1], r3 - vld1.8 {d13}, [r1], r3 - vld1.8 {d15}, [r1], r3 - vld1.8 {d23}, [r1] - - transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14 - lowpass_8 d8, d9, d10, d11, d8, d9 - lowpass_8 d12, d13, d14, d15, d12, d13 - lowpass_8 d22, d23, d24, d25, d22, d23 - lowpass_8 d26, d27, d28, d29, d26, d27 - transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27 - - vld1.8 {d0}, [r12], r2 - vld1.8 {d1}, [r12], r2 - vld1.8 {d2}, [r12], r2 - vld1.8 {d3}, [r12], r2 - vld1.8 {d4}, [r12], r2 - vrhadd.u8 q0, q0, q4 - vld1.8 {d5}, [r12], r2 - vrhadd.u8 q1, q1, q6 - vld1.8 {d10}, [r12], r2 - vrhadd.u8 q2, q2, q11 - vld1.8 {d11}, [r12], r2 - vrhadd.u8 q5, q5, q13 - - .ifc \type,avg - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d0, d0, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d1, d1, d17 - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d2, d2, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d3, d3, d17 - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d4, d4, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d5, d5, d17 - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d10, d10, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d11, d11, d17 - sub r0, r0, r3, lsl #3 - .endif - - vst1.8 {d0}, [r0,:64], r3 - vst1.8 {d1}, [r0,:64], r3 - vst1.8 {d2}, [r0,:64], r3 - vst1.8 {d3}, [r0,:64], r3 - vst1.8 {d4}, [r0,:64], r3 - vst1.8 {d5}, [r0,:64], r3 - vst1.8 {d10}, [r0,:64], r3 - vst1.8 {d11}, [r0,:64], r3 - - bx lr -endfunc -.endm - - h264_qpel_v_lowpass_l2 put - h264_qpel_v_lowpass_l2 avg - -function put_h264_qpel8_hv_lowpass_neon_top - lowpass_const r12 - mov r12, #12 -1: vld1.8 {d0, d1}, [r1], r3 - vld1.8 {d16,d17}, [r1], r3 - subs r12, r12, #2 - lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0 - vst1.8 {d22-d25}, [r4,:128]! - bne 1b - - vld1.8 {d0, d1}, [r1] - lowpass_8_1 d0, d1, q12, narrow=0 - - mov r12, #-16 - add r4, r4, r12 - vld1.8 {d30,d31}, [r4,:128], r12 - vld1.8 {d20,d21}, [r4,:128], r12 - vld1.8 {d18,d19}, [r4,:128], r12 - vld1.8 {d16,d17}, [r4,:128], r12 - vld1.8 {d14,d15}, [r4,:128], r12 - vld1.8 {d12,d13}, [r4,:128], r12 - vld1.8 {d10,d11}, [r4,:128], r12 - vld1.8 {d8, d9}, [r4,:128], r12 - vld1.8 {d6, d7}, [r4,:128], r12 - vld1.8 {d4, d5}, [r4,:128], r12 - vld1.8 {d2, d3}, [r4,:128], r12 - vld1.8 {d0, d1}, [r4,:128] - - swap4 d1, d3, d5, d7, d8, d10, d12, d14 - transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7 - - swap4 d17, d19, d21, d31, d24, d26, d28, d22 - transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11 - - vst1.8 {d30,d31}, [r4,:128]! - vst1.8 {d6, d7}, [r4,:128]! - vst1.8 {d20,d21}, [r4,:128]! - vst1.8 {d4, d5}, [r4,:128]! - vst1.8 {d18,d19}, [r4,:128]! - vst1.8 {d2, d3}, [r4,:128]! - vst1.8 {d16,d17}, [r4,:128]! - vst1.8 {d0, d1}, [r4,:128] - - lowpass_8.16 q4, q12, d8, d9, d24, d25, d8 - lowpass_8.16 q5, q13, d10, d11, d26, d27, d9 - lowpass_8.16 q6, q14, d12, d13, d28, d29, d10 - lowpass_8.16 q7, q11, d14, d15, d22, d23, d11 - - vld1.8 {d16,d17}, [r4,:128], r12 - vld1.8 {d30,d31}, [r4,:128], r12 - lowpass_8.16 q8, q15, d16, d17, d30, d31, d12 - vld1.8 {d16,d17}, [r4,:128], r12 - vld1.8 {d30,d31}, [r4,:128], r12 - lowpass_8.16 q8, q15, d16, d17, d30, d31, d13 - vld1.8 {d16,d17}, [r4,:128], r12 - vld1.8 {d30,d31}, [r4,:128], r12 - lowpass_8.16 q8, q15, d16, d17, d30, d31, d14 - vld1.8 {d16,d17}, [r4,:128], r12 - vld1.8 {d30,d31}, [r4,:128] - lowpass_8.16 q8, q15, d16, d17, d30, d31, d15 - - transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11 - - bx lr -endfunc - -.macro h264_qpel8_hv_lowpass type -function \type\()_h264_qpel8_hv_lowpass_neon - mov r10, lr - bl put_h264_qpel8_hv_lowpass_neon_top - .ifc \type,avg - vld1.8 {d0}, [r0,:64], r2 - vrhadd.u8 d12, d12, d0 - vld1.8 {d1}, [r0,:64], r2 - vrhadd.u8 d13, d13, d1 - vld1.8 {d2}, [r0,:64], r2 - vrhadd.u8 d14, d14, d2 - vld1.8 {d3}, [r0,:64], r2 - vrhadd.u8 d15, d15, d3 - vld1.8 {d4}, [r0,:64], r2 - vrhadd.u8 d8, d8, d4 - vld1.8 {d5}, [r0,:64], r2 - vrhadd.u8 d9, d9, d5 - vld1.8 {d6}, [r0,:64], r2 - vrhadd.u8 d10, d10, d6 - vld1.8 {d7}, [r0,:64], r2 - vrhadd.u8 d11, d11, d7 - sub r0, r0, r2, lsl #3 - .endif - - vst1.8 {d12}, [r0,:64], r2 - vst1.8 {d13}, [r0,:64], r2 - vst1.8 {d14}, [r0,:64], r2 - vst1.8 {d15}, [r0,:64], r2 - vst1.8 {d8}, [r0,:64], r2 - vst1.8 {d9}, [r0,:64], r2 - vst1.8 {d10}, [r0,:64], r2 - vst1.8 {d11}, [r0,:64], r2 - - mov lr, r10 - bx lr -endfunc -.endm - - h264_qpel8_hv_lowpass put - h264_qpel8_hv_lowpass avg - -.macro h264_qpel8_hv_lowpass_l2 type -function \type\()_h264_qpel8_hv_lowpass_l2_neon - mov r10, lr - bl put_h264_qpel8_hv_lowpass_neon_top - - vld1.8 {d0, d1}, [r2,:128]! - vld1.8 {d2, d3}, [r2,:128]! - vrhadd.u8 q0, q0, q6 - vld1.8 {d4, d5}, [r2,:128]! - vrhadd.u8 q1, q1, q7 - vld1.8 {d6, d7}, [r2,:128]! - vrhadd.u8 q2, q2, q4 - vrhadd.u8 q3, q3, q5 - .ifc \type,avg - vld1.8 {d16}, [r0,:64], r3 - vrhadd.u8 d0, d0, d16 - vld1.8 {d17}, [r0,:64], r3 - vrhadd.u8 d1, d1, d17 - vld1.8 {d18}, [r0,:64], r3 - vrhadd.u8 d2, d2, d18 - vld1.8 {d19}, [r0,:64], r3 - vrhadd.u8 d3, d3, d19 - vld1.8 {d20}, [r0,:64], r3 - vrhadd.u8 d4, d4, d20 - vld1.8 {d21}, [r0,:64], r3 - vrhadd.u8 d5, d5, d21 - vld1.8 {d22}, [r0,:64], r3 - vrhadd.u8 d6, d6, d22 - vld1.8 {d23}, [r0,:64], r3 - vrhadd.u8 d7, d7, d23 - sub r0, r0, r3, lsl #3 - .endif - vst1.8 {d0}, [r0,:64], r3 - vst1.8 {d1}, [r0,:64], r3 - vst1.8 {d2}, [r0,:64], r3 - vst1.8 {d3}, [r0,:64], r3 - vst1.8 {d4}, [r0,:64], r3 - vst1.8 {d5}, [r0,:64], r3 - vst1.8 {d6}, [r0,:64], r3 - vst1.8 {d7}, [r0,:64], r3 - - mov lr, r10 - bx lr -endfunc -.endm - - h264_qpel8_hv_lowpass_l2 put - h264_qpel8_hv_lowpass_l2 avg - -.macro h264_qpel16_hv type -function \type\()_h264_qpel16_hv_lowpass_neon - mov r9, lr - bl \type\()_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - bl \type\()_h264_qpel8_hv_lowpass_neon - sub r1, r1, r3, lsl #2 - mov lr, r9 - b \type\()_h264_qpel8_hv_lowpass_neon -endfunc - -function \type\()_h264_qpel16_hv_lowpass_l2_neon - mov r9, lr - sub r2, r4, #256 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #4 - sub r1, r1, r3, lsl #2 - add r1, r1, #8 - sub r0, r0, r3, lsl #4 - add r0, r0, #8 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - sub r1, r1, r3, lsl #2 - mov lr, r9 - b \type\()_h264_qpel8_hv_lowpass_l2_neon -endfunc -.endm - - h264_qpel16_hv put - h264_qpel16_hv avg - -.macro h264_qpel8 type -function ff_\type\()_h264_qpel8_mc10_neon, export=1 - lowpass_const r3 - mov r3, r1 - sub r1, r1, #2 - mov r12, #8 - b \type\()_h264_qpel8_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel8_mc20_neon, export=1 - lowpass_const r3 - sub r1, r1, #2 - mov r3, r2 - mov r12, #8 - b \type\()_h264_qpel8_h_lowpass_neon -endfunc - -function ff_\type\()_h264_qpel8_mc30_neon, export=1 - lowpass_const r3 - add r3, r1, #1 - sub r1, r1, #2 - mov r12, #8 - b \type\()_h264_qpel8_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel8_mc01_neon, export=1 - push {lr} - mov r12, r1 -\type\()_h264_qpel8_mc01: - lowpass_const r3 - mov r3, r2 - sub r1, r1, r2, lsl #1 - vpush {d8-d15} - bl \type\()_h264_qpel8_v_lowpass_l2_neon - vpop {d8-d15} - pop {pc} -endfunc - -function ff_\type\()_h264_qpel8_mc11_neon, export=1 - push {r0, r1, r11, lr} -\type\()_h264_qpel8_mc11: - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r0, r11, #15 -T mov sp, r0 - sub sp, sp, #64 - mov r0, sp - sub r1, r1, #2 - mov r3, #8 - mov r12, #8 - vpush {d8-d15} - bl put_h264_qpel8_h_lowpass_neon - ldrd r0, r1, [r11], #8 - mov r3, r2 - add r12, sp, #64 - sub r1, r1, r2, lsl #1 - mov r2, #8 - bl \type\()_h264_qpel8_v_lowpass_l2_neon - vpop {d8-d15} - mov sp, r11 - pop {r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc21_neon, export=1 - push {r0, r1, r4, r10, r11, lr} -\type\()_h264_qpel8_mc21: - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r0, r11, #15 -T mov sp, r0 - sub sp, sp, #(8*8+16*12) - sub r1, r1, #2 - mov r3, #8 - mov r0, sp - mov r12, #8 - vpush {d8-d15} - bl put_h264_qpel8_h_lowpass_neon - mov r4, r0 - ldrd r0, r1, [r11], #8 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub r2, r4, #64 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r10, r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc31_neon, export=1 - add r1, r1, #1 - push {r0, r1, r11, lr} - sub r1, r1, #1 - b \type\()_h264_qpel8_mc11 -endfunc - -function ff_\type\()_h264_qpel8_mc02_neon, export=1 - push {lr} - lowpass_const r3 - sub r1, r1, r2, lsl #1 - mov r3, r2 - vpush {d8-d15} - bl \type\()_h264_qpel8_v_lowpass_neon - vpop {d8-d15} - pop {pc} -endfunc - -function ff_\type\()_h264_qpel8_mc12_neon, export=1 - push {r0, r1, r4, r10, r11, lr} -\type\()_h264_qpel8_mc12: - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r0, r11, #15 -T mov sp, r0 - sub sp, sp, #(8*8+16*12) - sub r1, r1, r2, lsl #1 - mov r3, r2 - mov r2, #8 - mov r0, sp - vpush {d8-d15} - bl put_h264_qpel8_v_lowpass_neon - mov r4, r0 - ldrd r0, r1, [r11], #8 - sub r1, r1, r3, lsl #1 - sub r1, r1, #2 - sub r2, r4, #64 - bl \type\()_h264_qpel8_hv_lowpass_l2_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r10, r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc22_neon, export=1 - push {r4, r10, r11, lr} - mov r11, sp -A bic sp, sp, #15 -T bic r4, r11, #15 -T mov sp, r4 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub sp, sp, #(16*12) - mov r4, sp - vpush {d8-d15} - bl \type\()_h264_qpel8_hv_lowpass_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r10, r11, pc} -endfunc - -function ff_\type\()_h264_qpel8_mc32_neon, export=1 - push {r0, r1, r4, r10, r11, lr} - add r1, r1, #1 - b \type\()_h264_qpel8_mc12 -endfunc - -function ff_\type\()_h264_qpel8_mc03_neon, export=1 - push {lr} - add r12, r1, r2 - b \type\()_h264_qpel8_mc01 -endfunc - -function ff_\type\()_h264_qpel8_mc13_neon, export=1 - push {r0, r1, r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel8_mc11 -endfunc - -function ff_\type\()_h264_qpel8_mc23_neon, export=1 - push {r0, r1, r4, r10, r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel8_mc21 -endfunc - -function ff_\type\()_h264_qpel8_mc33_neon, export=1 - add r1, r1, #1 - push {r0, r1, r11, lr} - add r1, r1, r2 - sub r1, r1, #1 - b \type\()_h264_qpel8_mc11 -endfunc -.endm - - h264_qpel8 put - h264_qpel8 avg - -.macro h264_qpel16 type -function ff_\type\()_h264_qpel16_mc10_neon, export=1 - lowpass_const r3 - mov r3, r1 - sub r1, r1, #2 - b \type\()_h264_qpel16_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel16_mc20_neon, export=1 - lowpass_const r3 - sub r1, r1, #2 - mov r3, r2 - b \type\()_h264_qpel16_h_lowpass_neon -endfunc - -function ff_\type\()_h264_qpel16_mc30_neon, export=1 - lowpass_const r3 - add r3, r1, #1 - sub r1, r1, #2 - b \type\()_h264_qpel16_h_lowpass_l2_neon -endfunc - -function ff_\type\()_h264_qpel16_mc01_neon, export=1 - push {r4, lr} - mov r12, r1 -\type\()_h264_qpel16_mc01: - lowpass_const r3 - mov r3, r2 - sub r1, r1, r2, lsl #1 - vpush {d8-d15} - bl \type\()_h264_qpel16_v_lowpass_l2_neon - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc11_neon, export=1 - push {r0, r1, r4, r11, lr} -\type\()_h264_qpel16_mc11: - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r0, r11, #15 -T mov sp, r0 - sub sp, sp, #256 - mov r0, sp - sub r1, r1, #2 - mov r3, #16 - vpush {d8-d15} - bl put_h264_qpel16_h_lowpass_neon - ldrd r0, r1, [r11], #8 - mov r3, r2 - add r12, sp, #64 - sub r1, r1, r2, lsl #1 - mov r2, #16 - bl \type\()_h264_qpel16_v_lowpass_l2_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc21_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} -\type\()_h264_qpel16_mc21: - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r0, r11, #15 -T mov sp, r0 - sub sp, sp, #(16*16+16*12) - sub r1, r1, #2 - mov r0, sp - vpush {d8-d15} - bl put_h264_qpel16_h_lowpass_neon_packed - mov r4, r0 - ldrd r0, r1, [r11], #8 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - bl \type\()_h264_qpel16_hv_lowpass_l2_neon - vpop {d8-d15} - mov sp, r11 - pop {r4-r5, r9-r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc31_neon, export=1 - add r1, r1, #1 - push {r0, r1, r4, r11, lr} - sub r1, r1, #1 - b \type\()_h264_qpel16_mc11 -endfunc - -function ff_\type\()_h264_qpel16_mc02_neon, export=1 - push {r4, lr} - lowpass_const r3 - sub r1, r1, r2, lsl #1 - mov r3, r2 - vpush {d8-d15} - bl \type\()_h264_qpel16_v_lowpass_neon - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc12_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} -\type\()_h264_qpel16_mc12: - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r0, r11, #15 -T mov sp, r0 - sub sp, sp, #(16*16+16*12) - sub r1, r1, r2, lsl #1 - mov r0, sp - mov r3, r2 - vpush {d8-d15} - bl put_h264_qpel16_v_lowpass_neon_packed - mov r4, r0 - ldrd r0, r1, [r11], #8 - sub r1, r1, r3, lsl #1 - sub r1, r1, #2 - mov r2, r3 - bl \type\()_h264_qpel16_hv_lowpass_l2_neon - vpop {d8-d15} - mov sp, r11 - pop {r4-r5, r9-r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc22_neon, export=1 - push {r4, r9-r11, lr} - lowpass_const r3 - mov r11, sp -A bic sp, sp, #15 -T bic r4, r11, #15 -T mov sp, r4 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, r2 - sub sp, sp, #(16*12) - mov r4, sp - vpush {d8-d15} - bl \type\()_h264_qpel16_hv_lowpass_neon - vpop {d8-d15} - mov sp, r11 - pop {r4, r9-r11, pc} -endfunc - -function ff_\type\()_h264_qpel16_mc32_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} - add r1, r1, #1 - b \type\()_h264_qpel16_mc12 -endfunc - -function ff_\type\()_h264_qpel16_mc03_neon, export=1 - push {r4, lr} - add r12, r1, r2 - b \type\()_h264_qpel16_mc01 -endfunc - -function ff_\type\()_h264_qpel16_mc13_neon, export=1 - push {r0, r1, r4, r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel16_mc11 -endfunc - -function ff_\type\()_h264_qpel16_mc23_neon, export=1 - push {r0, r1, r4-r5, r9-r11, lr} - add r1, r1, r2 - b \type\()_h264_qpel16_mc21 -endfunc - -function ff_\type\()_h264_qpel16_mc33_neon, export=1 - add r1, r1, #1 - push {r0, r1, r4, r11, lr} - add r1, r1, r2 - sub r1, r1, #1 - b \type\()_h264_qpel16_mc11 -endfunc -.endm - - h264_qpel16 put - h264_qpel16 avg diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.S b/ffmpeg/libavcodec/arm/hpeldsp_arm.S deleted file mode 100644 index 2f3d311..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_arm.S +++ /dev/null @@ -1,611 +0,0 @@ -@ -@ ARMv4 optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> -@ -@ This file is part of FFmpeg. -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "libavutil/arm/asm.S" - -#if !HAVE_ARMV5TE_EXTERNAL -#define pld @ -#endif - -.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 - mov \Rd0, \Rn0, lsr #(\shift * 8) - mov \Rd1, \Rn1, lsr #(\shift * 8) - mov \Rd2, \Rn2, lsr #(\shift * 8) - mov \Rd3, \Rn3, lsr #(\shift * 8) - orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) - orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) - orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) - orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD shift, R0, R1, R2 - mov \R0, \R0, lsr #(\shift * 8) - orr \R0, \R0, \R1, lsl #(32 - \shift * 8) - mov \R1, \R1, lsr #(\shift * 8) - orr \R1, \R1, \R2, lsl #(32 - \shift * 8) -.endm -.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 - mov \Rdst0, \Rsrc0, lsr #(\shift * 8) - mov \Rdst1, \Rsrc1, lsr #(\shift * 8) - orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) - orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) -.endm - -.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - orr \Rn0, \Rn0, \Rm0 - orr \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - sub \Rd0, \Rn0, \Rd0, lsr #1 - sub \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - and \Rn0, \Rn0, \Rm0 - and \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - add \Rd0, \Rn0, \Rd0, lsr #1 - add \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro JMP_ALIGN tmp, reg - ands \tmp, \reg, #3 - bic \reg, \reg, #3 - beq 1f - subs \tmp, \tmp, #1 - beq 2f - subs \tmp, \tmp, #1 - beq 3f - b 4f -.endm - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels16_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11, lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r7} - add r1, r1, r2 - stm r0, {r4-r7} - pld [r1] - subs r3, r3, #1 - add r0, r0, r2 - bne 1b - pop {r4-r11, pc} - .align 5 -2: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 2b - pop {r4-r11, pc} - .align 5 -3: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 3b - pop {r4-r11, pc} - .align 5 -4: - ldm r1, {r4-r8} - add r1, r1, r2 - ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stm r0, {r9-r12} - add r0, r0, r2 - bne 4b - pop {r4-r11,pc} -endfunc - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r5,lr} - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 - subs r3, r3, #1 - pld [r1] - stm r0, {r4-r5} - add r0, r0, r2 - bne 1b - pop {r4-r5,pc} - .align 5 -2: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 1, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r5,pc} - .align 5 -3: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 2, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r5,pc} - .align 5 -4: - ldm r1, {r4-r5, r12} - add r1, r1, r2 - ALIGN_DWORD 3, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 4b - pop {r4-r5,pc} -endfunc - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_x2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r10,lr} - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 1b - pop {r4-r10,pc} - .align 5 -2: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 1, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 2b - pop {r4-r10,pc} - .align 5 -3: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 2, r6, r7, r4, r5, r10 - ALIGN_DWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stm r0, {r4-r5} - add r0, r0, r2 - bne 3b - pop {r4-r10,pc} - .align 5 -4: - ldm r1, {r4-r5, r10} - add r1, r1, r2 - ALIGN_DWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 4b - pop {r4-r10,pc} -endfunc - - -@ ---------------------------------------------------------------- - .align 5 -function ff_put_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_y2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - mov r3, r3, lsr #1 - ldr r12, =0xfefefefe - JMP_ALIGN r5, r1 -1: - ldm r1, {r4-r5} - add r1, r1, r2 -6: ldm r1, {r6-r7} - add r1, r1, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldm r1, {r4-r5} - add r1, r1, r2 - stm r0, {r8-r9} - add r0, r0, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stm r0, {r8-r9} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -2: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 1, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -3: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 2, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} - .align 5 -4: - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 -6: ldm r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stm r0, {r10-r11} - add r0, r0, r2 - ldm r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ALIGN_DWORD 3, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stm r0, {r10-r11} - add r0, r0, r2 - bne 6b - pop {r4-r11,pc} -endfunc - - .ltorg - -@ ---------------------------------------------------------------- -.macro RND_XY2_IT align, rnd - @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) - @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) -.if \align == 0 - ldm r1, {r6-r8} -.elseif \align == 3 - ldm r1, {r5-r7} -.else - ldm r1, {r8-r10} -.endif - add r1, r1, r2 - pld [r1] -.if \align == 0 - ALIGN_DWORD_D 1, r4, r5, r6, r7, r8 -.elseif \align == 1 - ALIGN_DWORD_D 1, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 2, r6, r7, r8, r9, r10 -.elseif \align == 2 - ALIGN_DWORD_D 2, r4, r5, r8, r9, r10 - ALIGN_DWORD_D 3, r6, r7, r8, r9, r10 -.elseif \align == 3 - ALIGN_DWORD_D 3, r4, r5, r5, r6, r7 -.endif - ldr r14, =0x03030303 - tst r3, #1 - and r8, r4, r14 - and r9, r5, r14 - and r10, r6, r14 - and r11, r7, r14 - it eq - andeq r14, r14, r14, \rnd #1 - add r8, r8, r10 - add r9, r9, r11 - ldr r12, =0xfcfcfcfc >> 2 - itt eq - addeq r8, r8, r14 - addeq r9, r9, r14 - and r4, r12, r4, lsr #2 - and r5, r12, r5, lsr #2 - and r6, r12, r6, lsr #2 - and r7, r12, r7, lsr #2 - add r10, r4, r6 - add r11, r5, r7 - subs r3, r3, #1 -.endm - -.macro RND_XY2_EXPAND align, rnd - RND_XY2_IT \align, \rnd -6: push {r8-r11} - RND_XY2_IT \align, \rnd - pop {r4-r7} - add r4, r4, r8 - add r5, r5, r9 - ldr r14, =0x0f0f0f0f - add r6, r6, r10 - add r7, r7, r11 - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - add r4, r4, r6 - add r5, r5, r7 - stm r0, {r4-r5} - add r0, r0, r2 - bge 6b - pop {r4-r11,pc} -.endm - - .align 5 -function ff_put_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} @ R14 is also called LR - JMP_ALIGN r5, r1 -1: RND_XY2_EXPAND 0, lsl - .align 5 -2: RND_XY2_EXPAND 1, lsl - .align 5 -3: RND_XY2_EXPAND 2, lsl - .align 5 -4: RND_XY2_EXPAND 3, lsl -endfunc - - .align 5 -function ff_put_no_rnd_pixels8_xy2_arm, export=1 - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - push {r4-r11,lr} - JMP_ALIGN r5, r1 -1: RND_XY2_EXPAND 0, lsr - .align 5 -2: RND_XY2_EXPAND 1, lsr - .align 5 -3: RND_XY2_EXPAND 2, lsr - .align 5 -4: RND_XY2_EXPAND 3, lsr -endfunc diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.h b/ffmpeg/libavcodec/arm/hpeldsp_arm.h deleted file mode 100644 index 3f18c62..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_arm.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_HPELDSP_H -#define AVCODEC_ARM_HPELDSP_H - -#include "libavcodec/hpeldsp.h" - -void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags); -void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags); - -#endif /* AVCODEC_ARM_HPELDSP_H */ diff --git a/ffmpeg/libavcodec/arm/hpeldsp_armv6.S b/ffmpeg/libavcodec/arm/hpeldsp_armv6.S deleted file mode 100644 index cd50150..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_armv6.S +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro call_2x_pixels type, subp -function ff_\type\()_pixels16\subp\()_armv6, export=1 - push {r0-r3, lr} - bl ff_\type\()_pixels8\subp\()_armv6 - pop {r0-r3, lr} - add r0, r0, #8 - add r1, r1, #8 - b ff_\type\()_pixels8\subp\()_armv6 -endfunc -.endm - -call_2x_pixels avg -call_2x_pixels put, _x2 -call_2x_pixels put, _y2 -call_2x_pixels put, _x2_no_rnd -call_2x_pixels put, _y2_no_rnd - -function ff_put_pixels16_armv6, export=1 - push {r4-r11} -1: - ldr r5, [r1, #4] - ldr r6, [r1, #8] - ldr r7, [r1, #12] - ldr_post r4, r1, r2 - strd r6, r7, [r0, #8] - ldr r9, [r1, #4] - strd_post r4, r5, r0, r2 - ldr r10, [r1, #8] - ldr r11, [r1, #12] - ldr_post r8, r1, r2 - strd r10, r11, [r0, #8] - subs r3, r3, #2 - strd_post r8, r9, r0, r2 - bne 1b - - pop {r4-r11} - bx lr -endfunc - -function ff_put_pixels8_armv6, export=1 - push {r4-r7} -1: - ldr r5, [r1, #4] - ldr_post r4, r1, r2 - ldr r7, [r1, #4] - strd_post r4, r5, r0, r2 - ldr_post r6, r1, r2 - subs r3, r3, #2 - strd_post r6, r7, r0, r2 - bne 1b - - pop {r4-r7} - bx lr -endfunc - -function ff_put_pixels8_x2_armv6, export=1 - push {r4-r11, lr} - mov r12, #1 - orr r12, r12, r12, lsl #8 - orr r12, r12, r12, lsl #16 -1: - ldr r4, [r1] - subs r3, r3, #2 - ldr r5, [r1, #4] - ldr r7, [r1, #5] - lsr r6, r4, #8 - ldr_pre r8, r1, r2 - orr r6, r6, r5, lsl #24 - ldr r9, [r1, #4] - ldr r11, [r1, #5] - lsr r10, r8, #8 - add r1, r1, r2 - orr r10, r10, r9, lsl #24 - eor r14, r4, r6 - uhadd8 r4, r4, r6 - eor r6, r5, r7 - uhadd8 r5, r5, r7 - and r14, r14, r12 - and r6, r6, r12 - uadd8 r4, r4, r14 - eor r14, r8, r10 - uadd8 r5, r5, r6 - eor r6, r9, r11 - uhadd8 r8, r8, r10 - and r14, r14, r12 - uhadd8 r9, r9, r11 - and r6, r6, r12 - uadd8 r8, r8, r14 - strd_post r4, r5, r0, r2 - uadd8 r9, r9, r6 - strd_post r8, r9, r0, r2 - bne 1b - - pop {r4-r11, pc} -endfunc - -function ff_put_pixels8_y2_armv6, export=1 - push {r4-r11} - mov r12, #1 - orr r12, r12, r12, lsl #8 - orr r12, r12, r12, lsl #16 - ldr r4, [r1] - ldr r5, [r1, #4] - ldr_pre r6, r1, r2 - ldr r7, [r1, #4] -1: - subs r3, r3, #2 - uhadd8 r8, r4, r6 - eor r10, r4, r6 - uhadd8 r9, r5, r7 - eor r11, r5, r7 - and r10, r10, r12 - ldr_pre r4, r1, r2 - uadd8 r8, r8, r10 - and r11, r11, r12 - uadd8 r9, r9, r11 - ldr r5, [r1, #4] - uhadd8 r10, r4, r6 - eor r6, r4, r6 - uhadd8 r11, r5, r7 - and r6, r6, r12 - eor r7, r5, r7 - uadd8 r10, r10, r6 - and r7, r7, r12 - ldr_pre r6, r1, r2 - uadd8 r11, r11, r7 - strd_post r8, r9, r0, r2 - ldr r7, [r1, #4] - strd_post r10, r11, r0, r2 - bne 1b - - pop {r4-r11} - bx lr -endfunc - -function ff_put_pixels8_x2_no_rnd_armv6, export=1 - push {r4-r9, lr} -1: - subs r3, r3, #2 - ldr r4, [r1] - ldr r5, [r1, #4] - ldr r7, [r1, #5] - ldr_pre r8, r1, r2 - ldr r9, [r1, #4] - ldr r14, [r1, #5] - add r1, r1, r2 - lsr r6, r4, #8 - orr r6, r6, r5, lsl #24 - lsr r12, r8, #8 - orr r12, r12, r9, lsl #24 - uhadd8 r4, r4, r6 - uhadd8 r5, r5, r7 - uhadd8 r8, r8, r12 - uhadd8 r9, r9, r14 - stm r0, {r4,r5} - add r0, r0, r2 - stm r0, {r8,r9} - add r0, r0, r2 - bne 1b - - pop {r4-r9, pc} -endfunc - -function ff_put_pixels8_y2_no_rnd_armv6, export=1 - push {r4-r9, lr} - ldr r4, [r1] - ldr r5, [r1, #4] - ldr_pre r6, r1, r2 - ldr r7, [r1, #4] -1: - subs r3, r3, #2 - uhadd8 r8, r4, r6 - ldr_pre r4, r1, r2 - uhadd8 r9, r5, r7 - ldr r5, [r1, #4] - uhadd8 r12, r4, r6 - ldr_pre r6, r1, r2 - uhadd8 r14, r5, r7 - ldr r7, [r1, #4] - stm r0, {r8,r9} - add r0, r0, r2 - stm r0, {r12,r14} - add r0, r0, r2 - bne 1b - - pop {r4-r9, pc} -endfunc - -function ff_avg_pixels8_armv6, export=1 - pld [r1, r2] - push {r4-r10, lr} - mov lr, #1 - orr lr, lr, lr, lsl #8 - orr lr, lr, lr, lsl #16 - ldrd r4, r5, [r0] - ldr r10, [r1, #4] - ldr_post r9, r1, r2 - subs r3, r3, #2 -1: - pld [r1, r2] - eor r8, r4, r9 - uhadd8 r4, r4, r9 - eor r12, r5, r10 - ldrd_reg r6, r7, r0, r2 - uhadd8 r5, r5, r10 - and r8, r8, lr - ldr r10, [r1, #4] - and r12, r12, lr - uadd8 r4, r4, r8 - ldr_post r9, r1, r2 - eor r8, r6, r9 - uadd8 r5, r5, r12 - pld [r1, r2, lsl #1] - eor r12, r7, r10 - uhadd8 r6, r6, r9 - strd_post r4, r5, r0, r2 - uhadd8 r7, r7, r10 - beq 2f - and r8, r8, lr - ldrd_reg r4, r5, r0, r2 - uadd8 r6, r6, r8 - ldr r10, [r1, #4] - and r12, r12, lr - subs r3, r3, #2 - uadd8 r7, r7, r12 - ldr_post r9, r1, r2 - strd_post r6, r7, r0, r2 - b 1b -2: - and r8, r8, lr - and r12, r12, lr - uadd8 r6, r6, r8 - uadd8 r7, r7, r12 - strd_post r6, r7, r0, r2 - - pop {r4-r10, pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c b/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c deleted file mode 100644 index 2cc2b78..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * ARM optimized DSP utils - * Copyright (c) 2001 Lionel Ulmer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/cpu.h" -#include "libavutil/attributes.h" -#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS -#include "libavcodec/rnd_avg.h" -#include "hpeldsp_arm.h" - -void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); -void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h); - -CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8) -CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8) -CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8) -CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8) - -av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags) -{ - int cpu_flags = av_get_cpu_flags(); - - c->put_pixels_tab[0][0] = ff_put_pixels16_arm; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm; - c->put_pixels_tab[1][0] = ff_put_pixels8_arm; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm; - c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm; - c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm; - c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm; - c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm; - c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm; - c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm; - - if (have_armv6(cpu_flags)) - ff_hpeldsp_init_armv6(c, flags); - if (have_neon(cpu_flags)) - ff_hpeldsp_init_neon(c, flags); -} diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c b/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c deleted file mode 100644 index 967a8e0..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "hpeldsp_arm.h" - -void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int); - -av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags) -{ - c->put_pixels_tab[0][0] = ff_put_pixels16_armv6; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6; -/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */ - c->put_pixels_tab[1][0] = ff_put_pixels8_armv6; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6; -/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */ - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6; -/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */ - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6; -/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */ - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6; -} diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c b/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c deleted file mode 100644 index d9feadd..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c +++ /dev/null @@ -1,88 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "hpeldsp_arm.h" - -void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); - -void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); -void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int); - -av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags) -{ - c->put_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon; - c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon; - c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon; - c->put_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon; - c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon; - c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon; - - c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon; - c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon; - c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon; - c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon; - - c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon; - c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon; - c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon; - c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon; - c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon; - c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon; - c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon; - c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon; - - c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon; - c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon; - c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon; - c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon; -} diff --git a/ffmpeg/libavcodec/arm/hpeldsp_neon.S b/ffmpeg/libavcodec/arm/hpeldsp_neon.S deleted file mode 100644 index cf4a6cf..0000000 --- a/ffmpeg/libavcodec/arm/hpeldsp_neon.S +++ /dev/null @@ -1,410 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro pixels16 rnd=1, avg=0 - .if \avg - mov r12, r0 - .endif -1: vld1.8 {q0}, [r1], r2 - vld1.8 {q1}, [r1], r2 - vld1.8 {q2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.8 {q3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] - .if \avg - vld1.8 {q8}, [r12,:128], r2 - vrhadd.u8 q0, q0, q8 - vld1.8 {q9}, [r12,:128], r2 - vrhadd.u8 q1, q1, q9 - vld1.8 {q10}, [r12,:128], r2 - vrhadd.u8 q2, q2, q10 - vld1.8 {q11}, [r12,:128], r2 - vrhadd.u8 q3, q3, q11 - .endif - subs r3, r3, #4 - vst1.64 {q0}, [r0,:128], r2 - vst1.64 {q1}, [r0,:128], r2 - vst1.64 {q2}, [r0,:128], r2 - vst1.64 {q3}, [r0,:128], r2 - bne 1b - bx lr -.endm - -.macro pixels16_x2 rnd=1, avg=0 -1: vld1.8 {d0-d2}, [r1], r2 - vld1.8 {d4-d6}, [r1], r2 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vext.8 q1, q0, q1, #1 - avg q0, q0, q1 - vext.8 q3, q2, q3, #1 - avg q2, q2, q3 - .if \avg - vld1.8 {q1}, [r0,:128], r2 - vld1.8 {q3}, [r0,:128] - vrhadd.u8 q0, q0, q1 - vrhadd.u8 q2, q2, q3 - sub r0, r0, r2 - .endif - vst1.8 {q0}, [r0,:128], r2 - vst1.8 {q2}, [r0,:128], r2 - bne 1b - bx lr -.endm - -.macro pixels16_y2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {q0}, [r1], r2 - vld1.8 {q1}, [r1], r2 -1: subs r3, r3, #2 - avg q2, q0, q1 - vld1.8 {q0}, [r1], r2 - avg q3, q0, q1 - vld1.8 {q1}, [r1], r2 - pld [r1] - pld [r1, r2] - .if \avg - vld1.8 {q8}, [r0,:128], r2 - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q2, q2, q8 - vrhadd.u8 q3, q3, q9 - sub r0, r0, r2 - .endif - vst1.8 {q2}, [r0,:128], r2 - vst1.8 {q3}, [r0,:128], r2 - bne 1b - - avg q2, q0, q1 - vld1.8 {q0}, [r1], r2 - avg q3, q0, q1 - .if \avg - vld1.8 {q8}, [r0,:128], r2 - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q2, q2, q8 - vrhadd.u8 q3, q3, q9 - sub r0, r0, r2 - .endif - vst1.8 {q2}, [r0,:128], r2 - vst1.8 {q3}, [r0,:128], r2 - - bx lr -.endm - -.macro pixels16_xy2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {d0-d2}, [r1], r2 - vld1.8 {d4-d6}, [r1], r2 -NRND vmov.i16 q13, #1 - pld [r1] - pld [r1, r2] - vext.8 q1, q0, q1, #1 - vext.8 q3, q2, q3, #1 - vaddl.u8 q8, d0, d2 - vaddl.u8 q10, d1, d3 - vaddl.u8 q9, d4, d6 - vaddl.u8 q11, d5, d7 -1: subs r3, r3, #2 - vld1.8 {d0-d2}, [r1], r2 - vadd.u16 q12, q8, q9 - pld [r1] -NRND vadd.u16 q12, q12, q13 - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - shrn d28, q12, #2 -NRND vadd.u16 q1, q1, q13 - shrn d29, q1, #2 - .if \avg - vld1.8 {q8}, [r0,:128] - vrhadd.u8 q14, q14, q8 - .endif - vaddl.u8 q8, d0, d30 - vld1.8 {d2-d4}, [r1], r2 - vaddl.u8 q10, d1, d31 - vst1.8 {q14}, [r0,:128], r2 - vadd.u16 q12, q8, q9 - pld [r1, r2] -NRND vadd.u16 q12, q12, q13 - vext.8 q2, q1, q2, #1 - vadd.u16 q0, q10, q11 - shrn d30, q12, #2 -NRND vadd.u16 q0, q0, q13 - shrn d31, q0, #2 - .if \avg - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q15, q15, q9 - .endif - vaddl.u8 q9, d2, d4 - vaddl.u8 q11, d3, d5 - vst1.8 {q15}, [r0,:128], r2 - bgt 1b - - vld1.8 {d0-d2}, [r1], r2 - vadd.u16 q12, q8, q9 -NRND vadd.u16 q12, q12, q13 - vext.8 q15, q0, q1, #1 - vadd.u16 q1 , q10, q11 - shrn d28, q12, #2 -NRND vadd.u16 q1, q1, q13 - shrn d29, q1, #2 - .if \avg - vld1.8 {q8}, [r0,:128] - vrhadd.u8 q14, q14, q8 - .endif - vaddl.u8 q8, d0, d30 - vaddl.u8 q10, d1, d31 - vst1.8 {q14}, [r0,:128], r2 - vadd.u16 q12, q8, q9 -NRND vadd.u16 q12, q12, q13 - vadd.u16 q0, q10, q11 - shrn d30, q12, #2 -NRND vadd.u16 q0, q0, q13 - shrn d31, q0, #2 - .if \avg - vld1.8 {q9}, [r0,:128] - vrhadd.u8 q15, q15, q9 - .endif - vst1.8 {q15}, [r0,:128], r2 - - bx lr -.endm - -.macro pixels8 rnd=1, avg=0 -1: vld1.8 {d0}, [r1], r2 - vld1.8 {d1}, [r1], r2 - vld1.8 {d2}, [r1], r2 - pld [r1, r2, lsl #2] - vld1.8 {d3}, [r1], r2 - pld [r1] - pld [r1, r2] - pld [r1, r2, lsl #1] - .if \avg - vld1.8 {d4}, [r0,:64], r2 - vrhadd.u8 d0, d0, d4 - vld1.8 {d5}, [r0,:64], r2 - vrhadd.u8 d1, d1, d5 - vld1.8 {d6}, [r0,:64], r2 - vrhadd.u8 d2, d2, d6 - vld1.8 {d7}, [r0,:64], r2 - vrhadd.u8 d3, d3, d7 - sub r0, r0, r2, lsl #2 - .endif - subs r3, r3, #4 - vst1.8 {d0}, [r0,:64], r2 - vst1.8 {d1}, [r0,:64], r2 - vst1.8 {d2}, [r0,:64], r2 - vst1.8 {d3}, [r0,:64], r2 - bne 1b - bx lr -.endm - -.macro pixels8_x2 rnd=1, avg=0 -1: vld1.8 {q0}, [r1], r2 - vext.8 d1, d0, d1, #1 - vld1.8 {q1}, [r1], r2 - vext.8 d3, d2, d3, #1 - pld [r1] - pld [r1, r2] - subs r3, r3, #2 - vswp d1, d2 - avg q0, q0, q1 - .if \avg - vld1.8 {d4}, [r0,:64], r2 - vld1.8 {d5}, [r0,:64] - vrhadd.u8 q0, q0, q2 - sub r0, r0, r2 - .endif - vst1.8 {d0}, [r0,:64], r2 - vst1.8 {d1}, [r0,:64], r2 - bne 1b - bx lr -.endm - -.macro pixels8_y2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {d0}, [r1], r2 - vld1.8 {d1}, [r1], r2 -1: subs r3, r3, #2 - avg d4, d0, d1 - vld1.8 {d0}, [r1], r2 - avg d5, d0, d1 - vld1.8 {d1}, [r1], r2 - pld [r1] - pld [r1, r2] - .if \avg - vld1.8 {d2}, [r0,:64], r2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 q2, q2, q1 - sub r0, r0, r2 - .endif - vst1.8 {d4}, [r0,:64], r2 - vst1.8 {d5}, [r0,:64], r2 - bne 1b - - avg d4, d0, d1 - vld1.8 {d0}, [r1], r2 - avg d5, d0, d1 - .if \avg - vld1.8 {d2}, [r0,:64], r2 - vld1.8 {d3}, [r0,:64] - vrhadd.u8 q2, q2, q1 - sub r0, r0, r2 - .endif - vst1.8 {d4}, [r0,:64], r2 - vst1.8 {d5}, [r0,:64], r2 - - bx lr -.endm - -.macro pixels8_xy2 rnd=1, avg=0 - sub r3, r3, #2 - vld1.8 {q0}, [r1], r2 - vld1.8 {q1}, [r1], r2 -NRND vmov.i16 q11, #1 - pld [r1] - pld [r1, r2] - vext.8 d4, d0, d1, #1 - vext.8 d6, d2, d3, #1 - vaddl.u8 q8, d0, d4 - vaddl.u8 q9, d2, d6 -1: subs r3, r3, #2 - vld1.8 {q0}, [r1], r2 - pld [r1] - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -NRND vadd.u16 q10, q10, q11 - vaddl.u8 q8, d0, d4 - shrn d5, q10, #2 - vld1.8 {q1}, [r1], r2 - vadd.u16 q10, q8, q9 - pld [r1, r2] - .if \avg - vld1.8 {d7}, [r0,:64] - vrhadd.u8 d5, d5, d7 - .endif -NRND vadd.u16 q10, q10, q11 - vst1.8 {d5}, [r0,:64], r2 - shrn d7, q10, #2 - .if \avg - vld1.8 {d5}, [r0,:64] - vrhadd.u8 d7, d7, d5 - .endif - vext.8 d6, d2, d3, #1 - vaddl.u8 q9, d2, d6 - vst1.8 {d7}, [r0,:64], r2 - bgt 1b - - vld1.8 {q0}, [r1], r2 - vadd.u16 q10, q8, q9 - vext.8 d4, d0, d1, #1 -NRND vadd.u16 q10, q10, q11 - vaddl.u8 q8, d0, d4 - shrn d5, q10, #2 - vadd.u16 q10, q8, q9 - .if \avg - vld1.8 {d7}, [r0,:64] - vrhadd.u8 d5, d5, d7 - .endif -NRND vadd.u16 q10, q10, q11 - vst1.8 {d5}, [r0,:64], r2 - shrn d7, q10, #2 - .if \avg - vld1.8 {d5}, [r0,:64] - vrhadd.u8 d7, d7, d5 - .endif - vst1.8 {d7}, [r0,:64], r2 - - bx lr -.endm - -.macro pixfunc pfx, name, suf, rnd=1, avg=0 - .if \rnd - .macro avg rd, rn, rm - vrhadd.u8 \rd, \rn, \rm - .endm - .macro shrn rd, rn, rm - vrshrn.u16 \rd, \rn, \rm - .endm - .macro NRND insn:vararg - .endm - .else - .macro avg rd, rn, rm - vhadd.u8 \rd, \rn, \rm - .endm - .macro shrn rd, rn, rm - vshrn.u16 \rd, \rn, \rm - .endm - .macro NRND insn:vararg - \insn - .endm - .endif -function ff_\pfx\name\suf\()_neon, export=1 - \name \rnd, \avg -endfunc - .purgem avg - .purgem shrn - .purgem NRND -.endm - -.macro pixfunc2 pfx, name, avg=0 - pixfunc \pfx, \name, rnd=1, avg=\avg - pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg -.endm - -function ff_put_h264_qpel16_mc00_neon, export=1 - mov r3, #16 -endfunc - - pixfunc put_, pixels16, avg=0 - pixfunc2 put_, pixels16_x2, avg=0 - pixfunc2 put_, pixels16_y2, avg=0 - pixfunc2 put_, pixels16_xy2, avg=0 - -function ff_avg_h264_qpel16_mc00_neon, export=1 - mov r3, #16 -endfunc - - pixfunc avg_, pixels16, avg=1 - pixfunc2 avg_, pixels16_x2, avg=1 - pixfunc2 avg_, pixels16_y2, avg=1 - pixfunc2 avg_, pixels16_xy2, avg=1 - -function ff_put_h264_qpel8_mc00_neon, export=1 - mov r3, #8 -endfunc - - pixfunc put_, pixels8, avg=0 - pixfunc2 put_, pixels8_x2, avg=0 - pixfunc2 put_, pixels8_y2, avg=0 - pixfunc2 put_, pixels8_xy2, avg=0 - -function ff_avg_h264_qpel8_mc00_neon, export=1 - mov r3, #8 -endfunc - - pixfunc avg_, pixels8, avg=1 - pixfunc avg_, pixels8_x2, avg=1 - pixfunc avg_, pixels8_y2, avg=1 - pixfunc avg_, pixels8_xy2, avg=1 diff --git a/ffmpeg/libavcodec/arm/int_neon.S b/ffmpeg/libavcodec/arm/int_neon.S deleted file mode 100644 index b3f5a69..0000000 --- a/ffmpeg/libavcodec/arm/int_neon.S +++ /dev/null @@ -1,92 +0,0 @@ -/* - * ARM NEON optimised integer operations - * Copyright (c) 2009 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - - .fpu neon - -function ff_scalarproduct_int16_neon, export=1 - vmov.i16 q0, #0 - vmov.i16 q1, #0 - vmov.i16 q2, #0 - vmov.i16 q3, #0 -1: vld1.16 {d16-d17}, [r0]! - vld1.16 {d20-d21}, [r1,:128]! - vmlal.s16 q0, d16, d20 - vld1.16 {d18-d19}, [r0]! - vmlal.s16 q1, d17, d21 - vld1.16 {d22-d23}, [r1,:128]! - vmlal.s16 q2, d18, d22 - vmlal.s16 q3, d19, d23 - subs r2, r2, #16 - bne 1b - - vpadd.s32 d16, d0, d1 - vpadd.s32 d17, d2, d3 - vpadd.s32 d18, d4, d5 - vpadd.s32 d19, d6, d7 - vpadd.s32 d0, d16, d17 - vpadd.s32 d1, d18, d19 - vpadd.s32 d2, d0, d1 - vpaddl.s32 d3, d2 - vmov.32 r0, d3[0] - bx lr -endfunc - -@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) -function ff_scalarproduct_and_madd_int16_neon, export=1 - vld1.16 {d28[],d29[]}, [sp] - vmov.i16 q0, #0 - vmov.i16 q1, #0 - vmov.i16 q2, #0 - vmov.i16 q3, #0 - mov r12, r0 - -1: vld1.16 {d16-d17}, [r0,:128]! - vld1.16 {d18-d19}, [r1]! - vld1.16 {d20-d21}, [r2]! - vld1.16 {d22-d23}, [r0,:128]! - vld1.16 {d24-d25}, [r1]! - vld1.16 {d26-d27}, [r2]! - vmul.s16 q10, q10, q14 - vmul.s16 q13, q13, q14 - vmlal.s16 q0, d16, d18 - vmlal.s16 q1, d17, d19 - vadd.s16 q10, q8, q10 - vadd.s16 q13, q11, q13 - vmlal.s16 q2, d22, d24 - vmlal.s16 q3, d23, d25 - vst1.16 {q10}, [r12,:128]! - subs r3, r3, #16 - vst1.16 {q13}, [r12,:128]! - bne 1b - - vpadd.s32 d16, d0, d1 - vpadd.s32 d17, d2, d3 - vpadd.s32 d18, d4, d5 - vpadd.s32 d19, d6, d7 - vpadd.s32 d0, d16, d17 - vpadd.s32 d1, d18, d19 - vpadd.s32 d2, d0, d1 - vpaddl.s32 d3, d2 - vmov.32 r0, d3[0] - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/jrevdct_arm.S b/ffmpeg/libavcodec/arm/jrevdct_arm.S deleted file mode 100644 index f951e2a..0000000 --- a/ffmpeg/libavcodec/arm/jrevdct_arm.S +++ /dev/null @@ -1,383 +0,0 @@ -/* - C-like prototype : - void j_rev_dct_arm(DCTBLOCK data) - - With DCTBLOCK being a pointer to an array of 64 'signed shorts' - - Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER - IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -*/ - -#include "libavutil/arm/asm.S" - -#define FIX_0_298631336 2446 -#define FIX_0_541196100 4433 -#define FIX_0_765366865 6270 -#define FIX_1_175875602 9633 -#define FIX_1_501321110 12299 -#define FIX_2_053119869 16819 -#define FIX_3_072711026 25172 -#define FIX_M_0_390180644 -3196 -#define FIX_M_0_899976223 -7373 -#define FIX_M_1_847759065 -15137 -#define FIX_M_1_961570560 -16069 -#define FIX_M_2_562915447 -20995 -#define FIX_0xFFFF 0xFFFF - -#define FIX_0_298631336_ID 0 -#define FIX_0_541196100_ID 4 -#define FIX_0_765366865_ID 8 -#define FIX_1_175875602_ID 12 -#define FIX_1_501321110_ID 16 -#define FIX_2_053119869_ID 20 -#define FIX_3_072711026_ID 24 -#define FIX_M_0_390180644_ID 28 -#define FIX_M_0_899976223_ID 32 -#define FIX_M_1_847759065_ID 36 -#define FIX_M_1_961570560_ID 40 -#define FIX_M_2_562915447_ID 44 -#define FIX_0xFFFF_ID 48 - -function ff_j_rev_dct_arm, export=1 - push {r0, r4 - r11, lr} - - mov lr, r0 @ lr = pointer to the current row - mov r12, #8 @ r12 = row-counter - movrel r11, const_array @ r11 = base pointer to the constants array -row_loop: - ldrsh r0, [lr, # 0] @ r0 = 'd0' - ldrsh r2, [lr, # 2] @ r2 = 'd2' - - @ Optimization for row that have all items except the first set to 0 - @ (this works as the int16_t are always 4-byte aligned) - ldr r5, [lr, # 0] - ldr r6, [lr, # 4] - ldr r3, [lr, # 8] - ldr r4, [lr, #12] - orr r3, r3, r4 - orr r3, r3, r6 - orrs r5, r3, r5 - beq end_of_row_loop @ nothing to be done as ALL of them are '0' - orrs r3, r3, r2 - beq empty_row - - ldrsh r1, [lr, # 8] @ r1 = 'd1' - ldrsh r4, [lr, # 4] @ r4 = 'd4' - ldrsh r6, [lr, # 6] @ r6 = 'd6' - - ldr r3, [r11, #FIX_0_541196100_ID] - add r7, r2, r6 - ldr r5, [r11, #FIX_M_1_847759065_ID] - mul r7, r3, r7 @ r7 = z1 - ldr r3, [r11, #FIX_0_765366865_ID] - mla r6, r5, r6, r7 @ r6 = tmp2 - add r5, r0, r4 @ r5 = tmp0 - mla r2, r3, r2, r7 @ r2 = tmp3 - sub r3, r0, r4 @ r3 = tmp1 - - add r0, r2, r5, lsl #13 @ r0 = tmp10 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13 - add r4, r6, r3, lsl #13 @ r4 = tmp11 - rsb r3, r6, r3, lsl #13 @ r3 = tmp12 - - push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11 - - ldrsh r3, [lr, #10] @ r3 = 'd3' - ldrsh r5, [lr, #12] @ r5 = 'd5' - ldrsh r7, [lr, #14] @ r7 = 'd7' - - add r0, r3, r5 @ r0 = 'z2' - add r2, r1, r7 @ r2 = 'z1' - add r4, r3, r7 @ r4 = 'z3' - add r6, r1, r5 @ r6 = 'z4' - ldr r9, [r11, #FIX_1_175875602_ID] - add r8, r4, r6 @ r8 = z3 + z4 - ldr r10, [r11, #FIX_M_0_899976223_ID] - mul r8, r9, r8 @ r8 = 'z5' - ldr r9, [r11, #FIX_M_2_562915447_ID] - mul r2, r10, r2 @ r2 = 'z1' - ldr r10, [r11, #FIX_M_1_961570560_ID] - mul r0, r9, r0 @ r0 = 'z2' - ldr r9, [r11, #FIX_M_0_390180644_ID] - mla r4, r10, r4, r8 @ r4 = 'z3' - ldr r10, [r11, #FIX_0_298631336_ID] - mla r6, r9, r6, r8 @ r6 = 'z4' - ldr r9, [r11, #FIX_2_053119869_ID] - mla r7, r10, r7, r2 @ r7 = tmp0 + z1 - ldr r10, [r11, #FIX_3_072711026_ID] - mla r5, r9, r5, r0 @ r5 = tmp1 + z2 - ldr r9, [r11, #FIX_1_501321110_ID] - mla r3, r10, r3, r0 @ r3 = tmp2 + z2 - add r7, r7, r4 @ r7 = tmp0 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1 - add r5, r5, r6 @ r5 = tmp1 - add r3, r3, r4 @ r3 = tmp2 - add r1, r1, r6 @ r1 = tmp3 - - pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 - - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) - add r8, r0, r1 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 0] - - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) - sub r8, r0, r1 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #14] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) - add r8, r6, r3 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 2] - - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) - sub r8, r6, r3 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #12] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) - add r8, r4, r5 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 4] - - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) - sub r8, r4, r5 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, #10] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) - add r8, r2, r7 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 6] - - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) - sub r8, r2, r7 - add r8, r8, #(1<<10) - mov r8, r8, asr #11 - strh r8, [lr, # 8] - - @ End of row loop - add lr, lr, #16 - subs r12, r12, #1 - bne row_loop - beq start_column_loop - -empty_row: - ldr r1, [r11, #FIX_0xFFFF_ID] - mov r0, r0, lsl #2 - and r0, r0, r1 - add r0, r0, r0, lsl #16 - str r0, [lr, # 0] - str r0, [lr, # 4] - str r0, [lr, # 8] - str r0, [lr, #12] - -end_of_row_loop: - @ End of loop - add lr, lr, #16 - subs r12, r12, #1 - bne row_loop - -start_column_loop: - @ Start of column loop - pop {lr} - mov r12, #8 -column_loop: - ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' - ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' - ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' - ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' - - ldr r3, [r11, #FIX_0_541196100_ID] - add r1, r2, r6 - ldr r5, [r11, #FIX_M_1_847759065_ID] - mul r1, r3, r1 @ r1 = z1 - ldr r3, [r11, #FIX_0_765366865_ID] - mla r6, r5, r6, r1 @ r6 = tmp2 - add r5, r0, r4 @ r5 = tmp0 - mla r2, r3, r2, r1 @ r2 = tmp3 - sub r3, r0, r4 @ r3 = tmp1 - - add r0, r2, r5, lsl #13 @ r0 = tmp10 - rsb r2, r2, r5, lsl #13 @ r2 = tmp13 - add r4, r6, r3, lsl #13 @ r4 = tmp11 - rsb r6, r6, r3, lsl #13 @ r6 = tmp12 - - ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' - ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' - ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' - ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' - - @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) - orr r9, r1, r3 - orr r10, r5, r7 - orrs r10, r9, r10 - beq empty_odd_column - - push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11 - - add r0, r3, r5 @ r0 = 'z2' - add r2, r1, r7 @ r2 = 'z1' - add r4, r3, r7 @ r4 = 'z3' - add r6, r1, r5 @ r6 = 'z4' - ldr r9, [r11, #FIX_1_175875602_ID] - add r8, r4, r6 - ldr r10, [r11, #FIX_M_0_899976223_ID] - mul r8, r9, r8 @ r8 = 'z5' - ldr r9, [r11, #FIX_M_2_562915447_ID] - mul r2, r10, r2 @ r2 = 'z1' - ldr r10, [r11, #FIX_M_1_961570560_ID] - mul r0, r9, r0 @ r0 = 'z2' - ldr r9, [r11, #FIX_M_0_390180644_ID] - mla r4, r10, r4, r8 @ r4 = 'z3' - ldr r10, [r11, #FIX_0_298631336_ID] - mla r6, r9, r6, r8 @ r6 = 'z4' - ldr r9, [r11, #FIX_2_053119869_ID] - mla r7, r10, r7, r2 @ r7 = tmp0 + z1 - ldr r10, [r11, #FIX_3_072711026_ID] - mla r5, r9, r5, r0 @ r5 = tmp1 + z2 - ldr r9, [r11, #FIX_1_501321110_ID] - mla r3, r10, r3, r0 @ r3 = tmp2 + z2 - add r7, r7, r4 @ r7 = tmp0 - mla r1, r9, r1, r2 @ r1 = tmp3 + z1 - add r5, r5, r6 @ r5 = tmp1 - add r3, r3, r4 @ r3 = tmp2 - add r1, r1, r6 @ r1 = tmp3 - - pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 - @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 - - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) - add r8, r0, r1 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 0*8)] - - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) - sub r8, r0, r1 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(14*8)] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) - add r8, r4, r3 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 2*8)] - - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) - sub r8, r4, r3 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(12*8)] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) - add r8, r6, r5 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 4*8)] - - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) - sub r8, r6, r5 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #(10*8)] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) - add r8, r2, r7 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 6*8)] - - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) - sub r8, r2, r7 - add r8, r8, #(1<<17) - mov r8, r8, asr #18 - strh r8, [lr, #( 8*8)] - - @ End of row loop - add lr, lr, #2 - subs r12, r12, #1 - bne column_loop - beq the_end - -empty_odd_column: - @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) - add r0, r0, #(1<<17) - mov r0, r0, asr #18 - strh r0, [lr, #( 0*8)] - strh r0, [lr, #(14*8)] - - @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) - add r4, r4, #(1<<17) - mov r4, r4, asr #18 - strh r4, [lr, #( 2*8)] - strh r4, [lr, #(12*8)] - - @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) - add r6, r6, #(1<<17) - mov r6, r6, asr #18 - strh r6, [lr, #( 4*8)] - strh r6, [lr, #(10*8)] - - @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) - @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) - add r2, r2, #(1<<17) - mov r2, r2, asr #18 - strh r2, [lr, #( 6*8)] - strh r2, [lr, #( 8*8)] - - @ End of row loop - add lr, lr, #2 - subs r12, r12, #1 - bne column_loop - -the_end: - @ The end.... - pop {r4 - r11, pc} -endfunc - -const const_array - .word FIX_0_298631336 - .word FIX_0_541196100 - .word FIX_0_765366865 - .word FIX_1_175875602 - .word FIX_1_501321110 - .word FIX_2_053119869 - .word FIX_3_072711026 - .word FIX_M_0_390180644 - .word FIX_M_0_899976223 - .word FIX_M_1_847759065 - .word FIX_M_1_961570560 - .word FIX_M_2_562915447 - .word FIX_0xFFFF -endconst diff --git a/ffmpeg/libavcodec/arm/mathops.h b/ffmpeg/libavcodec/arm/mathops.h deleted file mode 100644 index dc57c55..0000000 --- a/ffmpeg/libavcodec/arm/mathops.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_MATHOPS_H -#define AVCODEC_ARM_MATHOPS_H - -#include <stdint.h> -#include "config.h" -#include "libavutil/common.h" - -#if HAVE_INLINE_ASM - -#if HAVE_ARMV6_INLINE -#define MULH MULH -static inline av_const int MULH(int a, int b) -{ - int r; - __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b)); - return r; -} - -#define FASTDIV FASTDIV -static av_always_inline av_const int FASTDIV(int a, int b) -{ - int r; - __asm__ ("cmp %2, #2 \n\t" - "ldr %0, [%3, %2, lsl #2] \n\t" - "ite le \n\t" - "lsrle %0, %1, #1 \n\t" - "smmulgt %0, %0, %1 \n\t" - : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc"); - return r; -} - -#else /* HAVE_ARMV6_INLINE */ - -#define FASTDIV FASTDIV -static av_always_inline av_const int FASTDIV(int a, int b) -{ - int r, t; - __asm__ ("umull %1, %0, %2, %3" - : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b])); - return r; -} -#endif - -#define MLS64(d, a, b) MAC64(d, -(a), b) - -#if HAVE_ARMV5TE_INLINE - -/* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb)); - -/* signed 16x16 -> 32 multiply */ -# define MUL16 MUL16 -static inline av_const int MUL16(int ra, int rb) -{ - int rt; - __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb)); - return rt; -} - -#endif - -#define mid_pred mid_pred -static inline av_const int mid_pred(int a, int b, int c) -{ - int m; - __asm__ ( - "mov %0, %2 \n\t" - "cmp %1, %2 \n\t" - "itt gt \n\t" - "movgt %0, %1 \n\t" - "movgt %1, %2 \n\t" - "cmp %1, %3 \n\t" - "it le \n\t" - "movle %1, %3 \n\t" - "cmp %0, %1 \n\t" - "it gt \n\t" - "movgt %0, %1 \n\t" - : "=&r"(m), "+r"(a) - : "r"(b), "r"(c) - : "cc"); - return m; -} - -#endif /* HAVE_INLINE_ASM */ - -#endif /* AVCODEC_ARM_MATHOPS_H */ diff --git a/ffmpeg/libavcodec/arm/mdct_fixed_neon.S b/ffmpeg/libavcodec/arm/mdct_fixed_neon.S deleted file mode 100644 index 365c5e7..0000000 --- a/ffmpeg/libavcodec/arm/mdct_fixed_neon.S +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro prerot dst, rt - lsr r3, r6, #2 @ n4 - add \rt, r4, r6, lsr #1 @ revtab + n4 - add r9, r3, r3, lsl #1 @ n3 - add r8, r7, r6 @ tcos + n4 - add r3, r2, r6, lsr #1 @ in + n4 - add r9, r2, r9, lsl #1 @ in + n3 - sub r8, r8, #16 - sub r10, r3, #16 - sub r11, r9, #16 - mov r12, #-16 -1: - vld2.16 {d0,d1}, [r9, :128]! - vld2.16 {d2,d3}, [r11,:128], r12 - vld2.16 {d4,d5}, [r3, :128]! - vld2.16 {d6,d7}, [r10,:128], r12 - vld2.16 {d16,d17},[r7, :128]! @ cos, sin - vld2.16 {d18,d19},[r8, :128], r12 - vrev64.16 q1, q1 - vrev64.16 q3, q3 - vrev64.16 q9, q9 - vneg.s16 d0, d0 - vneg.s16 d2, d2 - vneg.s16 d16, d16 - vneg.s16 d18, d18 - vhsub.s16 d0, d0, d3 @ re - vhsub.s16 d4, d7, d4 @ im - vhsub.s16 d6, d6, d5 - vhsub.s16 d2, d2, d1 - vmull.s16 q10, d0, d16 - vmlsl.s16 q10, d4, d17 - vmull.s16 q11, d0, d17 - vmlal.s16 q11, d4, d16 - vmull.s16 q12, d6, d18 - vmlsl.s16 q12, d2, d19 - vmull.s16 q13, d6, d19 - vmlal.s16 q13, d2, d18 - vshrn.s32 d0, q10, #15 - vshrn.s32 d1, q11, #15 - vshrn.s32 d2, q12, #15 - vshrn.s32 d3, q13, #15 - vzip.16 d0, d1 - vzip.16 d2, d3 - ldrh lr, [r4], #2 - ldrh r2, [\rt, #-2]! - add lr, \dst, lr, lsl #2 - add r2, \dst, r2, lsl #2 - vst1.32 {d0[0]}, [lr,:32] - vst1.32 {d2[0]}, [r2,:32] - ldrh lr, [r4], #2 - ldrh r2, [\rt, #-2]! - add lr, \dst, lr, lsl #2 - add r2, \dst, r2, lsl #2 - vst1.32 {d0[1]}, [lr,:32] - vst1.32 {d2[1]}, [r2,:32] - ldrh lr, [r4], #2 - ldrh r2, [\rt, #-2]! - add lr, \dst, lr, lsl #2 - add r2, \dst, r2, lsl #2 - vst1.32 {d1[0]}, [lr,:32] - vst1.32 {d3[0]}, [r2,:32] - ldrh lr, [r4], #2 - ldrh r2, [\rt, #-2]! - add lr, \dst, lr, lsl #2 - add r2, \dst, r2, lsl #2 - vst1.32 {d1[1]}, [lr,:32] - vst1.32 {d3[1]}, [r2,:32] - subs r6, r6, #32 - bgt 1b -.endm - -function ff_mdct_fixed_calc_neon, export=1 - push {r1,r4-r11,lr} - - ldr r4, [r0, #8] @ revtab - ldr r6, [r0, #16] @ mdct_size; n - ldr r7, [r0, #24] @ tcos - - prerot r1, r5 - - mov r4, r0 - bl X(ff_fft_fixed_calc_neon) - - pop {r5} - mov r12, #-16 - ldr r6, [r4, #16] @ mdct_size; n - ldr r7, [r4, #24] @ tcos - add r5, r5, r6, lsr #1 - add r7, r7, r6, lsr #1 - sub r1, r5, #16 - sub r2, r7, #16 -1: - vld2.16 {d4,d5}, [r7,:128]! - vld2.16 {d6,d7}, [r2,:128], r12 - vld2.16 {d0,d1}, [r5,:128] - vld2.16 {d2,d3}, [r1,:128] - vrev64.16 q3, q3 - vrev64.16 q1, q1 - vneg.s16 q3, q3 - vneg.s16 q2, q2 - vmull.s16 q11, d2, d6 - vmlal.s16 q11, d3, d7 - vmull.s16 q8, d0, d5 - vmlsl.s16 q8, d1, d4 - vmull.s16 q9, d0, d4 - vmlal.s16 q9, d1, d5 - vmull.s16 q10, d2, d7 - vmlsl.s16 q10, d3, d6 - vshrn.s32 d0, q11, #15 - vshrn.s32 d1, q8, #15 - vshrn.s32 d2, q9, #15 - vshrn.s32 d3, q10, #15 - vrev64.16 q0, q0 - vst2.16 {d2,d3}, [r5,:128]! - vst2.16 {d0,d1}, [r1,:128], r12 - subs r6, r6, #32 - bgt 1b - - pop {r4-r11,pc} -endfunc - -function ff_mdct_fixed_calcw_neon, export=1 - push {r1,r4-r11,lr} - - ldrd r4, r5, [r0, #8] @ revtab, tmp_buf - ldr r6, [r0, #16] @ mdct_size; n - ldr r7, [r0, #24] @ tcos - - prerot r5, r1 - - mov r4, r0 - mov r1, r5 - bl X(ff_fft_fixed_calc_neon) - - pop {r7} - mov r12, #-16 - ldr r6, [r4, #16] @ mdct_size; n - ldr r9, [r4, #24] @ tcos - add r5, r5, r6, lsr #1 - add r7, r7, r6 - add r9, r9, r6, lsr #1 - sub r3, r5, #16 - sub r1, r7, #16 - sub r2, r9, #16 -1: - vld2.16 {d4,d5}, [r9,:128]! - vld2.16 {d6,d7}, [r2,:128], r12 - vld2.16 {d0,d1}, [r5,:128]! - vld2.16 {d2,d3}, [r3,:128], r12 - vrev64.16 q3, q3 - vrev64.16 q1, q1 - vneg.s16 q3, q3 - vneg.s16 q2, q2 - vmull.s16 q8, d2, d6 - vmlal.s16 q8, d3, d7 - vmull.s16 q9, d0, d5 - vmlsl.s16 q9, d1, d4 - vmull.s16 q10, d0, d4 - vmlal.s16 q10, d1, d5 - vmull.s16 q11, d2, d7 - vmlsl.s16 q11, d3, d6 - vrev64.32 q8, q8 - vrev64.32 q9, q9 - vst2.32 {q10,q11},[r7,:128]! - vst2.32 {d16,d18},[r1,:128], r12 - vst2.32 {d17,d19},[r1,:128], r12 - subs r6, r6, #32 - bgt 1b - - pop {r4-r11,pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/mdct_neon.S b/ffmpeg/libavcodec/arm/mdct_neon.S deleted file mode 100644 index e481cd1..0000000 --- a/ffmpeg/libavcodec/arm/mdct_neon.S +++ /dev/null @@ -1,301 +0,0 @@ -/* - * ARM NEON optimised MDCT - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define ff_fft_calc_neon X(ff_fft_calc_neon) - -function ff_imdct_half_neon, export=1 - push {r4-r8,lr} - - mov r12, #1 - ldr lr, [r0, #20] @ mdct_bits - ldr r4, [r0, #24] @ tcos - ldr r3, [r0, #8] @ revtab - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #2 @ n4 = n >> 2 - add r7, r2, r12, lsl #1 - mov r12, #-16 - sub r7, r7, #16 - - vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0 - vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x - vrev64.32 d17, d17 - vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 - vmul.f32 d6, d17, d2 - vmul.f32 d7, d0, d2 -1: - subs lr, lr, #2 - ldr r6, [r3], #4 - vmul.f32 d4, d0, d3 - vmul.f32 d5, d17, d3 - vsub.f32 d4, d6, d4 - vadd.f32 d5, d5, d7 - uxth r8, r6, ror #16 - uxth r6, r6 - add r8, r1, r8, lsl #3 - add r6, r1, r6, lsl #3 - beq 1f - vld2.32 {d16-d17},[r7,:128],r12 - vld2.32 {d0-d1}, [r2,:128]! - vrev64.32 d17, d17 - vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2 - vmul.f32 d6, d17, d2 - vmul.f32 d7, d0, d2 - vst2.32 {d4[0],d5[0]}, [r6,:64] - vst2.32 {d4[1],d5[1]}, [r8,:64] - b 1b -1: - vst2.32 {d4[0],d5[0]}, [r6,:64] - vst2.32 {d4[1],d5[1]}, [r8,:64] - - mov r4, r0 - mov r6, r1 - bl ff_fft_calc_neon - - mov r12, #1 - ldr lr, [r4, #20] @ mdct_bits - ldr r4, [r4, #24] @ tcos - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #3 @ n8 = n >> 3 - - add r4, r4, lr, lsl #3 - add r6, r6, lr, lsl #3 - sub r1, r4, #16 - sub r3, r6, #16 - - mov r7, #-16 - mov r8, r6 - mov r0, r3 - - vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0 - vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3 - vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 -1: - subs lr, lr, #2 - vmul.f32 d7, d0, d18 - vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3 - vmul.f32 d4, d1, d18 - vmul.f32 d5, d21, d19 - vmul.f32 d6, d20, d19 - vmul.f32 d22, d1, d16 - vmul.f32 d23, d21, d17 - vmul.f32 d24, d0, d16 - vmul.f32 d25, d20, d17 - vadd.f32 d7, d7, d22 - vadd.f32 d6, d6, d23 - vsub.f32 d4, d4, d24 - vsub.f32 d5, d5, d25 - beq 1f - vld2.32 {d0-d1}, [r3,:128], r7 - vld2.32 {d20-d21},[r6,:128]! - vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0 - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128], r7 - vst2.32 {d5,d7}, [r8,:128]! - b 1b -1: - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128] - vst2.32 {d5,d7}, [r8,:128] - - pop {r4-r8,pc} -endfunc - -function ff_imdct_calc_neon, export=1 - push {r4-r6,lr} - - ldr r3, [r0, #20] - mov r4, #1 - mov r5, r1 - lsl r4, r4, r3 - add r1, r1, r4 - - bl ff_imdct_half_neon - - add r0, r5, r4, lsl #2 - add r1, r5, r4, lsl #1 - sub r0, r0, #8 - sub r2, r1, #16 - mov r3, #-16 - mov r6, #-8 - vmov.i32 d30, #1<<31 -1: - vld1.32 {d0-d1}, [r2,:128], r3 - pld [r0, #-16] - vrev64.32 q0, q0 - vld1.32 {d2-d3}, [r1,:128]! - veor d4, d1, d30 - pld [r2, #-16] - vrev64.32 q1, q1 - veor d5, d0, d30 - vst1.32 {d2}, [r0,:64], r6 - vst1.32 {d3}, [r0,:64], r6 - vst1.32 {d4-d5}, [r5,:128]! - subs r4, r4, #16 - bgt 1b - - pop {r4-r6,pc} -endfunc - -function ff_mdct_calc_neon, export=1 - push {r4-r10,lr} - - mov r12, #1 - ldr lr, [r0, #20] @ mdct_bits - ldr r4, [r0, #24] @ tcos - ldr r3, [r0, #8] @ revtab - lsl lr, r12, lr @ n = 1 << nbits - add r7, r2, lr @ in4u - sub r9, r7, #16 @ in4d - add r2, r7, lr, lsl #1 @ in3u - add r8, r9, lr, lsl #1 @ in3d - add r5, r4, lr, lsl #1 - sub r5, r5, #16 - sub r3, r3, #4 - mov r12, #-16 - - vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 - vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 - vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 - vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 - vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 - vsub.f32 d0, d18, d0 @ in4d-in4u I - vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 - vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 - vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 - vadd.f32 d1, d1, d19 @ in3u+in3d -R - vsub.f32 d16, d16, d2 @ in0u-in2d R - vadd.f32 d17, d17, d3 @ in2u+in1d -I -1: - vmul.f32 d7, d0, d21 @ I*s -A ldr r10, [r3, lr, lsr #1] -T lsr r10, lr, #1 -T ldr r10, [r3, r10] - vmul.f32 d6, d1, d20 @ -R*c - ldr r6, [r3, #4]! - vmul.f32 d4, d1, d21 @ -R*s - vmul.f32 d5, d0, d20 @ I*c - vmul.f32 d24, d16, d30 @ R*c - vmul.f32 d25, d17, d31 @ -I*s - vmul.f32 d22, d16, d31 @ R*s - vmul.f32 d23, d17, d30 @ I*c - subs lr, lr, #16 - vsub.f32 d6, d6, d7 @ -R*c-I*s - vadd.f32 d7, d4, d5 @ -R*s+I*c - vsub.f32 d24, d25, d24 @ I*s-R*c - vadd.f32 d25, d22, d23 @ R*s-I*c - beq 1f - mov r12, #-16 - vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0 - vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0 - vneg.f32 d7, d7 @ R*s-I*c - vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0 - vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1 - vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0 - vsub.f32 d0, d18, d0 @ in4d-in4u I - vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1 - vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1 - vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3 - vadd.f32 d1, d1, d19 @ in3u+in3d -R - vsub.f32 d16, d16, d2 @ in0u-in2d R - vadd.f32 d17, d17, d3 @ in2u+in1d -I - uxth r12, r6, ror #16 - uxth r6, r6 - add r12, r1, r12, lsl #3 - add r6, r1, r6, lsl #3 - vst2.32 {d6[0],d7[0]}, [r6,:64] - vst2.32 {d6[1],d7[1]}, [r12,:64] - uxth r6, r10, ror #16 - uxth r10, r10 - add r6 , r1, r6, lsl #3 - add r10, r1, r10, lsl #3 - vst2.32 {d24[0],d25[0]},[r10,:64] - vst2.32 {d24[1],d25[1]},[r6,:64] - b 1b -1: - vneg.f32 d7, d7 @ R*s-I*c - uxth r12, r6, ror #16 - uxth r6, r6 - add r12, r1, r12, lsl #3 - add r6, r1, r6, lsl #3 - vst2.32 {d6[0],d7[0]}, [r6,:64] - vst2.32 {d6[1],d7[1]}, [r12,:64] - uxth r6, r10, ror #16 - uxth r10, r10 - add r6 , r1, r6, lsl #3 - add r10, r1, r10, lsl #3 - vst2.32 {d24[0],d25[0]},[r10,:64] - vst2.32 {d24[1],d25[1]},[r6,:64] - - mov r4, r0 - mov r6, r1 - bl ff_fft_calc_neon - - mov r12, #1 - ldr lr, [r4, #20] @ mdct_bits - ldr r4, [r4, #24] @ tcos - lsl r12, r12, lr @ n = 1 << nbits - lsr lr, r12, #3 @ n8 = n >> 3 - - add r4, r4, lr, lsl #3 - add r6, r6, lr, lsl #3 - sub r1, r4, #16 - sub r3, r6, #16 - - mov r7, #-16 - mov r8, r6 - mov r0, r3 - - vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0 - vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3 - vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 -1: - subs lr, lr, #2 - vmul.f32 d7, d0, d18 @ r1*s1,r0*s0 - vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3 - vmul.f32 d4, d1, d18 @ i1*s1,i0*s0 - vmul.f32 d5, d21, d19 @ i2*s2,i3*s3 - vmul.f32 d6, d20, d19 @ r2*s2,r3*s3 - vmul.f32 d24, d0, d16 @ r1*c1,r0*c0 - vmul.f32 d25, d20, d17 @ r2*c2,r3*c3 - vmul.f32 d22, d21, d17 @ i2*c2,i3*c3 - vmul.f32 d23, d1, d16 @ i1*c1,i0*c0 - vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0 - vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3 - vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3 - vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0 - vneg.f32 q2, q2 - beq 1f - vld2.32 {d0-d1}, [r3,:128], r7 - vld2.32 {d20-d21},[r6,:128]! - vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0 - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128], r7 - vst2.32 {d5,d7}, [r8,:128]! - b 1b -1: - vrev64.32 q3, q3 - vst2.32 {d4,d6}, [r0,:128] - vst2.32 {d5,d7}, [r8,:128] - - pop {r4-r10,pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S deleted file mode 100644 index 977abb6..0000000 --- a/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro skip args:vararg -.endm - -.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0 - ldr \t1, [\w, #4*\offs] - ldr \t2, [\p, #4]! - \rsb \t1, \t1, #0 - .irpc i, 135 - ldr \t3, [\w, #4*64*\i+4*\offs] - ldr \t4, [\p, #4*64*\i] - smlal \lo, \hi, \t1, \t2 - \rsb \t3, \t3, #0 - ldr \t1, [\w, #4*64*(\i+1)+4*\offs] - ldr \t2, [\p, #4*64*(\i+1)] - smlal \lo, \hi, \t3, \t4 - \rsb \t1, \t1, #0 - .endr - ldr \t3, [\w, #4*64*7+4*\offs] - ldr \t4, [\p, #4*64*7] - smlal \lo, \hi, \t1, \t2 - \rsb \t3, \t3, #0 - smlal \lo, \hi, \t3, \t4 -.endm - -.macro round rd, lo, hi - lsr \rd, \lo, #24 - bic \lo, \lo, #0xff000000 - orr \rd, \rd, \hi, lsl #8 - mov \hi, #0 - ssat \rd, #16, \rd -.endm - -function ff_mpadsp_apply_window_fixed_armv6, export=1 - push {r2,r4-r11,lr} - - add r4, r0, #4*512 @ synth_buf + 512 - .rept 4 - ldm r0!, {r5-r12} - stm r4!, {r5-r12} - .endr - - ldr r4, [sp, #40] @ incr - sub r0, r0, #4*17 @ synth_buf + 16 - ldr r8, [r2] @ sum:low - add r2, r0, #4*32 @ synth_buf + 48 - rsb r5, r4, r4, lsl #5 @ 31 * incr - lsl r4, r4, #1 - asr r9, r8, #31 @ sum:high - add r5, r3, r5, lsl #1 @ samples2 - add r6, r1, #4*32 @ w2 - str r4, [sp, #40] - - sum8 r8, r9, r1, r0, r10, r11, r12, lr - sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32 - round r10, r8, r9 - strh_post r10, r3, r4 - - mov lr, #15 -1: - ldr r12, [r0, #4]! - ldr r11, [r6, #-4]! - ldr r10, [r1, #4]! - .irpc i, 0246 - .if \i - ldr r11, [r6, #4*64*\i] - ldr r10, [r1, #4*64*\i] - .endif - rsb r11, r11, #0 - smlal r8, r9, r10, r12 - ldr r10, [r0, #4*64*(\i+1)] - .ifeq \i - smull r4, r7, r11, r12 - .else - smlal r4, r7, r11, r12 - .endif - ldr r11, [r6, #4*64*(\i+1)] - ldr r12, [r1, #4*64*(\i+1)] - rsb r11, r11, #0 - smlal r8, r9, r12, r10 - .iflt \i-6 - ldr r12, [r0, #4*64*(\i+2)] - .else - ldr r12, [r2, #-4]! - .endif - smlal r4, r7, r11, r10 - .endr - .irpc i, 0246 - ldr r10, [r1, #4*64*\i+4*32] - rsb r12, r12, #0 - ldr r11, [r6, #4*64*\i+4*32] - smlal r8, r9, r10, r12 - ldr r10, [r2, #4*64*(\i+1)] - smlal r4, r7, r11, r12 - ldr r12, [r1, #4*64*(\i+1)+4*32] - rsb r10, r10, #0 - ldr r11, [r6, #4*64*(\i+1)+4*32] - smlal r8, r9, r12, r10 - .iflt \i-6 - ldr r12, [r2, #4*64*(\i+2)] - .else - ldr r12, [sp, #40] - .endif - smlal r4, r7, r11, r10 - .endr - round r10, r8, r9 - adds r8, r8, r4 - adc r9, r9, r7 - strh_post r10, r3, r12 - round r11, r8, r9 - subs lr, lr, #1 - strh_dpost r11, r5, r12 - bgt 1b - - sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33 - pop {r4} - round r10, r8, r9 - str r8, [r4] - strh r10, [r3] - - pop {r4-r11,pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c b/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c deleted file mode 100644 index 98e0c8a..0000000 --- a/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2011 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/mpegaudiodsp.h" -#include "config.h" - -void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window, - int *dither, int16_t *out, int incr); - -av_cold void ff_mpadsp_init_arm(MPADSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_armv6(cpu_flags)) { - s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6; - } -} diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.c b/ffmpeg/libavcodec/arm/mpegvideo_arm.c deleted file mode 100644 index 6566798..0000000 --- a/ffmpeg/libavcodec/arm/mpegvideo_arm.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2002 Michael Niedermayer - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/cpu.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/mpegvideo.h" -#include "mpegvideo_arm.h" -#include "asm-offsets.h" - -#if HAVE_NEON -CHK_OFFS(MpegEncContext, y_dc_scale, Y_DC_SCALE); -CHK_OFFS(MpegEncContext, c_dc_scale, C_DC_SCALE); -CHK_OFFS(MpegEncContext, ac_pred, AC_PRED); -CHK_OFFS(MpegEncContext, block_last_index, BLOCK_LAST_INDEX); -CHK_OFFS(MpegEncContext, inter_scantable.raster_end, INTER_SCANTAB_RASTER_END); -CHK_OFFS(MpegEncContext, h263_aic, H263_AIC); -#endif - -void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block, - int n, int qscale); -void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block, - int n, int qscale); - -av_cold void ff_MPV_common_init_arm(MpegEncContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_armv5te(cpu_flags)) - ff_MPV_common_init_armv5te(s); - - if (have_neon(cpu_flags)) { - s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon; - s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.h b/ffmpeg/libavcodec/arm/mpegvideo_arm.h deleted file mode 100644 index 4ff93b7..0000000 --- a/ffmpeg/libavcodec/arm/mpegvideo_arm.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_MPEGVIDEO_H -#define AVCODEC_ARM_MPEGVIDEO_H - -#include "libavcodec/mpegvideo.h" - -void ff_MPV_common_init_armv5te(MpegEncContext *s); - -#endif /* AVCODEC_ARM_MPEGVIDEO_H */ diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c b/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c deleted file mode 100644 index a572290..0000000 --- a/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/avassert.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/mpegvideo.h" -#include "mpegvideo_arm.h" - -void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count); - -#ifdef ENABLE_ARM_TESTS -/** - * h263 dequantizer supplementary function, it is performance critical and needs to - * have optimized implementations for each architecture. Is also used as a reference - * implementation in regression tests - */ -static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count) -{ - int i, level; - for (i = 0; i < count; i++) { - level = block[i]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[i] = level; - } - } -} -#endif - -static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - int level, qmul, qadd; - int nCoeffs; - - av_assert2(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); - block[0] = level; -} - -static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, - int16_t *block, int n, int qscale) -{ - int qmul, qadd; - int nCoeffs; - - av_assert2(s->block_last_index[n]>=0); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1); -} - -av_cold void ff_MPV_common_init_armv5te(MpegEncContext *s) -{ - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; -} diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S b/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S deleted file mode 100644 index 8687d6b..0000000 --- a/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/arm/asm.S" - -/* - * Special optimized version of dct_unquantize_h263_helper_c, it - * requires the block to be at least 8 bytes aligned, and may process - * more elements than requested. But it is guaranteed to never - * process more than 64 elements provided that count argument is <= 64, - * so it is safe. This function is optimized for a common distribution - * of values for nCoeffs (they are mostly multiple of 8 plus one or - * two extra elements). So this function processes data as 8 elements - * per loop iteration and contains optional 2 elements processing in - * the end. - * - * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) - */ - -.macro dequant_t dst, src, mul, add, tmp - rsbs \tmp, ip, \src, asr #16 - it gt - addgt \tmp, \add, #0 - it lt - rsblt \tmp, \add, #0 - it ne - smlatbne \dst, \src, \mul, \tmp -.endm - -.macro dequant_b dst, src, mul, add, tmp - rsbs \tmp, ip, \src, lsl #16 - it gt - addgt \tmp, \add, #0 - it lt - rsblt \tmp, \add, #0 - it ne - smlabbne \dst, \src, \mul, \tmp -.endm - -function ff_dct_unquantize_h263_armv5te, export=1 - push {r4-r9,lr} - mov ip, #0 - subs r3, r3, #2 - ble 2f - ldrd r4, r5, [r0, #0] -1: - ldrd r6, r7, [r0, #8] - - dequant_t r9, r4, r1, r2, r9 - dequant_t lr, r5, r1, r2, lr - dequant_b r4, r4, r1, r2, r8 - dequant_b r5, r5, r1, r2, r8 - - strh r4, [r0], #2 - strh r9, [r0], #2 - strh r5, [r0], #2 - strh lr, [r0], #2 - - dequant_t r9, r6, r1, r2, r9 - dequant_t lr, r7, r1, r2, lr - dequant_b r6, r6, r1, r2, r8 - dequant_b r7, r7, r1, r2, r8 - - strh r6, [r0], #2 - strh r9, [r0], #2 - strh r7, [r0], #2 - strh lr, [r0], #2 - - subs r3, r3, #8 - it gt - ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */ - bgt 1b - - adds r3, r3, #2 - it le - pople {r4-r9,pc} -2: - ldrsh r9, [r0, #0] - ldrsh lr, [r0, #2] - mov r8, r2 - cmp r9, #0 - it lt - rsblt r8, r2, #0 - it ne - smlabbne r9, r9, r1, r8 - mov r8, r2 - cmp lr, #0 - it lt - rsblt r8, r2, #0 - it ne - smlabbne lr, lr, r1, r8 - strh r9, [r0], #2 - strh lr, [r0], #2 - pop {r4-r9,pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/mpegvideo_neon.S b/ffmpeg/libavcodec/arm/mpegvideo_neon.S deleted file mode 100644 index e05df8e..0000000 --- a/ffmpeg/libavcodec/arm/mpegvideo_neon.S +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" -#include "asm-offsets.h" - -function ff_dct_unquantize_h263_inter_neon, export=1 - add r12, r0, #BLOCK_LAST_INDEX - ldr r12, [r12, r2, lsl #2] - add r0, r0, #INTER_SCANTAB_RASTER_END - ldrb r12, [r0, r12] - sub r2, r3, #1 - lsl r0, r3, #1 - orr r2, r2, #1 - add r3, r12, #1 -endfunc - -function ff_dct_unquantize_h263_neon, export=1 - vdup.16 q15, r0 @ qmul - vdup.16 q14, r2 @ qadd - vneg.s16 q13, q14 - cmp r3, #4 - mov r0, r1 - ble 2f -1: - vld1.16 {q0}, [r0,:128]! - vclt.s16 q3, q0, #0 - vld1.16 {q8}, [r0,:128]! - vceq.s16 q1, q0, #0 - vmul.s16 q2, q0, q15 - vclt.s16 q11, q8, #0 - vmul.s16 q10, q8, q15 - vbsl q3, q13, q14 - vbsl q11, q13, q14 - vadd.s16 q2, q2, q3 - vceq.s16 q9, q8, #0 - vadd.s16 q10, q10, q11 - vbif q0, q2, q1 - vbif q8, q10, q9 - subs r3, r3, #16 - vst1.16 {q0}, [r1,:128]! - vst1.16 {q8}, [r1,:128]! - it le - bxle lr - cmp r3, #8 - bgt 1b -2: - vld1.16 {d0}, [r0,:64] - vclt.s16 d3, d0, #0 - vceq.s16 d1, d0, #0 - vmul.s16 d2, d0, d30 - vbsl d3, d26, d28 - vadd.s16 d2, d2, d3 - vbif d0, d2, d1 - vst1.16 {d0}, [r1,:64] - bx lr -endfunc - -function ff_dct_unquantize_h263_intra_neon, export=1 - push {r4-r6,lr} - add r12, r0, #BLOCK_LAST_INDEX - ldr r6, [r0, #AC_PRED] - add lr, r0, #INTER_SCANTAB_RASTER_END - cmp r6, #0 - it ne - movne r12, #63 - bne 1f - ldr r12, [r12, r2, lsl #2] - ldrb r12, [lr, r12] -1: ldr r5, [r0, #H263_AIC] - ldrsh r4, [r1] - cmp r5, #0 - mov r5, r1 - it ne - movne r2, #0 - bne 2f - cmp r2, #4 - it ge - addge r0, r0, #4 - sub r2, r3, #1 - ldr r6, [r0, #Y_DC_SCALE] - orr r2, r2, #1 - smulbb r4, r4, r6 -2: lsl r0, r3, #1 - add r3, r12, #1 - bl ff_dct_unquantize_h263_neon - vmov.16 d0[0], r4 - vst1.16 {d0[0]}, [r5] - pop {r4-r6,pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/neon.S b/ffmpeg/libavcodec/arm/neon.S deleted file mode 100644 index 787bc4b..0000000 --- a/ffmpeg/libavcodec/arm/neon.S +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7 - vtrn.32 \r0, \r4 - vtrn.32 \r1, \r5 - vtrn.32 \r2, \r6 - vtrn.32 \r3, \r7 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.16 \r4, \r6 - vtrn.16 \r5, \r7 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 - vtrn.8 \r4, \r5 - vtrn.8 \r6, \r7 -.endm - -.macro transpose_4x4 r0, r1, r2, r3 - vtrn.16 \r0, \r2 - vtrn.16 \r1, \r3 - vtrn.8 \r0, \r1 - vtrn.8 \r2, \r3 -.endm - -.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7 - vswp \r0, \r4 - vswp \r1, \r5 - vswp \r2, \r6 - vswp \r3, \r7 -.endm - -.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7 - vtrn.32 \r0, \r2 - vtrn.32 \r1, \r3 - vtrn.32 \r4, \r6 - vtrn.32 \r5, \r7 - vtrn.16 \r0, \r1 - vtrn.16 \r2, \r3 - vtrn.16 \r4, \r5 - vtrn.16 \r6, \r7 -.endm diff --git a/ffmpeg/libavcodec/arm/rdft_neon.S b/ffmpeg/libavcodec/arm/rdft_neon.S deleted file mode 100644 index 781d976..0000000 --- a/ffmpeg/libavcodec/arm/rdft_neon.S +++ /dev/null @@ -1,150 +0,0 @@ -/* - * ARM NEON optimised RDFT - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_rdft_calc_neon, export=1 - push {r4-r8,lr} - - ldr r6, [r0, #4] @ inverse - mov r4, r0 - mov r5, r1 - - lsls r6, r6, #31 - bne 1f - add r0, r4, #20 - bl X(ff_fft_permute_neon) - add r0, r4, #20 - mov r1, r5 - bl X(ff_fft_calc_neon) -1: - ldr r12, [r4, #0] @ nbits - mov r2, #1 - lsl r12, r2, r12 - add r0, r5, #8 - add r1, r5, r12, lsl #2 - lsr r12, r12, #2 - ldr r2, [r4, #12] @ tcos - sub r12, r12, #2 - ldr r3, [r4, #16] @ tsin - mov r7, r0 - sub r1, r1, #8 - mov lr, r1 - mov r8, #-8 - vld1.32 {d0}, [r0,:64]! @ d1[0,1] - vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] - vld1.32 {d4}, [r2,:64]! @ tcos[i] - vld1.32 {d5}, [r3,:64]! @ tsin[i] - vmov.f32 d18, #0.5 @ k1 - vdup.32 d19, r6 - pld [r0, #32] - veor d19, d18, d19 @ k2 - vmov.i32 d16, #0 - vmov.i32 d17, #1<<31 - pld [r1, #-32] - vtrn.32 d16, d17 - pld [r2, #32] - vrev64.32 d16, d16 @ d16=1,0 d17=0,1 - pld [r3, #32] -2: - veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] - vld1.32 {d24}, [r0,:64]! @ d1[0,1] - vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] - vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] - vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] - veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] - pld [r0, #32] - vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re - pld [r1, #-32] - vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] - vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] - vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re - veor d7, d21, d16 @ -od.im, od.re - vrev64.32 d3, d21 @ od.re, od.im - veor d6, d20, d17 @ ev.re,-ev.im - veor d2, d3, d16 @ -od.re, od.im - vmla.f32 d20, d3, d4[1] - vmla.f32 d20, d7, d5[1] - vmla.f32 d6, d2, d4[1] - vmla.f32 d6, d21, d5[1] - vld1.32 {d4}, [r2,:64]! @ tcos[i] - veor d7, d23, d16 @ -od.im, od.re - vld1.32 {d5}, [r3,:64]! @ tsin[i] - veor d24, d22, d17 @ ev.re,-ev.im - vrev64.32 d3, d23 @ od.re, od.im - pld [r2, #32] - veor d2, d3, d16 @ -od.re, od.im - pld [r3, #32] - vmla.f32 d22, d3, d4[0] - vmla.f32 d22, d7, d5[0] - vmla.f32 d24, d2, d4[0] - vmla.f32 d24, d23, d5[0] - vld1.32 {d0}, [r0,:64]! @ d1[0,1] - vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] - vst1.32 {d20}, [r7,:64]! - vst1.32 {d6}, [lr,:64], r8 - vst1.32 {d22}, [r7,:64]! - vst1.32 {d24}, [lr,:64], r8 - subs r12, r12, #2 - bgt 2b - - veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] - vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] - vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] - ldr r2, [r4, #8] @ sign_convention - vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re - add r0, r0, #4 - bfc r2, #0, #31 - vld1.32 {d0[0]}, [r0,:32] - veor d7, d21, d16 @ -od.im, od.re - vrev64.32 d3, d21 @ od.re, od.im - veor d6, d20, d17 @ ev.re,-ev.im - vld1.32 {d22}, [r5,:64] - vdup.32 d1, r2 - vmov d23, d22 - veor d2, d3, d16 @ -od.re, od.im - vtrn.32 d22, d23 - veor d0, d0, d1 - veor d23, d23, d17 - vmla.f32 d20, d3, d4[1] - vmla.f32 d20, d7, d5[1] - vmla.f32 d6, d2, d4[1] - vmla.f32 d6, d21, d5[1] - vadd.f32 d22, d22, d23 - vst1.32 {d20}, [r7,:64] - vst1.32 {d6}, [lr,:64] - vst1.32 {d0[0]}, [r0,:32] - vst1.32 {d22}, [r5,:64] - - cmp r6, #0 - it eq - popeq {r4-r8,pc} - - vmul.f32 d22, d22, d18 - vst1.32 {d22}, [r5,:64] - add r0, r4, #20 - mov r1, r5 - bl X(ff_fft_permute_neon) - add r0, r4, #20 - mov r1, r5 - pop {r4-r8,lr} - b X(ff_fft_calc_neon) -endfunc diff --git a/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c deleted file mode 100644 index 8bfe90b..0000000 --- a/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/rv34dsp.h" -#include "libavutil/arm/cpu.h" - -void ff_rv34_inv_transform_noround_neon(int16_t *block); - -void ff_rv34_inv_transform_noround_dc_neon(int16_t *block); - -void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block); -void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc); - -av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon; - c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon; - - c->rv34_idct_add = ff_rv34_idct_add_neon; - c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/rv34dsp_neon.S b/ffmpeg/libavcodec/arm/rv34dsp_neon.S deleted file mode 100644 index 3d4a83d..0000000 --- a/ffmpeg/libavcodec/arm/rv34dsp_neon.S +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" -#include "neon.S" - -.macro rv34_inv_transform r0 - vld1.16 {q14-q15}, [\r0,:128] - vmov.s16 d0, #13 - vshll.s16 q12, d29, #3 - vshll.s16 q13, d29, #4 - vshll.s16 q9, d31, #3 - vshll.s16 q1, d31, #4 - vmull.s16 q10, d28, d0 - vmlal.s16 q10, d30, d0 - vmull.s16 q11, d28, d0 - vmlsl.s16 q11, d30, d0 - vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7 - vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17 - vsubw.s16 q9, q9, d31 - vaddw.s16 q1, q1, d31 - vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3] - vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3] - vadd.s32 q1, q10, q13 @ z0 + z3 - vadd.s32 q2, q11, q12 @ z1 + z2 - vsub.s32 q8, q10, q13 @ z0 - z3 - vsub.s32 q3, q11, q12 @ z1 - z2 - vtrn.32 q1, q2 - vtrn.32 q3, q8 - vswp d3, d6 - vswp d5, d16 - vmov.s32 d0, #13 - vadd.s32 q10, q1, q3 - vsub.s32 q11, q1, q3 - vshl.s32 q12, q2, #3 - vshl.s32 q9, q2, #4 - vmul.s32 q13, q11, d0[0] - vshl.s32 q11, q8, #4 - vadd.s32 q9, q9, q2 - vshl.s32 q15, q8, #3 - vsub.s32 q12, q12, q2 - vadd.s32 q11, q11, q8 - vmul.s32 q14, q10, d0[0] - vsub.s32 q8, q15, q8 - vsub.s32 q12, q12, q11 - vadd.s32 q9, q9, q8 - vadd.s32 q2, q13, q12 @ z1 + z2 - vadd.s32 q1, q14, q9 @ z0 + z3 - vsub.s32 q3, q13, q12 @ z1 - z2 - vsub.s32 q15, q14, q9 @ z0 - z3 -.endm - -/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */ -function ff_rv34_idct_add_neon, export=1 - mov r3, r0 - rv34_inv_transform r2 - vmov.i16 q12, #0 - vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10 - vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10 - vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10 - vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10 - vld1.32 {d28[]}, [r0,:32], r1 - vld1.32 {d29[]}, [r0,:32], r1 - vtrn.32 q8, q9 - vld1.32 {d28[1]}, [r0,:32], r1 - vld1.32 {d29[1]}, [r0,:32], r1 - vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16) - vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16) - vtrn.16 d16, d17 - vtrn.32 d28, d29 - vtrn.16 d18, d19 - vaddw.u8 q0, q8, d28 - vaddw.u8 q1, q9, d29 - vqmovun.s16 d28, q0 - vqmovun.s16 d29, q1 - vst1.32 {d28[0]}, [r3,:32], r1 - vst1.32 {d28[1]}, [r3,:32], r1 - vst1.32 {d29[0]}, [r3,:32], r1 - vst1.32 {d29[1]}, [r3,:32], r1 - bx lr -endfunc - -/* void rv34_inv_transform_noround_neon(int16_t *block); */ -function ff_rv34_inv_transform_noround_neon, export=1 - rv34_inv_transform r0 - vshl.s32 q11, q2, #1 - vshl.s32 q10, q1, #1 - vshl.s32 q12, q3, #1 - vshl.s32 q13, q15, #1 - vadd.s32 q11, q11, q2 - vadd.s32 q10, q10, q1 - vadd.s32 q12, q12, q3 - vadd.s32 q13, q13, q15 - vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11 - vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11 - vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11 - vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11 - vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]! - vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]! - vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]! - vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]! - bx lr -endfunc - -/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */ -function ff_rv34_idct_dc_add_neon, export=1 - mov r3, r0 - vld1.32 {d28[]}, [r0,:32], r1 - vld1.32 {d29[]}, [r0,:32], r1 - vdup.16 d0, r2 - vmov.s16 d1, #169 - vld1.32 {d28[1]}, [r0,:32], r1 - vmull.s16 q1, d0, d1 @ dc * 13 * 13 - vld1.32 {d29[1]}, [r0,:32], r1 - vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10 - vmov d1, d0 - vaddw.u8 q2, q0, d28 - vaddw.u8 q3, q0, d29 - vqmovun.s16 d28, q2 - vqmovun.s16 d29, q3 - vst1.32 {d28[0]}, [r3,:32], r1 - vst1.32 {d29[0]}, [r3,:32], r1 - vst1.32 {d28[1]}, [r3,:32], r1 - vst1.32 {d29[1]}, [r3,:32], r1 - bx lr -endfunc - -/* void rv34_inv_transform_dc_noround_c(int16_t *block) */ -function ff_rv34_inv_transform_noround_dc_neon, export=1 - vld1.16 {d28[]}, [r0,:16] @ block[0] - vmov.i16 d4, #251 - vorr.s16 d4, #256 @ 13^2 * 3 - vmull.s16 q3, d28, d4 - vshrn.s32 d0, q3, #11 - vmov.i16 d1, d0 - vst1.64 {q0}, [r0,:128]! - vst1.64 {q0}, [r0,:128]! - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c deleted file mode 100644 index 3bf9ac7..0000000 --- a/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavcodec/avcodec.h" -#include "libavcodec/rv34dsp.h" -#include "libavutil/arm/cpu.h" - -#define DECL_QPEL3(type, w, pos) \ - void ff_##type##_rv40_qpel##w##_mc##pos##_neon(uint8_t *dst, uint8_t *src,\ - ptrdiff_t stride) -#define DECL_QPEL2(w, pos) \ - DECL_QPEL3(put, w, pos); \ - DECL_QPEL3(avg, w, pos) - -#define DECL_QPEL_XY(x, y) \ - DECL_QPEL2(16, x ## y); \ - DECL_QPEL2(8, x ## y) - -#define DECL_QPEL_Y(y) \ - DECL_QPEL_XY(0, y); \ - DECL_QPEL_XY(1, y); \ - DECL_QPEL_XY(2, y); \ - DECL_QPEL_XY(3, y); \ - -DECL_QPEL_Y(0); -DECL_QPEL_Y(1); -DECL_QPEL_Y(2); -DECL_QPEL_Y(3); - -void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int); -void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int); - -void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t); -void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t); - -int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride, - int beta, int beta2, int edge, - int *p1, int *q1); -int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride, - int beta, int beta2, int edge, - int *p1, int *q1); - -void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1, - int filter_q1, int alpha, int beta, - int lim_p0q0, int lim_q1, int lim_p1); -void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1, - int filter_q1, int alpha, int beta, - int lim_p0q0, int lim_q1, int lim_p1); - -static av_cold void rv40dsp_init_neon(RV34DSPContext *c) -{ - c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon; - c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon; - c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon; - c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon; - c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon; - c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon; - c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon; - c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon; - c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon; - c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon; - c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon; - c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon; - c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon; - c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon; - c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon; - c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon; - c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon; - c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon; - c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon; - c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon; - c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon; - c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon; - c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon; - c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon; - c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon; - c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon; - c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon; - c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon; - c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon; - c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon; - c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon; - c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon; - c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon; - c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon; - c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon; - c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon; - c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon; - c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon; - c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon; - c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon; - c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon; - c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon; - c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon; - c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon; - c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon; - c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon; - c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon; - c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon; - c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon; - c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon; - c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon; - c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon; - - c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon; - c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon; - c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon; - c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon; - - c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon; - c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon; - - c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; - c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; - c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon; - c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon; -} - -av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) - rv40dsp_init_neon(c); -} diff --git a/ffmpeg/libavcodec/arm/rv40dsp_neon.S b/ffmpeg/libavcodec/arm/rv40dsp_neon.S deleted file mode 100644 index 099f88c..0000000 --- a/ffmpeg/libavcodec/arm/rv40dsp_neon.S +++ /dev/null @@ -1,920 +0,0 @@ -/* - * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net> - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" -#include "neon.S" - -.macro qpel_lowpass r0, r1, rc1, rc2, shift - vext.8 d25, \r0, \r1, #1 @ src[-1] - vext.8 d26, \r0, \r1, #4 @ src[ 2] - vext.8 d24, \r0, \r1, #5 @ src[ 3] - vaddl.u8 q9, d25, d26 - vaddl.u8 q8, \r0, d24 - vext.8 d27, \r0, \r1, #2 @ src[ 0] - vshl.s16 q12, q9, #2 - vsub.s16 q8, q8, q9 - vext.8 d28, \r0, \r1, #3 @ src[ 1] - vsub.s16 q8, q8, q12 - vmlal.u8 q8, d27, \rc1 - vmlal.u8 q8, d28, \rc2 - vqrshrun.s16 \r0, q8, #\shift -.endm - -.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift - vext.8 d25, \r0, \r1, #1 @ src[-1] - vext.8 d26, \r0, \r1, #4 @ src[ 2] - vext.8 d24, \r0, \r1, #5 @ src[ 3] - vaddl.u8 q9, d25, d26 - vaddl.u8 q8, \r0, d24 - vext.8 d29, \r0, \r1, #2 @ src[ 0] - vext.8 d28, \r0, \r1, #3 @ src[ 1] - vshl.s16 q10, q9, #2 - vext.8 \r1, \r2, \r3, #1 @ src[-1] - vsub.s16 q8, q8, q9 - vext.8 d22, \r2, \r3, #4 @ src[ 2] - vext.8 \r0, \r2, \r3, #5 @ src[ 3] - vaddl.u8 q13, \r1, d22 - vaddl.u8 q12, \r2, \r0 - vsub.s16 q8, q8, q10 - vshl.s16 q9, q13, #2 - vsub.s16 q12, q12, q13 - vmlal.u8 q8, d29, \rc1 - vmlal.u8 q8, d28, \rc2 - vsub.s16 q12, q12, q9 - vext.8 d26, \r2, \r3, #2 @ src[ 0] - vext.8 d27, \r2, \r3, #3 @ src[ 1] - vmlal.u8 q12, d26, \rc1 - vmlal.u8 q12, d27, \rc2 - vqrshrun.s16 \r0, q8, #\shift - vqrshrun.s16 \r2, q12, #\shift -.endm - -.macro rv40_qpel8_h shift -function put_rv40_qpel8_h_lp_packed_s\shift\()_neon -1: - vld1.8 {q2}, [r1], r2 - vld1.8 {q3}, [r1], r2 - qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift - vst1.8 {d4}, [r12,:64]! - vst1.8 {d6}, [r12,:64]! - subs r3, r3, #2 - bgt 1b - vld1.8 {q2}, [r1] - qpel_lowpass d4, d5, d0, d1, \shift - vst1.8 {d4}, [r12,:64]! - bx lr -endfunc -.endm - -.macro rv40_qpel8_v shift, type -function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon - vld1.64 {d2}, [r1,:64]! - vld1.64 {d3}, [r1,:64]! - vld1.64 {d4}, [r1,:64]! - vld1.64 {d5}, [r1,:64]! - vld1.64 {d6}, [r1,:64]! - vld1.64 {d7}, [r1,:64]! - vld1.64 {d8}, [r1,:64]! - vld1.64 {d9}, [r1,:64]! - vld1.64 {d10}, [r1,:64]! - vld1.64 {d11}, [r1,:64]! - vld1.64 {d12}, [r1,:64]! - vld1.64 {d13}, [r1,:64]! - vld1.64 {d14}, [r1,:64]! - transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 - transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31 - qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift - qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift - qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift - qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift - transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 - .ifc \type,avg - vld1.64 d12, [r0,:64], r2 - vld1.64 d13, [r0,:64], r2 - vld1.64 d14, [r0,:64], r2 - vld1.64 d15, [r0,:64], r2 - vld1.64 d16, [r0,:64], r2 - vld1.64 d17, [r0,:64], r2 - vld1.64 d18, [r0,:64], r2 - vld1.64 d19, [r0,:64], r2 - sub r0, r0, r2, lsl #3 - vrhadd.u8 q1, q1, q6 - vrhadd.u8 q2, q2, q7 - vrhadd.u8 q3, q3, q8 - vrhadd.u8 q4, q4, q9 - .endif - vst1.64 d2, [r0,:64], r2 - vst1.64 d3, [r0,:64], r2 - vst1.64 d4, [r0,:64], r2 - vst1.64 d5, [r0,:64], r2 - vst1.64 d6, [r0,:64], r2 - vst1.64 d7, [r0,:64], r2 - vst1.64 d8, [r0,:64], r2 - vst1.64 d9, [r0,:64], r2 - bx lr -endfunc -.endm - - rv40_qpel8_h 5 - rv40_qpel8_h 6 - -.macro rv40_qpel type -function \type\()_rv40_qpel8_h_lowpass_neon - .ifc \type,avg - mov r12, r0 - .endif -1: - vld1.8 {q2}, [r1], r2 - vld1.8 {q3}, [r1], r2 - qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6 - .ifc \type,avg - vld1.8 {d3}, [r12,:64], r2 - vld1.8 {d16}, [r12,:64], r2 - vrhadd.u8 d4, d4, d3 - vrhadd.u8 d6, d6, d16 - .endif - vst1.8 {d4}, [r0,:64], r2 - vst1.8 {d6}, [r0,:64], r2 - subs r3, r3, #2 - bgt 1b - bx lr -endfunc - -function \type\()_rv40_qpel8_v_lowpass_neon - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d8}, [r1], r2 - vld1.64 {d9}, [r1], r2 - vld1.64 {d10}, [r1], r2 - vld1.64 {d11}, [r1], r2 - vld1.64 {d12}, [r1], r2 - vld1.64 {d13}, [r1], r2 - vld1.64 {d14}, [r1] - transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 - transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31 - qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6 - qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6 - qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6 - qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6 - transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9 - .ifc \type,avg - vld1.64 d12, [r0,:64], r2 - vld1.64 d13, [r0,:64], r2 - vld1.64 d14, [r0,:64], r2 - vld1.64 d15, [r0,:64], r2 - vld1.64 d16, [r0,:64], r2 - vld1.64 d17, [r0,:64], r2 - vld1.64 d18, [r0,:64], r2 - vld1.64 d19, [r0,:64], r2 - sub r0, r0, r2, lsl #3 - vrhadd.u8 q1, q1, q6 - vrhadd.u8 q2, q2, q7 - vrhadd.u8 q3, q3, q8 - vrhadd.u8 q4, q4, q9 - .endif - vst1.64 d2, [r0,:64], r2 - vst1.64 d3, [r0,:64], r2 - vst1.64 d4, [r0,:64], r2 - vst1.64 d5, [r0,:64], r2 - vst1.64 d6, [r0,:64], r2 - vst1.64 d7, [r0,:64], r2 - vst1.64 d8, [r0,:64], r2 - vst1.64 d9, [r0,:64], r2 - bx lr -endfunc - - rv40_qpel8_v 5, \type - rv40_qpel8_v 6, \type - -function ff_\type\()_rv40_qpel8_mc10_neon, export=1 - sub r1, r1, #2 - mov r3, #8 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - b \type\()_rv40_qpel8_h_lowpass_neon -endfunc - -function ff_\type\()_rv40_qpel8_mc30_neon, export=1 - sub r1, r1, #2 - mov r3, #8 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - b \type\()_rv40_qpel8_h_lowpass_neon -endfunc - -function ff_\type\()_rv40_qpel8_mc01_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub r1, r1, r2, lsl #1 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl \type\()_rv40_qpel8_v_lowpass_neon - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc11_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - add r1, sp, #7 - bic r1, r1, #7 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc21_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #20 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - add r1, sp, #7 - bic r1, r1, #7 - vmov.i8 d0, #52 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc31_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - bl put_rv40_qpel8_h_lp_packed_s6_neon - add r1, sp, #7 - bic r1, r1, #7 - vswp d0, d1 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc12_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - add r1, sp, #7 - bic r1, r1, #7 - vmov.i8 d0, #20 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc22_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #20 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - add r1, sp, #7 - bic r1, r1, #7 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc32_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - bl put_rv40_qpel8_h_lp_packed_s6_neon - add r1, sp, #7 - bic r1, r1, #7 - vmov.i8 d1, #20 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc03_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub r1, r1, r2, lsl #1 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - bl \type\()_rv40_qpel8_v_lowpass_neon - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc33_neon, export=1 - mov r3, #8 - b X(ff_\type\()_pixels8_xy2_neon) -endfunc - -function ff_\type\()_rv40_qpel8_mc13_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - add r1, sp, #7 - bic r1, r1, #7 - vswp d0, d1 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel8_mc23_neon, export=1 - push {r4, lr} - vpush {d8-d15} - sub sp, sp, #14*8 - add r12, sp, #7 - bic r12, r12, #7 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - mov r3, #12 - vmov.i8 d0, #20 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - add r1, sp, #7 - bic r1, r1, #7 - vmov.i8 d1, #52 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - add sp, sp, #14*8 - vpop {d8-d15} - pop {r4, pc} -endfunc - -function ff_\type\()_rv40_qpel16_mc10_neon, export=1 - vmov.i8 d0, #52 - vmov.i8 d1, #20 -.L\type\()_rv40_qpel16_h: - push {r1, lr} - sub r1, r1, #2 - mov r3, #16 - bl \type\()_rv40_qpel8_h_lowpass_neon - pop {r1, lr} - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #6 - mov r3, #16 - b \type\()_rv40_qpel8_h_lowpass_neon -endfunc - -function ff_\type\()_rv40_qpel16_mc30_neon, export=1 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - b .L\type\()_rv40_qpel16_h -endfunc - -function ff_\type\()_rv40_qpel16_mc01_neon, export=1 - vmov.i8 d0, #52 - vmov.i8 d1, #20 -.L\type\()_rv40_qpel16_v: - sub r1, r1, r2, lsl #1 - push {r1, lr} - vpush {d8-d15} - bl \type\()_rv40_qpel8_v_lowpass_neon - sub r1, r1, r2, lsl #2 - bl \type\()_rv40_qpel8_v_lowpass_neon - ldr r1, [sp, #64] - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - add r1, r1, #8 - bl \type\()_rv40_qpel8_v_lowpass_neon - sub r1, r1, r2, lsl #2 - bl \type\()_rv40_qpel8_v_lowpass_neon - vpop {d8-d15} - pop {r1, pc} -endfunc - -function ff_\type\()_rv40_qpel16_mc11_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon -.L\type\()_rv40_qpel16_v_s6: - add r1, sp, #7 - bic r1, r1, #7 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - sub r1, r1, #40 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - sub r1, r1, #40 - bl \type\()_rv40_qpel8_v_lp_packed_s6_neon - add sp, sp, #44*8 - vpop {d8-d15} - pop {r1, pc} -endfunc - -function ff_\type\()_rv40_qpel16_mc21_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #20 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - vmov.i8 d0, #52 - b .L\type\()_rv40_qpel16_v_s6 -endfunc - -function ff_\type\()_rv40_qpel16_mc31_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - bl put_rv40_qpel8_h_lp_packed_s6_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - vswp d0, d1 - b .L\type\()_rv40_qpel16_v_s6 -endfunc - -function ff_\type\()_rv40_qpel16_mc12_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - vmov.i8 d0, #20 -.L\type\()_rv40_qpel16_v_s5: - add r1, sp, #7 - bic r1, r1, #7 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - sub r1, r1, #40 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - sub r0, r0, r2, lsl #4 - add r0, r0, #8 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - sub r1, r1, #40 - bl \type\()_rv40_qpel8_v_lp_packed_s5_neon - add sp, sp, #44*8 - vpop {d8-d15} - pop {r1, pc} -endfunc - -function ff_\type\()_rv40_qpel16_mc22_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #20 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - b .L\type\()_rv40_qpel16_v_s5 -endfunc - -function ff_\type\()_rv40_qpel16_mc32_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - bl put_rv40_qpel8_h_lp_packed_s6_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - vmov.i8 d1, #20 - b .L\type\()_rv40_qpel16_v_s5 -endfunc - -function ff_\type\()_rv40_qpel16_mc03_neon, export=1 - vmov.i8 d0, #20 - vmov.i8 d1, #52 - b .L\type\()_rv40_qpel16_v -endfunc - -function ff_\type\()_rv40_qpel16_mc13_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #52 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s6_neon - vswp d0, d1 - b .L\type\()_rv40_qpel16_v_s6 -endfunc - -function ff_\type\()_rv40_qpel16_mc23_neon, export=1 - sub r1, r1, r2, lsl #1 - sub r1, r1, #2 - push {r1, lr} - vpush {d8-d15} - sub sp, sp, #44*8 - add r12, sp, #7 - bic r12, r12, #7 - mov r3, #20 - vmov.i8 d0, #20 - vmov.i8 d1, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - ldr r1, [sp, #416] - add r1, r1, #8 - mov r3, #20 - bl put_rv40_qpel8_h_lp_packed_s5_neon - vmov.i8 d1, #52 - b .L\type\()_rv40_qpel16_v_s6 -endfunc - -function ff_\type\()_rv40_qpel16_mc33_neon, export=1 - mov r3, #16 - b X(ff_\type\()_pixels16_xy2_neon) -endfunc -.endm - - rv40_qpel put - rv40_qpel avg - -.macro rv40_weight - vmovl.u8 q8, d2 - vmovl.u8 q9, d3 - vmovl.u8 q10, d4 - vmovl.u8 q11, d5 - vmull.u16 q2, d16, d0[2] - vmull.u16 q3, d17, d0[2] - vmull.u16 q8, d18, d0[2] - vmull.u16 q9, d19, d0[2] - vmull.u16 q12, d20, d0[0] - vmull.u16 q13, d21, d0[0] - vmull.u16 q14, d22, d0[0] - vmull.u16 q15, d23, d0[0] - vshrn.i32 d4, q2, #9 - vshrn.i32 d5, q3, #9 - vshrn.i32 d6, q8, #9 - vshrn.i32 d7, q9, #9 - vshrn.i32 d16, q12, #9 - vshrn.i32 d17, q13, #9 - vshrn.i32 d18, q14, #9 - vshrn.i32 d19, q15, #9 - vadd.u16 q2, q2, q8 - vadd.u16 q3, q3, q9 - vrshrn.i16 d2, q2, #5 - vrshrn.i16 d3, q3, #5 -.endm - -/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int w1, int w2, int stride) */ -function ff_rv40_weight_func_16_neon, export=1 - ldr r12, [sp] - vmov d0, r3, r12 - ldr r12, [sp, #4] - mov r3, #16 -1: - vld1.8 {q1}, [r1,:128], r12 - vld1.8 {q2}, [r2,:128], r12 - rv40_weight - vst1.8 {q1}, [r0,:128], r12 - subs r3, r3, #1 - bne 1b - bx lr -endfunc - -/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2, - int w1, int w2, int stride) */ -function ff_rv40_weight_func_8_neon, export=1 - ldr r12, [sp] - vmov d0, r3, r12 - ldr r12, [sp, #4] - mov r3, #8 -1: - vld1.8 {d2}, [r1,:64], r12 - vld1.8 {d3}, [r1,:64], r12 - vld1.8 {d4}, [r2,:64], r12 - vld1.8 {d5}, [r2,:64], r12 - rv40_weight - vst1.8 {d2}, [r0,:64], r12 - vst1.8 {d3}, [r0,:64], r12 - subs r3, r3, #2 - bne 1b - bx lr -endfunc - -function ff_rv40_h_loop_filter_strength_neon, export=1 - pkhbt r2, r3, r2, lsl #18 - - ldr r3, [r0] - ldr_dpre r12, r0, r1 - teq r3, r12 - beq 1f - - sub r0, r0, r1, lsl #1 - - vld1.32 {d4[]}, [r0,:32], r1 @ -3 - vld1.32 {d0[]}, [r0,:32], r1 @ -2 - vld1.32 {d4[1]}, [r0,:32], r1 @ -1 - vld1.32 {d5[]}, [r0,:32], r1 @ 0 - vld1.32 {d1[]}, [r0,:32], r1 @ 1 - vld1.32 {d5[0]}, [r0,:32], r1 @ 2 - - vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1 - vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0 - vdup.32 d30, r2 @ beta2, beta << 2 - vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1 - vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0 - vabd.u16 d16, d18, d16 - vclt.u16 d16, d16, d30 - - ldrd r2, r3, [sp, #4] - vmovl.u16 q12, d16 - vtrn.16 d16, d17 - vshr.u32 q12, q12, #15 - ldr r0, [sp] - vst1.32 {d24[1]}, [r2,:32] - vst1.32 {d25[1]}, [r3,:32] - - cmp r0, #0 - it eq - bxeq lr - - vand d18, d16, d17 - vtrn.32 d18, d19 - vand d18, d18, d19 - vmov.u16 r0, d18[0] - bx lr -1: - ldrd r2, r3, [sp, #4] - mov r0, #0 - str r0, [r2] - str r0, [r3] - bx lr -endfunc - -function ff_rv40_v_loop_filter_strength_neon, export=1 - sub r0, r0, #3 - pkhbt r2, r3, r2, lsl #18 - - vld1.8 {d0}, [r0], r1 - vld1.8 {d1}, [r0], r1 - vld1.8 {d2}, [r0], r1 - vld1.8 {d3}, [r0], r1 - - vaddl.u8 q0, d0, d1 - vaddl.u8 q1, d2, d3 - vdup.32 q15, r2 - vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2 - vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2 - vabd.u16 q0, q1, q0 - vclt.u16 q0, q0, q15 - - ldrd r2, r3, [sp, #4] - vmovl.u16 q1, d0 - vext.16 d1, d0, d1, #3 - vshr.u32 q1, q1, #15 - ldr r0, [sp] - vst1.32 {d2[1]}, [r2,:32] - vst1.32 {d3[1]}, [r3,:32] - - cmp r0, #0 - it eq - bxeq lr - - vand d0, d0, d1 - vtrn.16 d0, d1 - vand d0, d0, d1 - vmov.u16 r0, d0[0] - bx lr -endfunc - -.macro rv40_weak_loop_filter - vdup.16 d30, r2 @ filter_p1 - vdup.16 d31, r3 @ filter_q1 - ldrd r2, r3, [sp] - vdup.16 d28, r2 @ alpha - vdup.16 d29, r3 @ beta - ldr r12, [sp, #8] - vdup.16 d25, r12 @ lim_p0q0 - ldrd r2, r3, [sp, #12] - vsubl.u8 q9, d5, d4 @ x, t - vabdl.u8 q8, d5, d4 @ x, abs(t) - vneg.s16 q15, q15 - vceq.i16 d16, d19, #0 @ !t - vshl.s16 d19, d19, #2 @ t << 2 - vmul.u16 d18, d17, d28 @ alpha * abs(t) - vand d24, d30, d31 @ filter_p1 & filter_q1 - vsubl.u8 q1, d0, d4 @ p1p2, p1p0 - vsubl.u8 q3, d1, d5 @ q1q2, q1q0 - vmov.i16 d22, #3 - vshr.u16 d18, d18, #7 - vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1) - vsubl.u8 q10, d0, d1 @ src[-2] - src[1] - vcle.u16 d18, d18, d22 - vand d20, d20, d24 - vneg.s16 d23, d25 @ -lim_p0q0 - vadd.s16 d19, d19, d20 - vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1) - vtrn.32 d4, d5 @ -3, 2, -1, 0 - vrshr.s16 d19, d19, #3 - vmov d28, d29 @ beta - vswp d3, d6 @ q1q2, p1p0 - vmin.s16 d19, d19, d25 - vand d30, d30, d16 - vand d31, d31, d16 - vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0 - vmax.s16 d19, d19, d23 @ diff - vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2) - vand d18, d19, d16 @ diff - vcle.u16 q1, q1, q14 - vneg.s16 d19, d18 @ -diff - vdup.16 d26, r3 @ lim_p1 - vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff - vhsub.s16 q11, q10, q9 - vand q1, q1, q15 - vqmovun.s16 d4, q2 @ -1, 0 - vand q9, q11, q1 - vdup.16 d27, r2 @ lim_q1 - vneg.s16 q9, q9 - vneg.s16 q14, q13 - vmin.s16 q9, q9, q13 - vtrn.32 d0, d1 @ -2, 1, -2, 1 - vmax.s16 q9, q9, q14 - vaddw.u8 q3, q9, d0 - vqmovun.s16 d5, q3 @ -2, 1 -.endm - -function ff_rv40_h_weak_loop_filter_neon, export=1 - sub r0, r0, r1, lsl #1 - sub r0, r0, r1 - - vld1.32 {d4[]}, [r0,:32], r1 - vld1.32 {d0[]}, [r0,:32], r1 - vld1.32 {d4[1]}, [r0,:32], r1 - vld1.32 {d5[]}, [r0,:32], r1 - vld1.32 {d1[]}, [r0,:32], r1 - vld1.32 {d5[0]}, [r0,:32] - - sub r0, r0, r1, lsl #2 - - rv40_weak_loop_filter - - vst1.32 {d5[0]}, [r0,:32], r1 - vst1.32 {d4[0]}, [r0,:32], r1 - vst1.32 {d4[1]}, [r0,:32], r1 - vst1.32 {d5[1]}, [r0,:32], r1 - - bx lr -endfunc - -function ff_rv40_v_weak_loop_filter_neon, export=1 - sub r12, r0, #3 - sub r0, r0, #2 - - vld1.8 {d4}, [r12], r1 - vld1.8 {d5}, [r12], r1 - vld1.8 {d2}, [r12], r1 - vld1.8 {d3}, [r12], r1 - - vtrn.16 q2, q1 - vtrn.8 d4, d5 - vtrn.8 d2, d3 - - vrev64.32 d5, d5 - vtrn.32 q2, q1 - vdup.32 d0, d3[0] - vdup.32 d1, d2[0] - - rv40_weak_loop_filter - - vtrn.32 q2, q3 - vswp d4, d5 - - vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1 - vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1 - vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1 - vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1 - - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c b/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c deleted file mode 100644 index 4fb69f9..0000000 --- a/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) 2012 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" -#include "libavutil/arm/cpu.h" -#include "libavutil/attributes.h" -#include "libavcodec/sbrdsp.h" - -void ff_sbr_sum64x5_neon(float *z); -float ff_sbr_sum_square_neon(float (*x)[2], int n); -void ff_sbr_neg_odd_64_neon(float *x); -void ff_sbr_qmf_pre_shuffle_neon(float *z); -void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z); -void ff_sbr_qmf_deint_neg_neon(float *v, const float *src); -void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1); -void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2], - const float *g_filt, int m_max, intptr_t ixh); -void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2], - const float alpha0[2], const float alpha1[2], - float bw, int start, int end); -void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]); - -void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m, - const float *q_filt, int noise, - int kx, int m_max); -void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m, - const float *q_filt, int noise, - int kx, int m_max); -void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m, - const float *q_filt, int noise, - int kx, int m_max); -void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m, - const float *q_filt, int noise, - int kx, int m_max); - -av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - s->sum64x5 = ff_sbr_sum64x5_neon; - s->sum_square = ff_sbr_sum_square_neon; - s->neg_odd_64 = ff_sbr_neg_odd_64_neon; - s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon; - s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon; - s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon; - s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon; - s->hf_g_filt = ff_sbr_hf_g_filt_neon; - s->hf_gen = ff_sbr_hf_gen_neon; - s->autocorrelate = ff_sbr_autocorrelate_neon; - s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon; - s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon; - s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon; - s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/sbrdsp_neon.S b/ffmpeg/libavcodec/arm/sbrdsp_neon.S deleted file mode 100644 index e66abd6..0000000 --- a/ffmpeg/libavcodec/arm/sbrdsp_neon.S +++ /dev/null @@ -1,411 +0,0 @@ -/* - * Copyright (c) 2012 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_sbr_sum64x5_neon, export=1 - push {lr} - add r1, r0, # 64*4 - add r2, r0, #128*4 - add r3, r0, #192*4 - add lr, r0, #256*4 - mov r12, #64 -1: - vld1.32 {q0}, [r0,:128] - vld1.32 {q1}, [r1,:128]! - vadd.f32 q0, q0, q1 - vld1.32 {q2}, [r2,:128]! - vadd.f32 q0, q0, q2 - vld1.32 {q3}, [r3,:128]! - vadd.f32 q0, q0, q3 - vld1.32 {q8}, [lr,:128]! - vadd.f32 q0, q0, q8 - vst1.32 {q0}, [r0,:128]! - subs r12, #4 - bgt 1b - pop {pc} -endfunc - -function ff_sbr_sum_square_neon, export=1 - vmov.f32 q0, #0.0 -1: - vld1.32 {q1}, [r0,:128]! - vmla.f32 q0, q1, q1 - subs r1, r1, #2 - bgt 1b - vadd.f32 d0, d0, d1 - vpadd.f32 d0, d0, d0 -NOVFP vmov.32 r0, d0[0] - bx lr -endfunc - -function ff_sbr_neg_odd_64_neon, export=1 - mov r1, r0 - vmov.i32 q8, #1<<31 - vld2.32 {q0,q1}, [r0,:128]! - veor q1, q1, q8 - vld2.32 {q2,q3}, [r0,:128]! - .rept 3 - vst2.32 {q0,q1}, [r1,:128]! - veor q3, q3, q8 - vld2.32 {q0,q1}, [r0,:128]! - vst2.32 {q2,q3}, [r1,:128]! - veor q1, q1, q8 - vld2.32 {q2,q3}, [r0,:128]! - .endr - veor q3, q3, q8 - vst2.32 {q0,q1}, [r1,:128]! - vst2.32 {q2,q3}, [r1,:128]! - bx lr -endfunc - -function ff_sbr_qmf_pre_shuffle_neon, export=1 - add r1, r0, #60*4 - add r2, r0, #64*4 - vld1.32 {d0}, [r0,:64]! - vst1.32 {d0}, [r2,:64]! - mov r3, #-16 - mov r12, #24 - vmov.i32 q8, #1<<31 - vld1.32 {q0}, [r1,:128], r3 - vld1.32 {d2}, [r0,:64]! -1: - vld1.32 {d3,d4}, [r0,:128]! - vrev64.32 q0, q0 - vld1.32 {q9}, [r1,:128], r3 - veor q0, q0, q8 - vld1.32 {d5,d6}, [r0,:128]! - vswp d0, d1 - vrev64.32 q9, q9 - vst2.32 {q0,q1}, [r2,:64]! - vmov q10, q2 - veor q9, q9, q8 - vmov d2, d6 - vswp d18, d19 - vld1.32 {q0}, [r1,:128], r3 - vst2.32 {q9,q10}, [r2,:64]! - subs r12, r12, #8 - bgt 1b - vld1.32 {d3,d4}, [r0,:128]! - vrev64.32 q0, q0 - vld1.32 {q9}, [r1,:128], r3 - veor q0, q0, q8 - vld1.32 {d5}, [r0,:64]! - vswp d0, d1 - vrev64.32 q9, q9 - vst2.32 {q0,q1}, [r2,:64]! - vswp d4, d5 - veor q1, q9, q8 - vst2.32 {d3,d5}, [r2,:64]! - vst2.32 {d2[0],d4[0]}, [r2,:64]! - bx lr -endfunc - -function ff_sbr_qmf_post_shuffle_neon, export=1 - add r2, r1, #60*4 - mov r3, #-16 - mov r12, #32 - vmov.i32 q8, #1<<31 - vld1.32 {q0}, [r2,:128], r3 - vld1.32 {q1}, [r1,:128]! -1: - pld [r2, #-32] - vrev64.32 q0, q0 - vswp d2, d3 - veor q0, q0, q8 - vld1.32 {q2}, [r2,:128], r3 - vld1.32 {q3}, [r1,:128]! - vst2.32 {d1,d3}, [r0,:128]! - vst2.32 {d0,d2}, [r0,:128]! - pld [r2, #-32] - vrev64.32 q2, q2 - vswp d6, d7 - veor q2, q2, q8 - vld1.32 {q0}, [r2,:128], r3 - vld1.32 {q1}, [r1,:128]! - vst2.32 {d5,d7}, [r0,:128]! - vst2.32 {d4,d6}, [r0,:128]! - subs r12, r12, #8 - bgt 1b - bx lr -endfunc - -function ff_sbr_qmf_deint_neg_neon, export=1 - add r1, r1, #60*4 - add r2, r0, #62*4 - mov r3, #-16 - mov r12, #32 - vmov.i32 d2, #1<<31 -1: - vld2.32 {d0,d1}, [r1,:128], r3 - veor d0, d0, d2 - vrev64.32 d1, d1 - vst1.32 {d0}, [r2,:64] - vst1.32 {d1}, [r0,:64]! - sub r2, r2, #8 - subs r12, r12, #2 - bgt 1b - bx lr -endfunc - -function ff_sbr_qmf_deint_bfly_neon, export=1 - push {lr} - add r2, r2, #60*4 - add r3, r0, #124*4 - mov r12, #64 - mov lr, #-16 -1: - vld1.32 {q0}, [r1,:128]! - vld1.32 {q1}, [r2,:128], lr - vrev64.32 q2, q0 - vrev64.32 q3, q1 - vadd.f32 d3, d4, d3 - vadd.f32 d2, d5, d2 - vsub.f32 d0, d0, d7 - vsub.f32 d1, d1, d6 - vst1.32 {q1}, [r3,:128], lr - vst1.32 {q0}, [r0,:128]! - subs r12, r12, #4 - bgt 1b - pop {pc} -endfunc - -function ff_sbr_hf_g_filt_neon, export=1 - ldr r12, [sp] - add r1, r1, r12, lsl #3 - mov r12, #40*2*4 - sub r3, r3, #1 - vld2.32 {d2[],d3[]},[r2,:64]! - vld1.32 {d0}, [r1,:64], r12 -1: - vld1.32 {d1}, [r1,:64], r12 - vmul.f32 q3, q0, q1 - vld2.32 {d2[],d3[]},[r2,:64]! - vld1.32 {d0}, [r1,:64], r12 - vst1.32 {q3}, [r0,:64]! - subs r3, r3, #2 - bgt 1b - it lt - bxlt lr - vmul.f32 d0, d0, d2 - vst1.32 {d0}, [r0,:64]! - bx lr -endfunc - -function ff_sbr_hf_gen_neon, export=1 -NOVFP vld1.32 {d1[]}, [sp,:32] -VFP vdup.32 d1, d0[0] - vmul.f32 d0, d1, d1 - vld1.32 {d3}, [r2,:64] - vld1.32 {d2}, [r3,:64] - vmul.f32 q0, q0, q1 - ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS] - vtrn.32 d0, d1 - vneg.f32 d18, d1 - vtrn.32 d18, d1 - add r0, r0, r2, lsl #3 - add r1, r1, r2, lsl #3 - sub r1, r1, #2*8 - sub r3, r3, r2 - vld1.32 {q1}, [r1,:128]! -1: - vld1.32 {q3}, [r1,:128]! - vrev64.32 q2, q1 - vmov q8, q3 - vrev64.32 d20, d3 - vrev64.32 d21, d6 - vmla.f32 q3, q1, d0[0] - vmla.f32 d6, d4, d18 - vmla.f32 d7, d20, d18 - vmla.f32 d6, d3, d0[1] - vmla.f32 d7, d16, d0[1] - vmla.f32 d6, d5, d1 - vmla.f32 d7, d21, d1 - vmov q1, q8 - vst1.32 {q3}, [r0,:128]! - subs r3, r3, #2 - bgt 1b - bx lr -endfunc - -function ff_sbr_autocorrelate_neon, export=1 - vld1.32 {q0}, [r0,:128]! - vmov.f32 q1, #0.0 - vmov.f32 q3, #0.0 - vmov.f32 d20, #0.0 - vmul.f32 d21, d1, d1 - vmov q8, q0 - vmov q11, q0 - mov r12, #36 -1: - vld1.32 {q2}, [r0,:128]! - vrev64.32 q12, q2 - vmla.f32 q10, q2, q2 - vmla.f32 d2, d1, d4 - vmla.f32 d3, d1, d24 - vmla.f32 d6, d0, d4 - vmla.f32 d7, d0, d24 - vmla.f32 d2, d4, d5 - vmla.f32 d3, d4, d25 - vmla.f32 d6, d1, d5 - vmla.f32 d7, d1, d25 - vmov q0, q2 - subs r12, r12, #2 - bgt 1b - vld1.32 {q2}, [r0,:128]! - vrev64.32 q12, q2 - vmla.f32 d2, d1, d4 - vmla.f32 d3, d1, d24 - vmla.f32 d6, d0, d4 - vmla.f32 d7, d0, d24 - vadd.f32 d20, d20, d21 - vrev64.32 d18, d17 - vmla.f32 d6, d1, d5 - vmla.f32 d7, d1, d25 - vmov q0, q1 - vmla.f32 d0, d16, d17 - vmla.f32 d1, d16, d18 - vmla.f32 d2, d4, d5 - vmla.f32 d3, d4, d25 - vneg.f32 s15, s15 - vmov d21, d20 - vpadd.f32 d0, d0, d2 - vpadd.f32 d7, d6, d7 - vtrn.32 d1, d3 - vsub.f32 d6, d1, d3 - vmla.f32 d20, d22, d22 - vmla.f32 d21, d4, d4 - vtrn.32 d0, d6 - vpadd.f32 d20, d20, d21 - vst1.32 {q3}, [r1,:128]! - vst1.32 {d20[1]}, [r1,:32] - add r1, r1, #2*4 - vst1.32 {d0}, [r1,:64] - add r1, r1, #4*4 - vst1.32 {d20[0]}, [r1,:32] - bx lr -endfunc - -function ff_sbr_hf_apply_noise_0_neon, export=1 - vmov.i32 d3, #0 -.Lhf_apply_noise_0: - push {r4,lr} - movrelx r4, X(ff_sbr_noise_table) - ldr r12, [sp, #12] - add r3, r3, #1 - bfc r3, #9, #23 - sub r12, r12, #1 -1: - add lr, r4, r3, lsl #3 - vld2.32 {q0}, [r0,:64] - vld2.32 {q3}, [lr,:64] - vld1.32 {d2}, [r1,:64]! - vld1.32 {d18}, [r2,:64]! - vceq.f32 d16, d2, #0 - veor d2, d2, d3 - vmov q2, q0 - vmla.f32 d0, d6, d18 - vmla.f32 d1, d7, d18 - vadd.f32 d4, d4, d2 - add r3, r3, #2 - bfc r3, #9, #23 - vbif d0, d4, d16 - vbif d1, d5, d16 - vst2.32 {q0}, [r0,:64]! - subs r12, r12, #2 - bgt 1b - blt 2f - add lr, r4, r3, lsl #3 - vld1.32 {d0}, [r0,:64] - vld1.32 {d6}, [lr,:64] - vld1.32 {d2[]}, [r1,:32]! - vld1.32 {d3[]}, [r2,:32]! - vceq.f32 d4, d2, #0 - veor d2, d2, d3 - vmov d1, d0 - vmla.f32 d0, d6, d3 - vadd.f32 s2, s2, s4 - vbif d0, d1, d4 - vst1.32 {d0}, [r0,:64]! -2: - pop {r4,pc} -endfunc - -function ff_sbr_hf_apply_noise_1_neon, export=1 - ldr r12, [sp] - push {r4,lr} - lsl r12, r12, #31 - eor lr, r12, #1<<31 - vmov d3, r12, lr -.Lhf_apply_noise_1: - movrelx r4, X(ff_sbr_noise_table) - ldr r12, [sp, #12] - add r3, r3, #1 - bfc r3, #9, #23 - sub r12, r12, #1 -1: - add lr, r4, r3, lsl #3 - vld2.32 {q0}, [r0,:64] - vld2.32 {q3}, [lr,:64] - vld1.32 {d2}, [r1,:64]! - vld1.32 {d18}, [r2,:64]! - vceq.f32 d16, d2, #0 - veor d2, d2, d3 - vmov q2, q0 - vmla.f32 d0, d6, d18 - vmla.f32 d1, d7, d18 - vadd.f32 d5, d5, d2 - add r3, r3, #2 - bfc r3, #9, #23 - vbif d0, d4, d16 - vbif d1, d5, d16 - vst2.32 {q0}, [r0,:64]! - subs r12, r12, #2 - bgt 1b - blt 2f - add lr, r4, r3, lsl #3 - vld1.32 {d0}, [r0,:64] - vld1.32 {d6}, [lr,:64] - vld1.32 {d2[]}, [r1,:32]! - vld1.32 {d18[]}, [r2,:32]! - vceq.f32 d4, d2, #0 - veor d2, d2, d3 - vmov d1, d0 - vmla.f32 d0, d6, d18 - vadd.f32 s3, s3, s5 - vbif d0, d1, d4 - vst1.32 {d0}, [r0,:64]! -2: - pop {r4,pc} -endfunc - -function ff_sbr_hf_apply_noise_2_neon, export=1 - vmov.i32 d3, #1<<31 - b .Lhf_apply_noise_0 -endfunc - -function ff_sbr_hf_apply_noise_3_neon, export=1 - ldr r12, [sp] - push {r4,lr} - lsl r12, r12, #31 - eor lr, r12, #1<<31 - vmov d3, lr, r12 - b .Lhf_apply_noise_1 -endfunc diff --git a/ffmpeg/libavcodec/arm/simple_idct_arm.S b/ffmpeg/libavcodec/arm/simple_idct_arm.S deleted file mode 100644 index 50d20c9..0000000 --- a/ffmpeg/libavcodec/arm/simple_idct_arm.S +++ /dev/null @@ -1,479 +0,0 @@ -/* - * Copyright (C) 2002 Frederic 'dilb' Boulay - * - * Author: Frederic Boulay <dilb@handhelds.org> - * - * The function defined in this file is derived from the simple_idct function - * from the libavcodec library part of the FFmpeg project. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -/* useful constants for the algorithm */ -#define W1 22725 -#define W2 21407 -#define W3 19266 -#define W4 16383 -#define W5 12873 -#define W6 8867 -#define W7 4520 -#define MASK_MSHW 0xFFFF0000 - -#define ROW_SHIFT 11 -#define ROW_SHIFT2MSHW (16-11) -#define COL_SHIFT 20 -#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */ -#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */ - - -function ff_simple_idct_arm, export=1 - @@ void simple_idct_arm(int16_t *block) - @@ save stack for reg needed (take all of them), - @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block - @@ so it must not be overwritten, if it is not saved!! - @@ R12 is another scratch register, so it should not be saved too - @@ save all registers - stmfd sp!, {r4-r11, r14} @ R14 is also called LR - @@ at this point, R0=block, other registers are free. - add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block. - @@ add 2 temporary variables in the stack: R0 and R14 - sub sp, sp, #8 @ allow 2 local variables - str r0, [sp, #0] @ save block in sp[0] - @@ stack status - @@ sp+4 free - @@ sp+0 R0 (block) - - - @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free - - -__row_loop: - @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :) - ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer) - ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1] - ldr r3, [r14, #8] @ R3=ROWr32[2] - ldr r4, [r14, #12] @ R4=ROWr32[3] - @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop), - @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row) - @@ else follow the complete algorithm. - @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], - @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free - orr r5, r4, r3 @ R5=R4 | R3 - orr r5, r5, r2 @ R5=R4 | R3 | R2 - orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null) - beq __end_row_loop - mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later) - ldrsh r6, [r14, #0] @ R6=ROWr16[0] - orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7 - beq __almost_empty_row - -@@ __b_evaluation: - @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3], - @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free, - @@ R12=__const_ptr_, R14=&block[n] - @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3 - - @@ MUL16(b0, W1, row[1]); - @@ MUL16(b1, W3, row[1]); - @@ MUL16(b2, W5, row[1]); - @@ MUL16(b3, W7, row[1]); - @@ MAC16(b0, W3, row[3]); - @@ MAC16(b1, -W7, row[3]); - @@ MAC16(b2, -W1, row[3]); - @@ MAC16(b3, -W5, row[3]); - ldr r8, =W1 @ R8=W1 - mov r2, r2, asr #16 @ R2=ROWr16[3] - mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, =W3 @ R9=W3 - ldr r10, =W5 @ R10=W5 - mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, =W7 @ R11=W7 - mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if null avoid muls - itttt ne - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - rsbne r2, r2, #0 @ R2=-ROWr16[3] - mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - it ne - mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - - @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], - @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; - @@ if (temp != 0) {} - orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3] - beq __end_b_evaluation - - @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3], - @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ MAC16(b0, W5, row[5]); - @@ MAC16(b2, W7, row[5]); - @@ MAC16(b3, W3, row[5]); - @@ MAC16(b1, -W1, row[5]); - @@ MAC16(b0, W7, row[7]); - @@ MAC16(b2, W3, row[7]); - @@ MAC16(b3, -W1, row[7]); - @@ MAC16(b1, -W5, row[7]); - mov r3, r3, asr #16 @ R3=ROWr16[5] - teq r3, #0 @ if null avoid muls - it ne - mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0 - mov r4, r4, asr #16 @ R4=ROWr16[7] - itttt ne - mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2 - mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3 - rsbne r3, r3, #0 @ R3=-ROWr16[5] - mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1 - @@ R3 is free now - teq r4, #0 @ if null avoid muls - itttt ne - mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0 - mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2 - rsbne r4, r4, #0 @ R4=-ROWr16[7] - mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3 - it ne - mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1 - @@ R4 is free now -__end_b_evaluation: - @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free), - @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - -@@ __a_evaluation: - @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1)); - @@ a1 = a0 + W6 * row[2]; - @@ a2 = a0 - W6 * row[2]; - @@ a3 = a0 - W2 * row[2]; - @@ a0 = a0 + W2 * row[2]; - ldr r9, =W4 @ R9=W4 - mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, =W6 @ R10=W6 - ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet) - add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0) - - mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, =W2 @ R8=W2 - sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) - @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3]; - @@ if (temp != 0) {} - teq r2, #0 - beq __end_bef_a_evaluation - - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - - - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - - - @@ a0 += W4*row[4] - @@ a1 -= W4*row[4] - @@ a2 -= W4*row[4] - @@ a3 += W4*row[4] - ldrsh r11, [r14, #8] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls - it ne - mulne r11, r9, r11 @ R11=W4*ROWr16[4] - @@ R9 is free now - ldrsh r9, [r14, #12] @ R9=ROWr16[6] - itttt ne - addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) - subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) - subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) - addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) - @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls - itttt ne - mulne r11, r10, r9 @ R11=W6*ROWr16[6] - addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) - mulne r10, r8, r9 @ R10=W2*ROWr16[6] - @@ a0 += W6*row[6]; - @@ a3 -= W6*row[6]; - @@ a1 -= W2*row[6]; - @@ a2 += W2*row[6]; - subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) - itt ne - subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) - addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) - -__end_a_evaluation: - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ row[0] = (a0 + b0) >> ROW_SHIFT; - @@ row[1] = (a1 + b1) >> ROW_SHIFT; - @@ row[2] = (a2 + b2) >> ROW_SHIFT; - @@ row[3] = (a3 + b3) >> ROW_SHIFT; - @@ row[4] = (a3 - b3) >> ROW_SHIFT; - @@ row[5] = (a2 - b2) >> ROW_SHIFT; - @@ row[6] = (a1 - b1) >> ROW_SHIFT; - @@ row[7] = (a0 - b0) >> ROW_SHIFT; - add r8, r6, r0 @ R8=a0+b0 - add r9, r2, r1 @ R9=a1+b1 - @@ put 2 16 bits half-words in a 32bits word - @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!) - ldr r10, =MASK_MSHW @ R10=0xFFFF0000 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5) - mvn r11, r10 @ R11= NOT R10= 0x0000FFFF - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11) - orr r8, r8, r9 - str r8, [r14, #0] - - add r8, r3, r5 @ R8=a2+b2 - add r9, r4, r7 @ R9=a3+b3 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11) - orr r8, r8, r9 - str r8, [r14, #4] - - sub r8, r4, r7 @ R8=a3-b3 - sub r9, r3, r5 @ R9=a2-b2 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11) - orr r8, r8, r9 - str r8, [r14, #8] - - sub r8, r2, r1 @ R8=a1-b1 - sub r9, r6, r0 @ R9=a0-b0 - and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5) - and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11) - orr r8, r8, r9 - str r8, [r14, #12] - - bal __end_row_loop - -__almost_empty_row: - @@ the row was empty, except ROWr16[0], now, management of this special case - @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1], - @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1], - @@ R8=0xFFFF (temp), R9-R11 free - mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run). - sub r8, r8, #1 @ R8 is now ready. - and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF - orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16) - str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5 - str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5 - str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5 - str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5 - -__end_row_loop: - @@ at this point, R0-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - ldr r0, [sp, #0] @ R0=block - teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished. - sub r14, r14, #16 - bne __row_loop - - - - @@ at this point, R0=block, R1-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block. -__col_loop: - -@@ __b_evaluation2: - @@ at this point, R0=block (temp), R1-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - @@ proceed with b0-b3 first, followed by a0-a3 - @@ MUL16(b0, W1, col[8x1]); - @@ MUL16(b1, W3, col[8x1]); - @@ MUL16(b2, W5, col[8x1]); - @@ MUL16(b3, W7, col[8x1]); - @@ MAC16(b0, W3, col[8x3]); - @@ MAC16(b1, -W7, col[8x3]); - @@ MAC16(b2, -W1, col[8x3]); - @@ MAC16(b3, -W5, col[8x3]); - ldr r8, =W1 @ R8=W1 - ldrsh r7, [r14, #16] - mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r9, =W3 @ R9=W3 - ldr r10, =W5 @ R10=W5 - mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldr r11, =W7 @ R11=W7 - mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - ldrsh r2, [r14, #48] - mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle) - teq r2, #0 @ if 0, then avoid muls - itttt ne - mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - rsbne r2, r2, #0 @ R2=-ROWr16[3] - mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - it ne - mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle) - - @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), - @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7, - @@ R12=__const_ptr_, R14=&block[n] - @@ MAC16(b0, W5, col[5x8]); - @@ MAC16(b2, W7, col[5x8]); - @@ MAC16(b3, W3, col[5x8]); - @@ MAC16(b1, -W1, col[5x8]); - @@ MAC16(b0, W7, col[7x8]); - @@ MAC16(b2, W3, col[7x8]); - @@ MAC16(b3, -W1, col[7x8]); - @@ MAC16(b1, -W5, col[7x8]); - ldrsh r3, [r14, #80] @ R3=COLr16[5x8] - teq r3, #0 @ if 0 then avoid muls - itttt ne - mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0 - mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2 - mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3 - rsbne r3, r3, #0 @ R3=-ROWr16[5x8] - ldrsh r4, [r14, #112] @ R4=COLr16[7x8] - it ne - mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1 - @@ R3 is free now - teq r4, #0 @ if 0 then avoid muls - itttt ne - mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0 - mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2 - rsbne r4, r4, #0 @ R4=-ROWr16[7x8] - mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3 - it ne - mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1 - @@ R4 is free now -@@ __end_b_evaluation2: - @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free), - @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - -@@ __a_evaluation2: - @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1)); - @@ a1 = a0 + W6 * row[2]; - @@ a2 = a0 - W6 * row[2]; - @@ a3 = a0 - W2 * row[2]; - @@ a0 = a0 + W2 * row[2]; - ldrsh r6, [r14, #0] - ldr r9, =W4 @ R9=W4 - mul r6, r9, r6 @ R6=W4*ROWr16[0] - ldr r10, =W6 @ R10=W6 - ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet) - add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0) - mul r11, r10, r4 @ R11=W6*ROWr16[2] - ldr r8, =W2 @ R8=W2 - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ a0 += W4*row[4] - @@ a1 -= W4*row[4] - @@ a2 -= W4*row[4] - @@ a3 += W4*row[4] - ldrsh r11, [r14, #64] @ R11=ROWr16[4] - teq r11, #0 @ if null avoid muls - itttt ne - mulne r11, r9, r11 @ R11=W4*ROWr16[4] - @@ R9 is free now - addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0) - subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1) - subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2) - ldrsh r9, [r14, #96] @ R9=ROWr16[6] - it ne - addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3) - @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead - teq r9, #0 @ if null avoid muls - itttt ne - mulne r11, r10, r9 @ R11=W6*ROWr16[6] - addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0) - mulne r10, r8, r9 @ R10=W2*ROWr16[6] - @@ a0 += W6*row[6]; - @@ a3 -= W6*row[6]; - @@ a1 -= W2*row[6]; - @@ a2 += W2*row[6]; - subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3) - itt ne - subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1) - addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2) -@@ __end_a_evaluation2: - @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3, - @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free), - @@ R12=__const_ptr_, R14=&block[n] - @@ col[0 ] = ((a0 + b0) >> COL_SHIFT); - @@ col[8 ] = ((a1 + b1) >> COL_SHIFT); - @@ col[16] = ((a2 + b2) >> COL_SHIFT); - @@ col[24] = ((a3 + b3) >> COL_SHIFT); - @@ col[32] = ((a3 - b3) >> COL_SHIFT); - @@ col[40] = ((a2 - b2) >> COL_SHIFT); - @@ col[48] = ((a1 - b1) >> COL_SHIFT); - @@ col[56] = ((a0 - b0) >> COL_SHIFT); - @@@@@ no optimization here @@@@@ - add r8, r6, r0 @ R8=a0+b0 - add r9, r2, r1 @ R9=a1+b1 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #0] - strh r9, [r14, #16] - add r8, r3, r5 @ R8=a2+b2 - add r9, r4, r7 @ R9=a3+b3 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #32] - strh r9, [r14, #48] - sub r8, r4, r7 @ R8=a3-b3 - sub r9, r3, r5 @ R9=a2-b2 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #64] - strh r9, [r14, #80] - sub r8, r2, r1 @ R8=a1-b1 - sub r9, r6, r0 @ R9=a0-b0 - mov r8, r8, asr #COL_SHIFT - mov r9, r9, asr #COL_SHIFT - strh r8, [r14, #96] - strh r9, [r14, #112] - -@@ __end_col_loop: - @@ at this point, R0-R11 (free) - @@ R12=__const_ptr_, R14=&block[n] - ldr r0, [sp, #0] @ R0=block - teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished. - sub r14, r14, #2 - bne __col_loop - - - - -@@ __end_simple_idct_arm: - @@ restore registers to previous status! - add sp, sp, #8 @@ the local variables! - ldmfd sp!, {r4-r11, r15} @@ update PC with LR content. - - - -@@ kind of sub-function, here not to overload the common case. -__end_bef_a_evaluation: - add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1) - mul r11, r8, r4 @ R11=W2*ROWr16[2] - sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3) - add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0) - bal __end_a_evaluation diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv5te.S b/ffmpeg/libavcodec/arm/simple_idct_armv5te.S deleted file mode 100644 index d1f10b7..0000000 --- a/ffmpeg/libavcodec/arm/simple_idct_armv5te.S +++ /dev/null @@ -1,620 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> - * Copyright (c) 2006 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - -function idct_row_armv5te - str lr, [sp, #-4]! - - ldrd v1, v2, [a1, #8] - ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */ - orrs v1, v1, v2 - itt eq - cmpeq v1, a4 - cmpeq v1, a3, lsr #16 - beq row_dc_only - - mov v1, #(1<<(ROW_SHIFT-1)) - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ - smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ - ldr ip, =W26 /* ip = W2 | (W6 << 16) */ - smultb a2, ip, a4 - smulbb lr, ip, a4 - add v2, v1, a2 - sub v3, v1, a2 - sub v4, v1, lr - add v1, v1, lr - - ldr ip, =W13 /* ip = W1 | (W3 << 16) */ - ldr lr, =W57 /* lr = W5 | (W7 << 16) */ - smulbt v5, ip, a3 - smultt v6, lr, a4 - smlatt v5, ip, a4, v5 - smultt a2, ip, a3 - smulbt v7, lr, a3 - sub v6, v6, a2 - smulbt a2, ip, a4 - smultt fp, lr, a3 - sub v7, v7, a2 - smulbt a2, lr, a4 - ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ - sub fp, fp, a2 - - orrs a2, a3, a4 - beq 1f - - smlabt v5, lr, a3, v5 - smlabt v6, ip, a3, v6 - smlatt v5, lr, a4, v5 - smlabt v6, lr, a4, v6 - smlatt v7, lr, a3, v7 - smlatt fp, ip, a3, fp - smulbt a2, ip, a4 - smlatt v7, ip, a4, v7 - sub fp, fp, a2 - - ldr ip, =W26 /* ip = W2 | (W6 << 16) */ - mov a2, #16384 - sub a2, a2, #1 /* a2 = W4 */ - smulbb a2, a2, a3 /* a2 = W4*row[4] */ - smultb lr, ip, a4 /* lr = W6*row[6] */ - add v1, v1, a2 /* v1 += W4*row[4] */ - add v1, v1, lr /* v1 += W6*row[6] */ - add v4, v4, a2 /* v4 += W4*row[4] */ - sub v4, v4, lr /* v4 -= W6*row[6] */ - smulbb lr, ip, a4 /* lr = W2*row[6] */ - sub v2, v2, a2 /* v2 -= W4*row[4] */ - sub v2, v2, lr /* v2 -= W2*row[6] */ - sub v3, v3, a2 /* v3 -= W4*row[4] */ - add v3, v3, lr /* v3 += W2*row[6] */ - -1: add a2, v1, v5 - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v2, v6 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v3, v7 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - add a2, v4, fp - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, a4, [a1] - - sub a2, v4, fp - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v3, v7 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v2, v6 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - sub a2, v1, v5 - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, a4, [a1, #8] - - ldr pc, [sp], #4 - -row_dc_only: - orr a3, a3, a3, lsl #16 - bic a3, a3, #0xe000 - mov a3, a3, lsl #3 - mov a4, a3 - strd a3, a4, [a1] - strd a3, a4, [a1, #8] - - ldr pc, [sp], #4 -endfunc - - .macro idct_col - ldr a4, [a1] /* a4 = col[1:0] */ - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ -#if 0 - mov v1, #(1<<(COL_SHIFT-1)) - smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ - smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ - ldr a4, [a1, #(16*4)] -#else - mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ - add v2, v1, a4, asr #16 - rsb v2, v2, v2, lsl #14 - mov a4, a4, lsl #16 - add v1, v1, a4, asr #16 - ldr a4, [a1, #(16*4)] - rsb v1, v1, v1, lsl #14 -#endif - - smulbb lr, ip, a4 - smulbt a3, ip, a4 - sub v3, v1, lr - sub v5, v1, lr - add v7, v1, lr - add v1, v1, lr - sub v4, v2, a3 - sub v6, v2, a3 - add fp, v2, a3 - ldr ip, =W26 - ldr a4, [a1, #(16*2)] - add v2, v2, a3 - - smulbb lr, ip, a4 - smultb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - add v3, v3, a3 - sub v5, v5, a3 - smulbt lr, ip, a4 - smultt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - add v4, v4, a3 - ldr a4, [a1, #(16*6)] - sub v6, v6, a3 - - smultb lr, ip, a4 - smulbb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - sub v3, v3, a3 - add v5, v5, a3 - smultt lr, ip, a4 - smulbt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - sub v4, v4, a3 - add v6, v6, a3 - - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} - - ldr ip, =W13 - ldr a4, [a1, #(16*1)] - ldr lr, =W57 - smulbb v1, ip, a4 - smultb v3, ip, a4 - smulbb v5, lr, a4 - smultb v7, lr, a4 - smulbt v2, ip, a4 - smultt v4, ip, a4 - smulbt v6, lr, a4 - smultt fp, lr, a4 - rsb v4, v4, #0 - ldr a4, [a1, #(16*3)] - rsb v3, v3, #0 - - smlatb v1, ip, a4, v1 - smlatb v3, lr, a4, v3 - smulbb a3, ip, a4 - smulbb a2, lr, a4 - sub v5, v5, a3 - sub v7, v7, a2 - smlatt v2, ip, a4, v2 - smlatt v4, lr, a4, v4 - smulbt a3, ip, a4 - smulbt a2, lr, a4 - sub v6, v6, a3 - ldr a4, [a1, #(16*5)] - sub fp, fp, a2 - - smlabb v1, lr, a4, v1 - smlabb v3, ip, a4, v3 - smlatb v5, lr, a4, v5 - smlatb v7, ip, a4, v7 - smlabt v2, lr, a4, v2 - smlabt v4, ip, a4, v4 - smlatt v6, lr, a4, v6 - ldr a3, [a1, #(16*7)] - smlatt fp, ip, a4, fp - - smlatb v1, lr, a3, v1 - smlabb v3, lr, a3, v3 - smlatb v5, ip, a3, v5 - smulbb a4, ip, a3 - smlatt v2, lr, a3, v2 - sub v7, v7, a4 - smlabt v4, lr, a3, v4 - smulbt a4, ip, a3 - smlatt v6, ip, a3, v6 - sub fp, fp, a4 - .endm - -function idct_col_armv5te - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - adds a2, a3, v1 - mov a2, a2, lsr #20 - it mi - orrmi a2, a2, #0xf000 - add ip, a4, v2 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1] - subs a3, a3, v1 - mov a2, a3, lsr #20 - it mi - orrmi a2, a2, #0xf000 - sub a4, a4, v2 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*7)] - - subs a2, a3, v3 - mov a2, a2, lsr #20 - it mi - orrmi a2, a2, #0xf000 - sub ip, a4, v4 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*1)] - adds a3, a3, v3 - mov a2, a3, lsr #20 - it mi - orrmi a2, a2, #0xf000 - add a4, a4, v4 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*6)] - - adds a2, a3, v5 - mov a2, a2, lsr #20 - it mi - orrmi a2, a2, #0xf000 - add ip, a4, v6 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*2)] - subs a3, a3, v5 - mov a2, a3, lsr #20 - it mi - orrmi a2, a2, #0xf000 - sub a4, a4, v6 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*5)] - - adds a2, a3, v7 - mov a2, a2, lsr #20 - it mi - orrmi a2, a2, #0xf000 - add ip, a4, fp - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*3)] - subs a3, a3, v7 - mov a2, a3, lsr #20 - it mi - orrmi a2, a2, #0xf000 - sub a4, a4, fp - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - str a2, [a1, #(16*4)] - - ldr pc, [sp], #4 -endfunc - -.macro clip dst, src:vararg - movs \dst, \src - it mi - movmi \dst, #0 - cmp \dst, #255 - it gt - movgt \dst, #255 -.endm - -.macro aclip dst, src:vararg - adds \dst, \src - it mi - movmi \dst, #0 - cmp \dst, #255 - it gt - movgt \dst, #255 -.endm - -function idct_col_put_armv5te - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - ldr lr, [sp, #32] - add a2, a3, v1 - clip a2, a2, asr #20 - add ip, a4, v2 - clip ip, ip, asr #20 - orr a2, a2, ip, lsl #8 - sub a3, a3, v1 - clip a3, a3, asr #20 - sub a4, a4, v2 - clip a4, a4, asr #20 - ldr v1, [sp, #28] - strh a2, [v1] - add a2, v1, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - rsb v2, lr, lr, lsl #3 - ldmfd sp!, {a3, a4} - strh_pre a2, v2, v1 - - sub a2, a3, v3 - clip a2, a2, asr #20 - sub ip, a4, v4 - clip ip, ip, asr #20 - orr a2, a2, ip, lsl #8 - strh_pre a2, v1, lr - add a3, a3, v3 - clip a2, a3, asr #20 - add a4, a4, v4 - clip a4, a4, asr #20 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh_dpre a2, v2, lr - - add a2, a3, v5 - clip a2, a2, asr #20 - add ip, a4, v6 - clip ip, ip, asr #20 - orr a2, a2, ip, lsl #8 - strh_pre a2, v1, lr - sub a3, a3, v5 - clip a2, a3, asr #20 - sub a4, a4, v6 - clip a4, a4, asr #20 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh_dpre a2, v2, lr - - add a2, a3, v7 - clip a2, a2, asr #20 - add ip, a4, fp - clip ip, ip, asr #20 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr] - sub a3, a3, v7 - clip a2, a3, asr #20 - sub a4, a4, fp - clip a4, a4, asr #20 - orr a2, a2, a4, lsl #8 - strh_dpre a2, v2, lr - - ldr pc, [sp], #4 -endfunc - -function idct_col_add_armv5te - str lr, [sp, #-4]! - - idct_col - - ldr lr, [sp, #36] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr] - add a2, a3, v1 - sub a3, a3, v1 - and v1, ip, #255 - aclip a2, v1, a2, asr #20 - add v1, a4, v2 - mov v1, v1, asr #20 - aclip v1, v1, ip, lsr #8 - orr a2, a2, v1, lsl #8 - ldr v1, [sp, #32] - sub a4, a4, v2 - rsb v2, v1, v1, lsl #3 - ldrh_pre ip, v2, lr - strh a2, [lr] - and a2, ip, #255 - aclip a3, a2, a3, asr #20 - mov a4, a4, asr #20 - aclip a4, a4, ip, lsr #8 - add a2, lr, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh_pre ip, lr, v1 - sub a2, a3, v3 - add a3, a3, v3 - and v3, ip, #255 - aclip a2, v3, a2, asr #20 - sub v3, a4, v4 - mov v3, v3, asr #20 - aclip v3, v3, ip, lsr #8 - orr a2, a2, v3, lsl #8 - add a4, a4, v4 - ldrh_dpre ip, v2, v1 - strh a2, [lr] - and a2, ip, #255 - aclip a3, a2, a3, asr #20 - mov a4, a4, asr #20 - aclip a4, a4, ip, lsr #8 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh_pre ip, lr, v1 - add a2, a3, v5 - sub a3, a3, v5 - and v3, ip, #255 - aclip a2, v3, a2, asr #20 - add v3, a4, v6 - mov v3, v3, asr #20 - aclip v3, v3, ip, lsr #8 - orr a2, a2, v3, lsl #8 - sub a4, a4, v6 - ldrh_dpre ip, v2, v1 - strh a2, [lr] - and a2, ip, #255 - aclip a3, a2, a3, asr #20 - mov a4, a4, asr #20 - aclip a4, a4, ip, lsr #8 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh_pre ip, lr, v1 - add a2, a3, v7 - sub a3, a3, v7 - and v3, ip, #255 - aclip a2, v3, a2, asr #20 - add v3, a4, fp - mov v3, v3, asr #20 - aclip v3, v3, ip, lsr #8 - orr a2, a2, v3, lsl #8 - sub a4, a4, fp - ldrh_dpre ip, v2, v1 - strh a2, [lr] - and a2, ip, #255 - aclip a3, a2, a3, asr #20 - mov a4, a4, asr #20 - aclip a4, a4, ip, lsr #8 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldr pc, [sp], #4 -endfunc - -function ff_simple_idct_armv5te, export=1 - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} -endfunc - -function ff_simple_idct_add_armv5te, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} -endfunc - -function ff_simple_idct_put_armv5te, export=1 - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv6.S b/ffmpeg/libavcodec/arm/simple_idct_armv6.S deleted file mode 100644 index 79cf5d4..0000000 --- a/ffmpeg/libavcodec/arm/simple_idct_armv6.S +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> - * Copyright (c) 2007 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W42 (W4 | (W2 << 16)) -#define W42n (-W4&0xffff | (-W2 << 16)) -#define W46 (W4 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - -/* - Compute partial IDCT of single row. - shift = left-shift amount - r0 = source address - r2 = row[2,0] <= 2 cycles - r3 = row[3,1] - ip = w42 <= 2 cycles - - Output in registers r4--r11 -*/ - .macro idct_row shift - ldr lr, =W46 /* lr = W4 | (W6 << 16) */ - mov r1, #(1<<(\shift-1)) - smlad r4, r2, ip, r1 - smlsd r7, r2, ip, r1 - ldr ip, =W13 /* ip = W1 | (W3 << 16) */ - ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ - smlad r5, r2, lr, r1 - smlsd r6, r2, lr, r1 - - smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ - smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ - ldr lr, [r0, #12] /* lr = row[7,5] */ - pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ - pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ - smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ - smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */ - smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ - - ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */ - smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */ - ldr r2, [r0, #4] /* r2 = row[6,4] */ - smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */ - ldr ip, =W46 /* ip = W4 | (W6 << 16) */ - smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */ - - smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */ - smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */ - smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */ - smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */ - .endm - -/* - Compute partial IDCT of half row. - shift = left-shift amount - r2 = row[2,0] - r3 = row[3,1] - ip = w42 - - Output in registers r4--r11 -*/ - .macro idct_row4 shift - ldr lr, =W46 /* lr = W4 | (W6 << 16) */ - ldr r10,=W57 /* r10 = W5 | (W7 << 16) */ - mov r1, #(1<<(\shift-1)) - smlad r4, r2, ip, r1 - smlsd r7, r2, ip, r1 - ldr ip, =W13 /* ip = W1 | (W3 << 16) */ - smlad r5, r2, lr, r1 - smlsd r6, r2, lr, r1 - smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */ - smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */ - pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */ - pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */ - smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */ - smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */ - .endm - -/* - Compute final part of IDCT single row without shift. - Input in registers r4--r11 - Output in registers ip, r4--r6, lr, r8--r10 -*/ - .macro idct_finish - add ip, r4, r8 /* r1 = A0 + B0 */ - sub lr, r4, r8 /* r2 = A0 - B0 */ - sub r4, r5, r9 /* r2 = A1 + B1 */ - add r8, r5, r9 /* r2 = A1 - B1 */ - add r5, r6, r10 /* r1 = A2 + B2 */ - sub r9, r6, r10 /* r1 = A2 - B2 */ - add r6, r7, r11 /* r2 = A3 + B3 */ - sub r10,r7, r11 /* r2 = A3 - B3 */ - .endm - -/* - Compute final part of IDCT single row. - shift = right-shift amount - Input/output in registers r4--r11 -*/ - .macro idct_finish_shift shift - add r3, r4, r8 /* r3 = A0 + B0 */ - sub r2, r4, r8 /* r2 = A0 - B0 */ - mov r4, r3, asr #\shift - mov r8, r2, asr #\shift - - sub r3, r5, r9 /* r3 = A1 + B1 */ - add r2, r5, r9 /* r2 = A1 - B1 */ - mov r5, r3, asr #\shift - mov r9, r2, asr #\shift - - add r3, r6, r10 /* r3 = A2 + B2 */ - sub r2, r6, r10 /* r2 = A2 - B2 */ - mov r6, r3, asr #\shift - mov r10,r2, asr #\shift - - add r3, r7, r11 /* r3 = A3 + B3 */ - sub r2, r7, r11 /* r2 = A3 - B3 */ - mov r7, r3, asr #\shift - mov r11,r2, asr #\shift - .endm - -/* - Compute final part of IDCT single row, saturating results at 8 bits. - shift = right-shift amount - Input/output in registers r4--r11 -*/ - .macro idct_finish_shift_sat shift - add r3, r4, r8 /* r3 = A0 + B0 */ - sub ip, r4, r8 /* ip = A0 - B0 */ - usat r4, #8, r3, asr #\shift - usat r8, #8, ip, asr #\shift - - sub r3, r5, r9 /* r3 = A1 + B1 */ - add ip, r5, r9 /* ip = A1 - B1 */ - usat r5, #8, r3, asr #\shift - usat r9, #8, ip, asr #\shift - - add r3, r6, r10 /* r3 = A2 + B2 */ - sub ip, r6, r10 /* ip = A2 - B2 */ - usat r6, #8, r3, asr #\shift - usat r10,#8, ip, asr #\shift - - add r3, r7, r11 /* r3 = A3 + B3 */ - sub ip, r7, r11 /* ip = A3 - B3 */ - usat r7, #8, r3, asr #\shift - usat r11,#8, ip, asr #\shift - .endm - -/* - Compute IDCT of single row, storing as column. - r0 = source - r1 = dest -*/ -function idct_row_armv6 - push {lr} - - ldr lr, [r0, #12] /* lr = row[7,5] */ - ldr ip, [r0, #4] /* ip = row[6,4] */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - ldr r2, [r0] /* r2 = row[2,0] */ - orrs lr, lr, ip - itt eq - cmpeq lr, r3 - cmpeq lr, r2, lsr #16 - beq 1f - push {r1} - ldr ip, =W42 /* ip = W4 | (W2 << 16) */ - cmp lr, #0 - beq 2f - - idct_row ROW_SHIFT - b 3f - -2: idct_row4 ROW_SHIFT - -3: pop {r1} - idct_finish_shift ROW_SHIFT - - strh r4, [r1] - strh r5, [r1, #(16*2)] - strh r6, [r1, #(16*4)] - strh r7, [r1, #(16*6)] - strh r11,[r1, #(16*1)] - strh r10,[r1, #(16*3)] - strh r9, [r1, #(16*5)] - strh r8, [r1, #(16*7)] - - pop {pc} - -1: mov r2, r2, lsl #3 - strh r2, [r1] - strh r2, [r1, #(16*2)] - strh r2, [r1, #(16*4)] - strh r2, [r1, #(16*6)] - strh r2, [r1, #(16*1)] - strh r2, [r1, #(16*3)] - strh r2, [r1, #(16*5)] - strh r2, [r1, #(16*7)] - pop {pc} -endfunc - -/* - Compute IDCT of single column, read as row. - r0 = source - r1 = dest -*/ -function idct_col_armv6 - push {r1, lr} - - ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, =W42 /* ip = W4 | (W2 << 16) */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - idct_row COL_SHIFT - pop {r1} - idct_finish_shift COL_SHIFT - - strh r4, [r1] - strh r5, [r1, #(16*1)] - strh r6, [r1, #(16*2)] - strh r7, [r1, #(16*3)] - strh r11,[r1, #(16*4)] - strh r10,[r1, #(16*5)] - strh r9, [r1, #(16*6)] - strh r8, [r1, #(16*7)] - - pop {pc} -endfunc - -/* - Compute IDCT of single column, read as row, store saturated 8-bit. - r0 = source - r1 = dest - r2 = line size -*/ -function idct_col_put_armv6 - push {r1, r2, lr} - - ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, =W42 /* ip = W4 | (W2 << 16) */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - idct_row COL_SHIFT - pop {r1, r2} - idct_finish_shift_sat COL_SHIFT - - strb_post r4, r1, r2 - strb_post r5, r1, r2 - strb_post r6, r1, r2 - strb_post r7, r1, r2 - strb_post r11,r1, r2 - strb_post r10,r1, r2 - strb_post r9, r1, r2 - strb_post r8, r1, r2 - - sub r1, r1, r2, lsl #3 - - pop {pc} -endfunc - -/* - Compute IDCT of single column, read as row, add/store saturated 8-bit. - r0 = source - r1 = dest - r2 = line size -*/ -function idct_col_add_armv6 - push {r1, r2, lr} - - ldr r2, [r0] /* r2 = row[2,0] */ - ldr ip, =W42 /* ip = W4 | (W2 << 16) */ - ldr r3, [r0, #8] /* r3 = row[3,1] */ - idct_row COL_SHIFT - pop {r1, r2} - idct_finish - - ldrb r3, [r1] - ldrb r7, [r1, r2] - ldrb r11,[r1, r2, lsl #2] - add ip, r3, ip, asr #COL_SHIFT - usat ip, #8, ip - add r4, r7, r4, asr #COL_SHIFT - strb_post ip, r1, r2 - ldrb ip, [r1, r2] - usat r4, #8, r4 - ldrb r11,[r1, r2, lsl #2] - add r5, ip, r5, asr #COL_SHIFT - usat r5, #8, r5 - strb_post r4, r1, r2 - ldrb r3, [r1, r2] - ldrb ip, [r1, r2, lsl #2] - strb_post r5, r1, r2 - ldrb r7, [r1, r2] - ldrb r4, [r1, r2, lsl #2] - add r6, r3, r6, asr #COL_SHIFT - usat r6, #8, r6 - add r10,r7, r10,asr #COL_SHIFT - usat r10,#8, r10 - add r9, r11,r9, asr #COL_SHIFT - usat r9, #8, r9 - add r8, ip, r8, asr #COL_SHIFT - usat r8, #8, r8 - add lr, r4, lr, asr #COL_SHIFT - usat lr, #8, lr - strb_post r6, r1, r2 - strb_post r10,r1, r2 - strb_post r9, r1, r2 - strb_post r8, r1, r2 - strb_post lr, r1, r2 - - sub r1, r1, r2, lsl #3 - - pop {pc} -endfunc - -/* - Compute 8 IDCT row transforms. - func = IDCT row->col function - width = width of columns in bytes -*/ - .macro idct_rows func width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - sub r0, r0, #(16*5) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - add r0, r0, #(16*2) - add r1, r1, #\width - bl \func - - sub r0, r0, #(16*7) - .endm - -/* void ff_simple_idct_armv6(int16_t *data); */ -function ff_simple_idct_armv6, export=1 - push {r4-r11, lr} - sub sp, sp, #128 - - mov r1, sp - idct_rows idct_row_armv6, 2 - mov r1, r0 - mov r0, sp - idct_rows idct_col_armv6, 2 - - add sp, sp, #128 - pop {r4-r11, pc} -endfunc - -/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */ -function ff_simple_idct_add_armv6, export=1 - push {r0, r1, r4-r11, lr} - sub sp, sp, #128 - - mov r0, r2 - mov r1, sp - idct_rows idct_row_armv6, 2 - mov r0, sp - ldr r1, [sp, #128] - ldr r2, [sp, #(128+4)] - idct_rows idct_col_add_armv6, 1 - - add sp, sp, #(128+8) - pop {r4-r11, pc} -endfunc - -/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */ -function ff_simple_idct_put_armv6, export=1 - push {r0, r1, r4-r11, lr} - sub sp, sp, #128 - - mov r0, r2 - mov r1, sp - idct_rows idct_row_armv6, 2 - mov r0, sp - ldr r1, [sp, #128] - ldr r2, [sp, #(128+4)] - idct_rows idct_col_put_armv6, 1 - - add sp, sp, #(128+8) - pop {r4-r11, pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/simple_idct_neon.S b/ffmpeg/libavcodec/arm/simple_idct_neon.S deleted file mode 100644 index c3e573c..0000000 --- a/ffmpeg/libavcodec/arm/simple_idct_neon.S +++ /dev/null @@ -1,375 +0,0 @@ -/* - * ARM NEON IDCT - * - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * Based on Simple IDCT - * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 -#define W4c ((1<<(COL_SHIFT-1))/W4) -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define w1 d0[0] -#define w2 d0[1] -#define w3 d0[2] -#define w4 d0[3] -#define w5 d1[0] -#define w6 d1[1] -#define w7 d1[2] -#define w4c d1[3] - - .macro idct_col4_top - vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ - vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ - vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ - vadd.i32 q11, q15, q7 - vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ - vadd.i32 q12, q15, q8 - vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ - vsub.i32 q13, q15, q8 - vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ - vsub.i32 q14, q15, q7 - - vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ - vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ - vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ - vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ - .endm - - .text - .align 6 - -function idct_row4_pld_neon - pld [r0] - add r3, r0, r1, lsl #2 - pld [r0, r1] - pld [r0, r1, lsl #1] -A pld [r3, -r1] - pld [r3] - pld [r3, r1] - add r3, r3, r1, lsl #1 - pld [r3] - pld [r3, r1] -endfunc - -function idct_row4_neon - vmov.i32 q15, #(1<<(ROW_SHIFT-1)) - vld1.64 {d2-d5}, [r2,:128]! - vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ - vld1.64 {d6,d7}, [r2,:128]! - vorr d10, d3, d5 - vld1.64 {d8,d9}, [r2,:128]! - add r2, r2, #-64 - - vorr d11, d7, d9 - vorr d10, d10, d11 - vmov r3, r4, d10 - - idct_col4_top - - orrs r3, r3, r4 - beq 1f - - vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ - vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ - vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ - vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ - vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q7 - vsub.i32 q13, q13, q7 - vadd.i32 q14, q14, q7 - vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ - vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ - vmlal.s16 q9, d9, w7 - vmlsl.s16 q10, d9, w5 - vmlal.s16 q5, d9, w3 - vmlsl.s16 q6, d9, w1 - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q8 - vadd.i32 q13, q13, q8 - vsub.i32 q14, q14, q7 - -1: vadd.i32 q3, q11, q9 - vadd.i32 q4, q12, q10 - vshrn.i32 d2, q3, #ROW_SHIFT - vshrn.i32 d4, q4, #ROW_SHIFT - vadd.i32 q7, q13, q5 - vadd.i32 q8, q14, q6 - vtrn.16 d2, d4 - vshrn.i32 d6, q7, #ROW_SHIFT - vshrn.i32 d8, q8, #ROW_SHIFT - vsub.i32 q14, q14, q6 - vsub.i32 q11, q11, q9 - vtrn.16 d6, d8 - vsub.i32 q13, q13, q5 - vshrn.i32 d3, q14, #ROW_SHIFT - vtrn.32 d2, d6 - vsub.i32 q12, q12, q10 - vtrn.32 d4, d8 - vshrn.i32 d5, q13, #ROW_SHIFT - vshrn.i32 d7, q12, #ROW_SHIFT - vshrn.i32 d9, q11, #ROW_SHIFT - - vtrn.16 d3, d5 - vtrn.16 d7, d9 - vtrn.32 d3, d7 - vtrn.32 d5, d9 - - vst1.64 {d2-d5}, [r2,:128]! - vst1.64 {d6-d9}, [r2,:128]! - - bx lr -endfunc - -function idct_col4_neon - mov ip, #16 - vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ - vdup.16 d30, w4c - vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ - vadd.i16 d30, d30, d2 - vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ - vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ - vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ - - ldrd r4, r5, [r2] - ldrd r6, r7, [r2, #16] - orrs r4, r4, r5 - - idct_col4_top - it eq - addeq r2, r2, #16 - beq 1f - - vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ - vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ - vadd.i32 q11, q11, q7 - vsub.i32 q12, q12, q7 - vsub.i32 q13, q13, q7 - vadd.i32 q14, q14, q7 - -1: orrs r6, r6, r7 - ldrd r4, r5, [r2, #16] - it eq - addeq r2, r2, #16 - beq 2f - - vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ - vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ - vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ - vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ - vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ - -2: orrs r4, r4, r5 - ldrd r4, r5, [r2, #16] - it eq - addeq r2, r2, #16 - beq 3f - - vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ - vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ - vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ - vadd.i32 q11, q11, q7 - vsub.i32 q14, q14, q7 - vsub.i32 q12, q12, q8 - vadd.i32 q13, q13, q8 - -3: orrs r4, r4, r5 - it eq - addeq r2, r2, #16 - beq 4f - - vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ - vmlal.s16 q9, d9, w7 - vmlsl.s16 q10, d9, w5 - vmlal.s16 q5, d9, w3 - vmlsl.s16 q6, d9, w1 - -4: vaddhn.i32 d2, q11, q9 - vaddhn.i32 d3, q12, q10 - vaddhn.i32 d4, q13, q5 - vaddhn.i32 d5, q14, q6 - vsubhn.i32 d9, q11, q9 - vsubhn.i32 d8, q12, q10 - vsubhn.i32 d7, q13, q5 - vsubhn.i32 d6, q14, q6 - - bx lr -endfunc - - .align 6 - -function idct_col4_st8_neon - vqshrun.s16 d2, q1, #COL_SHIFT-16 - vqshrun.s16 d3, q2, #COL_SHIFT-16 - vqshrun.s16 d4, q3, #COL_SHIFT-16 - vqshrun.s16 d5, q4, #COL_SHIFT-16 - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - vst1.32 {d4[0]}, [r0,:32], r1 - vst1.32 {d4[1]}, [r0,:32], r1 - vst1.32 {d5[0]}, [r0,:32], r1 - vst1.32 {d5[1]}, [r0,:32], r1 - - bx lr -endfunc - -const idct_coeff_neon, align=4 - .short W1, W2, W3, W4, W5, W6, W7, W4c -endconst - - .macro idct_start data - push {r4-r7, lr} - pld [\data] - pld [\data, #64] - vpush {d8-d15} - movrel r3, idct_coeff_neon - vld1.64 {d0,d1}, [r3,:128] - .endm - - .macro idct_end - vpop {d8-d15} - pop {r4-r7, pc} - .endm - -/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */ -function ff_simple_idct_put_neon, export=1 - idct_start r2 - - bl idct_row4_pld_neon - bl idct_row4_neon - add r2, r2, #-128 - bl idct_col4_neon - bl idct_col4_st8_neon - sub r0, r0, r1, lsl #3 - add r0, r0, #4 - add r2, r2, #-120 - bl idct_col4_neon - bl idct_col4_st8_neon - - idct_end -endfunc - - .align 6 - -function idct_col4_add8_neon - mov ip, r0 - - vld1.32 {d10[0]}, [r0,:32], r1 - vshr.s16 q1, q1, #COL_SHIFT-16 - vld1.32 {d10[1]}, [r0,:32], r1 - vshr.s16 q2, q2, #COL_SHIFT-16 - vld1.32 {d11[0]}, [r0,:32], r1 - vshr.s16 q3, q3, #COL_SHIFT-16 - vld1.32 {d11[1]}, [r0,:32], r1 - vshr.s16 q4, q4, #COL_SHIFT-16 - vld1.32 {d12[0]}, [r0,:32], r1 - vaddw.u8 q1, q1, d10 - vld1.32 {d12[1]}, [r0,:32], r1 - vaddw.u8 q2, q2, d11 - vld1.32 {d13[0]}, [r0,:32], r1 - vqmovun.s16 d2, q1 - vld1.32 {d13[1]}, [r0,:32], r1 - vaddw.u8 q3, q3, d12 - vst1.32 {d2[0]}, [ip,:32], r1 - vqmovun.s16 d3, q2 - vst1.32 {d2[1]}, [ip,:32], r1 - vaddw.u8 q4, q4, d13 - vst1.32 {d3[0]}, [ip,:32], r1 - vqmovun.s16 d4, q3 - vst1.32 {d3[1]}, [ip,:32], r1 - vqmovun.s16 d5, q4 - vst1.32 {d4[0]}, [ip,:32], r1 - vst1.32 {d4[1]}, [ip,:32], r1 - vst1.32 {d5[0]}, [ip,:32], r1 - vst1.32 {d5[1]}, [ip,:32], r1 - - bx lr -endfunc - -/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */ -function ff_simple_idct_add_neon, export=1 - idct_start r2 - - bl idct_row4_pld_neon - bl idct_row4_neon - add r2, r2, #-128 - bl idct_col4_neon - bl idct_col4_add8_neon - sub r0, r0, r1, lsl #3 - add r0, r0, #4 - add r2, r2, #-120 - bl idct_col4_neon - bl idct_col4_add8_neon - - idct_end -endfunc - - .align 6 - -function idct_col4_st16_neon - mov ip, #16 - - vshr.s16 q1, q1, #COL_SHIFT-16 - vshr.s16 q2, q2, #COL_SHIFT-16 - vst1.64 {d2}, [r2,:64], ip - vshr.s16 q3, q3, #COL_SHIFT-16 - vst1.64 {d3}, [r2,:64], ip - vshr.s16 q4, q4, #COL_SHIFT-16 - vst1.64 {d4}, [r2,:64], ip - vst1.64 {d5}, [r2,:64], ip - vst1.64 {d6}, [r2,:64], ip - vst1.64 {d7}, [r2,:64], ip - vst1.64 {d8}, [r2,:64], ip - vst1.64 {d9}, [r2,:64], ip - - bx lr -endfunc - -/* void ff_simple_idct_neon(int16_t *data); */ -function ff_simple_idct_neon, export=1 - idct_start r0 - - mov r2, r0 - bl idct_row4_neon - bl idct_row4_neon - add r2, r2, #-128 - bl idct_col4_neon - add r2, r2, #-128 - bl idct_col4_st16_neon - add r2, r2, #-120 - bl idct_col4_neon - add r2, r2, #-128 - bl idct_col4_st16_neon - - idct_end -endfunc diff --git a/ffmpeg/libavcodec/arm/synth_filter_neon.S b/ffmpeg/libavcodec/arm/synth_filter_neon.S deleted file mode 100644 index 5417be7..0000000 --- a/ffmpeg/libavcodec/arm/synth_filter_neon.S +++ /dev/null @@ -1,115 +0,0 @@ -/* - * Copyright (c) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_synth_filter_float_neon, export=1 - push {r3-r11,lr} - - ldr r4, [r2] @ synth_buf_offset - add r1, r1, r4, lsl #2 @ synth_buf - sub r12, r4, #32 - bfc r12, #9, #23 - bic r4, r4, #63 - str r12, [r2] - - ldr r2, [sp, #12*4] @ in - mov r9, r1 @ synth_buf - -VFP vpush {d0} - bl X(ff_imdct_half_neon) -VFP vpop {d0} - pop {r3} - - ldr r5, [sp, #9*4] @ window - ldr r2, [sp, #10*4] @ out -NOVFP vldr s0, [sp, #12*4] @ scale - add r8, r9, #12*4 - - mov lr, #64*4 - mov r1, #4 -1: - add r10, r9, #16*4 @ synth_buf - add r11, r8, #16*4 - add r0, r5, #16*4 @ window - add r6, r5, #32*4 - add r7, r5, #48*4 - - vld1.32 {q10}, [r3,:128] @ a - add r3, r3, #16*4 - vld1.32 {q1}, [r3,:128] @ b - vmov.f32 q2, #0.0 @ c - vmov.f32 q3, #0.0 @ d - - mov r12, #512 -2: - vld1.32 {q9}, [r8, :128], lr - vrev64.32 q9, q9 - vld1.32 {q8}, [r5, :128], lr - vmls.f32 d20, d16, d19 - vld1.32 {q11}, [r0, :128], lr - vmls.f32 d21, d17, d18 - vld1.32 {q12}, [r9, :128], lr - vmla.f32 d2, d22, d24 - vld1.32 {q8}, [r6, :128], lr - vmla.f32 d3, d23, d25 - vld1.32 {q9}, [r10,:128], lr - vmla.f32 d4, d16, d18 - vld1.32 {q12}, [r11,:128], lr - vmla.f32 d5, d17, d19 - vrev64.32 q12, q12 - vld1.32 {q11}, [r7, :128], lr - vmla.f32 d6, d22, d25 - vmla.f32 d7, d23, d24 - subs r12, r12, #64 - beq 3f - cmp r12, r4 - bne 2b - sub r8, r8, #512*4 - sub r9, r9, #512*4 - sub r10, r10, #512*4 - sub r11, r11, #512*4 - b 2b -3: - vmul.f32 q8, q10, d0[0] - vmul.f32 q9, q1, d0[0] - vst1.32 {q3}, [r3,:128] - sub r3, r3, #16*4 - vst1.32 {q2}, [r3,:128] - vst1.32 {q8}, [r2,:128] - add r2, r2, #16*4 - vst1.32 {q9}, [r2,:128] - - subs r1, r1, #1 - it eq - popeq {r4-r11,pc} - - cmp r4, #0 - itt eq - subeq r8, r8, #512*4 - subeq r9, r9, #512*4 - sub r5, r5, #512*4 - sub r2, r2, #12*4 @ out - add r3, r3, #4*4 @ synth_buf2 - add r5, r5, #4*4 @ window - add r9, r9, #4*4 @ synth_buf - sub r8, r8, #4*4 @ synth_buf - b 1b -endfunc diff --git a/ffmpeg/libavcodec/arm/videodsp_arm.h b/ffmpeg/libavcodec/arm/videodsp_arm.h deleted file mode 100644 index 112cbb8..0000000 --- a/ffmpeg/libavcodec/arm/videodsp_arm.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_VIDEODSP_ARM_H -#define AVCODEC_ARM_VIDEODSP_ARM_H - -#include "libavcodec/avcodec.h" -#include "libavcodec/videodsp.h" - -void ff_videodsp_init_armv5te(VideoDSPContext* ctx, int bpc); - -#endif /* AVCODEC_ARM_VIDEODSP_ARM_H */ diff --git a/ffmpeg/libavcodec/arm/videodsp_armv5te.S b/ffmpeg/libavcodec/arm/videodsp_armv5te.S deleted file mode 100644 index 48a6c3b..0000000 --- a/ffmpeg/libavcodec/arm/videodsp_armv5te.S +++ /dev/null @@ -1,31 +0,0 @@ -@ -@ ARMv5te optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> -@ -@ This file is part of FFmpeg -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -#include "config.h" -#include "libavutil/arm/asm.S" - -function ff_prefetch_arm, export=1 - subs r2, r2, #1 - pld [r0] - add r0, r0, r1 - bne ff_prefetch_arm - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/videodsp_init_arm.c b/ffmpeg/libavcodec/arm/videodsp_init_arm.c deleted file mode 100644 index a89abb2..0000000 --- a/ffmpeg/libavcodec/arm/videodsp_init_arm.c +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) 2012 Ronald S. Bultje - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/videodsp.h" -#include "videodsp_arm.h" - -av_cold void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc) -{ - int cpu_flags = av_get_cpu_flags(); - if (have_armv5te(cpu_flags)) ff_videodsp_init_armv5te(ctx, bpc); -} diff --git a/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c b/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c deleted file mode 100644 index 1ea1f34..0000000 --- a/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 2012 Ronald S. Bultje - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/videodsp.h" -#include "videodsp_arm.h" - -void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h); - -av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc) -{ -#if HAVE_ARMV5TE_EXTERNAL - ctx->prefetch = ff_prefetch_arm; -#endif -} diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c b/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c deleted file mode 100644 index f4b3d80..0000000 --- a/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c +++ /dev/null @@ -1,37 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/vorbisdsp.h" - -void ff_vorbis_inverse_coupling_neon(float *mag, float *ang, - intptr_t blocksize); - -av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_neon.S b/ffmpeg/libavcodec/arm/vorbisdsp_neon.S deleted file mode 100644 index 79ce54f..0000000 --- a/ffmpeg/libavcodec/arm/vorbisdsp_neon.S +++ /dev/null @@ -1,83 +0,0 @@ -/* - * ARM NEON optimised DSP functions - * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -function ff_vorbis_inverse_coupling_neon, export=1 - vmov.i32 q10, #1<<31 - subs r2, r2, #4 - mov r3, r0 - mov r12, r1 - beq 3f - - vld1.32 {d24-d25},[r1,:128]! - vld1.32 {d22-d23},[r0,:128]! - vcle.s32 q8, q12, #0 - vand q9, q11, q10 - veor q12, q12, q9 - vand q2, q12, q8 - vbic q3, q12, q8 - vadd.f32 q12, q11, q2 - vsub.f32 q11, q11, q3 -1: vld1.32 {d2-d3}, [r1,:128]! - vld1.32 {d0-d1}, [r0,:128]! - vcle.s32 q8, q1, #0 - vand q9, q0, q10 - veor q1, q1, q9 - vst1.32 {d24-d25},[r3, :128]! - vst1.32 {d22-d23},[r12,:128]! - vand q2, q1, q8 - vbic q3, q1, q8 - vadd.f32 q1, q0, q2 - vsub.f32 q0, q0, q3 - subs r2, r2, #8 - ble 2f - vld1.32 {d24-d25},[r1,:128]! - vld1.32 {d22-d23},[r0,:128]! - vcle.s32 q8, q12, #0 - vand q9, q11, q10 - veor q12, q12, q9 - vst1.32 {d2-d3}, [r3, :128]! - vst1.32 {d0-d1}, [r12,:128]! - vand q2, q12, q8 - vbic q3, q12, q8 - vadd.f32 q12, q11, q2 - vsub.f32 q11, q11, q3 - b 1b - -2: vst1.32 {d2-d3}, [r3, :128]! - vst1.32 {d0-d1}, [r12,:128]! - it lt - bxlt lr - -3: vld1.32 {d2-d3}, [r1,:128] - vld1.32 {d0-d1}, [r0,:128] - vcle.s32 q8, q1, #0 - vand q9, q0, q10 - veor q1, q1, q9 - vand q2, q1, q8 - vbic q3, q1, q8 - vadd.f32 q1, q0, q2 - vsub.f32 q0, q0, q3 - vst1.32 {d2-d3}, [r0,:128]! - vst1.32 {d0-d1}, [r1,:128]! - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c deleted file mode 100644 index 5af795b..0000000 --- a/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/dsputil.h" -#include "libavcodec/vp3dsp.h" - -void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, int16_t *data); -void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, int16_t *data); -void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const int16_t *data); - -void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); -void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); - -av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_neon(cpu_flags)) { - c->idct_put = ff_vp3_idct_put_neon; - c->idct_add = ff_vp3_idct_add_neon; - c->idct_dc_add = ff_vp3_idct_dc_add_neon; - c->v_loop_filter = ff_vp3_v_loop_filter_neon; - c->h_loop_filter = ff_vp3_h_loop_filter_neon; - } -} diff --git a/ffmpeg/libavcodec/arm/vp3dsp_neon.S b/ffmpeg/libavcodec/arm/vp3dsp_neon.S deleted file mode 100644 index f133905..0000000 --- a/ffmpeg/libavcodec/arm/vp3dsp_neon.S +++ /dev/null @@ -1,395 +0,0 @@ -/* - * Copyright (c) 2009 David Conrad - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -const vp3_idct_constants, align=4 -.short 64277, 60547, 54491, 46341, 36410, 25080, 12785 -endconst - -#define xC1S7 d0[0] -#define xC2S6 d0[1] -#define xC3S5 d0[2] -#define xC4S4 d0[3] -#define xC5S3 d1[0] -#define xC6S2 d1[1] -#define xC7S1 d1[2] - -.macro vp3_loop_filter - vsubl.u8 q3, d18, d17 - vsubl.u8 q2, d16, d19 - vadd.i16 q1, q3, q3 - vadd.i16 q2, q2, q3 - vadd.i16 q0, q1, q2 - vrshr.s16 q0, q0, #3 - vmovl.u8 q9, d18 - vdup.u16 q15, r2 - - vabs.s16 q1, q0 - vshr.s16 q0, q0, #15 - vqsub.u16 q2, q15, q1 - vqsub.u16 q3, q2, q1 - vsub.i16 q1, q2, q3 - veor q1, q1, q0 - vsub.i16 q0, q1, q0 - - vaddw.u8 q2, q0, d17 - vsub.i16 q3, q9, q0 - vqmovun.s16 d0, q2 - vqmovun.s16 d1, q3 -.endm - -function ff_vp3_v_loop_filter_neon, export=1 - sub ip, r0, r1 - sub r0, r0, r1, lsl #1 - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d17}, [r0,:64], r1 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d19}, [r0,:64], r1 - ldrb r2, [r2, #129*4] - - vp3_loop_filter - - vst1.64 {d0}, [ip,:64], r1 - vst1.64 {d1}, [ip,:64], r1 - bx lr -endfunc - -function ff_vp3_h_loop_filter_neon, export=1 - sub ip, r0, #1 - sub r0, r0, #2 - vld1.32 {d16[]}, [r0], r1 - vld1.32 {d17[]}, [r0], r1 - vld1.32 {d18[]}, [r0], r1 - vld1.32 {d19[]}, [r0], r1 - vld1.32 {d16[1]}, [r0], r1 - vld1.32 {d17[1]}, [r0], r1 - vld1.32 {d18[1]}, [r0], r1 - vld1.32 {d19[1]}, [r0], r1 - ldrb r2, [r2, #129*4] - - vtrn.8 d16, d17 - vtrn.8 d18, d19 - vtrn.16 d16, d18 - vtrn.16 d17, d19 - - vp3_loop_filter - - vtrn.8 d0, d1 - - vst1.16 {d0[0]}, [ip], r1 - vst1.16 {d1[0]}, [ip], r1 - vst1.16 {d0[1]}, [ip], r1 - vst1.16 {d1[1]}, [ip], r1 - vst1.16 {d0[2]}, [ip], r1 - vst1.16 {d1[2]}, [ip], r1 - vst1.16 {d0[3]}, [ip], r1 - vst1.16 {d1[3]}, [ip], r1 - bx lr -endfunc - - -function vp3_idct_start_neon - vpush {d8-d15} - vmov.i16 q4, #0 - vmov.i16 q5, #0 - movrel r3, vp3_idct_constants - vld1.64 {d0-d1}, [r3,:128] - vld1.64 {d16-d19}, [r2,:128] - vst1.64 {q4-q5}, [r2,:128]! - vld1.64 {d20-d23}, [r2,:128] - vst1.64 {q4-q5}, [r2,:128]! - vld1.64 {d24-d27}, [r2,:128] - vst1.64 {q4-q5}, [r2,:128]! - vadd.s16 q1, q8, q12 - vsub.s16 q8, q8, q12 - vld1.64 {d28-d31}, [r2,:128] - vst1.64 {q4-q5}, [r2,:128]! - -vp3_idct_core_neon: - vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16 - vmull.s16 q3, d19, xC1S7 - vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16 - vmull.s16 q5, d3, xC4S4 - vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16 - vmull.s16 q7, d17, xC4S4 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 - vshrn.s32 d9, q7, #16 - vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4 - vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4 - vadd.s16 q1, q2, q9 // ip[1] * C1 - - vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16 - vmull.s16 q3, d31, xC1S7 - vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16 - vmull.s16 q5, d31, xC7S1 - vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16 - vmull.s16 q7, d19, xC7S1 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 // ip[7] * C7 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 // ip[1] * C7 - vshrn.s32 d9, q7, #16 - vadd.s16 q2, q2, q15 // ip[7] * C1 - vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7 - vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1 - - vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16 - vmull.s16 q3, d23, xC5S3 - vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16 - vmull.s16 q5, d23, xC3S5 - vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16 - vmull.s16 q7, d27, xC5S3 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 - vshrn.s32 d9, q7, #16 - vadd.s16 q3, q3, q11 // ip[3] * C3 - vadd.s16 q4, q4, q13 // ip[5] * C5 - vadd.s16 q1, q2, q11 // ip[3] * C5 - vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5 - - vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16 - vmull.s16 q3, d27, xC3S5 - vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16 - vmull.s16 q5, d21, xC2S6 - vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16 - vmull.s16 q7, d29, xC6S2 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 // ip[6] * C6 - vshrn.s32 d9, q7, #16 - vadd.s16 q2, q2, q13 // ip[5] * C3 - vadd.s16 q3, q3, q10 // ip[2] * C2 - vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5 - vsub.s16 q1, q9, q11 // (A - C) - vadd.s16 q11, q9, q11 // Cd = A + C - vsub.s16 q9, q15, q13 // (B - D) - vadd.s16 q13, q15, q13 // Dd = B + D - vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6 - - vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16 - vmull.s16 q3, d3, xC4S4 - vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16 - vmull.s16 q5, d29, xC2S6 - vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16 - vmull.s16 q7, d21, xC6S2 - vshrn.s32 d4, q2, #16 - vshrn.s32 d5, q3, #16 - vshrn.s32 d6, q4, #16 - vshrn.s32 d7, q5, #16 - vshrn.s32 d8, q6, #16 // ip[2] * C6 - vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16 - vmull.s16 q6, d19, xC4S4 - vshrn.s32 d9, q7, #16 - vadd.s16 q3, q3, q14 // ip[6] * C2 - vadd.s16 q10, q1, q2 // Ad = (A - C) * C4 - vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2 - bx lr -endfunc - -.macro VP3_IDCT_END type -function vp3_idct_end_\type\()_neon -.ifc \type, col - vdup.16 q0, r3 - vadd.s16 q12, q12, q0 - vadd.s16 q8, q8, q0 -.endif - - vshrn.s32 d2, q5, #16 - vshrn.s32 d3, q6, #16 - vadd.s16 q2, q12, q15 // Gd = E + G - vadd.s16 q9, q1, q9 // (B - D) * C4 - vsub.s16 q12, q12, q15 // Ed = E - G - vsub.s16 q3, q8, q10 // Fd = F - Ad - vadd.s16 q10, q8, q10 // Add = F + Ad - vadd.s16 q4, q9, q14 // Hd = Bd + H - vsub.s16 q14, q9, q14 // Bdd = Bd - H - vadd.s16 q8, q2, q11 // [0] = Gd + Cd - vsub.s16 q15, q2, q11 // [7] = Gd - Cd - vadd.s16 q9, q10, q4 // [1] = Add + Hd - vsub.s16 q10, q10, q4 // [2] = Add - Hd - vadd.s16 q11, q12, q13 // [3] = Ed + Dd - vsub.s16 q12, q12, q13 // [4] = Ed - Dd -.ifc \type, row - vtrn.16 q8, q9 -.endif - vadd.s16 q13, q3, q14 // [5] = Fd + Bdd - vsub.s16 q14, q3, q14 // [6] = Fd - Bdd - -.ifc \type, row - // 8x8 transpose - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vswp d17, d24 - vswp d19, d26 - vadd.s16 q1, q8, q12 - vswp d21, d28 - vsub.s16 q8, q8, q12 - vswp d23, d30 -.endif - bx lr -endfunc -.endm - -VP3_IDCT_END row -VP3_IDCT_END col - -function ff_vp3_idct_put_neon, export=1 - mov ip, lr - bl vp3_idct_start_neon - bl vp3_idct_end_row_neon - mov r3, #8 - add r3, r3, #2048 // convert signed pixel to unsigned - bl vp3_idct_core_neon - bl vp3_idct_end_col_neon - mov lr, ip - vpop {d8-d15} - - vqshrun.s16 d0, q8, #4 - vqshrun.s16 d1, q9, #4 - vqshrun.s16 d2, q10, #4 - vqshrun.s16 d3, q11, #4 - vst1.64 {d0}, [r0,:64], r1 - vqshrun.s16 d4, q12, #4 - vst1.64 {d1}, [r0,:64], r1 - vqshrun.s16 d5, q13, #4 - vst1.64 {d2}, [r0,:64], r1 - vqshrun.s16 d6, q14, #4 - vst1.64 {d3}, [r0,:64], r1 - vqshrun.s16 d7, q15, #4 - vst1.64 {d4}, [r0,:64], r1 - vst1.64 {d5}, [r0,:64], r1 - vst1.64 {d6}, [r0,:64], r1 - vst1.64 {d7}, [r0,:64], r1 - bx lr -endfunc - -function ff_vp3_idct_add_neon, export=1 - mov ip, lr - bl vp3_idct_start_neon - bl vp3_idct_end_row_neon - mov r3, #8 - bl vp3_idct_core_neon - bl vp3_idct_end_col_neon - mov lr, ip - vpop {d8-d15} - mov r2, r0 - - vld1.64 {d0}, [r0,:64], r1 - vshr.s16 q8, q8, #4 - vld1.64 {d1}, [r0,:64], r1 - vshr.s16 q9, q9, #4 - vld1.64 {d2}, [r0,:64], r1 - vaddw.u8 q8, q8, d0 - vld1.64 {d3}, [r0,:64], r1 - vaddw.u8 q9, q9, d1 - vld1.64 {d4}, [r0,:64], r1 - vshr.s16 q10, q10, #4 - vld1.64 {d5}, [r0,:64], r1 - vshr.s16 q11, q11, #4 - vld1.64 {d6}, [r0,:64], r1 - vqmovun.s16 d0, q8 - vld1.64 {d7}, [r0,:64], r1 - vqmovun.s16 d1, q9 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vshr.s16 q12, q12, #4 - vshr.s16 q13, q13, #4 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vshr.s16 q14, q14, #4 - vshr.s16 q15, q15, #4 - vst1.64 {d0}, [r2,:64], r1 - vqmovun.s16 d4, q12 - vst1.64 {d1}, [r2,:64], r1 - vqmovun.s16 d5, q13 - vst1.64 {d2}, [r2,:64], r1 - vaddw.u8 q14, q14, d6 - vst1.64 {d3}, [r2,:64], r1 - vaddw.u8 q15, q15, d7 - vst1.64 {d4}, [r2,:64], r1 - vqmovun.s16 d6, q14 - vst1.64 {d5}, [r2,:64], r1 - vqmovun.s16 d7, q15 - vst1.64 {d6}, [r2,:64], r1 - vst1.64 {d7}, [r2,:64], r1 - bx lr -endfunc - -function ff_vp3_idct_dc_add_neon, export=1 - ldrsh r12, [r2] - mov r3, r0 - add r12, r12, #15 - vdup.16 q15, r12 - mov r12, 0 - strh r12, [r2] - vshr.s16 q15, q15, #5 - - vld1.8 {d0}, [r0,:64], r1 - vld1.8 {d1}, [r0,:64], r1 - vld1.8 {d2}, [r0,:64], r1 - vaddw.u8 q8, q15, d0 - vld1.8 {d3}, [r0,:64], r1 - vaddw.u8 q9, q15, d1 - vld1.8 {d4}, [r0,:64], r1 - vaddw.u8 q10, q15, d2 - vld1.8 {d5}, [r0,:64], r1 - vaddw.u8 q11, q15, d3 - vld1.8 {d6}, [r0,:64], r1 - vaddw.u8 q12, q15, d4 - vld1.8 {d7}, [r0,:64], r1 - vaddw.u8 q13, q15, d5 - vqmovun.s16 d0, q8 - vaddw.u8 q14, q15, d6 - vqmovun.s16 d1, q9 - vaddw.u8 q15, q15, d7 - vqmovun.s16 d2, q10 - vst1.8 {d0}, [r3,:64], r1 - vqmovun.s16 d3, q11 - vst1.8 {d1}, [r3,:64], r1 - vqmovun.s16 d4, q12 - vst1.8 {d2}, [r3,:64], r1 - vqmovun.s16 d5, q13 - vst1.8 {d3}, [r3,:64], r1 - vqmovun.s16 d6, q14 - vst1.8 {d4}, [r3,:64], r1 - vqmovun.s16 d7, q15 - vst1.8 {d5}, [r3,:64], r1 - vst1.8 {d6}, [r3,:64], r1 - vst1.8 {d7}, [r3,:64], r1 - bx lr -endfunc diff --git a/ffmpeg/libavcodec/arm/vp56_arith.h b/ffmpeg/libavcodec/arm/vp56_arith.h deleted file mode 100644 index feb1247..0000000 --- a/ffmpeg/libavcodec/arm/vp56_arith.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright (C) 2010 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_VP56_ARITH_H -#define AVCODEC_ARM_VP56_ARITH_H - -#if CONFIG_THUMB -# define A(x) -# define T(x) x -#else -# define A(x) x -# define T(x) -#endif - -#if CONFIG_THUMB || defined __clang__ -# define L(x) -# define U(x) x -#else -# define L(x) x -# define U(x) -#endif - -#if HAVE_ARMV6_INLINE - -#define vp56_rac_get_prob vp56_rac_get_prob_armv6 -static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr) -{ - unsigned shift = ff_vp56_norm_shift[c->high]; - unsigned code_word = c->code_word << shift; - unsigned high = c->high << shift; - unsigned bit; - - __asm__ ("adds %3, %3, %0 \n" - "itt cs \n" - "cmpcs %7, %4 \n" - L("ldrcsh %2, [%4], #2 \n") - U("ldrhcs %2, [%4], #2 \n") - "rsb %0, %6, #256 \n" - "smlabb %0, %5, %6, %0 \n" - T("itttt cs \n") - "rev16cs %2, %2 \n" - T("lslcs %2, %2, %3 \n") - T("orrcs %1, %1, %2 \n") - A("orrcs %1, %1, %2, lsl %3 \n") - "subcs %3, %3, #16 \n" - "lsr %0, %0, #8 \n" - "cmp %1, %0, lsl #16 \n" - "ittte ge \n" - "subge %1, %1, %0, lsl #16 \n" - "subge %0, %5, %0 \n" - "movge %2, #1 \n" - "movlt %2, #0 \n" - : "=&r"(c->high), "=&r"(c->code_word), "=&r"(bit), - "+&r"(c->bits), "+&r"(c->buffer) - : "r"(high), "r"(pr), "r"(c->end - 1), - "0"(shift), "1"(code_word) - : "cc"); - - return bit; -} - -#define vp56_rac_get_prob_branchy vp56_rac_get_prob_branchy_armv6 -static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr) -{ - unsigned shift = ff_vp56_norm_shift[c->high]; - unsigned code_word = c->code_word << shift; - unsigned high = c->high << shift; - unsigned low; - unsigned tmp; - - __asm__ ("adds %3, %3, %0 \n" - "itt cs \n" - "cmpcs %7, %4 \n" - L("ldrcsh %2, [%4], #2 \n") - U("ldrhcs %2, [%4], #2 \n") - "rsb %0, %6, #256 \n" - "smlabb %0, %5, %6, %0 \n" - T("itttt cs \n") - "rev16cs %2, %2 \n" - T("lslcs %2, %2, %3 \n") - T("orrcs %1, %1, %2 \n") - A("orrcs %1, %1, %2, lsl %3 \n") - "subcs %3, %3, #16 \n" - "lsr %0, %0, #8 \n" - "lsl %2, %0, #16 \n" - : "=&r"(low), "+&r"(code_word), "=&r"(tmp), - "+&r"(c->bits), "+&r"(c->buffer) - : "r"(high), "r"(pr), "r"(c->end - 1), "0"(shift) - : "cc"); - - if (code_word >= tmp) { - c->high = high - low; - c->code_word = code_word - tmp; - return 1; - } - - c->high = low; - c->code_word = code_word; - return 0; -} - -#endif - -#endif /* AVCODEC_ARM_VP56_ARITH_H */ diff --git a/ffmpeg/libavcodec/arm/vp8.h b/ffmpeg/libavcodec/arm/vp8.h deleted file mode 100644 index ddaa120..0000000 --- a/ffmpeg/libavcodec/arm/vp8.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_VP8_H -#define AVCODEC_ARM_VP8_H - -#include <stdint.h> - -#include "config.h" -#include "libavcodec/vp56.h" -#include "libavcodec/vp8.h" - -#if HAVE_ARMV6_EXTERNAL -#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6 -int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, int16_t block[16], - uint8_t probs[8][3][NUM_DCT_TOKENS-1], - int i, uint8_t *token_prob, int16_t qmul[2]); -#endif - -#endif /* AVCODEC_ARM_VP8_H */ diff --git a/ffmpeg/libavcodec/arm/vp8_armv6.S b/ffmpeg/libavcodec/arm/vp8_armv6.S deleted file mode 100644 index e7d25a4..0000000 --- a/ffmpeg/libavcodec/arm/vp8_armv6.S +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright (C) 2010 Mans Rullgard - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" - -.macro rac_get_prob h, bs, buf, cw, pr, t0, t1 - adds \bs, \bs, \t0 - lsl \cw, \cw, \t0 - lsl \t0, \h, \t0 - rsb \h, \pr, #256 - it cs - ldrhcs \t1, [\buf], #2 - smlabb \h, \t0, \pr, \h -T itttt cs - rev16cs \t1, \t1 -A orrcs \cw, \cw, \t1, lsl \bs -T lslcs \t1, \t1, \bs -T orrcs \cw, \cw, \t1 - subcs \bs, \bs, #16 - lsr \h, \h, #8 - cmp \cw, \h, lsl #16 - itt ge - subge \cw, \cw, \h, lsl #16 - subge \h, \t0, \h -.endm - -.macro rac_get_128 h, bs, buf, cw, t0, t1 - adds \bs, \bs, \t0 - lsl \cw, \cw, \t0 - lsl \t0, \h, \t0 - it cs - ldrhcs \t1, [\buf], #2 - mov \h, #128 - it cs - rev16cs \t1, \t1 - add \h, \h, \t0, lsl #7 -A orrcs \cw, \cw, \t1, lsl \bs -T ittt cs -T lslcs \t1, \t1, \bs -T orrcs \cw, \cw, \t1 - subcs \bs, \bs, #16 - lsr \h, \h, #8 - cmp \cw, \h, lsl #16 - itt ge - subge \cw, \cw, \h, lsl #16 - subge \h, \t0, \h -.endm - -function ff_decode_block_coeffs_armv6, export=1 - push {r0,r1,r4-r11,lr} - movrelx lr, X(ff_vp56_norm_shift) - ldrd r4, r5, [sp, #44] @ token_prob, qmul - cmp r3, #0 - ldr r11, [r5] - ldm r0, {r5-r7} @ high, bits, buf - it ne - pkhtbne r11, r11, r11, asr #16 - ldr r8, [r0, #16] @ code_word -0: - ldrb r9, [lr, r5] - add r3, r3, #1 - ldrb r0, [r4, #1] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - blt 2f - - ldrb r9, [lr, r5] - ldrb r0, [r4, #2] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - ldrb r9, [lr, r5] - bge 3f - - add r4, r3, r3, lsl #5 - sxth r12, r11 - add r4, r4, r2 - adds r6, r6, r9 - add r4, r4, #11 - lsl r8, r8, r9 - it cs - ldrhcs r10, [r7], #2 - lsl r9, r5, r9 - mov r5, #128 - it cs - rev16cs r10, r10 - add r5, r5, r9, lsl #7 -T ittt cs -T lslcs r10, r10, r6 -T orrcs r8, r8, r10 -A orrcs r8, r8, r10, lsl r6 - subcs r6, r6, #16 - lsr r5, r5, #8 - cmp r8, r5, lsl #16 - movrel r10, zigzag_scan-1 - itt ge - subge r8, r8, r5, lsl #16 - subge r5, r9, r5 - ldrb r10, [r10, r3] - it ge - rsbge r12, r12, #0 - cmp r3, #16 - strh r12, [r1, r10] - bge 6f -5: - ldrb r9, [lr, r5] - ldrb r0, [r4] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - pkhtb r11, r11, r11, asr #16 - bge 0b - -6: - ldr r0, [sp] - ldr r9, [r0, #12] - cmp r7, r9 - it hi - movhi r7, r9 - stm r0, {r5-r7} @ high, bits, buf - str r8, [r0, #16] @ code_word - - add sp, sp, #8 - mov r0, r3 - pop {r4-r11,pc} -2: - add r4, r3, r3, lsl #5 - cmp r3, #16 - add r4, r4, r2 - pkhtb r11, r11, r11, asr #16 - bne 0b - b 6b -3: - ldrb r0, [r4, #3] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - ldrb r9, [lr, r5] - bge 1f - - mov r12, #2 - ldrb r0, [r4, #4] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - it ge - addge r12, #1 - ldrb r9, [lr, r5] - blt 4f - ldrb r0, [r4, #5] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - it ge - addge r12, #1 - ldrb r9, [lr, r5] - b 4f -1: - ldrb r0, [r4, #6] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - ldrb r9, [lr, r5] - bge 3f - - ldrb r0, [r4, #7] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - ldrb r9, [lr, r5] - bge 2f - - mov r12, #5 - mov r0, #159 - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - it ge - addge r12, r12, #1 - ldrb r9, [lr, r5] - b 4f -2: - mov r12, #7 - mov r0, #165 - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - it ge - addge r12, r12, #2 - ldrb r9, [lr, r5] - mov r0, #145 - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - it ge - addge r12, r12, #1 - ldrb r9, [lr, r5] - b 4f -3: - ldrb r0, [r4, #8] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - it ge - addge r4, r4, #1 - ldrb r9, [lr, r5] - ite ge - movge r12, #2 - movlt r12, #0 - ldrb r0, [r4, #9] - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - mov r9, #8 - it ge - addge r12, r12, #1 - movrelx r4, X(ff_vp8_dct_cat_prob), r1 - lsl r9, r9, r12 - ldr r4, [r4, r12, lsl #2] - add r12, r9, #3 - mov r1, #0 - ldrb r0, [r4], #1 -1: - ldrb r9, [lr, r5] - lsl r1, r1, #1 - rac_get_prob r5, r6, r7, r8, r0, r9, r10 - ldrb r0, [r4], #1 - it ge - addge r1, r1, #1 - cmp r0, #0 - bne 1b - ldrb r9, [lr, r5] - add r12, r12, r1 - ldr r1, [sp, #4] -4: - add r4, r3, r3, lsl #5 - add r4, r4, r2 - add r4, r4, #22 - rac_get_128 r5, r6, r7, r8, r9, r10 - it ge - rsbge r12, r12, #0 - smulbb r12, r12, r11 - movrel r9, zigzag_scan-1 - ldrb r9, [r9, r3] - cmp r3, #16 - strh r12, [r1, r9] - bge 6b - b 5b -endfunc - -const zigzag_scan - .byte 0, 2, 8, 16 - .byte 10, 4, 6, 12 - .byte 18, 24, 26, 20 - .byte 14, 22, 28, 30 -endconst diff --git a/ffmpeg/libavcodec/arm/vp8dsp.h b/ffmpeg/libavcodec/arm/vp8dsp.h deleted file mode 100644 index 6041ef1..0000000 --- a/ffmpeg/libavcodec/arm/vp8dsp.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef AVCODEC_ARM_VP8DSP_H -#define AVCODEC_ARM_VP8DSP_H - -#include "libavcodec/vp8dsp.h" - -void ff_vp8dsp_init_armv6(VP8DSPContext *dsp); -void ff_vp8dsp_init_neon(VP8DSPContext *dsp); - -#define VP8_LF_Y(hv, inner, opt) \ - void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \ - ptrdiff_t stride, \ - int flim_E, int flim_I, \ - int hev_thresh) - -#define VP8_LF_UV(hv, inner, opt) \ - void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \ - uint8_t *dstV, \ - ptrdiff_t stride, \ - int flim_E, int flim_I, \ - int hev_thresh) - -#define VP8_LF_SIMPLE(hv, opt) \ - void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \ - ptrdiff_t stride, \ - int flim) - -#define VP8_LF_HV(inner, opt) \ - VP8_LF_Y(h, inner, opt); \ - VP8_LF_Y(v, inner, opt); \ - VP8_LF_UV(h, inner, opt); \ - VP8_LF_UV(v, inner, opt) - -#define VP8_LF(opt) \ - VP8_LF_HV(, opt); \ - VP8_LF_HV(_inner, opt); \ - VP8_LF_SIMPLE(h, opt); \ - VP8_LF_SIMPLE(v, opt) - -#define VP8_MC(n, opt) \ - void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \ - uint8_t *src, ptrdiff_t srcstride, \ - int h, int x, int y) - -#define VP8_EPEL(w, opt) \ - VP8_MC(pixels ## w, opt); \ - VP8_MC(epel ## w ## _h4, opt); \ - VP8_MC(epel ## w ## _h6, opt); \ - VP8_MC(epel ## w ## _v4, opt); \ - VP8_MC(epel ## w ## _h4v4, opt); \ - VP8_MC(epel ## w ## _h6v4, opt); \ - VP8_MC(epel ## w ## _v6, opt); \ - VP8_MC(epel ## w ## _h4v6, opt); \ - VP8_MC(epel ## w ## _h6v6, opt) - -#define VP8_BILIN(w, opt) \ - VP8_MC(bilin ## w ## _h, opt); \ - VP8_MC(bilin ## w ## _v, opt); \ - VP8_MC(bilin ## w ## _hv, opt) - -#endif /* AVCODEC_ARM_VP8DSP_H */ diff --git a/ffmpeg/libavcodec/arm/vp8dsp_armv6.S b/ffmpeg/libavcodec/arm/vp8dsp_armv6.S deleted file mode 100644 index a14b188..0000000 --- a/ffmpeg/libavcodec/arm/vp8dsp_armv6.S +++ /dev/null @@ -1,1634 +0,0 @@ -/* - * VP8 ARMv6 optimisations - * - * Copyright (c) 2010 Google Inc. - * Copyright (c) 2010 Rob Clark <rob@ti.com> - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * This code was partially ported from libvpx, which uses this license: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * * Neither the name of Google nor the names of its contributors may - * be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "libavutil/arm/asm.S" - -@ idct - -@ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16]) -function ff_vp8_luma_dc_wht_armv6, export=1 - push {r4-r10, lr} - - ldm r1, {r2-r9} - mov r10, #0 - mov lr, #0 - uadd16 r12, r2, r8 @ t0[0,1] - usub16 r2, r2, r8 @ t3[0,1] - stm r1!, {r10, lr} - uadd16 r8, r4, r6 @ t1[0,1] - usub16 r4, r4, r6 @ t2[0,1] - stm r1!, {r10, lr} - uadd16 r6, r12, r8 @ dc0[0,1] - usub16 r12, r12, r8 @ dc2[0,1] - stm r1!, {r10, lr} - uadd16 r8, r2, r4 @ dc1[0,1] - usub16 r2, r2, r4 @ dc3[0,1] - stm r1!, {r10, lr} - - uadd16 lr, r3, r9 @ t0[2,3] - usub16 r3, r3, r9 @ t3[2,3] - uadd16 r9, r5, r7 @ t1[2,3] - usub16 r5, r5, r7 @ t2[2,3] - - uadd16 r7, lr, r9 @ dc0[2,3] - usub16 lr, lr, r9 @ dc2[2,3] - uadd16 r9, r3, r5 @ dc1[2,3] - usub16 r3, r3, r5 @ dc3[2,3] - - mov r1, #3 - orr r1, r1, #0x30000 @ 3 | 3 (round) - - pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0] - pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1] - pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0] - pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1] - pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2] - uadd16 r4, r4, r1 - uadd16 r5, r5, r1 - pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3] - pkhbt r2, lr, r3, lsl #16 @ dc{2,3}[2] - pkhtb lr, r3, lr, asr #16 @ dc{2,3}[3] - - uadd16 r9, r4, r7 @ t0[0,1] - uadd16 r3, r5, lr @ t0[2,3] - usub16 r4, r4, r7 @ t3[0,1] - usub16 r5, r5, lr @ t3[2,3] - uadd16 r7, r6, r8 @ t1[0,1] - uadd16 lr, r12, r2 @ t1[2,3] - usub16 r6, r6, r8 @ t2[0,1] - usub16 r12, r12, r2 @ t2[2,3] - - uadd16 r8, r9, r7 @ block[0,1][0] - uadd16 r2, r3, lr @ block[2,3][0] - usub16 r9, r9, r7 @ block[0,1][2] - usub16 r3, r3, lr @ block[2,3][2] - uadd16 r7, r4, r6 @ block[0,1][1] - uadd16 lr, r5, r12 @ block[2,3][1] - usub16 r4, r4, r6 @ block[0,1][3] - usub16 r5, r5, r12 @ block[2,3][3] - -#if HAVE_ARMV6T2_EXTERNAL - sbfx r6, r8, #3, #13 - sbfx r12, r7, #3, #13 - sbfx r1, r9, #3, #13 - sbfx r10, r4, #3, #13 -#else - sxth r6, r8 - sxth r12, r7 - sxth r1, r9 - sxth r10, r4 - asr r6, #3 @ block[0][0] - asr r12, #3 @ block[0][1] - asr r1, #3 @ block[0][2] - asr r10, #3 @ block[0][3] -#endif - - strh r6, [r0], #32 - asr r8, r8, #19 @ block[1][0] - strh r12, [r0], #32 - asr r7, r7, #19 @ block[1][1] - strh r1, [r0], #32 - asr r9, r9, #19 @ block[1][2] - strh r10, [r0], #32 - asr r4, r4, #19 @ block[1][3] - strh r8, [r0], #32 - asr r6, r2, #19 @ block[3][0] - strh r7, [r0], #32 - asr r12, lr, #19 @ block[3][1] - strh r9, [r0], #32 - asr r1, r3, #19 @ block[3][2] - strh r4, [r0], #32 - asr r10, r5, #19 @ block[3][3] - -#if HAVE_ARMV6T2_EXTERNAL - sbfx r2, r2, #3, #13 - sbfx lr, lr, #3, #13 - sbfx r3, r3, #3, #13 - sbfx r5, r5, #3, #13 -#else - sxth r2, r2 - sxth lr, lr - sxth r3, r3 - sxth r5, r5 - asr r2, #3 @ block[2][0] - asr lr, #3 @ block[2][1] - asr r3, #3 @ block[2][2] - asr r5, #3 @ block[2][3] -#endif - - strh r2, [r0], #32 - strh lr, [r0], #32 - strh r3, [r0], #32 - strh r5, [r0], #32 - strh r6, [r0], #32 - strh r12, [r0], #32 - strh r1, [r0], #32 - strh r10, [r0], #32 - - pop {r4-r10, pc} -endfunc - -@ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16]) -function ff_vp8_luma_dc_wht_dc_armv6, export=1 - ldrsh r2, [r1] - mov r3, #0 - add r2, r2, #3 - strh r3, [r1] - asr r2, r2, #3 - .rept 16 - strh r2, [r0], #32 - .endr - bx lr -endfunc - -@ void vp8_idct_add(uint8_t *dst, int16_t block[16], int stride) -function ff_vp8_idct_add_armv6, export=1 - push {r4-r12, lr} - sub sp, sp, #32 - - movw r3, #20091 @ cospi8sqrt2minus1 - movw r4, #35468 @ sinpi8sqrt2 - mov r5, sp -1: - ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block1[0] - ldr lr, [r1, #16] @ i9 | i8 = block2[1] | block2[0] - ldr r12, [r1, #24] @ i13 | i12 = block3[1] | block3[0] - - smulwt r9, r3, r6 @ ip[5] * cospi8sqrt2minus1 - smulwb r7, r3, r6 @ ip[4] * cospi8sqrt2minus1 - smulwt r10, r4, r6 @ ip[5] * sinpi8sqrt2 - smulwb r8, r4, r6 @ ip[4] * sinpi8sqrt2 - pkhbt r7, r7, r9, lsl #16 @ 5c | 4c - smulwt r11, r3, r12 @ ip[13] * cospi8sqrt2minus1 - pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first half - uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first half - smulwb r9, r3, r12 @ ip[12] * cospi8sqrt2minus1 - smulwt r7, r4, r12 @ ip[13] * sinpi8sqrt2 - smulwb r10, r4, r12 @ ip[12] * sinpi8sqrt2 - - pkhbt r9, r9, r11, lsl #16 @ 13c | 12c - ldr r11, [r1] @ i1 | i0 - pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second half - uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 2nd half - uadd16 r6, r6, r10 @ d = t3 - uadd16 r10, r11, lr @ a = t0 - usub16 r7, r8, r7 @ c = t2 - usub16 r8, r11, lr @ b = t1 - uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0] - usub16 r10, r10, r6 @ a-d = tmp{0,1}[3] - uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1] - usub16 r7, r8, r7 @ b-c = tmp{0,1}[2] - mov r8, #0 - cmp sp, r5 - str r6, [r5, #8] @ o5 | o4 - str r7, [r5, #16] @ o9 | o8 - str r10, [r5, #24] @ o13 | o12 - str r9, [r5], #4 @ o1 | o0 - str r8, [r1, #8] - str r8, [r1, #16] - str r8, [r1, #24] - str r8, [r1], #4 - beq 1b - - mov r5, #2 -2: - pop {r1, r6, r12, lr} - smulwt r9, r3, r12 @ ip[5] * cospi8sqrt2minus1 - smulwt r7, r3, r1 @ ip[1] * cospi8sqrt2minus1 - smulwt r10, r4, r12 @ ip[5] * sinpi8sqrt2 - smulwt r8, r4, r1 @ ip[1] * sinpi8sqrt2 - pkhbt r11, r1, r12, lsl #16 @ i4 | i0 = t0/t1 first half - pkhtb r1, r12, r1, asr #16 @ i5 | i1 - pkhbt r7, r7, r9, lsl #16 @ 5c | 1c - pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = t2 first half - pkhbt r9, r6, lr, lsl #16 @ i6 | i2 = t0/t1 second half - pkhtb r12, lr, r6, asr #16 @ i7 | i3 - uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = t3 first half - uadd16 r10, r11, r9 @ a = t0 - usub16 r9, r11, r9 @ b = t1 - smulwt r7, r3, r12 @ ip[7] * cospi8sqrt2minus1 - smulwb lr, r3, r12 @ ip[3] * cospi8sqrt2minus1 - smulwt r11, r4, r12 @ ip[7] * sinpi8sqrt2 - smulwb r6, r4, r12 @ ip[3] * sinpi8sqrt2 - subs r5, r5, #1 - pkhbt r7, lr, r7, lsl #16 @ 7c | 3c - pkhbt r11, r6, r11, lsl #16 @ 7s | 3s = t3 second half - mov r6, #0x4 - orr r6, r6, #0x40000 - uadd16 r12, r7, r12 @ 7c+7 | 3c+3 = t2 second half - uadd16 r10, r10, r6 @ t0 + 4 - uadd16 r9, r9, r6 @ t1 + 4 - usub16 lr, r8, r12 @ c (o5 | o1) = t2 - uadd16 r12, r11, r1 @ d (o7 | o3) = t3 - usub16 r1, r9, lr @ b-c = dst{0,1}[2] - uadd16 r7, r10, r12 @ a+d = dst{0,1}[0] - usub16 r12, r10, r12 @ a-d = dst{0,1}[3] - uadd16 r10, r9, lr @ b+c = dst{0,1}[1] - - asr lr, r1, #3 @ o[1][2] - asr r9, r12, #3 @ o[1][3] - pkhtb r8, lr, r7, asr #19 @ o[1][0,2] - pkhtb r11, r9, r10, asr #19 @ o[1][1,3] - ldr lr, [r0] - sxth r12, r12 - ldr r9, [r0, r2] - sxth r1, r1 -#if HAVE_ARMV6T2_EXTERNAL - sbfx r7, r7, #3, #13 - sbfx r10, r10, #3, #13 -#else - sxth r7, r7 - sxth r10, r10 - asr r7, #3 @ o[0][0] - asr r10, #3 @ o[0][1] -#endif - pkhbt r7, r7, r1, lsl #13 @ o[0][0,2] - pkhbt r10, r10, r12, lsl #13 @ o[0][1,3] - - uxtab16 r7, r7, lr - uxtab16 r10, r10, lr, ror #8 - uxtab16 r8, r8, r9 - uxtab16 r11, r11, r9, ror #8 - usat16 r7, #8, r7 - usat16 r10, #8, r10 - usat16 r8, #8, r8 - usat16 r11, #8, r11 - orr r7, r7, r10, lsl #8 - orr r8, r8, r11, lsl #8 - str r8, [r0, r2] - str_post r7, r0, r2, lsl #1 - - bne 2b - - pop {r4-r12, pc} -endfunc - -@ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], int stride) -function ff_vp8_idct_dc_add_armv6, export=1 - push {r4-r6, lr} - add r6, r0, r2, lsl #1 - ldrsh r3, [r1] - mov r4, #0 - add r3, r3, #4 - strh r4, [r1], #32 - asr r3, #3 - ldr r5, [r0] - ldr r4, [r0, r2] - pkhbt r3, r3, r3, lsl #16 - uxtab16 lr, r3, r5 @ a1+2 | a1+0 - uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1 - uxtab16 r12, r3, r4 - uxtab16 r4, r3, r4, ror #8 - usat16 lr, #8, lr - usat16 r5, #8, r5 - usat16 r12, #8, r12 - usat16 r4, #8, r4 - orr lr, lr, r5, lsl #8 - ldr r5, [r6] - orr r12, r12, r4, lsl #8 - ldr r4, [r6, r2] - str lr, [r0] - uxtab16 lr, r3, r5 - str r12, [r0, r2] - uxtab16 r5, r3, r5, ror #8 - uxtab16 r12, r3, r4 - uxtab16 r4, r3, r4, ror #8 - usat16 lr, #8, lr - usat16 r5, #8, r5 - usat16 r12, #8, r12 - usat16 r4, #8, r4 - orr lr, lr, r5, lsl #8 - orr r12, r12, r4, lsl #8 - str lr, [r6] - str r12, [r6, r2] - pop {r4-r6, pc} -endfunc - -@ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], int stride) -function ff_vp8_idct_dc_add4uv_armv6, export=1 - push {r4, lr} - - bl ff_vp8_idct_dc_add_armv6 - add r0, r0, #4 - bl ff_vp8_idct_dc_add_armv6 - add r0, r0, r2, lsl #2 - sub r0, r0, #4 - bl ff_vp8_idct_dc_add_armv6 - add r0, r0, #4 - bl ff_vp8_idct_dc_add_armv6 - - pop {r4, pc} -endfunc - -@ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], int stride) -function ff_vp8_idct_dc_add4y_armv6, export=1 - push {r4, lr} - - bl ff_vp8_idct_dc_add_armv6 - add r0, r0, #4 - bl ff_vp8_idct_dc_add_armv6 - add r0, r0, #4 - bl ff_vp8_idct_dc_add_armv6 - add r0, r0, #4 - bl ff_vp8_idct_dc_add_armv6 - - pop {r4, pc} -endfunc - -@ loopfilter - -.macro transpose o3, o2, o1, o0, i0, i1, i2, i3 - uxtb16 \o1, \i1 @ xx 12 xx 10 - uxtb16 \o0, \i0 @ xx 02 xx 00 - uxtb16 \o3, \i3 @ xx 32 xx 30 - uxtb16 \o2, \i2 @ xx 22 xx 20 - orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00 - orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20 - - uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11 - uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31 - uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01 - uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21 - orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01 - orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21 - - pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02 - pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00 - pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03 - pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01 -.endm - -.macro simple_filter - uqsub8 r7, r3, r6 @ p1 - q1 - uqsub8 r8, r6, r3 @ q1 - p1 - uqsub8 r10, r4, r5 @ p0 - q0 - uqsub8 r9, r5, r4 @ q0 - p0 - orr r7, r7, r8 @ abs(p1 - q1) - orr r9, r9, r10 @ abs(p0 - q0) - uhadd8 r7, r7, lr @ abs(p1 - q2) >> 1 - uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2 - uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1-q1)/2 - mvn r8, #0 - usub8 r10, r12, r7 @ compare to flimit - sel r10, r8, lr @ filter mask: F or 0 - cmp r10, #0 - beq 2f - - eor r3, r3, r2 @ ps1 - eor r6, r6, r2 @ qs1 - eor r4, r4, r2 @ ps0 - eor r5, r5, r2 @ qs0 - - qsub8 r3, r3, r6 @ vp8_filter = p1 - q1 - qsub8 r6, r5, r4 @ q0 - p0 - qadd8 r3, r3, r6 @ += q0 - p0 - lsr r7, r2, #5 @ 0x04040404 - qadd8 r3, r3, r6 @ += q0 - p0 - sub r9, r7, r2, lsr #7 @ 0x03030303 - qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0) - and r3, r3, r10 @ vp8_filter &= mask - - qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3 - qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4 - - shadd8 r9, r9, lr - shadd8 r3, r3, lr - shadd8 r9, r9, lr - shadd8 r3, r3, lr - shadd8 r9, r9, lr @ Filter2 >>= 3 - shadd8 r3, r3, lr @ Filter1 >>= 3 - - qadd8 r4, r4, r9 @ u = p0 + Filter2 - qsub8 r5, r5, r3 @ u = q0 - Filter1 - eor r4, r4, r2 @ *op0 = u ^ 0x80 - eor r5, r5, r2 @ *oq0 = u ^ 0x80 -.endm - -@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim) -function ff_vp8_v_loop_filter16_simple_armv6, export=1 - push {r4-r11, lr} - - orr r2, r2, r2, lsl #16 - mov r11, #4 - mov lr, #0 - orr r12, r2, r2, lsl #8 - mov32 r2, 0x80808080 -1: - ldr_nreg r3, r0, r1, lsl #1 @ p1 - ldr_nreg r4, r0, r1 @ p0 - ldr r5, [r0] @ q0 - ldr r6, [r0, r1] @ q1 - simple_filter -T sub r7, r0, r1 - str r5, [r0] @ oq0 -A str r4, [r0, -r1] @ op0 -T str r4, [r7] -2: - subs r11, r11, #1 - add r0, r0, #4 - bne 1b - - pop {r4-r11, pc} -endfunc - -.macro filter_mask_p - uqsub8 r6, r9, r10 @ p3 - p2 - uqsub8 r7, r10, r9 @ p2 - p3 - uqsub8 r8, r10, r11 @ p2 - p1 - uqsub8 r10, r11, r10 @ p1 - p2 - orr r6, r6, r7 @ abs(p3-p2) - orr r8, r8, r10 @ abs(p2-p1) - uqsub8 lr, r6, r2 @ compare to limit - uqsub8 r8, r8, r2 @ compare to limit - uqsub8 r6, r11, r12 @ p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 @ p0 - p1 - orr r6, r6, r7 @ abs(p1-p0) - uqsub8 r7, r6, r2 @ compare to limit - uqsub8 r8, r6, r3 @ compare to thresh - orr lr, lr, r7 -.endm - -.macro filter_mask_pq - uqsub8 r6, r11, r10 @ p1 - q1 - uqsub8 r7, r10, r11 @ q1 - p1 - uqsub8 r11, r12, r9 @ p0 - q0 - uqsub8 r12, r9, r12 @ q0 - p0 - orr r6, r6, r7 @ abs(p1-q1) - orr r12, r11, r12 @ abs(p0-q0) - mov32 r7, 0x7f7f7f7f - uqadd8 r12, r12, r12 @ abs(p0-q0) * 2 - and r6, r7, r6, lsr #1 @ abs(p1-q1) / 2 - uqadd8 r12, r12, r6 @ abs(p0-q0) * 2 + abs(p1-q1)/2 -.endm - -.macro filter_mask_v - filter_mask_p - - ldr r10, [r0, r1] @ q1 - ldr_post r9, r0, r1, lsl #1 @ q0 - - filter_mask_pq - - ldr r11, [r0] @ q2 - - uqsub8 r7, r9, r10 @ q0 - q1 - uqsub8 r6, r10, r9 @ q1 - q0 - uqsub8 r12, r12, r4 @ compare to flimit - uqsub8 r9, r11, r10 @ q2 - q1 - uqsub8 r10, r10, r11 @ q1 - q2 - orr lr, lr, r12 - ldr r12, [r0, r1] @ q3 - orr r6, r7, r6 @ abs(q1-q0) - orr r10, r9, r10 @ abs(q2-q1) - uqsub8 r9, r12, r11 @ q3 - q2 - uqsub8 r11, r11, r12 @ q2 - q3 - uqsub8 r7, r6, r2 @ compare to limit - uqsub8 r10, r10, r2 @ compare to limit - uqsub8 r6, r6, r3 @ compare to thresh - orr r9, r9, r11 @ abs(q3-q2) - orr lr, lr, r7 - orr lr, lr, r10 - uqsub8 r9, r9, r2 @ compare to limit - orr lr, lr, r9 - - mov r12, #0 - usub8 lr, r12, lr - mvn r11, #0 - sel lr, r11, r12 @ filter mask - sub r0, r0, r1, lsl #1 -.endm - -.macro filter_mask_h - transpose r12, r11, r10, r9, r6, r7, r8, lr - - filter_mask_p - - stm sp, {r8, r11, r12, lr} - sub r0, r0, r1, lsl #2 - add r0, r0, #4 - - ldr r7, [r0, r1] - ldr_post r6, r0, r1, lsl #1 - ldr lr, [r0, r1] - ldr r8, [r0] - - transpose r12, r11, r10, r9, r6, r7, r8, lr - - uqsub8 r8, r12, r11 @ q3 - q2 - uqsub8 lr, r11, r12 @ q2 - q3 - uqsub8 r7, r9, r10 @ q0 - q1 - uqsub8 r6, r10, r9 @ q1 - q0 - uqsub8 r12, r11, r10 @ q2 - q1 - uqsub8 r11, r10, r11 @ q1 - q2 - orr r8, r8, lr @ abs(q3-q2) - orr r6, r7, r6 @ abs(q1-q0) - orr r11, r12, r11 @ abs(q2-q1) - ldr lr, [sp, #12] @ load back (f)limit accumulator - uqsub8 r8, r8, r2 @ compare to limit - uqsub8 r7, r6, r2 @ compare to limit - uqsub8 r11, r11, r2 @ compare to limit - orr lr, lr, r8 - uqsub8 r8, r6, r3 @ compare to thresh - orr lr, lr, r7 - ldr r12, [sp, #8] @ p1 - orr lr, lr, r11 - - ldr r11, [sp, #4] @ p0 - - filter_mask_pq - - mov r10, #0 - uqsub8 r12, r12, r4 @ compare to flimit - mvn r11, #0 - orr lr, lr, r12 - usub8 lr, r10, lr - sel lr, r11, r10 @ filter mask -.endm - -.macro filter inner - mov32 r12, 0x80808080 - eor r11, r7, r12 @ ps1 - eor r8, r8, r12 @ ps0 - eor r9, r9, r12 @ qs0 - eor r10, r10, r12 @ qs1 - - stm sp, {r8-r11} - - qsub8 r7, r11, r10 @ vp8_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - .if \inner - and r7, r7, r6 @ vp8_filter &= hev - .endif - qadd8 r7, r7, r8 - lsr r10, r12, #5 @ 0x04040404 - qadd8 r7, r7, r8 - sub r9, r10, r12, lsr #7 @ 0x03030303 - qadd8 r7, r7, r8 - - and r7, r7, lr @ vp8_filter &= mask - .if !\inner - mov r12, r7 @ Filter2 - and r7, r7, r6 @ Filter2 &= hev - .endif - qadd8 lr, r7, r9 @ Filter2 = vp8_signed_char_clamp(vp8_filter+3) - qadd8 r7, r7, r10 @ Filter1 = vp8_signed_char_clamp(vp8_filter+4) - - mov r9, #0 - shadd8 lr, lr, r9 @ Filter2 >>= 3 - shadd8 r7, r7, r9 @ Filter1 >>= 3 - shadd8 lr, lr, r9 - shadd8 r7, r7, r9 - shadd8 lr, lr, r9 @ Filter2 - shadd8 r7, r7, r9 @ Filter1 -.endm - -.macro filter_v inner - orr r10, r6, r8 @ calculate vp8_hevmask - ldr_nreg r7, r0, r1, lsl #1 @ p1 - usub8 r10, r12, r10 - ldr_nreg r8, r0, r1 @ p0 - sel r6, r12, r11 @ obtain vp8_hevmask - ldr r9, [r0] @ q0 - ldr r10, [r0, r1] @ q1 - filter \inner -.endm - -.macro filter_h inner - orr r9, r6, r8 - usub8 r9, r12, r9 - sel r6, r12, r11 @ hev mask - - stm sp, {r6, lr} - - ldr_nreg r12, r0, r1, lsl #1 - ldr_nreg r11, r0, r1 - ldr r6, [r0] - ldr lr, [r0, r1] - - transpose r10, r9, r8, r7, r12, r11, r6, lr - - ldm sp, {r6, lr} - filter \inner -.endm - -.macro filter_inner - ldm sp, {r8, r9} - lsr r10, r10, #2 @ 0x01010101 - qadd8 r8, r8, lr @ u = vp8_signed_char_clamp(ps0 + Filter2) - mov lr, #0 - qsub8 r9, r9, r7 @ u = vp8_signed_char_clamp(qs0 - Filter1) - sadd8 r7, r7, r10 @ vp8_filter += 1 - ldr r10, [sp, #8] @ qs1 - shadd8 r7, r7, lr @ vp8_filter >>= 1 - eor r8, r8, r12 @ *op0 = u ^ 0x80 - bic r7, r7, r6 @ vp8_filter &= ~hev - qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter) - eor r9, r9, r12 @ *oq0 = u ^ 0x80 - qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter) - eor r11, r11, r12 @ *op1 = u ^ 0x80 - eor r10, r10, r12 @ *oq1 = u ^ 0x80 -.endm - -.macro filter_x c0 - mov lr, \c0 - mov r7, #63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - mov32 lr, 0x80808080 - - orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u) - eor r8, r8, lr @ *oq0 = s ^ 0x80 - eor r10, r10, lr @ *op0 = s ^ 0x80 -.endm - -.macro filter_1 - ldm sp, {r8, r9} - qadd8 r11, r8, lr - qsub8 r9, r9, r7 - bic r12, r12, r6 @ vp8_filter &= ~hev - filter_x #27 -.endm - -.macro filter_2 - ldr r9, [sp, #8] @ qs1 - ldr r11, [sp, #12] @ ps1 - filter_x #18 -.endm - -.macro filter_3 - eor r9, r9, lr - eor r11, r11, lr - filter_x #9 -.endm - -function vp8_v_loop_filter_inner_armv6 - mov r5, #4 - sub sp, sp, #16 - - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - orr r6, r6, r6, lsl #16 - orr r4, r2, r2, lsl #8 @ flimE - orr r2, r3, r3, lsl #8 @ flimI - orr r3, r6, r6, lsl #8 @ thresh -1: - sub r0, r0, r1, lsl #2 - ldr r10, [r0, r1] @ p2 - ldr_post r9, r0, r1, lsl #1 @ p3 - ldr r12, [r0, r1] @ p0 - ldr_post r11, r0, r1, lsl #1 @ p1 - - filter_mask_v - cmp lr, #0 - beq 2f - filter_v inner=1 - filter_inner - -A str r11, [r0, -r1, lsl #1] @ op1 -A str r8, [r0, -r1] @ op0 -T sub r0, r0, r1, lsl #1 -T str r8, [r0, r1] -T str_post r11, r0, r1, lsl #1 - str r9, [r0] @ oq0 - str r10, [r0, r1] @ oq1 -2: - add r0, r0, #4 - cmp r5, #3 - it eq - ldreq r0, [sp, #16] - subs r5, r5, #1 - bne 1b - - add sp, sp, #16 - pop {r0, r4-r11, pc} -endfunc - -function ff_vp8_v_loop_filter16_inner_armv6, export=1 - push {r4-r11, lr} - add r12, r0, #8 - push {r12} - ldr r6, [sp, #40] - orr r2, r2, r2, lsl #16 - b vp8_v_loop_filter_inner_armv6 -endfunc - -function ff_vp8_v_loop_filter8uv_inner_armv6, export=1 - push {r1, r4-r11, lr} - mov r1, r2 - orr r2, r3, r3, lsl #16 - ldr r3, [sp, #40] - ldr r6, [sp, #44] - b vp8_v_loop_filter_inner_armv6 -endfunc - -function vp8_v_loop_filter_armv6 - mov r5, #4 - sub sp, sp, #16 - - orr r3, r3, r3, lsl #16 - orr r6, r6, r6, lsl #16 - orr r4, r2, r2, lsl #8 @ flimE - orr r2, r3, r3, lsl #8 @ flimI - orr r3, r6, r6, lsl #8 @ thresh -1: - sub r0, r0, r1, lsl #2 - ldr r10, [r0, r1] @ p2 - ldr_post r9, r0, r1, lsl #1 @ p3 - ldr r12, [r0, r1] @ p0 - ldr_post r11, r0, r1, lsl #1 @ p1 - - filter_mask_v - cmp lr, #0 - beq 2f - - filter_v inner=0 - filter_1 - - str r8, [r0] @ *oq0 -A str r10, [r0, -r1] @ *op0 -T sub r0, r0, r1, lsl #1 -T str r10, [r0, r1] - - filter_2 - -A str r10, [r0, -r1, lsl #1] @ *op1 -T str_post r10, r0, r1, lsl #1 - str r8, [r0, r1] @ *oq1 - - ldr r9, [r0, r1, lsl #1] @ q2 - add r0, r0, r1 -A ldr r11, [r0, -r1, lsl #2] @ p2 -T ldr_dpre r11, r0, r1, lsl #2 - - filter_3 - -A str r10, [r0, -r1, lsl #2] @ *op2 -T str_post r10, r0, r1, lsl #2 - str r8, [r0, r1] @ *oq2 - sub r0, r0, r1 -2: - add r0, r0, #4 - cmp r5, #3 - it eq - ldreq r0, [sp, #16] - subs r5, r5, #1 - bne 1b - - add sp, sp, #16 - pop {r0, r4-r11, pc} -endfunc - -function ff_vp8_v_loop_filter16_armv6, export=1 - push {r4-r11, lr} - add r12, r0, #8 - push {r12} - ldr r6, [sp, #40] - orr r2, r2, r2, lsl #16 - b vp8_v_loop_filter_armv6 -endfunc - -function ff_vp8_v_loop_filter8uv_armv6, export=1 - push {r1, r4-r11, lr} - mov r1, r2 - orr r2, r3, r3, lsl #16 - ldr r3, [sp, #40] - ldr r6, [sp, #44] - b vp8_v_loop_filter_armv6 -endfunc - -@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim) -function ff_vp8_h_loop_filter16_simple_armv6, export=1 - push {r4-r11, lr} - orr r12, r2, r2, lsl #16 - mov32 r2, 0x80808080 - orr r12, r12, r12, lsl #8 - - mov lr, #0 - mov r11, #4 -1: - sub r0, r0, #2 - ldr r8, [r0, r1] - ldr_post r7, r0, r1, lsl #1 - ldr r10, [r0, r1] - ldr_post r9, r0, r1, lsl #1 - add r0, r0, #2 - transpose r6, r5, r4, r3, r7, r8, r9, r10 - simple_filter - sub r0, r0, r1, lsl #2 - sub r0, r0, #1 - - uxtb16 r6, r4 - uxtb16 r8, r5 - uxtb16 r7, r4, ror #8 - uxtb16 r9, r5, ror #8 - orr r6, r6, r8, lsl #8 - orr r7, r7, r9, lsl #8 - lsr r4, r6, #16 - lsr r5, r7, #16 - - strh_post r6, r0, r1 - strh_post r7, r0, r1 - strh_post r4, r0, r1 - strh_post r5, r0, r1 - add r0, r0, #1 -2: - subs r11, r11, #1 - bne 1b - - pop {r4-r11, pc} -endfunc - -function vp8_h_loop_filter_inner_armv6 - mov r5, #4 - sub sp, sp, #16 - - orr r3, r3, r3, lsl #16 - orr r9, r9, r9, lsl #16 - orr r4, r2, r2, lsl #8 @ flimE - orr r2, r3, r3, lsl #8 @ flimI - orr r3, r9, r9, lsl #8 @ thresh - sub r0, r0, #4 -1: - ldr r7, [r0, r1] - ldr_post r6, r0, r1, lsl #1 - ldr lr, [r0, r1] - ldr_post r8, r0, r1, lsl #1 - - filter_mask_h - - cmp lr, #0 - sub r0, r0, #2 - beq 2f - - ldr r6, [sp] - - filter_h inner=1 - filter_inner - - transpose lr, r12, r7, r6, r11, r8, r9, r10 - -A str r6, [r0, -r1, lsl #1] -A str r7, [r0, -r1] -T sub r0, r0, r1, lsl #1 -T str r7, [r0, r1] -T str_post r6, r0, r1, lsl #1 - str r12, [r0] - str lr, [r0, r1] -2: - sub r0, r0, #2 - add r0, r0, r1, lsl #1 - cmp r5, #3 - it eq - ldreq r0, [sp, #16] - subs r5, r5, #1 - bne 1b - - add sp, sp, #16 - pop {r0, r4-r11, pc} -endfunc - -function ff_vp8_h_loop_filter16_inner_armv6, export=1 - push {r4-r11, lr} - add r12, r0, r1, lsl #3 - sub r12, r12, #4 - push {r12} - ldr r9, [sp, #40] - orr r2, r2, r2, lsl #16 - b vp8_h_loop_filter_inner_armv6 -endfunc - -function ff_vp8_h_loop_filter8uv_inner_armv6, export=1 - sub r1, r1, #4 - push {r1, r4-r11, lr} - mov r1, r2 - orr r2, r3, r3, lsl #16 - ldr r3, [sp, #40] - ldr r9, [sp, #44] - b vp8_h_loop_filter_inner_armv6 -endfunc - -function vp8_h_loop_filter_armv6 - mov r5, #4 - sub sp, sp, #16 - - orr r3, r3, r3, lsl #16 - orr r9, r9, r9, lsl #16 - orr r4, r2, r2, lsl #8 @ flimE - orr r2, r3, r3, lsl #8 @ flimI - orr r3, r9, r9, lsl #8 @ thresh -1: - sub r0, r0, #4 - ldr r7, [r0, r1] - ldr_post r6, r0, r1, lsl #1 - ldr lr, [r0, r1] - ldr_post r8, r0, r1, lsl #1 - - filter_mask_h - cmp lr, #0 - it eq - addeq r0, r0, r1, lsl #1 - beq 2f - - ldr r6, [sp] - sub r0, r0, #2 - - filter_h inner=0 - filter_1 - - sub r0, r0, r1, lsl #1 - uxtb16 r6, r10 - uxtb16 r7, r8 - uxtb16 r10, r10, ror #8 - uxtb16 r8, r8, ror #8 - orr r6, r6, r7, lsl #8 - orr r10, r10, r8, lsl #8 - lsr r7, r6, #16 - lsr r8, r10, #16 - - add r0, r0, #1 - strh_post r6, r0, r1 - strh_post r10, r0, r1 - strh_post r7, r0, r1 - strh_post r8, r0, r1 - - filter_2 - - sub r0, r0, r1, lsl #2 - add r0, r0, #3 - - ldrb r11, [r0, #-5] @ p2 for 1/7th difference - strb r10, [r0, #-4] @ op1 - strb r8, [r0, #-1] @ oq1 - ldrb_post r9, r0, r1 @ q2 for 1/7th difference - - lsr r10, r10, #8 - lsr r8, r8, #8 - - ldrb r6, [r0, #-5] - strb r10, [r0, #-4] - strb r8, [r0, #-1] - ldrb_post r7, r0, r1 - - lsr r10, r10, #8 - lsr r8, r8, #8 - orr r11, r11, r6, lsl #8 - orr r9, r9, r7, lsl #8 - - ldrb r6, [r0, #-5] - strb r10, [r0, #-4] - strb r8, [r0, #-1] - ldrb_post r7, r0, r1 - - lsr r10, r10, #8 - lsr r8, r8, #8 - orr r11, r11, r6, lsl #16 - orr r9, r9, r7, lsl #16 - - ldrb r6, [r0, #-5] - strb r10, [r0, #-4] - strb r8, [r0, #-1] - ldrb_post r7, r0, r1 - orr r11, r11, r6, lsl #24 - orr r9, r9, r7, lsl #24 - - filter_3 - - sub r0, r0, r1, lsl #2 - strb r10, [r0, #-5] - strb_post r8, r0, r1 - lsr r10, r10, #8 - lsr r8, r8, #8 - strb r10, [r0, #-5] - strb_post r8, r0, r1 - lsr r10, r10, #8 - lsr r8, r8, #8 - strb r10, [r0, #-5] - strb_post r8, r0, r1 - lsr r10, r10, #8 - lsr r8, r8, #8 - strb r10, [r0, #-5] - strb_post r8, r0, r1 - - sub r0, r0, #2 -2: - cmp r5, #3 - it eq - ldreq r0, [sp, #16] - subs r5, r5, #1 - bne 1b - - add sp, sp, #16 - pop {r0, r4-r11, pc} -endfunc - -function ff_vp8_h_loop_filter16_armv6, export=1 - push {r4-r11, lr} - add r12, r0, r1, lsl #3 - push {r12} - ldr r9, [sp, #40] - orr r2, r2, r2, lsl #16 - b vp8_h_loop_filter_armv6 -endfunc - -function ff_vp8_h_loop_filter8uv_armv6, export=1 - push {r1, r4-r11, lr} - mov r1, r2 - orr r2, r3, r3, lsl #16 - ldr r3, [sp, #40] - ldr r9, [sp, #44] - b vp8_h_loop_filter_armv6 -endfunc - -.ltorg - -@ MC - -@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src, -@ int srcstride, int h, int mx, int my) -function ff_put_vp8_pixels16_armv6, export=1 - push {r4-r11} - ldr r12, [sp, #32] @ h -1: - subs r12, r12, #2 - ldr r5, [r2, #4] - ldr r6, [r2, #8] - ldr r7, [r2, #12] - ldr_post r4, r2, r3 - ldr r9, [r2, #4] - ldr r10, [r2, #8] - ldr r11, [r2, #12] - ldr_post r8, r2, r3 - strd r6, r7, [r0, #8] - strd_post r4, r5, r0, r1 - strd r10, r11, [r0, #8] - strd_post r8, r9, r0, r1 - bgt 1b - pop {r4-r11} - bx lr -endfunc - -@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src, -@ int srcstride, int h, int mx, int my) -function ff_put_vp8_pixels8_armv6, export=1 - push {r4-r11} - ldr r12, [sp, #32] @ h -1: - subs r12, r12, #4 - ldr r5, [r2, #4] - ldr_post r4, r2, r3 - ldr r7, [r2, #4] - ldr_post r6, r2, r3 - ldr r9, [r2, #4] - ldr_post r8, r2, r3 - ldr r11, [r2, #4] - ldr_post r10, r2, r3 - strd_post r4, r5, r0, r1 - strd_post r6, r7, r0, r1 - strd_post r8, r9, r0, r1 - strd_post r10, r11, r0, r1 - bgt 1b - pop {r4-r11} - bx lr -endfunc - -@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src, -@ int srcstride, int h, int mx, int my) -function ff_put_vp8_pixels4_armv6, export=1 - ldr r12, [sp, #0] @ h - push {r4-r6,lr} -1: - subs r12, r12, #4 - ldr_post r4, r2, r3 - ldr_post r5, r2, r3 - ldr_post r6, r2, r3 - ldr_post lr, r2, r3 - str_post r4, r0, r1 - str_post r5, r0, r1 - str_post r6, r0, r1 - str_post lr, r0, r1 - bgt 1b - pop {r4-r6,pc} -endfunc - -@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit -@ arithmatic can be used to apply filters -const sixtap_filters_13245600, align=4 - .short 2, 108, -11, 36, -8, 1, 0, 0 - .short 3, 77, -16, 77, -16, 3, 0, 0 - .short 1, 36, -8, 108, -11, 2, 0, 0 -endconst - -const fourtap_filters_1324, align=4 - .short -6, 12, 123, -1 - .short -9, 50, 93, -6 - .short -6, 93, 50, -9 - .short -1, 123, 12, -6 -endconst - -.macro vp8_mc_1 name, size, hv -function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1 - sub r1, r1, #\size - mov r12, sp - push {r1, r4-r11, lr} - ldm r12, {r5-r7} - mov r4, #\size - stm r12, {r4, r5} - orr r12, r6, r7 - b vp8_put_\name\()_\hv\()_armv6 + 4 -endfunc -.endm - -vp8_mc_1 epel, 16, h6 -vp8_mc_1 epel, 16, v6 -vp8_mc_1 epel, 8, h6 -vp8_mc_1 epel, 8, v6 -vp8_mc_1 epel, 8, h4 -vp8_mc_1 epel, 8, v4 -vp8_mc_1 epel, 4, h6 -vp8_mc_1 epel, 4, v6 -vp8_mc_1 epel, 4, h4 -vp8_mc_1 epel, 4, v4 - -vp8_mc_1 bilin, 16, h -vp8_mc_1 bilin, 16, v -vp8_mc_1 bilin, 8, h -vp8_mc_1 bilin, 8, v -vp8_mc_1 bilin, 4, h -vp8_mc_1 bilin, 4, v - -/* True relational expressions have the value -1 in the GNU assembler, - +1 in Apple's. */ -#ifdef __APPLE__ -# define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1) -#else -# define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1) -#endif - -.macro vp8_mc_hv name, size, h, v, ytaps -function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1 - push {r0, r1, r4, lr} - add r0, sp, #16 - sub sp, sp, #TMPSIZE+16 - ldm r0, {r0, r12} - mov r4, #\size - add lr, r0, #\ytaps-1 - .if \ytaps > 2 - sub r2, r2, r3, lsl #\ytaps >> 1 & 1 - .endif - stm sp, {r4, lr} - add r0, sp, #16 - mov r1, #0 - bl vp8_put_\name\()_\h\()_armv6 - add r0, sp, #TMPSIZE+16 - ldr lr, [sp, #TMPSIZE+16+16] - ldm r0, {r0, r1} - mov r3, #\size - ldr r12, [sp, #TMPSIZE+16+16+8] - str lr, [sp, #4] - add r2, sp, #16 + \size * (\ytaps / 2 - 1) - sub r1, r1, #\size - bl vp8_put_\name\()_\v\()_armv6 - add sp, sp, #TMPSIZE+16+8 - pop {r4, pc} -endfunc -.endm - -vp8_mc_hv epel, 16, h6, v6, 6 -vp8_mc_hv epel, 8, h6, v6, 6 -vp8_mc_hv epel, 8, h4, v6, 6 -vp8_mc_hv epel, 8, h6, v4, 4 -vp8_mc_hv epel, 8, h4, v4, 4 -vp8_mc_hv epel, 4, h6, v6, 6 -vp8_mc_hv epel, 4, h4, v6, 6 -vp8_mc_hv epel, 4, h6, v4, 4 -vp8_mc_hv epel, 4, h4, v4, 4 - -vp8_mc_hv bilin, 16, h, v, 2 -vp8_mc_hv bilin, 8, h, v, 2 -vp8_mc_hv bilin, 4, h, v, 2 - -.macro sat4 r0, r1, r2, r3 - asr \r0, \r0, #7 - asr \r1, \r1, #7 - pkhbt \r0, \r0, \r2, lsl #9 - pkhbt \r1, \r1, \r3, lsl #9 - usat16 \r0, #8, \r0 - usat16 \r1, #8, \r1 - orr \r0, \r0, \r1, lsl #8 -.endm - -@ Calling convention for the inner MC functions: -@ r0 dst -@ r1 dst_stride - block_width -@ r2 src -@ r3 src_stride -@ r4 block_width -@ r12 filter_index -@ [sp] block_width -@ [sp+4] height -@ [sp+8] scratch - -function vp8_put_epel_h6_armv6 - push {r1, r4-r11, lr} - sub r2, r2, #2 - movrel lr, sixtap_filters_13245600 - 16 - add lr, lr, r12, lsl #3 - sub r3, r3, r4 - str r3, [sp, #48] - ldm lr, {r1, r3, lr} -1: - ldr r7, [r2, #5] @ src[5-8] - ldr r6, [r2, #2] @ src[2-5] - ldr r5, [r2], #4 @ src[0-3] - - pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6] - uxtb16 r9, r6, ror #8 @ src[5] | src[3] - uxtb16 r6, r6 @ src[4] | src[2] - uxtb16 r8, r5, ror #8 @ src[3] | src[1] - uxtb16 r11, r7, ror #8 @ src[8] | src[7] - uxtb16 r7, r7 @ src[7] | src[6] - uxtb16 r5, r5 @ src[2] | src[0] - - mov r10, #0x40 - smlad r5, r5, r1, r10 @ filter[0][0] - smlad r11, r11, lr, r10 @ filter[3][2] - smlad r12, r7, lr, r10 @ filter[2][2] - smlad r10, r8, r1, r10 @ filter[1][0] - smlad r5, r8, r3, r5 @ filter[0][1] - smlad r11, r9, r1, r11 @ filter[3][0] - smlad r12, r9, r3, r12 @ filter[2][1] - pkhtb r9, r9, r6, asr #16 @ src[5] | src[4] - smlad r10, r6, r3, r10 @ filter[1][1] - pkhbt r7, r9, r7, lsl #16 @ src[6] | src[4] - smlad r5, r9, lr, r5 @ filter[0][2] - pkhtb r8, r7, r9, asr #16 @ src[6] | src[5] - smlad r11, r7, r3, r11 @ filter[3][1] - smlad r9, r8, lr, r10 @ filter[1][2] - smlad r7, r6, r1, r12 @ filter[2][0] - - subs r4, r4, #4 - - sat4 r5, r9, r7, r11 - str r5, [r0], #4 - - bne 1b - - add r4, sp, #40 - ldm r4, {r4, r5, r12} - ldr r6, [sp] - subs r5, r5, #1 - add r2, r2, r12 - str r5, [sp, #44] - add r0, r0, r6 - - bne 1b - - pop {r1, r4-r11, pc} -endfunc - -function vp8_put_epel_v6_armv6 - push {r1, r4-r11, lr} - movrel lr, sixtap_filters_13245600 - 16 - add lr, lr, r12, lsl #3 - str r3, [sp, #48] -1: - add r1, r3, r3, lsl #1 @ stride * 3 - ldr_nreg r5, r2, r3 @ src[0,1,2,3 + stride * 1] - ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3] - ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4] - ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5] - - uxtb16 r9, r5, ror #8 @ src[3 + s*1] | src[1 + s*1] - uxtb16 r10, r6, ror #8 @ src[3 + s*3] | src[1 + s*3] - uxtb16 r11, r7, ror #8 @ src[3 + s*4] | src[1 + s*4] - uxtb16 r12, r8, ror #8 @ src[3 + s*5] | src[1 + s*5] - uxtb16 r5, r5 @ src[2 + s*1] | src[0 + s*1] - uxtb16 r6, r6 @ src[2 + s*3] | src[0 + s*3] - uxtb16 r7, r7 @ src[2 + s*4] | src[0 + s*4] - uxtb16 r8, r8 @ src[2 + s*5] | src[0 + s*5] - pkhbt r1, r9, r10, lsl #16 @ src[1 + s*3] | src[1 + s*1] - pkhtb r9, r10, r9, asr #16 @ src[3 + s*3] | src[3 + s*1] - pkhbt r10, r11, r12, lsl #16 @ src[1 + s*5] | src[1 + s*4] - pkhtb r11, r12, r11, asr #16 @ src[3 + s*5] | src[3 + s*4] - pkhbt r12, r5, r6, lsl #16 @ src[0 + s*3] | src[0 + s*1] - pkhtb r5, r6, r5, asr #16 @ src[2 + s*3] | src[2 + s*1] - pkhbt r6, r7, r8, lsl #16 @ src[0 + s*5] | src[0 + s*4] - pkhtb r7, r8, r7, asr #16 @ src[2 + s*5] | src[2 + s*4] - - ldr r8, [lr, #4] - mov r3, #0x40 - smlad r12, r12, r8, r3 @ filter[0][1] - smlad r1, r1, r8, r3 @ filter[1][1] - smlad r5, r5, r8, r3 @ filter[2][1] - smlad r9, r9, r8, r3 @ filter[3][1] - ldr r8, [lr, #8] - ldr r3, [sp, #48] - smlad r12, r6, r8, r12 @ filter[0][2] - smlad r1, r10, r8, r1 @ filter[1][2] - ldr_nreg r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0] - ldr r10, [r2], #4 @ src[0,1,2,3 + stride * 2] - smlad r5, r7, r8, r5 @ filter[2][2] - smlad r9, r11, r8, r9 @ filter[3][2] - - uxtb16 r7, r6, ror #8 @ src[3 + s*0] | src[1 + s*0] - uxtb16 r11, r10, ror #8 @ src[3 + s*2] | src[1 + s*2] - uxtb16 r6, r6 @ src[2 + s*0] | src[0 + s*0] - uxtb16 r10, r10 @ src[2 + s*2] | src[0 + s*2] - - pkhbt r8, r7, r11, lsl #16 @ src[1 + s*2] | src[1 + s*0] - pkhtb r7, r11, r7, asr #16 @ src[3 + s*2] | src[3 + s*0] - pkhbt r11, r6, r10, lsl #16 @ src[0 + s*2] | src[0 + s*0] - pkhtb r6, r10, r6, asr #16 @ src[2 + s*2] | src[2 + s*0] - - ldr r10, [lr] - subs r4, r4, #4 - smlad r12, r11, r10, r12 @ filter[0][0] - smlad r1, r8, r10, r1 @ filter[1][0] - smlad r5, r6, r10, r5 @ filter[2][0] - smlad r9, r7, r10, r9 @ filter[3][0] - - sat4 r12, r1, r5, r9 - str r12, [r0], #4 - - bne 1b - - ldrd r4, r5, [sp, #40] - ldr r6, [sp] - subs r5, r5, #1 - sub r2, r2, r4 - str r5, [sp, #44] - add r0, r0, r6 - add r2, r2, r3 - - bne 1b - - pop {r1, r4-r11, pc} -endfunc - -function vp8_put_epel_h4_armv6 - push {r1, r4-r11, lr} - subs r2, r2, #1 - movrel lr, fourtap_filters_1324 - 4 - add lr, lr, r12, lsl #2 - sub r3, r3, r4 - ldm lr, {r5, r6} - ldr lr, [sp, #44] -1: - ldr r9, [r2, #3] - ldr r8, [r2, #2] - ldr r7, [r2], #4 - - uxtb16 r9, r9, ror #8 @ src[6] | src[4] - uxtb16 r10, r8, ror #8 @ src[5] | src[3] - uxtb16 r8, r8 @ src[4] | src[2] - uxtb16 r11, r7, ror #8 @ src[3] | src[1] - uxtb16 r7, r7 @ src[2] | src[0] - - mov r12, #0x40 - smlad r9, r9, r6, r12 @ filter[3][1] - smlad r7, r7, r5, r12 @ filter[0][0] - smlad r9, r10, r5, r9 @ filter[3][0] - smlad r10, r10, r6, r12 @ filter[2][1] - smlad r12, r11, r5, r12 @ filter[1][0] - smlad r7, r11, r6, r7 @ filter[0][1] - smlad r10, r8, r5, r10 @ filter[2][0] - smlad r12, r8, r6, r12 @ filter[1][1] - - subs r4, r4, #4 - - sat4 r7, r12, r10, r9 - str r7, [r0], #4 - - bne 1b - - subs lr, lr, #1 - ldr r4, [sp, #40] - add r2, r2, r3 - add r0, r0, r1 - - bne 1b - - pop {r1, r4-r11, pc} -endfunc - -function vp8_put_epel_v4_armv6 - push {r1, r4-r11, lr} - movrel lr, fourtap_filters_1324 - 4 - add lr, lr, r12, lsl #2 - ldm lr, {r5, r6} - str r3, [sp, #48] -1: - ldr lr, [r2, r3, lsl #1] - ldr r12, [r2, r3] - ldr_nreg r7, r2, r3 - ldr r11, [r2], #4 - - uxtb16 r8, lr, ror #8 @ src[3 + s*3] | src[1 + s*3] - uxtb16 r9, r12, ror #8 @ src[3 + s*2] | src[1 + s*2] - uxtb16 r3, r7, ror #8 @ src[3 + s*0] | src[1 + s*0] - uxtb16 r1, r11, ror #8 @ src[3 + s*1] | src[1 + s*1] - uxtb16 lr, lr @ src[2 + s*3] | src[0 + s*3] - uxtb16 r12, r12 @ src[2 + s*2] | src[0 + s*2] - uxtb16 r7, r7 @ src[2 + s*0] | src[0 + s*0] - uxtb16 r11, r11 @ src[2 + s*1] | src[0 + s*1] - pkhbt r10, r1, r8, lsl #16 @ src[1 + s*3] | src[1 + s*1] - pkhtb r1, r8, r1, asr #16 @ src[3 + s*3] | src[3 + s*1] - pkhbt r8, r3, r9, lsl #16 @ src[1 + s*2] | src[1 + s*0] - pkhtb r3, r9, r3, asr #16 @ src[3 + s*2] | src[3 + s*0] - pkhbt r9, r11, lr, lsl #16 @ src[0 + s*3] | src[0 + s*1] - pkhtb r11, lr, r11, asr #16 @ src[2 + s*3] | src[2 + s*1] - pkhbt lr, r7, r12, lsl #16 @ src[0 + s*2] | src[0 + s*0] - pkhtb r7, r12, r7, asr #16 @ src[2 + s*2] | src[2 + s*0] - - mov r12, #0x40 - smlad r9, r9, r6, r12 @ filter[0][1] - smlad r10, r10, r6, r12 @ filter[1][1] - smlad r11, r11, r6, r12 @ filter[2][1] - smlad r1, r1, r6, r12 @ filter[3][1] - smlad r9, lr, r5, r9 @ filter[0][0] - smlad r10, r8, r5, r10 @ filter[1][0] - smlad r11, r7, r5, r11 @ filter[2][0] - smlad r1, r3, r5, r1 @ filter[3][0] - - subs r4, r4, #4 - ldr r3, [sp, #48] - - sat4 r9, r10, r11, r1 - str r9, [r0], #4 - - bne 1b - - ldr r4, [sp, #40] - ldr r12, [sp, #44] - add r2, r2, r3 - ldr r9, [sp, #0] - subs r12, r12, #1 - sub r2, r2, r4 - str r12, [sp, #44] - add r0, r0, r9 - - bne 1b - - pop {r1, r4-r11, pc} -endfunc - -function vp8_put_bilin_h_armv6 - push {r1, r4-r11, lr} - rsb r5, r12, r12, lsl #16 - ldr r12, [sp, #44] - sub r3, r3, r4 - add r5, r5, #8 -1: - ldrb r6, [r2], #1 - ldrb r7, [r2], #1 - ldrb r8, [r2], #1 - ldrb r9, [r2], #1 - ldrb lr, [r2] - - pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1] - pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2] - pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3] - - mov r10, #4 - smlad r6, r6, r5, r10 - smlad r7, r7, r5, r10 - smlad r8, r8, r5, r10 - smlad r9, r9, r5, r10 - - subs r4, r4, #4 - - asr r6, #3 - asr r7, #3 - pkhbt r6, r6, r8, lsl #13 - pkhbt r7, r7, r9, lsl #13 - orr r6, r6, r7, lsl #8 - str r6, [r0], #4 - - bne 1b - - ldr r4, [sp, #40] - subs r12, r12, #1 - add r2, r2, r3 - add r0, r0, r1 - - bne 1b - - pop {r1, r4-r11, pc} -endfunc - -function vp8_put_bilin_v_armv6 - push {r1, r4-r11, lr} - rsb r5, r12, r12, lsl #16 - ldr r12, [sp, #44] - add r5, r5, #8 -1: - ldrb r10, [r2, r3] - ldrb r6, [r2], #1 - ldrb r11, [r2, r3] - ldrb r7, [r2], #1 - ldrb lr, [r2, r3] - ldrb r8, [r2], #1 - ldrb r9, [r2, r3] - pkhbt r6, r6, r10, lsl #16 - ldrb r10, [r2], #1 - pkhbt r7, r7, r11, lsl #16 - pkhbt r8, r8, lr, lsl #16 - pkhbt r9, r10, r9, lsl #16 - - mov r10, #4 - smlad r6, r6, r5, r10 - smlad r7, r7, r5, r10 - smlad r8, r8, r5, r10 - smlad r9, r9, r5, r10 - - subs r4, r4, #4 - - asr r6, #3 - asr r7, #3 - pkhbt r6, r6, r8, lsl #13 - pkhbt r7, r7, r9, lsl #13 - orr r6, r6, r7, lsl #8 - str r6, [r0], #4 - - bne 1b - - ldr r4, [sp, #40] - subs r12, r12, #1 - add r2, r2, r3 - add r0, r0, r1 - sub r2, r2, r4 - - bne 1b - pop {r1, r4-r11, pc} -endfunc diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c deleted file mode 100644 index d360ae3..0000000 --- a/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c +++ /dev/null @@ -1,34 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/vp8dsp.h" -#include "vp8dsp.h" - -av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp) -{ - int cpu_flags = av_get_cpu_flags(); - - if (have_armv6(cpu_flags)) - ff_vp8dsp_init_armv6(dsp); - if (have_neon(cpu_flags)) - ff_vp8dsp_init_neon(dsp); -} diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c b/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c deleted file mode 100644 index 563268e..0000000 --- a/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavcodec/vp8dsp.h" -#include "vp8dsp.h" - -void ff_vp8_luma_dc_wht_armv6(int16_t block[4][4][16], int16_t dc[16]); -void ff_vp8_luma_dc_wht_dc_armv6(int16_t block[4][4][16], int16_t dc[16]); - -void ff_vp8_idct_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride); -void ff_vp8_idct_dc_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride); -void ff_vp8_idct_dc_add4y_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); -void ff_vp8_idct_dc_add4uv_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); - -VP8_LF(armv6); - -VP8_EPEL(16, armv6); -VP8_EPEL(8, armv6); -VP8_EPEL(4, armv6); - -VP8_BILIN(16, armv6); -VP8_BILIN(8, armv6); -VP8_BILIN(4, armv6); - -av_cold void ff_vp8dsp_init_armv6(VP8DSPContext *dsp) -{ - dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_armv6; - dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6; - - dsp->vp8_idct_add = ff_vp8_idct_add_armv6; - dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_armv6; - dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_armv6; - dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_armv6; - - dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_armv6; - dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_armv6; - dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_armv6; - dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_armv6; - - dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_armv6; - dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_armv6; - dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_armv6; - dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_armv6; - - dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_armv6; - dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_armv6; - - dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6; - dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_armv6; - dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_armv6; - dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_armv6; - - dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6; - dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_armv6; - dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_armv6; - dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_armv6; - dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_armv6; - dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_armv6; - dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_armv6; - dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_armv6; - dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_armv6; - - dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; - dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_armv6; - dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_armv6; - dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_armv6; - dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_armv6; - dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_armv6; - dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_armv6; - dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_armv6; - dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_armv6; - - dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_armv6; - - dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_armv6; - - dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_armv6; - dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_armv6; -} diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c b/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c deleted file mode 100644 index ae045a6..0000000 --- a/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdint.h> - -#include "libavutil/attributes.h" -#include "libavcodec/vp8dsp.h" -#include "vp8dsp.h" - -void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); - -void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); -void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); -void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); -void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); - -VP8_LF(neon); - -VP8_EPEL(16, neon); -VP8_EPEL(8, neon); -VP8_EPEL(4, neon); - -VP8_BILIN(16, neon); -VP8_BILIN(8, neon); -VP8_BILIN(4, neon); - -av_cold void ff_vp8dsp_init_neon(VP8DSPContext *dsp) -{ - dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon; - - dsp->vp8_idct_add = ff_vp8_idct_add_neon; - dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; - dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; - dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon; - - dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; - dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; - dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; - dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; - - dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon; - dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon; - dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon; - dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon; - - dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon; - dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon; - - dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; - dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon; - dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon; - dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; - - dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; - dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon; - dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon; - dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon; - dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; - dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; - dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; - dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; - dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; - - dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon; - dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon; - dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon; - dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon; - dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon; - dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; - dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; - dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; - - dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; - dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon; - dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon; - dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon; - dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon; - dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon; - - dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; - dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon; - dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon; - dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon; - dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon; - dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon; - - dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon; - dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon; - dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon; - dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon; - dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon; - dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon; -} diff --git a/ffmpeg/libavcodec/arm/vp8dsp_neon.S b/ffmpeg/libavcodec/arm/vp8dsp_neon.S deleted file mode 100644 index 436b340..0000000 --- a/ffmpeg/libavcodec/arm/vp8dsp_neon.S +++ /dev/null @@ -1,1876 +0,0 @@ -/* - * VP8 NEON optimisations - * - * Copyright (c) 2010 Rob Clark <rob@ti.com> - * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "libavutil/arm/asm.S" -#include "neon.S" - -function ff_vp8_luma_dc_wht_neon, export=1 - vld1.16 {q0-q1}, [r1,:128] - vmov.i16 q15, #0 - - vadd.i16 d4, d0, d3 - vadd.i16 d6, d1, d2 - vst1.16 {q15}, [r1,:128]! - vsub.i16 d7, d1, d2 - vsub.i16 d5, d0, d3 - vst1.16 {q15}, [r1,:128] - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vmov.i16 q8, #3 - - vtrn.32 d0, d2 - vtrn.32 d1, d3 - vtrn.16 d0, d1 - vtrn.16 d2, d3 - - vadd.i16 d0, d0, d16 - - vadd.i16 d4, d0, d3 - vadd.i16 d6, d1, d2 - vsub.i16 d7, d1, d2 - vsub.i16 d5, d0, d3 - vadd.i16 q0, q2, q3 - vsub.i16 q1, q2, q3 - - vshr.s16 q0, q0, #3 - vshr.s16 q1, q1, #3 - - mov r3, #32 - vst1.16 {d0[0]}, [r0,:16], r3 - vst1.16 {d1[0]}, [r0,:16], r3 - vst1.16 {d2[0]}, [r0,:16], r3 - vst1.16 {d3[0]}, [r0,:16], r3 - vst1.16 {d0[1]}, [r0,:16], r3 - vst1.16 {d1[1]}, [r0,:16], r3 - vst1.16 {d2[1]}, [r0,:16], r3 - vst1.16 {d3[1]}, [r0,:16], r3 - vst1.16 {d0[2]}, [r0,:16], r3 - vst1.16 {d1[2]}, [r0,:16], r3 - vst1.16 {d2[2]}, [r0,:16], r3 - vst1.16 {d3[2]}, [r0,:16], r3 - vst1.16 {d0[3]}, [r0,:16], r3 - vst1.16 {d1[3]}, [r0,:16], r3 - vst1.16 {d2[3]}, [r0,:16], r3 - vst1.16 {d3[3]}, [r0,:16], r3 - - bx lr -endfunc - -function ff_vp8_idct_add_neon, export=1 - vld1.16 {q0-q1}, [r1,:128] - movw r3, #20091 - movt r3, #35468/2 - vdup.32 d4, r3 - - vmull.s16 q12, d1, d4[0] - vmull.s16 q13, d3, d4[0] - vqdmulh.s16 d20, d1, d4[1] - vqdmulh.s16 d23, d3, d4[1] - vshrn.s32 d21, q12, #16 - vshrn.s32 d22, q13, #16 - vadd.s16 d21, d21, d1 - vadd.s16 d22, d22, d3 - - vadd.s16 d16, d0, d2 - vsub.s16 d17, d0, d2 - vadd.s16 d18, d21, d23 - vsub.s16 d19, d20, d22 - vadd.s16 q0, q8, q9 - vsub.s16 q1, q8, q9 - - vtrn.32 d0, d3 - vtrn.32 d1, d2 - vtrn.16 d0, d1 - vtrn.16 d3, d2 - - vmov.i16 q15, #0 - vmull.s16 q12, d1, d4[0] - vst1.16 {q15}, [r1,:128]! - vmull.s16 q13, d2, d4[0] - vst1.16 {q15}, [r1,:128] - vqdmulh.s16 d21, d1, d4[1] - vqdmulh.s16 d23, d2, d4[1] - vshrn.s32 d20, q12, #16 - vshrn.s32 d22, q13, #16 - vadd.i16 d20, d20, d1 - vadd.i16 d22, d22, d2 - - vadd.i16 d16, d0, d3 - vsub.i16 d17, d0, d3 - vadd.i16 d18, d20, d23 - vld1.32 {d20[]}, [r0,:32], r2 - vsub.i16 d19, d21, d22 - vld1.32 {d22[]}, [r0,:32], r2 - vadd.s16 q0, q8, q9 - vld1.32 {d23[]}, [r0,:32], r2 - vsub.s16 q1, q8, q9 - vld1.32 {d21[]}, [r0,:32], r2 - vrshr.s16 q0, q0, #3 - vtrn.32 q10, q11 - vrshr.s16 q1, q1, #3 - - sub r0, r0, r2, lsl #2 - - vtrn.32 d0, d3 - vtrn.32 d1, d2 - vtrn.16 d0, d1 - vtrn.16 d3, d2 - - vaddw.u8 q0, q0, d20 - vaddw.u8 q1, q1, d21 - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - - bx lr -endfunc - -function ff_vp8_idct_dc_add_neon, export=1 - mov r3, #0 - ldrsh r12, [r1] - strh r3, [r1] - vdup.16 q1, r12 - vrshr.s16 q1, q1, #3 - vld1.32 {d0[]}, [r0,:32], r2 - vld1.32 {d1[]}, [r0,:32], r2 - vld1.32 {d0[1]}, [r0,:32], r2 - vld1.32 {d1[1]}, [r0,:32], r2 - vaddw.u8 q2, q1, d0 - vaddw.u8 q3, q1, d1 - sub r0, r0, r2, lsl #2 - vqmovun.s16 d0, q2 - vqmovun.s16 d1, q3 - vst1.32 {d0[0]}, [r0,:32], r2 - vst1.32 {d1[0]}, [r0,:32], r2 - vst1.32 {d0[1]}, [r0,:32], r2 - vst1.32 {d1[1]}, [r0,:32], r2 - bx lr -endfunc - -function ff_vp8_idct_dc_add4uv_neon, export=1 - vmov.i16 d0, #0 - mov r3, #32 - vld1.16 {d16[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vld1.16 {d17[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vld1.16 {d18[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vld1.16 {d19[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - mov r3, r0 - vrshr.s16 q8, q8, #3 @ dc >>= 3 - vld1.8 {d0}, [r0,:64], r2 - vrshr.s16 q9, q9, #3 - vld1.8 {d1}, [r0,:64], r2 - vaddw.u8 q10, q8, d0 - vld1.8 {d2}, [r0,:64], r2 - vaddw.u8 q0, q8, d1 - vld1.8 {d3}, [r0,:64], r2 - vaddw.u8 q11, q8, d2 - vld1.8 {d4}, [r0,:64], r2 - vaddw.u8 q1, q8, d3 - vld1.8 {d5}, [r0,:64], r2 - vaddw.u8 q12, q9, d4 - vld1.8 {d6}, [r0,:64], r2 - vaddw.u8 q2, q9, d5 - vld1.8 {d7}, [r0,:64], r2 - vaddw.u8 q13, q9, d6 - vqmovun.s16 d20, q10 - vaddw.u8 q3, q9, d7 - vqmovun.s16 d21, q0 - vqmovun.s16 d22, q11 - vst1.8 {d20}, [r3,:64], r2 - vqmovun.s16 d23, q1 - vst1.8 {d21}, [r3,:64], r2 - vqmovun.s16 d24, q12 - vst1.8 {d22}, [r3,:64], r2 - vqmovun.s16 d25, q2 - vst1.8 {d23}, [r3,:64], r2 - vqmovun.s16 d26, q13 - vst1.8 {d24}, [r3,:64], r2 - vqmovun.s16 d27, q3 - vst1.8 {d25}, [r3,:64], r2 - vst1.8 {d26}, [r3,:64], r2 - vst1.8 {d27}, [r3,:64], r2 - - bx lr -endfunc - -function ff_vp8_idct_dc_add4y_neon, export=1 - vmov.i16 d0, #0 - mov r3, #32 - vld1.16 {d16[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vld1.16 {d17[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vld1.16 {d18[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vld1.16 {d19[]}, [r1,:16] - vst1.16 {d0[0]}, [r1,:16], r3 - vrshr.s16 q8, q8, #3 @ dc >>= 3 - vld1.8 {q0}, [r0,:128], r2 - vrshr.s16 q9, q9, #3 - vld1.8 {q1}, [r0,:128], r2 - vaddw.u8 q10, q8, d0 - vld1.8 {q2}, [r0,:128], r2 - vaddw.u8 q0, q9, d1 - vld1.8 {q3}, [r0,:128], r2 - vaddw.u8 q11, q8, d2 - vaddw.u8 q1, q9, d3 - vaddw.u8 q12, q8, d4 - vaddw.u8 q2, q9, d5 - vaddw.u8 q13, q8, d6 - vaddw.u8 q3, q9, d7 - sub r0, r0, r2, lsl #2 - vqmovun.s16 d20, q10 - vqmovun.s16 d21, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q1 - vqmovun.s16 d24, q12 - vst1.8 {q10}, [r0,:128], r2 - vqmovun.s16 d25, q2 - vst1.8 {q11}, [r0,:128], r2 - vqmovun.s16 d26, q13 - vst1.8 {q12}, [r0,:128], r2 - vqmovun.s16 d27, q3 - vst1.8 {q13}, [r0,:128], r2 - - bx lr -endfunc - -@ Register layout: -@ P3..Q3 -> q0..q7 -@ flim_E -> q14 -@ flim_I -> q15 -@ hev_thresh -> r12 -@ -.macro vp8_loop_filter, inner=0, simple=0 - .if \simple - vabd.u8 q9, q3, q4 @ abs(P0-Q0) - vabd.u8 q15, q2, q5 @ abs(P1-Q1) - vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 - vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 - vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) - vmov.i8 q13, #0x80 - vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim - .else - @ calculate hev and normal_limit: - vabd.u8 q12, q2, q3 @ abs(P1-P0) - vabd.u8 q13, q5, q4 @ abs(Q1-Q0) - vabd.u8 q10, q0, q1 @ abs(P3-P2) - vabd.u8 q11, q1, q2 @ abs(P2-P1) - vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I - vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I - vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I - vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I - vand q8, q8, q9 - vabd.u8 q9, q7, q6 @ abs(Q3-Q2) - vand q8, q8, q11 - vabd.u8 q11, q6, q5 @ abs(Q2-Q1) - vand q8, q8, q10 - vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I - vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I - vabd.u8 q9, q3, q4 @ abs(P0-Q0) - vabd.u8 q15, q2, q5 @ abs(P1-Q1) - vand q8, q8, q10 - vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 - vand q8, q8, q11 - vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 - vdup.8 q15, r12 @ hev_thresh - vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) - vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh - vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E - vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh - vand q8, q8, q11 - vmov.i8 q13, #0x80 - vorr q9, q12, q14 - .endif - - @ at this point: - @ q8: normal_limit - @ q9: hev - - @ convert to signed value: - veor q3, q3, q13 @ PS0 = P0 ^ 0x80 - veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 - - vmov.i16 q12, #3 - vsubl.s8 q10, d8, d6 @ QS0 - PS0 - vsubl.s8 q11, d9, d7 @ (widened to 16bit) - veor q2, q2, q13 @ PS1 = P1 ^ 0x80 - veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 - vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) - vmul.i16 q11, q11, q12 - - vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) - vmov.i8 q14, #4 - vmov.i8 q15, #3 - .if \inner - vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) - .endif - vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) - vaddw.s8 q11, q11, d25 - vqmovn.s16 d20, q10 @ narrow result back into q10 - vqmovn.s16 d21, q11 - .if !\inner && !\simple - veor q1, q1, q13 @ PS2 = P2 ^ 0x80 - veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 - .endif - vand q10, q10, q8 @ w &= normal_limit - - @ registers used at this point.. - @ q0 -> P3 (don't corrupt) - @ q1-q6 -> PS2-QS2 - @ q7 -> Q3 (don't corrupt) - @ q9 -> hev - @ q10 -> w - @ q13 -> #0x80 - @ q14 -> #4 - @ q15 -> #3 - @ q8, q11, q12 -> unused - - @ filter_common: is4tap==1 - @ c1 = clamp(w + 4) >> 3; - @ c2 = clamp(w + 3) >> 3; - @ Q0 = s2u(QS0 - c1); - @ P0 = s2u(PS0 + c2); - - .if \simple - vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) - vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) - vshr.s8 q11, q11, #3 @ c1 >>= 3 - vshr.s8 q12, q12, #3 @ c2 >>= 3 - vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) - vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) - veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 - veor q3, q3, q13 @ P0 = PS0 ^ 0x80 - veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 - veor q2, q2, q13 @ P1 = PS1 ^ 0x80 - .elseif \inner - @ the !is4tap case of filter_common, only used for inner blocks - @ c3 = ((c1&~hev) + 1) >> 1; - @ Q1 = s2u(QS1 - c3); - @ P1 = s2u(PS1 + c3); - vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) - vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) - vshr.s8 q11, q11, #3 @ c1 >>= 3 - vshr.s8 q12, q12, #3 @ c2 >>= 3 - vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) - vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) - vbic q11, q11, q9 @ c1 & ~hev - veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 - vrshr.s8 q11, q11, #1 @ c3 >>= 1 - veor q3, q3, q13 @ P0 = PS0 ^ 0x80 - vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) - vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) - veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 - veor q2, q2, q13 @ P1 = PS1 ^ 0x80 - .else - vand q12, q10, q9 @ w & hev - vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) - vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) - vshr.s8 q11, q11, #3 @ c1 >>= 3 - vshr.s8 q12, q12, #3 @ c2 >>= 3 - vbic q10, q10, q9 @ w &= ~hev - vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) - vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) - - @ filter_mbedge: - @ a = clamp((27*w + 63) >> 7); - @ Q0 = s2u(QS0 - a); - @ P0 = s2u(PS0 + a); - @ a = clamp((18*w + 63) >> 7); - @ Q1 = s2u(QS1 - a); - @ P1 = s2u(PS1 + a); - @ a = clamp((9*w + 63) >> 7); - @ Q2 = s2u(QS2 - a); - @ P2 = s2u(PS2 + a); - vmov.i16 q9, #63 - vshll.s8 q14, d20, #3 - vshll.s8 q15, d21, #3 - vaddw.s8 q14, q14, d20 - vaddw.s8 q15, q15, d21 - vadd.s16 q8, q9, q14 - vadd.s16 q9, q9, q15 @ 9*w + 63 - vadd.s16 q11, q8, q14 - vadd.s16 q12, q9, q15 @ 18*w + 63 - vadd.s16 q14, q11, q14 - vadd.s16 q15, q12, q15 @ 27*w + 63 - vqshrn.s16 d16, q8, #7 - vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) - vqshrn.s16 d22, q11, #7 - vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) - vqshrn.s16 d28, q14, #7 - vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) - vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) - vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) - vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) - vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) - vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) - vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) - veor q3, q3, q13 @ P0 = PS0 ^ 0x80 - veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 - veor q2, q2, q13 @ P1 = PS1 ^ 0x80 - veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 - veor q1, q1, q13 @ P2 = PS2 ^ 0x80 - veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 - .endif -.endm - -.macro vp8_v_loop_filter16 name, inner=0, simple=0 -function ff_vp8_v_loop_filter16\name\()_neon, export=1 - vpush {q4-q7} - sub r0, r0, r1, lsl #1+!\simple - - @ Load pixels: - .if !\simple - ldr r12, [sp, #64] @ hev_thresh - vld1.8 {q0}, [r0,:128], r1 @ P3 - vld1.8 {q1}, [r0,:128], r1 @ P2 - .endif - vld1.8 {q2}, [r0,:128], r1 @ P1 - vld1.8 {q3}, [r0,:128], r1 @ P0 - vld1.8 {q4}, [r0,:128], r1 @ Q0 - vld1.8 {q5}, [r0,:128], r1 @ Q1 - .if !\simple - vld1.8 {q6}, [r0,:128], r1 @ Q2 - vld1.8 {q7}, [r0,:128] @ Q3 - vdup.8 q15, r3 @ flim_I - .endif - vdup.8 q14, r2 @ flim_E - - vp8_loop_filter inner=\inner, simple=\simple - - @ back up to P2: dst -= stride * 6 - sub r0, r0, r1, lsl #2 - .if !\simple - sub r0, r0, r1, lsl #1 - - @ Store pixels: - vst1.8 {q1}, [r0,:128], r1 @ P2 - .endif - vst1.8 {q2}, [r0,:128], r1 @ P1 - vst1.8 {q3}, [r0,:128], r1 @ P0 - vst1.8 {q4}, [r0,:128], r1 @ Q0 - vst1.8 {q5}, [r0,:128], r1 @ Q1 - .if !\simple - vst1.8 {q6}, [r0,:128] @ Q2 - .endif - - vpop {q4-q7} - bx lr -endfunc -.endm - -vp8_v_loop_filter16 -vp8_v_loop_filter16 _inner, inner=1 -vp8_v_loop_filter16 _simple, simple=1 - -.macro vp8_v_loop_filter8uv name, inner=0 -function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 - vpush {q4-q7} - sub r0, r0, r2, lsl #2 - sub r1, r1, r2, lsl #2 - ldr r12, [sp, #64] @ flim_I - - @ Load pixels: - vld1.8 {d0}, [r0,:64], r2 @ P3 - vld1.8 {d1}, [r1,:64], r2 @ P3 - vld1.8 {d2}, [r0,:64], r2 @ P2 - vld1.8 {d3}, [r1,:64], r2 @ P2 - vld1.8 {d4}, [r0,:64], r2 @ P1 - vld1.8 {d5}, [r1,:64], r2 @ P1 - vld1.8 {d6}, [r0,:64], r2 @ P0 - vld1.8 {d7}, [r1,:64], r2 @ P0 - vld1.8 {d8}, [r0,:64], r2 @ Q0 - vld1.8 {d9}, [r1,:64], r2 @ Q0 - vld1.8 {d10}, [r0,:64], r2 @ Q1 - vld1.8 {d11}, [r1,:64], r2 @ Q1 - vld1.8 {d12}, [r0,:64], r2 @ Q2 - vld1.8 {d13}, [r1,:64], r2 @ Q2 - vld1.8 {d14}, [r0,:64] @ Q3 - vld1.8 {d15}, [r1,:64] @ Q3 - - vdup.8 q14, r3 @ flim_E - vdup.8 q15, r12 @ flim_I - ldr r12, [sp, #68] @ hev_thresh - - vp8_loop_filter inner=\inner - - @ back up to P2: u,v -= stride * 6 - sub r0, r0, r2, lsl #2 - sub r1, r1, r2, lsl #2 - sub r0, r0, r2, lsl #1 - sub r1, r1, r2, lsl #1 - - @ Store pixels: - vst1.8 {d2}, [r0,:64], r2 @ P2 - vst1.8 {d3}, [r1,:64], r2 @ P2 - vst1.8 {d4}, [r0,:64], r2 @ P1 - vst1.8 {d5}, [r1,:64], r2 @ P1 - vst1.8 {d6}, [r0,:64], r2 @ P0 - vst1.8 {d7}, [r1,:64], r2 @ P0 - vst1.8 {d8}, [r0,:64], r2 @ Q0 - vst1.8 {d9}, [r1,:64], r2 @ Q0 - vst1.8 {d10}, [r0,:64], r2 @ Q1 - vst1.8 {d11}, [r1,:64], r2 @ Q1 - vst1.8 {d12}, [r0,:64] @ Q2 - vst1.8 {d13}, [r1,:64] @ Q2 - - vpop {q4-q7} - bx lr -endfunc -.endm - -vp8_v_loop_filter8uv -vp8_v_loop_filter8uv _inner, inner=1 - -.macro vp8_h_loop_filter16 name, inner=0, simple=0 -function ff_vp8_h_loop_filter16\name\()_neon, export=1 - vpush {q4-q7} - sub r0, r0, #4 - .if !\simple - ldr r12, [sp, #64] @ hev_thresh - .endif - - @ Load pixels: - vld1.8 {d0}, [r0], r1 @ load first 8-line src data - vld1.8 {d2}, [r0], r1 - vld1.8 {d4}, [r0], r1 - vld1.8 {d6}, [r0], r1 - vld1.8 {d8}, [r0], r1 - vld1.8 {d10}, [r0], r1 - vld1.8 {d12}, [r0], r1 - vld1.8 {d14}, [r0], r1 - vld1.8 {d1}, [r0], r1 @ load second 8-line src data - vld1.8 {d3}, [r0], r1 - vld1.8 {d5}, [r0], r1 - vld1.8 {d7}, [r0], r1 - vld1.8 {d9}, [r0], r1 - vld1.8 {d11}, [r0], r1 - vld1.8 {d13}, [r0], r1 - vld1.8 {d15}, [r0], r1 - - transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 - - vdup.8 q14, r2 @ flim_E - .if !\simple - vdup.8 q15, r3 @ flim_I - .endif - - vp8_loop_filter inner=\inner, simple=\simple - - sub r0, r0, r1, lsl #4 @ backup 16 rows - - transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 - - @ Store pixels: - vst1.8 {d0}, [r0], r1 - vst1.8 {d2}, [r0], r1 - vst1.8 {d4}, [r0], r1 - vst1.8 {d6}, [r0], r1 - vst1.8 {d8}, [r0], r1 - vst1.8 {d10}, [r0], r1 - vst1.8 {d12}, [r0], r1 - vst1.8 {d14}, [r0], r1 - vst1.8 {d1}, [r0], r1 - vst1.8 {d3}, [r0], r1 - vst1.8 {d5}, [r0], r1 - vst1.8 {d7}, [r0], r1 - vst1.8 {d9}, [r0], r1 - vst1.8 {d11}, [r0], r1 - vst1.8 {d13}, [r0], r1 - vst1.8 {d15}, [r0] - - vpop {q4-q7} - bx lr -endfunc -.endm - -vp8_h_loop_filter16 -vp8_h_loop_filter16 _inner, inner=1 -vp8_h_loop_filter16 _simple, simple=1 - -.macro vp8_h_loop_filter8uv name, inner=0 -function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 - vpush {q4-q7} - sub r0, r0, #4 - sub r1, r1, #4 - ldr r12, [sp, #64] @ flim_I - - @ Load pixels: - vld1.8 {d0}, [r0], r2 @ load u - vld1.8 {d1}, [r1], r2 @ load v - vld1.8 {d2}, [r0], r2 - vld1.8 {d3}, [r1], r2 - vld1.8 {d4}, [r0], r2 - vld1.8 {d5}, [r1], r2 - vld1.8 {d6}, [r0], r2 - vld1.8 {d7}, [r1], r2 - vld1.8 {d8}, [r0], r2 - vld1.8 {d9}, [r1], r2 - vld1.8 {d10}, [r0], r2 - vld1.8 {d11}, [r1], r2 - vld1.8 {d12}, [r0], r2 - vld1.8 {d13}, [r1], r2 - vld1.8 {d14}, [r0], r2 - vld1.8 {d15}, [r1], r2 - - transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 - - vdup.8 q14, r3 @ flim_E - vdup.8 q15, r12 @ flim_I - ldr r12, [sp, #68] @ hev_thresh - - vp8_loop_filter inner=\inner - - sub r0, r0, r2, lsl #3 @ backup u 8 rows - sub r1, r1, r2, lsl #3 @ backup v 8 rows - - transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7 - - @ Store pixels: - vst1.8 {d0}, [r0], r2 - vst1.8 {d1}, [r1], r2 - vst1.8 {d2}, [r0], r2 - vst1.8 {d3}, [r1], r2 - vst1.8 {d4}, [r0], r2 - vst1.8 {d5}, [r1], r2 - vst1.8 {d6}, [r0], r2 - vst1.8 {d7}, [r1], r2 - vst1.8 {d8}, [r0], r2 - vst1.8 {d9}, [r1], r2 - vst1.8 {d10}, [r0], r2 - vst1.8 {d11}, [r1], r2 - vst1.8 {d12}, [r0], r2 - vst1.8 {d13}, [r1], r2 - vst1.8 {d14}, [r0] - vst1.8 {d15}, [r1] - - vpop {q4-q7} - bx lr -endfunc -.endm - -vp8_h_loop_filter8uv -vp8_h_loop_filter8uv _inner, inner=1 - -function ff_put_vp8_pixels16_neon, export=1 - ldr r12, [sp, #0] @ h -1: - subs r12, r12, #4 - vld1.8 {q0}, [r2], r3 - vld1.8 {q1}, [r2], r3 - vld1.8 {q2}, [r2], r3 - vld1.8 {q3}, [r2], r3 - vst1.8 {q0}, [r0,:128], r1 - vst1.8 {q1}, [r0,:128], r1 - vst1.8 {q2}, [r0,:128], r1 - vst1.8 {q3}, [r0,:128], r1 - bgt 1b - bx lr -endfunc - -function ff_put_vp8_pixels8_neon, export=1 - ldr r12, [sp, #0] @ h -1: - subs r12, r12, #4 - vld1.8 {d0}, [r2], r3 - vld1.8 {d1}, [r2], r3 - vld1.8 {d2}, [r2], r3 - vld1.8 {d3}, [r2], r3 - vst1.8 {d0}, [r0,:64], r1 - vst1.8 {d1}, [r0,:64], r1 - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - bgt 1b - bx lr -endfunc - -/* 4/6-tap 8th-pel MC */ - -.macro vp8_epel8_h6 d, a, b - vext.8 d27, \a, \b, #1 - vmovl.u8 q8, \a - vext.8 d28, \a, \b, #2 - vmovl.u8 q9, d27 - vext.8 d29, \a, \b, #3 - vmovl.u8 q10, d28 - vext.8 d30, \a, \b, #4 - vmovl.u8 q11, d29 - vext.8 d31, \a, \b, #5 - vmovl.u8 q12, d30 - vmul.u16 q10, q10, d0[2] - vmovl.u8 q13, d31 - vmul.u16 q11, q11, d0[3] - vmls.u16 q10, q9, d0[1] - vmls.u16 q11, q12, d1[0] - vmla.u16 q10, q8, d0[0] - vmla.u16 q11, q13, d1[1] - vqadd.s16 q11, q10, q11 - vqrshrun.s16 \d, q11, #7 -.endm - -.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 - vext.8 q14, \q0, \q1, #3 - vext.8 q15, \q0, \q1, #4 - vmovl.u8 q11, d28 - vmovl.u8 q14, d29 - vext.8 q3, \q0, \q1, #2 - vmovl.u8 q12, d30 - vmovl.u8 q15, d31 - vext.8 q8, \q0, \q1, #1 - vmovl.u8 q10, d6 - vmovl.u8 q3, d7 - vext.8 q2, \q0, \q1, #5 - vmovl.u8 q13, d4 - vmovl.u8 q2, d5 - vmovl.u8 q9, d16 - vmovl.u8 q8, d17 - vmul.u16 q11, q11, d0[3] - vmul.u16 q10, q10, d0[2] - vmul.u16 q3, q3, d0[2] - vmul.u16 q14, q14, d0[3] - vmls.u16 q11, q12, d1[0] - vmovl.u8 q12, \s0 - vmovl.u8 q1, \s1 - vmls.u16 q10, q9, d0[1] - vmls.u16 q3, q8, d0[1] - vmls.u16 q14, q15, d1[0] - vmla.u16 q10, q12, d0[0] - vmla.u16 q11, q13, d1[1] - vmla.u16 q3, q1, d0[0] - vmla.u16 q14, q2, d1[1] - vqadd.s16 q11, q10, q11 - vqadd.s16 q14, q3, q14 - vqrshrun.s16 \d0, q11, #7 - vqrshrun.s16 \d1, q14, #7 -.endm - -.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 - vmovl.u8 q10, \s2 - vmovl.u8 q11, \s3 - vmovl.u8 q9, \s1 - vmovl.u8 q12, \s4 - vmovl.u8 q8, \s0 - vmovl.u8 q13, \s5 - vmul.u16 q10, q10, d0[2] - vmul.u16 q11, q11, d0[3] - vmls.u16 q10, q9, d0[1] - vmls.u16 q11, q12, d1[0] - vmla.u16 q10, q8, d0[0] - vmla.u16 q11, q13, d1[1] - vqadd.s16 q11, q10, q11 - vqrshrun.s16 \d0, q11, #7 -.endm - -.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 - vmovl.u8 q10, \s0 - vmovl.u8 q11, \s3 - vmovl.u8 q14, \s6 - vmovl.u8 q9, \s1 - vmovl.u8 q12, \s4 - vmovl.u8 q8, \s2 - vmovl.u8 q13, \s5 - vmul.u16 q10, q10, d0[0] - vmul.u16 q15, q11, d0[3] - vmul.u16 q11, q11, d0[2] - vmul.u16 q14, q14, d1[1] - vmls.u16 q10, q9, d0[1] - vmls.u16 q15, q12, d1[0] - vmls.u16 q11, q8, d0[1] - vmls.u16 q14, q13, d1[0] - vmla.u16 q10, q8, d0[2] - vmla.u16 q15, q13, d1[1] - vmla.u16 q11, q9, d0[0] - vmla.u16 q14, q12, d0[3] - vqadd.s16 q15, q10, q15 - vqadd.s16 q14, q11, q14 - vqrshrun.s16 \d0, q15, #7 - vqrshrun.s16 \d1, q14, #7 -.endm - -.macro vp8_epel8_h4 d, a, b - vext.8 d28, \a, \b, #1 - vmovl.u8 q9, \a - vext.8 d29, \a, \b, #2 - vmovl.u8 q10, d28 - vext.8 d30, \a, \b, #3 - vmovl.u8 q11, d29 - vmovl.u8 q12, d30 - vmul.u16 q10, q10, d0[2] - vmul.u16 q11, q11, d0[3] - vmls.u16 q10, q9, d0[1] - vmls.u16 q11, q12, d1[0] - vqadd.s16 q11, q10, q11 - vqrshrun.s16 \d, q11, #7 -.endm - -.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 - vmovl.u8 q9, \s0 - vmovl.u8 q10, \s1 - vmovl.u8 q11, \s2 - vmovl.u8 q12, \s3 - vmovl.u8 q13, \s4 - vmul.u16 q8, q10, d0[2] - vmul.u16 q14, q11, d0[3] - vmul.u16 q11, q11, d0[2] - vmul.u16 q15, q12, d0[3] - vmls.u16 q8, q9, d0[1] - vmls.u16 q14, q12, d1[0] - vmls.u16 q11, q10, d0[1] - vmls.u16 q15, q13, d1[0] - vqadd.s16 q8, q8, q14 - vqadd.s16 q11, q11, q15 - vqrshrun.s16 \d0, q8, #7 - vqrshrun.s16 \d1, q11, #7 -.endm - -function ff_put_vp8_epel16_v6_neon, export=1 - sub r2, r2, r3, lsl #1 - push {r4,lr} - vpush {d8-d15} - - ldr r4, [sp, #80] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #72] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2-d3}, [r2], r3 - vld1.8 {d4-d5}, [r2], r3 - vld1.8 {d6-d7}, [r2], r3 - vld1.8 {d8-d9}, [r2], r3 - vld1.8 {d10-d11},[r2], r3 - vld1.8 {d12-d13},[r2], r3 - vld1.8 {d14-d15},[r2] - sub r2, r2, r3, lsl #2 - - vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 - vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 - - vst1.8 {d2-d3}, [r0,:128], r1 - vst1.8 {d4-d5}, [r0,:128], r1 - subs r12, r12, #2 - bne 1b - - vpop {d8-d15} - pop {r4,pc} -endfunc - -function ff_put_vp8_epel16_h6_neon, export=1 - sub r2, r2, #2 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2-d4}, [r2], r3 - - vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 - - vst1.8 {d2-d3}, [r0,:128], r1 - subs r12, r12, #1 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel16_h6v6_neon, export=1 - sub r2, r2, r3, lsl #1 - sub r2, r2, #2 - push {r4,lr} - vpush {d8-d9} - - @ first pass (horizontal): - ldr r4, [sp, #28] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #24] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #336+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #5 - bic lr, lr, #15 -1: - vld1.8 {d2,d3,d4}, [r2], r3 - - vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 - - vst1.8 {d2-d3}, [lr,:128]! - subs r12, r12, #1 - bne 1b - - @ second pass (vertical): - ldr r4, [sp, #336+16+32] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #336+16+24] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d5}, [lr,:128]! - vld1.8 {d6-d9}, [lr,:128]! - vld1.8 {d28-d31},[lr,:128] - sub lr, lr, #48 - - vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 - vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 - - vst1.8 {d2-d3}, [r0,:128], r1 - subs r12, r12, #1 - bne 2b - - add sp, sp, #336+16 - vpop {d8-d9} - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_v6_neon, export=1 - sub r2, r2, r3, lsl #1 - push {r4,lr} - - ldr r4, [sp, #16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2}, [r2], r3 - vld1.8 {d3}, [r2], r3 - vld1.8 {d4}, [r2], r3 - vld1.8 {d5}, [r2], r3 - vld1.8 {d6}, [r2], r3 - vld1.8 {d7}, [r2], r3 - vld1.8 {d28}, [r2] - - sub r2, r2, r3, lsl #2 - - vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 - - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - subs r12, r12, #2 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_h6_neon, export=1 - sub r2, r2, #2 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2,d3}, [r2], r3 - - vp8_epel8_h6 d2, d2, d3 - - vst1.8 {d2}, [r0,:64], r1 - subs r12, r12, #1 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_h6v6_neon, export=1 - sub r2, r2, r3, lsl #1 - sub r2, r2, #2 - push {r4,lr} - - @ first pass (horizontal): - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #168+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #5 - bic lr, lr, #15 -1: - vld1.8 {d2,d3}, [r2], r3 - - vp8_epel8_h6 d2, d2, d3 - - vst1.8 {d2}, [lr,:64]! - subs r12, r12, #1 - bne 1b - - @ second pass (vertical): - ldr r4, [sp, #168+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #168+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d5}, [lr,:128]! - vld1.8 {d6-d7}, [lr,:128]! - vld1.8 {d30}, [lr,:64] - sub lr, lr, #32 - - vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 - - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - subs r12, r12, #2 - bne 2b - - add sp, sp, #168+16 - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_v4_neon, export=1 - sub r2, r2, r3 - push {r4,lr} - - ldr r4, [sp, #16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2}, [r2], r3 - vld1.8 {d3}, [r2], r3 - vld1.8 {d4}, [r2], r3 - vld1.8 {d5}, [r2], r3 - vld1.8 {d6}, [r2] - sub r2, r2, r3, lsl #1 - - vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 - - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - subs r12, r12, #2 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_h4_neon, export=1 - sub r2, r2, #1 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2,d3}, [r2], r3 - - vp8_epel8_h4 d2, d2, d3 - - vst1.8 {d2}, [r0,:64], r1 - subs r12, r12, #1 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_h4v4_neon, export=1 - sub r2, r2, r3 - sub r2, r2, #1 - push {r4,lr} - - @ first pass (horizontal): - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #168+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #3 - bic lr, lr, #15 -1: - vld1.8 {d2,d3}, [r2], r3 - - vp8_epel8_h4 d2, d2, d3 - - vst1.8 {d2}, [lr,:64]! - subs r12, r12, #1 - bne 1b - - @ second pass (vertical): - ldr r4, [sp, #168+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #168+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d5}, [lr,:128]! - vld1.8 {d6}, [lr,:64] - sub lr, lr, #16 - - vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 - - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - subs r12, r12, #2 - bne 2b - - add sp, sp, #168+16 - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_h6v4_neon, export=1 - sub r2, r2, r3 - sub r2, r2, #2 - push {r4,lr} - - @ first pass (horizontal): - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #168+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #3 - bic lr, lr, #15 -1: - vld1.8 {d2,d3}, [r2], r3 - - vp8_epel8_h6 d2, d2, d3 - - vst1.8 {d2}, [lr,:64]! - subs r12, r12, #1 - bne 1b - - @ second pass (vertical): - ldr r4, [sp, #168+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #168+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d5}, [lr,:128]! - vld1.8 {d6}, [lr,:64] - sub lr, lr, #16 - - vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 - - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - subs r12, r12, #2 - bne 2b - - add sp, sp, #168+16 - pop {r4,pc} -endfunc - -function ff_put_vp8_epel8_h4v6_neon, export=1 - sub r2, r2, r3, lsl #1 - sub r2, r2, #1 - push {r4,lr} - - @ first pass (horizontal): - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #168+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #5 - bic lr, lr, #15 -1: - vld1.8 {d2,d3}, [r2], r3 - - vp8_epel8_h4 d2, d2, d3 - - vst1.8 {d2}, [lr,:64]! - subs r12, r12, #1 - bne 1b - - @ second pass (vertical): - ldr r4, [sp, #168+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #168+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d5}, [lr,:128]! - vld1.8 {d6-d7}, [lr,:128]! - vld1.8 {d30}, [lr,:64] - sub lr, lr, #32 - - vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 - - vst1.8 {d2}, [r0,:64], r1 - vst1.8 {d3}, [r0,:64], r1 - subs r12, r12, #2 - bne 2b - - add sp, sp, #168+16 - pop {r4,pc} -endfunc - -.ltorg - -function ff_put_vp8_epel4_v6_neon, export=1 - sub r2, r2, r3, lsl #1 - push {r4,lr} - - ldr r4, [sp, #16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.32 {d2[]}, [r2], r3 - vld1.32 {d3[]}, [r2], r3 - vld1.32 {d4[]}, [r2], r3 - vld1.32 {d5[]}, [r2], r3 - vld1.32 {d6[]}, [r2], r3 - vld1.32 {d7[]}, [r2], r3 - vld1.32 {d28[]}, [r2] - sub r2, r2, r3, lsl #2 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d3[1]}, [r2], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d5[1]}, [r2], r3 - vld1.32 {d6[1]}, [r2], r3 - vld1.32 {d7[1]}, [r2], r3 - vld1.32 {d28[1]}, [r2] - sub r2, r2, r3, lsl #2 - - vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 - - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - subs r12, r12, #4 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_h6_neon, export=1 - sub r2, r2, #2 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {q1}, [r2], r3 - vp8_epel8_h6 d2, d2, d3 - vst1.32 {d2[0]}, [r0,:32], r1 - subs r12, r12, #1 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_h6v6_neon, export=1 - sub r2, r2, r3, lsl #1 - sub r2, r2, #2 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #52+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #5 - bic lr, lr, #15 -1: - vld1.8 {q1}, [r2], r3 - vp8_epel8_h6 d2, d2, d3 - vst1.32 {d2[0]}, [lr,:32]! - subs r12, r12, #1 - bne 1b - - ldr r4, [sp, #52+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #52+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d3}, [lr,:128]! - vld1.8 {d6}, [lr,:64]! - vld1.32 {d28[]}, [lr,:32] - sub lr, lr, #16 - vld1.8 {d4-d5}, [lr]! - vld1.8 {d7}, [lr,:64]! - vld1.32 {d28[1]}, [lr,:32] - sub lr, lr, #16 - vtrn.32 q1, q2 - vtrn.32 d6, d7 - vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - subs r12, r12, #4 - bne 2b - - add sp, sp, #52+16 - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_h4v6_neon, export=1 - sub r2, r2, r3, lsl #1 - sub r2, r2, #1 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #52+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #5 - bic lr, lr, #15 -1: - vld1.8 {d2}, [r2], r3 - vp8_epel8_h4 d2, d2, d2 - vst1.32 {d2[0]}, [lr,:32]! - subs r12, r12, #1 - bne 1b - - ldr r4, [sp, #52+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #52+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d3}, [lr,:128]! - vld1.8 {d6}, [lr,:64]! - vld1.32 {d28[]}, [lr,:32] - sub lr, lr, #16 - vld1.8 {d4-d5}, [lr]! - vld1.8 {d7}, [lr,:64]! - vld1.32 {d28[1]}, [lr,:32] - sub lr, lr, #16 - vtrn.32 q1, q2 - vtrn.32 d6, d7 - vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - subs r12, r12, #4 - bne 2b - - add sp, sp, #52+16 - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_h6v4_neon, export=1 - sub r2, r2, r3 - sub r2, r2, #2 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #44+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #3 - bic lr, lr, #15 -1: - vld1.8 {q1}, [r2], r3 - vp8_epel8_h6 d2, d2, d3 - vst1.32 {d2[0]}, [lr,:32]! - subs r12, r12, #1 - bne 1b - - ldr r4, [sp, #44+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #44+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d3}, [lr,:128]! - vld1.32 {d6[]}, [lr,:32] - sub lr, lr, #8 - vld1.8 {d4-d5}, [lr]! - vld1.32 {d6[1]}, [lr,:32] - sub lr, lr, #8 - vtrn.32 q1, q2 - vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - subs r12, r12, #4 - bne 2b - - add sp, sp, #44+16 - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_h4_neon, export=1 - sub r2, r2, #1 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.8 {d2}, [r2], r3 - vp8_epel8_h4 d2, d2, d2 - vst1.32 {d2[0]}, [r0,:32], r1 - subs r12, r12, #1 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_v4_neon, export=1 - sub r2, r2, r3 - push {r4,lr} - - ldr r4, [sp, #16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - vld1.16 {q0}, [r4,:128] -1: - vld1.32 {d2[]}, [r2], r3 - vld1.32 {d3[]}, [r2], r3 - vld1.32 {d4[]}, [r2], r3 - vld1.32 {d5[]}, [r2], r3 - vld1.32 {d6[]}, [r2] - sub r2, r2, r3, lsl #1 - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d3[1]}, [r2], r3 - vld1.32 {d4[1]}, [r2], r3 - vld1.32 {d5[1]}, [r2], r3 - vld1.32 {d6[1]}, [r2] - sub r2, r2, r3, lsl #1 - - vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 - - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - subs r12, r12, #4 - bne 1b - - pop {r4,pc} -endfunc - -function ff_put_vp8_epel4_h4v4_neon, export=1 - sub r2, r2, r3 - sub r2, r2, #1 - push {r4,lr} - - ldr r4, [sp, #12] @ mx - movrel lr, subpel_filters-16 - ldr r12, [sp, #8] @ h - add r4, lr, r4, lsl #4 - sub sp, sp, #44+16 - vld1.16 {q0}, [r4,:128] - add lr, sp, #15 - add r12, r12, #3 - bic lr, lr, #15 -1: - vld1.8 {d2}, [r2], r3 - vp8_epel8_h4 d2, d2, d3 - vst1.32 {d2[0]}, [lr,:32]! - subs r12, r12, #1 - bne 1b - - ldr r4, [sp, #44+16+16] @ my - movrel lr, subpel_filters-16 - ldr r12, [sp, #44+16+8] @ h - add r4, lr, r4, lsl #4 - add lr, sp, #15 - vld1.16 {q0}, [r4,:128] - bic lr, lr, #15 -2: - vld1.8 {d2-d3}, [lr,:128]! - vld1.32 {d6[]}, [lr,:32] - sub lr, lr, #8 - vld1.8 {d4-d5}, [lr]! - vld1.32 {d6[1]}, [lr,:32] - sub lr, lr, #8 - vtrn.32 q1, q2 - vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 - vst1.32 {d2[0]}, [r0,:32], r1 - vst1.32 {d3[0]}, [r0,:32], r1 - vst1.32 {d2[1]}, [r0,:32], r1 - vst1.32 {d3[1]}, [r0,:32], r1 - subs r12, r12, #4 - bne 2b - - add sp, sp, #44+16 - pop {r4,pc} -endfunc - -@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit -@ arithmatic can be used to apply filters -const subpel_filters, align=4 - .short 0, 6, 123, 12, 1, 0, 0, 0 - .short 2, 11, 108, 36, 8, 1, 0, 0 - .short 0, 9, 93, 50, 6, 0, 0, 0 - .short 3, 16, 77, 77, 16, 3, 0, 0 - .short 0, 6, 50, 93, 9, 0, 0, 0 - .short 1, 8, 36, 108, 11, 2, 0, 0 - .short 0, 1, 12, 123, 6, 0, 0, 0 -endconst - -/* Bilinear MC */ - -function ff_put_vp8_bilin16_h_neon, export=1 - push {lr} - ldr lr, [sp, #8] @ mx - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr r12, [sp, #4] @ h -1: - subs r12, r12, #2 - vld1.8 {d2-d4}, [r2], r3 - vext.8 q2, q1, q2, #1 - vmull.u8 q8, d2, d1 - vmlal.u8 q8, d4, d0 - vld1.8 {d18-d20},[r2], r3 - vmull.u8 q3, d3, d1 - vmlal.u8 q3, d5, d0 - vext.8 q10, q9, q10, #1 - vmull.u8 q11, d18, d1 - vmlal.u8 q11, d20, d0 - vmull.u8 q12, d19, d1 - vmlal.u8 q12, d21, d0 - vrshrn.u16 d4, q8, #3 - vrshrn.u16 d5, q3, #3 - vrshrn.u16 d6, q11, #3 - vrshrn.u16 d7, q12, #3 - vst1.8 {q2}, [r0,:128], r1 - vst1.8 {q3}, [r0,:128], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin16_v_neon, export=1 - push {lr} - ldr lr, [sp, #12] @ my - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr r12, [sp, #4] @ h - vld1.8 {q1}, [r2], r3 -1: - subs r12, r12, #2 - vld1.8 {q2}, [r2], r3 - vmull.u8 q3, d2, d1 - vmlal.u8 q3, d4, d0 - vmull.u8 q8, d3, d1 - vmlal.u8 q8, d5, d0 - vld1.8 {q1}, [r2], r3 - vmull.u8 q9, d4, d1 - vmlal.u8 q9, d2, d0 - vmull.u8 q10, d5, d1 - vmlal.u8 q10, d3, d0 - vrshrn.u16 d4, q3, #3 - vrshrn.u16 d5, q8, #3 - vrshrn.u16 d6, q9, #3 - vrshrn.u16 d7, q10, #3 - vst1.8 {q2}, [r0,:128], r1 - vst1.8 {q3}, [r0,:128], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin16_hv_neon, export=1 - push {lr} - ldr lr, [sp, #8] @ mx - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr lr, [sp, #12] @ my - rsb r12, lr, #8 - vdup.8 d2, lr - vdup.8 d3, r12 - ldr r12, [sp, #4] @ h - - vld1.8 {d4-d6}, [r2], r3 - vext.8 q3, q2, q3, #1 - vmull.u8 q8, d4, d1 - vmlal.u8 q8, d6, d0 - vmull.u8 q9, d5, d1 - vmlal.u8 q9, d7, d0 - vrshrn.u16 d4, q8, #3 - vrshrn.u16 d5, q9, #3 -1: - subs r12, r12, #2 - vld1.8 {d18-d20},[r2], r3 - vext.8 q10, q9, q10, #1 - vmull.u8 q11, d18, d1 - vmlal.u8 q11, d20, d0 - vld1.8 {d26-d28},[r2], r3 - vmull.u8 q12, d19, d1 - vmlal.u8 q12, d21, d0 - vext.8 q14, q13, q14, #1 - vmull.u8 q8, d26, d1 - vmlal.u8 q8, d28, d0 - vmull.u8 q9, d27, d1 - vmlal.u8 q9, d29, d0 - vrshrn.u16 d6, q11, #3 - vrshrn.u16 d7, q12, #3 - vmull.u8 q12, d4, d3 - vmlal.u8 q12, d6, d2 - vmull.u8 q15, d5, d3 - vmlal.u8 q15, d7, d2 - vrshrn.u16 d4, q8, #3 - vrshrn.u16 d5, q9, #3 - vmull.u8 q10, d6, d3 - vmlal.u8 q10, d4, d2 - vmull.u8 q11, d7, d3 - vmlal.u8 q11, d5, d2 - vrshrn.u16 d24, q12, #3 - vrshrn.u16 d25, q15, #3 - vst1.8 {q12}, [r0,:128], r1 - vrshrn.u16 d20, q10, #3 - vrshrn.u16 d21, q11, #3 - vst1.8 {q10}, [r0,:128], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin8_h_neon, export=1 - push {lr} - ldr lr, [sp, #8] @ mx - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr r12, [sp, #4] @ h -1: - subs r12, r12, #2 - vld1.8 {q1}, [r2], r3 - vext.8 d3, d2, d3, #1 - vmull.u8 q2, d2, d1 - vmlal.u8 q2, d3, d0 - vld1.8 {q3}, [r2], r3 - vext.8 d7, d6, d7, #1 - vmull.u8 q8, d6, d1 - vmlal.u8 q8, d7, d0 - vrshrn.u16 d4, q2, #3 - vrshrn.u16 d16, q8, #3 - vst1.8 {d4}, [r0,:64], r1 - vst1.8 {d16}, [r0,:64], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin8_v_neon, export=1 - push {lr} - ldr lr, [sp, #12] @ my - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr r12, [sp, #4] @ h - vld1.8 {d2}, [r2], r3 -1: - subs r12, r12, #2 - vld1.8 {d3}, [r2], r3 - vmull.u8 q2, d2, d1 - vmlal.u8 q2, d3, d0 - vld1.8 {d2}, [r2], r3 - vmull.u8 q3, d3, d1 - vmlal.u8 q3, d2, d0 - vrshrn.u16 d4, q2, #3 - vrshrn.u16 d6, q3, #3 - vst1.8 {d4}, [r0,:64], r1 - vst1.8 {d6}, [r0,:64], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin8_hv_neon, export=1 - push {lr} - ldr lr, [sp, #8] @ mx - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr lr, [sp, #12] @ my - rsb r12, lr, #8 - vdup.8 d2, lr - vdup.8 d3, r12 - ldr r12, [sp, #4] @ h - - vld1.8 {q2}, [r2], r3 - vext.8 d5, d4, d5, #1 - vmull.u8 q9, d4, d1 - vmlal.u8 q9, d5, d0 - vrshrn.u16 d22, q9, #3 -1: - subs r12, r12, #2 - vld1.8 {q3}, [r2], r3 - vext.8 d7, d6, d7, #1 - vmull.u8 q8, d6, d1 - vmlal.u8 q8, d7, d0 - vld1.8 {q2}, [r2], r3 - vext.8 d5, d4, d5, #1 - vmull.u8 q9, d4, d1 - vmlal.u8 q9, d5, d0 - vrshrn.u16 d16, q8, #3 - vmull.u8 q10, d22, d3 - vmlal.u8 q10, d16, d2 - vrshrn.u16 d22, q9, #3 - vmull.u8 q12, d16, d3 - vmlal.u8 q12, d22, d2 - vrshrn.u16 d20, q10, #3 - vst1.8 {d20}, [r0,:64], r1 - vrshrn.u16 d23, q12, #3 - vst1.8 {d23}, [r0,:64], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin4_h_neon, export=1 - push {lr} - ldr lr, [sp, #8] @ mx - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr r12, [sp, #4] @ h -1: - subs r12, r12, #2 - vld1.8 {d2}, [r2], r3 - vext.8 d3, d2, d3, #1 - vld1.8 {d6}, [r2], r3 - vext.8 d7, d6, d7, #1 - vtrn.32 q1, q3 - vmull.u8 q2, d2, d1 - vmlal.u8 q2, d3, d0 - vrshrn.u16 d4, q2, #3 - vst1.32 {d4[0]}, [r0,:32], r1 - vst1.32 {d4[1]}, [r0,:32], r1 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin4_v_neon, export=1 - push {lr} - ldr lr, [sp, #12] @ my - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr r12, [sp, #4] @ h - vld1.32 {d2[]}, [r2], r3 -1: - vld1.32 {d3[]}, [r2] - vld1.32 {d2[1]}, [r2], r3 - vld1.32 {d3[1]}, [r2], r3 - vmull.u8 q2, d2, d1 - vmlal.u8 q2, d3, d0 - vtrn.32 d3, d2 - vrshrn.u16 d4, q2, #3 - vst1.32 {d4[0]}, [r0,:32], r1 - vst1.32 {d4[1]}, [r0,:32], r1 - subs r12, r12, #2 - bgt 1b - - pop {pc} -endfunc - -function ff_put_vp8_bilin4_hv_neon, export=1 - push {lr} - ldr lr, [sp, #8] @ mx - rsb r12, lr, #8 - vdup.8 d0, lr - vdup.8 d1, r12 - ldr lr, [sp, #12] @ my - rsb r12, lr, #8 - vdup.8 d2, lr - vdup.8 d3, r12 - ldr r12, [sp, #4] @ h - - vld1.8 {d4}, [r2], r3 - vext.8 d5, d4, d4, #1 - vmull.u8 q9, d4, d1 - vmlal.u8 q9, d5, d0 - vrshrn.u16 d22, q9, #3 -1: - subs r12, r12, #2 - vld1.8 {d6}, [r2], r3 - vext.8 d7, d6, d6, #1 - vld1.8 {d4}, [r2], r3 - vext.8 d5, d4, d4, #1 - vtrn.32 q3, q2 - vmull.u8 q8, d6, d1 - vmlal.u8 q8, d7, d0 - vrshrn.u16 d16, q8, #3 - vmull.u8 q10, d16, d2 - vtrn.32 d22, d16 - vmlal.u8 q10, d22, d3 - vrev64.32 d22, d16 - vrshrn.u16 d20, q10, #3 - vst1.32 {d20[0]}, [r0,:32], r1 - vst1.32 {d20[1]}, [r0,:32], r1 - bgt 1b - - pop {pc} -endfunc |
