summaryrefslogtreecommitdiff
path: root/ffmpeg/libavcodec/arm
diff options
context:
space:
mode:
authorTim Redfern <tim@eclectronics.org>2014-02-17 13:36:38 +0000
committerTim Redfern <tim@eclectronics.org>2014-02-17 13:36:38 +0000
commit22e28216336da876e1fd17f380ce42eaf1446769 (patch)
tree444dad3dc7e2656992d29f34f7bce31970c122a5 /ffmpeg/libavcodec/arm
parentae5e8541f6e06e64c28719467cdf366ac57aff31 (diff)
chasing indexing error
Diffstat (limited to 'ffmpeg/libavcodec/arm')
-rw-r--r--ffmpeg/libavcodec/arm/Makefile98
-rw-r--r--ffmpeg/libavcodec/arm/aac.h143
-rw-r--r--ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c57
-rw-r--r--ffmpeg/libavcodec/arm/aacpsdsp_neon.S272
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_arm.S36
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_armv6.S84
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_init_arm.c73
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_neon.S177
-rw-r--r--ffmpeg/libavcodec/arm/asm-offsets.h39
-rw-r--r--ffmpeg/libavcodec/arm/dca.h103
-rw-r--r--ffmpeg/libavcodec/arm/dcadsp_init_arm.c70
-rw-r--r--ffmpeg/libavcodec/arm/dcadsp_neon.S61
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_arm.S125
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_arm.h32
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_armv6.S381
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_arm.c86
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_armv5te.c37
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_armv6.c85
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_neon.c76
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_neon.S186
-rw-r--r--ffmpeg/libavcodec/arm/fft_fixed_init_arm.c48
-rw-r--r--ffmpeg/libavcodec/arm/fft_fixed_neon.S261
-rw-r--r--ffmpeg/libavcodec/arm/fft_init_arm.c70
-rw-r--r--ffmpeg/libavcodec/arm/fft_neon.S375
-rw-r--r--ffmpeg/libavcodec/arm/flacdsp_arm.S146
-rw-r--r--ffmpeg/libavcodec/arm/flacdsp_init_arm.c32
-rw-r--r--ffmpeg/libavcodec/arm/fmtconvert_init_arm.c65
-rw-r--r--ffmpeg/libavcodec/arm/fmtconvert_neon.S392
-rw-r--r--ffmpeg/libavcodec/arm/fmtconvert_vfp.S221
-rw-r--r--ffmpeg/libavcodec/arm/h264chroma_init_arm.c51
-rw-r--r--ffmpeg/libavcodec/arm/h264cmc_neon.S411
-rw-r--r--ffmpeg/libavcodec/arm/h264dsp_init_arm.c115
-rw-r--r--ffmpeg/libavcodec/arm/h264dsp_neon.S541
-rw-r--r--ffmpeg/libavcodec/arm/h264idct_neon.S413
-rw-r--r--ffmpeg/libavcodec/arm/h264pred_init_arm.c92
-rw-r--r--ffmpeg/libavcodec/arm/h264pred_neon.S359
-rw-r--r--ffmpeg/libavcodec/arm/h264qpel_init_arm.c171
-rw-r--r--ffmpeg/libavcodec/arm/h264qpel_neon.S955
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_arm.S611
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_arm.h29
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_armv6.S259
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_init_arm.c72
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c67
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_init_neon.c88
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_neon.S410
-rw-r--r--ffmpeg/libavcodec/arm/int_neon.S92
-rw-r--r--ffmpeg/libavcodec/arm/jrevdct_arm.S383
-rw-r--r--ffmpeg/libavcodec/arm/mathops.h108
-rw-r--r--ffmpeg/libavcodec/arm/mdct_fixed_neon.S193
-rw-r--r--ffmpeg/libavcodec/arm/mdct_neon.S301
-rw-r--r--ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S143
-rw-r--r--ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c38
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_arm.c52
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_arm.h26
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_armv5te.c102
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S114
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_neon.S107
-rw-r--r--ffmpeg/libavcodec/arm/neon.S59
-rw-r--r--ffmpeg/libavcodec/arm/rdft_neon.S150
-rw-r--r--ffmpeg/libavcodec/arm/rv34dsp_init_arm.c46
-rw-r--r--ffmpeg/libavcodec/arm/rv34dsp_neon.S156
-rw-r--r--ffmpeg/libavcodec/arm/rv40dsp_init_arm.c148
-rw-r--r--ffmpeg/libavcodec/arm/rv40dsp_neon.S920
-rw-r--r--ffmpeg/libavcodec/arm/sbrdsp_init_arm.c73
-rw-r--r--ffmpeg/libavcodec/arm/sbrdsp_neon.S411
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_arm.S479
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_armv5te.S620
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_armv6.S425
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_neon.S375
-rw-r--r--ffmpeg/libavcodec/arm/synth_filter_neon.S115
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_arm.h29
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_armv5te.S31
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_init_arm.c30
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_init_armv5te.c33
-rw-r--r--ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c37
-rw-r--r--ffmpeg/libavcodec/arm/vorbisdsp_neon.S83
-rw-r--r--ffmpeg/libavcodec/arm/vp3dsp_init_arm.c45
-rw-r--r--ffmpeg/libavcodec/arm/vp3dsp_neon.S395
-rw-r--r--ffmpeg/libavcodec/arm/vp56_arith.h121
-rw-r--r--ffmpeg/libavcodec/arm/vp8.h35
-rw-r--r--ffmpeg/libavcodec/arm/vp8_armv6.S248
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp.h78
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_armv6.S1634
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_init_arm.c34
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c120
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_init_neon.c116
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_neon.S1876
87 files changed, 0 insertions, 19056 deletions
diff --git a/ffmpeg/libavcodec/arm/Makefile b/ffmpeg/libavcodec/arm/Makefile
deleted file mode 100644
index 277abd9..0000000
--- a/ffmpeg/libavcodec/arm/Makefile
+++ /dev/null
@@ -1,98 +0,0 @@
-ARCH_HEADERS = mathops.h
-
-OBJS += arm/fmtconvert_init_arm.o
-
-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
- arm/sbrdsp_init_arm.o
-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
- arm/ac3dsp_arm.o
-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o
-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_arm.o \
- arm/dsputil_arm.o \
- arm/jrevdct_arm.o \
- arm/simple_idct_arm.o
-OBJS-$(CONFIG_FFT) += arm/fft_init_arm.o \
- arm/fft_fixed_init_arm.o
-OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
- arm/flacdsp_arm.o
-OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
-OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
- arm/hpeldsp_arm.o
-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
-OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_arm.o
-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_init_arm.o
-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o
-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \
- arm/rv40dsp_init_arm.o
-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o \
-
-ARMV5TE-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv5te.o \
- arm/simple_idct_armv5te.o
-ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
- arm/mpegvideo_armv5te_s.o
-ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
- arm/videodsp_armv5te.o
-
-ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \
- arm/dsputil_armv6.o \
- arm/simple_idct_armv6.o \
-
-ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
-ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
-ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
- arm/hpeldsp_armv6.o
-ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
-ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
- arm/vp8dsp_init_armv6.o \
- arm/vp8dsp_armv6.o
-
-VFP-OBJS += arm/fmtconvert_vfp.o
-
-VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
- arm/synth_filter_vfp.o
-VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
-VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
-VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp_armv6.o
-
-NEON-OBJS += arm/fmtconvert_neon.o
-
-NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
-NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
- arm/sbrdsp_neon.o
-NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
- arm/synth_filter_neon.o
-NEON-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_neon.o \
- arm/dsputil_neon.o \
- arm/int_neon.o \
- arm/simple_idct_neon.o
-NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
- arm/fft_fixed_neon.o
-NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
-NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \
- arm/h264idct_neon.o
-NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o
-NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
- arm/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_neon.o \
- arm/hpeldsp_neon.o
-NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
- arm/mdct_fixed_neon.o
-NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
-NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o
-NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
-NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
- arm/rv40dsp_neon.o
-NEON-OBJS-$(CONFIG_VC1_DECODER) += arm/vc1dsp_init_neon.o \
- arm/vc1dsp_neon.o
-NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
-NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
-NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp6dsp_neon.o
-NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_neon.o \
- arm/vp8dsp_neon.o
diff --git a/ffmpeg/libavcodec/arm/aac.h b/ffmpeg/libavcodec/arm/aac.h
deleted file mode 100644
index cafa881..0000000
--- a/ffmpeg/libavcodec/arm/aac.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_AAC_H
-#define AVCODEC_ARM_AAC_H
-
-#include "config.h"
-
-#if HAVE_NEON_INLINE
-
-#define VMUL2 VMUL2
-static inline float *VMUL2(float *dst, const float *v, unsigned idx,
- const float *scale)
-{
- unsigned v0, v1;
- __asm__ ("ubfx %0, %6, #0, #4 \n\t"
- "ubfx %1, %6, #4, #4 \n\t"
- "ldr %0, [%5, %0, lsl #2] \n\t"
- "ldr %1, [%5, %1, lsl #2] \n\t"
- "vld1.32 {d1[]}, [%7,:32] \n\t"
- "vmov d0, %0, %1 \n\t"
- "vmul.f32 d0, d0, d1 \n\t"
- "vst1.32 {d0}, [%2,:64]! \n\t"
- : "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1])
- : "r"(v), "r"(idx), "r"(scale)
- : "d0", "d1");
- return dst;
-}
-
-#define VMUL4 VMUL4
-static inline float *VMUL4(float *dst, const float *v, unsigned idx,
- const float *scale)
-{
- unsigned v0, v1, v2, v3;
- __asm__ ("ubfx %0, %10, #0, #2 \n\t"
- "ubfx %1, %10, #2, #2 \n\t"
- "ldr %0, [%9, %0, lsl #2] \n\t"
- "ubfx %2, %10, #4, #2 \n\t"
- "ldr %1, [%9, %1, lsl #2] \n\t"
- "ubfx %3, %10, #6, #2 \n\t"
- "ldr %2, [%9, %2, lsl #2] \n\t"
- "vmov d0, %0, %1 \n\t"
- "ldr %3, [%9, %3, lsl #2] \n\t"
- "vld1.32 {d2[],d3[]},[%11,:32] \n\t"
- "vmov d1, %2, %3 \n\t"
- "vmul.f32 q0, q0, q1 \n\t"
- "vst1.32 {q0}, [%4,:128]! \n\t"
- : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
- "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
- : "r"(v), "r"(idx), "r"(scale)
- : "d0", "d1", "d2", "d3");
- return dst;
-}
-
-#define VMUL2S VMUL2S
-static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
- unsigned sign, const float *scale)
-{
- unsigned v0, v1, v2, v3;
- __asm__ ("ubfx %0, %8, #0, #4 \n\t"
- "ubfx %1, %8, #4, #4 \n\t"
- "ldr %0, [%7, %0, lsl #2] \n\t"
- "lsl %2, %10, #30 \n\t"
- "ldr %1, [%7, %1, lsl #2] \n\t"
- "lsl %3, %10, #31 \n\t"
- "vmov d0, %0, %1 \n\t"
- "bic %2, %2, #1<<30 \n\t"
- "vld1.32 {d1[]}, [%9,:32] \n\t"
- "vmov d2, %2, %3 \n\t"
- "veor d0, d0, d2 \n\t"
- "vmul.f32 d0, d0, d1 \n\t"
- "vst1.32 {d0}, [%4,:64]! \n\t"
- : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
- "=m"(dst[0]), "=m"(dst[1])
- : "r"(v), "r"(idx), "r"(scale), "r"(sign)
- : "d0", "d1", "d2");
- return dst;
-}
-
-#define VMUL4S VMUL4S
-static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
- unsigned sign, const float *scale)
-{
- unsigned v0, v1, v2, v3, nz;
- __asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t"
- "ubfx %0, %12, #0, #2 \n\t"
- "ubfx %1, %12, #2, #2 \n\t"
- "ldr %0, [%11,%0, lsl #2] \n\t"
- "ubfx %2, %12, #4, #2 \n\t"
- "ldr %1, [%11,%1, lsl #2] \n\t"
- "ubfx %3, %12, #6, #2 \n\t"
- "ldr %2, [%11,%2, lsl #2] \n\t"
- "vmov d0, %0, %1 \n\t"
- "ldr %3, [%11,%3, lsl #2] \n\t"
- "lsr %6, %12, #12 \n\t"
- "rbit %6, %6 \n\t"
- "vmov d1, %2, %3 \n\t"
- "lsls %6, %6, #1 \n\t"
- "and %0, %5, #1<<31 \n\t"
- "it cs \n\t"
- "lslcs %5, %5, #1 \n\t"
- "lsls %6, %6, #1 \n\t"
- "and %1, %5, #1<<31 \n\t"
- "it cs \n\t"
- "lslcs %5, %5, #1 \n\t"
- "lsls %6, %6, #1 \n\t"
- "and %2, %5, #1<<31 \n\t"
- "it cs \n\t"
- "lslcs %5, %5, #1 \n\t"
- "vmov d4, %0, %1 \n\t"
- "and %3, %5, #1<<31 \n\t"
- "vmov d5, %2, %3 \n\t"
- "veor q0, q0, q2 \n\t"
- "vmul.f32 q0, q0, q1 \n\t"
- "vst1.32 {q0}, [%4,:128]! \n\t"
- : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
- "+r"(sign), "=r"(nz),
- "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
- : "r"(v), "r"(idx), "r"(scale)
- : "cc", "d0", "d1", "d2", "d3", "d4", "d5");
- return dst;
-}
-
-#endif /* HAVE_NEON_INLINE */
-
-#endif /* AVCODEC_ARM_AAC_H */
diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c b/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c
deleted file mode 100644
index e04787c..0000000
--- a/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2012 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/arm/cpu.h"
-#include "libavutil/attributes.h"
-#include "libavcodec/aacpsdsp.h"
-
-void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
-void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
- float *src1, int n);
-void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
- const float (*filter)[8][2],
- int stride, int n);
-void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64],
- int i, int len);
-void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2],
- int i, int len);
-void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2],
- float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
- const float phi_fract[2], float (*Q_fract)[2],
- const float *transient_gain, float g_decay_slope,
- int len);
-void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
- float h[2][4], float h_step[2][4],
- int len);
-
-av_cold void ff_psdsp_init_arm(PSDSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->add_squares = ff_ps_add_squares_neon;
- s->mul_pair_single = ff_ps_mul_pair_single_neon;
- s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon;
- s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
- s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_neon.S b/ffmpeg/libavcodec/arm/aacpsdsp_neon.S
deleted file mode 100644
index a93bbfe..0000000
--- a/ffmpeg/libavcodec/arm/aacpsdsp_neon.S
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2012 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_ps_add_squares_neon, export=1
- mov r3, r0
- sub r2, r2, #4
- vld1.32 {q0}, [r1,:128]!
- vmul.f32 q0, q0, q0
- vld1.32 {q2}, [r1,:128]!
- vmul.f32 q2, q2, q2
- vld1.32 {q1}, [r0,:128]!
-1:
- vpadd.f32 d6, d0, d1
- vld1.32 {q0}, [r1,:128]!
- vpadd.f32 d7, d4, d5
- vmul.f32 q0, q0, q0
- vld1.32 {q2}, [r1,:128]!
- vadd.f32 q3, q1, q3
- vld1.32 {q1}, [r0,:128]!
- vmul.f32 q2, q2, q2
- vst1.32 {q3}, [r3,:128]!
- subs r2, r2, #4
- bgt 1b
- vpadd.f32 d6, d0, d1
- vpadd.f32 d7, d4, d5
- vadd.f32 q1, q1, q3
- vst1.32 {q1}, [r3,:128]!
- bx lr
-endfunc
-
-function ff_ps_mul_pair_single_neon, export=1
- sub r3, r3, #4
- tst r1, #8
- bne 2f
- vld1.32 {q0}, [r1,:128]!
-1:
- vld1.32 {q3}, [r2,:128]!
- vmul.f32 d4, d0, d6[0]
- vmul.f32 d5, d1, d6[1]
- vld1.32 {q1}, [r1,:128]!
- vmul.f32 d6, d2, d7[0]
- vmul.f32 d7, d3, d7[1]
- vld1.32 {q0}, [r1,:128]!
- vst1.32 {q2,q3}, [r0,:128]!
- subs r3, r3, #4
- bgt 1b
- vld1.32 {q3}, [r2,:128]!
- vmul.f32 d4, d0, d6[0]
- vmul.f32 d5, d1, d6[1]
- vld1.32 {q1}, [r1,:128]!
- vmul.f32 d6, d2, d7[0]
- vmul.f32 d7, d3, d7[1]
- vst1.32 {q2,q3}, [r0,:128]!
- bx lr
-2:
- vld1.32 {d0}, [r1,:64]!
- vld1.32 {d1,d2}, [r1,:128]!
-1:
- vld1.32 {q3}, [r2,:128]!
- vmul.f32 d4, d0, d6[0]
- vmul.f32 d5, d1, d6[1]
- vld1.32 {d0,d1}, [r1,:128]!
- vmul.f32 d6, d2, d7[0]
- vmul.f32 d7, d0, d7[1]
- vmov d0, d1
- vld1.32 {d1,d2}, [r1,:128]!
- vst1.32 {q2,q3}, [r0,:128]!
- subs r3, r3, #4
- bgt 1b
- vld1.32 {q3}, [r2,:128]!
- vmul.f32 d4, d0, d6[0]
- vmul.f32 d5, d1, d6[1]
- vld1.32 {d0}, [r1,:64]!
- vmul.f32 d6, d2, d7[0]
- vmul.f32 d7, d0, d7[1]
- vst1.32 {q2,q3}, [r0,:128]!
- bx lr
-endfunc
-
-function ff_ps_hybrid_synthesis_deint_neon, export=1
- push {r4-r8,lr}
- add r0, r0, r2, lsl #2
- add r1, r1, r2, lsl #5+1+2
- rsb r2, r2, #64
- mov r5, #64*4
- mov lr, r0
- add r4, r0, #38*64*4
- mov r12, r3
-2:
- vld1.32 {d0,d1}, [r1,:128]!
- vst1.32 {d0[0]}, [lr,:32], r5
- vst1.32 {d0[1]}, [r4,:32], r5
- vst1.32 {d1[0]}, [lr,:32], r5
- vst1.32 {d1[1]}, [r4,:32], r5
- subs r12, r12, #2
- bgt 2b
- add r0, r0, #4
- sub r2, r2, #1
- tst r2, #2
- bne 6f
-1:
- mov lr, r0
- add r4, r0, #38*64*4
- add r6, r1, # 32*2*4
- add r7, r1, #2*32*2*4
- add r8, r1, #3*32*2*4
- mov r12, r3
-2:
- vld1.32 {d0,d1}, [r1,:128]!
- vld1.32 {d2,d3}, [r6,:128]!
- vld1.32 {d4,d5}, [r7,:128]!
- vld1.32 {d6,d7}, [r8,:128]!
- vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
- vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
- vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
- vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
- subs r12, r12, #2
- bgt 2b
- add r0, r0, #16
- add r1, r1, #3*32*2*4
- subs r2, r2, #4
- bgt 1b
- pop {r4-r8,pc}
-6:
- mov lr, r0
- add r4, r0, #38*64*4
- add r6, r1, #32*2*4
- mov r12, r3
-2:
- vld1.32 {d0,d1}, [r1,:128]!
- vld1.32 {d2,d3}, [r6,:128]!
- vst2.32 {d0[0],d2[0]}, [lr,:64], r5
- vst2.32 {d0[1],d2[1]}, [r4,:64], r5
- vst2.32 {d1[0],d3[0]}, [lr,:64], r5
- vst2.32 {d1[1],d3[1]}, [r4,:64], r5
- subs r12, r12, #2
- bgt 2b
- add r0, r0, #8
- add r1, r1, #32*2*4
- sub r2, r2, #2
- b 1b
-endfunc
-
-function ff_ps_hybrid_analysis_neon, export=1
- vldm r1, {d19-d31}
- ldr r12, [sp]
- lsl r3, r3, #3
- vadd.f32 d16, d19, d31
- vadd.f32 d17, d20, d30
- vsub.f32 d18, d19, d31
- vsub.f32 d19, d20, d30
- vsub.f32 d0, d21, d29
- vsub.f32 d1, d22, d28
- vadd.f32 d2, d21, d29
- vadd.f32 d3, d22, d28
- vadd.f32 d20, d23, d27
- vadd.f32 d21, d24, d26
- vsub.f32 d22, d23, d27
- vsub.f32 d23, d24, d26
- vmov.i32 d6, #1<<31
- vmov.i32 d7, #0
- vmov.f32 q14, #0.0
- vmov.f32 q15, #0.0
- vtrn.32 d6, d7
- vrev64.32 q9, q9
- vrev64.32 q0, q0
- vrev64.32 q11, q11
- veor q9, q9, q3
- veor q0, q0, q3
- veor q11, q11, q3
- vld1.32 {q13}, [r2,:128]!
- vtrn.32 q8, q9
- vtrn.32 q1, q0
- vtrn.32 q10, q11
- sub r12, r12, #1
- vmla.f32 q14, q8, q13
- vld1.32 {q2}, [r2,:128]!
- vmla.f32 q15, q9, q13
-1:
- vmla.f32 q14, q1, q2
- vld1.32 {q13}, [r2,:128]!
- vmla.f32 q15, q0, q2
- vmla.f32 q14, q10, q13
- vld1.32 {q2}, [r2,:128]!
- vmla.f32 q15, q11, q13
- vld1.32 {q13}, [r2,:128]!
- vadd.f32 d6, d28, d29
- vadd.f32 d7, d30, d31
- vmov.f32 q14, #0.0
- vmov.f32 q15, #0.0
- vmla.f32 q14, q8, q13
- vpadd.f32 d6, d6, d7
- vmla.f32 q15, q9, q13
- vmla.f32 d6, d25, d4[0]
- vld1.32 {q2}, [r2,:128]!
- vst1.32 {d6}, [r0,:64], r3
- subs r12, r12, #1
- bgt 1b
- vmla.f32 q14, q1, q2
- vld1.32 {q13}, [r2,:128]!
- vmla.f32 q15, q0, q2
- vmla.f32 q14, q10, q13
- vld1.32 {q2}, [r2,:128]!
- vmla.f32 q15, q11, q13
- vadd.f32 d6, d28, d29
- vadd.f32 d7, d30, d31
- vpadd.f32 d6, d6, d7
- vmla.f32 d6, d25, d4[0]
- vst1.32 {d6}, [r0,:64], r3
- bx lr
-endfunc
-
-function ff_ps_stereo_interpolate_neon, export=1
- vld1.32 {q0}, [r2]
- vld1.32 {q14}, [r3]
- vadd.f32 q15, q14, q14
- mov r2, r0
- mov r3, r1
- ldr r12, [sp]
- vadd.f32 q1, q0, q14
- vadd.f32 q0, q0, q15
- vld1.32 {q2}, [r0,:64]!
- vld1.32 {q3}, [r1,:64]!
- subs r12, r12, #1
- beq 2f
-1:
- vmul.f32 d16, d4, d2[0]
- vmul.f32 d17, d5, d0[0]
- vmul.f32 d18, d4, d2[1]
- vmul.f32 d19, d5, d0[1]
- vmla.f32 d16, d6, d3[0]
- vmla.f32 d17, d7, d1[0]
- vmla.f32 d18, d6, d3[1]
- vmla.f32 d19, d7, d1[1]
- vadd.f32 q1, q1, q15
- vadd.f32 q0, q0, q15
- vld1.32 {q2}, [r0,:64]!
- vld1.32 {q3}, [r1,:64]!
- vst1.32 {q8}, [r2,:64]!
- vst1.32 {q9}, [r3,:64]!
- subs r12, r12, #2
- bgt 1b
- it lt
- bxlt lr
-2:
- vmul.f32 d16, d4, d2[0]
- vmul.f32 d18, d4, d2[1]
- vmla.f32 d16, d6, d3[0]
- vmla.f32 d18, d6, d3[1]
- vst1.32 {d16}, [r2,:64]!
- vst1.32 {d18}, [r3,:64]!
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_arm.S b/ffmpeg/libavcodec/arm/ac3dsp_arm.S
deleted file mode 100644
index 1aea190..0000000
--- a/ffmpeg/libavcodec/arm/ac3dsp_arm.S
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_ac3_update_bap_counts_arm, export=1
- push {lr}
- ldrb lr, [r1], #1
-1:
- lsl r3, lr, #1
- ldrh r12, [r0, r3]
- subs r2, r2, #1
- it gt
- ldrbgt lr, [r1], #1
- add r12, r12, #1
- strh r12, [r0, r3]
- bgt 1b
- pop {pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_armv6.S b/ffmpeg/libavcodec/arm/ac3dsp_armv6.S
deleted file mode 100644
index 1d2563d..0000000
--- a/ffmpeg/libavcodec/arm/ac3dsp_armv6.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_ac3_bit_alloc_calc_bap_armv6, export=1
- ldr r12, [sp]
- cmp r12, #-960
- beq 4f
- push {r4-r11,lr}
- add r5, sp, #40
- movrelx r4, X(ff_ac3_bin_to_band_tab), r11
- movrelx lr, X(ff_ac3_band_start_tab)
- ldm r5, {r5-r7}
- ldrb r4, [r4, r2]
- add r1, r1, r2, lsl #1 @ psd + start
- add r0, r0, r4, lsl #1 @ mask + band
- add r4, r4, lr
- add r7, r7, r2 @ bap + start
-1:
- ldrsh r9, [r0], #2 @ mask[band]
- mov r8, #0xff0
- sub r9, r9, r12 @ - snr_offset
- ldrb r10, [r4, #1]! @ band_start_tab[++band]
- subs r9, r9, r5 @ - floor
- it lt
- movlt r9, #0
- cmp r10, r3 @ - end
- and r9, r9, r8, lsl #1 @ & 0x1fe0
- ite gt
- subgt r8, r3, r2
- suble r8, r10, r2
- mov r2, r10
- add r9, r9, r5 @ + floor => m
- tst r8, #1
- add r11, r7, r8
- bne 3f
- b 5f
-2:
- ldrsh r8, [r1], #2
- ldrsh lr, [r1], #2
- sub r8, r8, r9
- sub lr, lr, r9
- usat r8, #6, r8, asr #5 @ address
- usat lr, #6, lr, asr #5
- ldrb r8, [r6, r8] @ bap_tab[address]
- ldrb lr, [r6, lr]
- strb r8, [r7], #1 @ bap[bin]
- strb lr, [r7], #1
-5: cmp r7, r11
- blo 2b
- cmp r3, r10
- bgt 1b
- pop {r4-r11,pc}
-3:
- ldrsh r8, [r1], #2 @ psd[bin]
- sub r8, r8, r9 @ - m
- usat r8, #6, r8, asr #5 @ address
- ldrb r8, [r6, r8] @ bap_tab[address]
- strb r8, [r7], #1 @ bap[bin]
- b 5b
-4:
- ldr r0, [sp, #12]
- mov r1, #0
- mov r2, #256
- b X(memset)
-endfunc
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c b/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c
deleted file mode 100644
index a3c32ff..0000000
--- a/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/arm/cpu.h"
-#include "libavutil/attributes.h"
-#include "libavcodec/ac3dsp.h"
-#include "config.h"
-
-void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
-int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len);
-void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift);
-void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift);
-void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
-void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
-void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
- const int16_t *window, unsigned n);
-void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
- const int32_t *coef0,
- const int32_t *coef1,
- int len);
-void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
- const float *coef0,
- const float *coef1,
- int len);
-
-void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
- int start, int end,
- int snr_offset, int floor,
- const uint8_t *bap_tab, uint8_t *bap);
-
-void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len);
-
-av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
-{
- int cpu_flags = av_get_cpu_flags();
-
- c->update_bap_counts = ff_ac3_update_bap_counts_arm;
-
- if (have_armv6(cpu_flags)) {
- c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6;
- }
-
- if (have_neon(cpu_flags)) {
- c->ac3_exponent_min = ff_ac3_exponent_min_neon;
- c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon;
- c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon;
- c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon;
- c->float_to_fixed24 = ff_float_to_fixed24_neon;
- c->extract_exponents = ff_ac3_extract_exponents_neon;
- c->apply_window_int16 = ff_apply_window_int16_neon;
- c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
- c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_neon.S b/ffmpeg/libavcodec/arm/ac3dsp_neon.S
deleted file mode 100644
index 89d0ae8..0000000
--- a/ffmpeg/libavcodec/arm/ac3dsp_neon.S
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_ac3_max_msb_abs_int16_neon, export=1
- vmov.i16 q0, #0
- vmov.i16 q2, #0
-1: vld1.16 {q1}, [r0,:128]!
- vabs.s16 q1, q1
- vld1.16 {q3}, [r0,:128]!
- vabs.s16 q3, q3
- vorr q0, q0, q1
- vorr q2, q2, q3
- subs r1, r1, #16
- bgt 1b
- vorr q0, q0, q2
- vorr d0, d0, d1
- vpmax.u16 d0, d0, d0
- vpmax.u16 d0, d0, d0
- vmov.u16 r0, d0[0]
- bx lr
-endfunc
-
-function ff_ac3_exponent_min_neon, export=1
- cmp r1, #0
- it eq
- bxeq lr
- push {lr}
- mov r12, #256
-1:
- vld1.8 {q0}, [r0,:128]
- mov lr, r1
- add r3, r0, #256
-2: vld1.8 {q1}, [r3,:128], r12
- subs lr, lr, #1
- vmin.u8 q0, q0, q1
- bgt 2b
- subs r2, r2, #16
- vst1.8 {q0}, [r0,:128]!
- bgt 1b
- pop {pc}
-endfunc
-
-function ff_ac3_lshift_int16_neon, export=1
- vdup.16 q0, r2
-1: vld1.16 {q1}, [r0,:128]
- vshl.s16 q1, q1, q0
- vst1.16 {q1}, [r0,:128]!
- subs r1, r1, #8
- bgt 1b
- bx lr
-endfunc
-
-function ff_ac3_rshift_int32_neon, export=1
- rsb r2, r2, #0
- vdup.32 q0, r2
-1: vld1.32 {q1}, [r0,:128]
- vshl.s32 q1, q1, q0
- vst1.32 {q1}, [r0,:128]!
- subs r1, r1, #4
- bgt 1b
- bx lr
-endfunc
-
-function ff_float_to_fixed24_neon, export=1
-1: vld1.32 {q0-q1}, [r1,:128]!
- vcvt.s32.f32 q0, q0, #24
- vld1.32 {q2-q3}, [r1,:128]!
- vcvt.s32.f32 q1, q1, #24
- vcvt.s32.f32 q2, q2, #24
- vst1.32 {q0-q1}, [r0,:128]!
- vcvt.s32.f32 q3, q3, #24
- vst1.32 {q2-q3}, [r0,:128]!
- subs r2, r2, #16
- bgt 1b
- bx lr
-endfunc
-
-function ff_ac3_extract_exponents_neon, export=1
- vmov.i32 q15, #8
-1:
- vld1.32 {q0}, [r1,:128]!
- vabs.s32 q1, q0
- vclz.i32 q3, q1
- vsub.i32 q3, q3, q15
- vmovn.i32 d6, q3
- vmovn.i16 d6, q3
- vst1.32 {d6[0]}, [r0,:32]!
- subs r2, r2, #4
- bgt 1b
- bx lr
-endfunc
-
-function ff_apply_window_int16_neon, export=1
- push {r4,lr}
- add r4, r1, r3, lsl #1
- add lr, r0, r3, lsl #1
- sub r4, r4, #16
- sub lr, lr, #16
- mov r12, #-16
-1:
- vld1.16 {q0}, [r1,:128]!
- vld1.16 {q2}, [r2,:128]!
- vld1.16 {q1}, [r4,:128], r12
- vrev64.16 q3, q2
- vqrdmulh.s16 q0, q0, q2
- vqrdmulh.s16 d2, d2, d7
- vqrdmulh.s16 d3, d3, d6
- vst1.16 {q0}, [r0,:128]!
- vst1.16 {q1}, [lr,:128], r12
- subs r3, r3, #16
- bgt 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_ac3_sum_square_butterfly_int32_neon, export=1
- vmov.i64 q0, #0
- vmov.i64 q1, #0
- vmov.i64 q2, #0
- vmov.i64 q3, #0
-1:
- vld1.32 {d16}, [r1]!
- vld1.32 {d17}, [r2]!
- vadd.s32 d18, d16, d17
- vsub.s32 d19, d16, d17
- vmlal.s32 q0, d16, d16
- vmlal.s32 q1, d17, d17
- vmlal.s32 q2, d18, d18
- vmlal.s32 q3, d19, d19
- subs r3, r3, #2
- bgt 1b
- vadd.s64 d0, d0, d1
- vadd.s64 d1, d2, d3
- vadd.s64 d2, d4, d5
- vadd.s64 d3, d6, d7
- vst1.64 {q0-q1}, [r0]
- bx lr
-endfunc
-
-function ff_ac3_sum_square_butterfly_float_neon, export=1
- vmov.f32 q0, #0.0
- vmov.f32 q1, #0.0
-1:
- vld1.32 {d16}, [r1]!
- vld1.32 {d17}, [r2]!
- vadd.f32 d18, d16, d17
- vsub.f32 d19, d16, d17
- vmla.f32 d0, d16, d16
- vmla.f32 d1, d17, d17
- vmla.f32 d2, d18, d18
- vmla.f32 d3, d19, d19
- subs r3, r3, #2
- bgt 1b
- vpadd.f32 d0, d0, d1
- vpadd.f32 d1, d2, d3
- vst1.32 {q0}, [r0]
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/asm-offsets.h b/ffmpeg/libavcodec/arm/asm-offsets.h
deleted file mode 100644
index 5cfc5cb..0000000
--- a/ffmpeg/libavcodec/arm/asm-offsets.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_ASM_OFFSETS_H
-#define AVCODEC_ARM_ASM_OFFSETS_H
-
-#ifndef __ASSEMBLER__
-#include <stddef.h>
-#define CHK_OFFS(s, m, o) struct check_##o { \
- int x_##o[offsetof(s, m) == o? 1: -1]; \
- }
-#endif
-
-/* MpegEncContext */
-#define Y_DC_SCALE 0xa8
-#define C_DC_SCALE 0xac
-#define AC_PRED 0xb0
-#define BLOCK_LAST_INDEX 0xb4
-#define H263_AIC 0xe4
-#define INTER_SCANTAB_RASTER_END 0x12c
-
-#endif /* AVCODEC_ARM_ASM_OFFSETS_H */
diff --git a/ffmpeg/libavcodec/arm/dca.h b/ffmpeg/libavcodec/arm/dca.h
deleted file mode 100644
index 35971a8..0000000
--- a/ffmpeg/libavcodec/arm/dca.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_DCA_H
-#define AVCODEC_ARM_DCA_H
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavcodec/mathops.h"
-
-#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
-
-#define decode_blockcodes decode_blockcodes
-static inline int decode_blockcodes(int code1, int code2, int levels,
- int32_t *values)
-{
- int32_t v0, v1, v2, v3, v4, v5;
-
- __asm__ ("smmul %0, %6, %10 \n"
- "smmul %3, %7, %10 \n"
- "smlabb %6, %0, %9, %6 \n"
- "smlabb %7, %3, %9, %7 \n"
- "smmul %1, %0, %10 \n"
- "smmul %4, %3, %10 \n"
- "sub %6, %6, %8, lsr #1 \n"
- "sub %7, %7, %8, lsr #1 \n"
- "smlabb %0, %1, %9, %0 \n"
- "smlabb %3, %4, %9, %3 \n"
- "smmul %2, %1, %10 \n"
- "smmul %5, %4, %10 \n"
- "str %6, [%11, #0] \n"
- "str %7, [%11, #16] \n"
- "sub %0, %0, %8, lsr #1 \n"
- "sub %3, %3, %8, lsr #1 \n"
- "smlabb %1, %2, %9, %1 \n"
- "smlabb %4, %5, %9, %4 \n"
- "smmul %6, %2, %10 \n"
- "smmul %7, %5, %10 \n"
- "str %0, [%11, #4] \n"
- "str %3, [%11, #20] \n"
- "sub %1, %1, %8, lsr #1 \n"
- "sub %4, %4, %8, lsr #1 \n"
- "smlabb %2, %6, %9, %2 \n"
- "smlabb %5, %7, %9, %5 \n"
- "str %1, [%11, #8] \n"
- "str %4, [%11, #24] \n"
- "sub %2, %2, %8, lsr #1 \n"
- "sub %5, %5, %8, lsr #1 \n"
- "str %2, [%11, #12] \n"
- "str %5, [%11, #28] \n"
- : "=&r"(v0), "=&r"(v1), "=&r"(v2),
- "=&r"(v3), "=&r"(v4), "=&r"(v5),
- "+&r"(code1), "+&r"(code2)
- : "r"(levels - 1), "r"(-levels),
- "r"(ff_inverse[levels]), "r"(values)
- : "memory");
-
- return code1 | code2;
-}
-
-#endif
-
-#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
-
-#define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
-{
- __asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
- "vld1.8 {d0}, [%1,:64] \n"
- "vmovl.s8 q0, d0 \n"
- "vmovl.s16 q1, d1 \n"
- "vmovl.s16 q0, d0 \n"
- "vcvt.f32.s32 q0, q0 \n"
- "vcvt.f32.s32 q1, q1 \n"
- "vmul.f32 q0, q0, %y2 \n"
- "vmul.f32 q1, q1, %y2 \n"
- "vst1.32 {q0-q1}, [%m0,:128] \n"
- : "=Um"(*(float (*)[8])dst)
- : "r"(src), "x"(scale)
- : "d0", "d1", "d2", "d3");
-}
-
-#endif
-
-#endif /* AVCODEC_ARM_DCA_H */
diff --git a/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
deleted file mode 100644
index 8893f48..0000000
--- a/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "libavutil/arm/cpu.h"
-#include "libavutil/attributes.h"
-#include "libavcodec/dcadsp.h"
-
-void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
- int decifactor, float scale);
-void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
- SynthFilterContext *synth, FFTContext *imdct,
- float synth_buf_ptr[512],
- int *synth_buf_offset, float synth_buf2[32],
- const float window[512], float *samples_out,
- float raXin[32], float scale);
-void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
- int decifactor, float scale);
-
-void ff_synth_filter_float_vfp(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
-void ff_synth_filter_float_neon(FFTContext *imdct,
- float *synth_buf_ptr, int *synth_buf_offset,
- float synth_buf2[32], const float window[512],
- float out[32], const float in[32],
- float scale);
-
-av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
- s->lfe_fir = ff_dca_lfe_fir_vfp;
- s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
- }
- if (have_neon(cpu_flags))
- s->lfe_fir = ff_dca_lfe_fir_neon;
-}
-
-av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_vfp;
- if (have_neon(cpu_flags))
- s->synth_filter_float = ff_synth_filter_float_neon;
-}
diff --git a/ffmpeg/libavcodec/arm/dcadsp_neon.S b/ffmpeg/libavcodec/arm/dcadsp_neon.S
deleted file mode 100644
index 6a6c77a..0000000
--- a/ffmpeg/libavcodec/arm/dcadsp_neon.S
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_dca_lfe_fir_neon, export=1
- push {r4-r6,lr}
-
- add r4, r0, r3, lsl #2 @ out2
- add r5, r2, #256*4-16 @ cf1
- sub r1, r1, #12
- cmp r3, #32
- ite eq
- moveq r6, #256/32
- movne r6, #256/64
-NOVFP vldr s0, [sp, #16] @ scale
- mov lr, #-16
-1:
- vmov.f32 q2, #0.0 @ v0
- vmov.f32 q3, #0.0 @ v1
- mov r12, r6
-2:
- vld1.32 {q8}, [r2,:128]! @ cf0
- vld1.32 {q9}, [r5,:128], lr @ cf1
- vld1.32 {q1}, [r1], lr @ in
- subs r12, r12, #4
- vrev64.32 q10, q8
- vmla.f32 q3, q1, q9
- vmla.f32 d4, d2, d21
- vmla.f32 d5, d3, d20
- bne 2b
-
- add r1, r1, r6, lsl #2
- subs r3, r3, #1
- vadd.f32 d4, d4, d5
- vadd.f32 d6, d6, d7
- vpadd.f32 d4, d4, d6
- vmul.f32 d5, d4, d0[0]
- vst1.32 {d5[0]}, [r0,:32]!
- vst1.32 {d5[1]}, [r4,:32]!
- bne 1b
-
- pop {r4-r6,pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.S b/ffmpeg/libavcodec/arm/dsputil_arm.S
deleted file mode 100644
index 586a833..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_arm.S
+++ /dev/null
@@ -1,125 +0,0 @@
-@
-@ ARMv4 optimized DSP utils
-@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
-@
-@ This file is part of FFmpeg.
-@
-@ FFmpeg is free software; you can redistribute it and/or
-@ modify it under the terms of the GNU Lesser General Public
-@ License as published by the Free Software Foundation; either
-@ version 2.1 of the License, or (at your option) any later version.
-@
-@ FFmpeg is distributed in the hope that it will be useful,
-@ but WITHOUT ANY WARRANTY; without even the implied warranty of
-@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-@ Lesser General Public License for more details.
-@
-@ You should have received a copy of the GNU Lesser General Public
-@ License along with FFmpeg; if not, write to the Free Software
-@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-@
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-#if !HAVE_ARMV5TE_EXTERNAL
-#define pld @
-#endif
-
- .align 5
-@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
-function ff_add_pixels_clamped_arm, export=1
- push {r4-r10}
- mov r10, #8
-1:
- ldr r4, [r1] /* load dest */
- /* block[0] and block[1]*/
- ldrsh r5, [r0]
- ldrsh r7, [r0, #2]
- and r6, r4, #0xFF
- and r8, r4, #0xFF00
- add r6, r6, r5
- add r8, r7, r8, lsr #8
- mvn r5, r5
- mvn r7, r7
- tst r6, #0x100
- it ne
- movne r6, r5, lsr #24
- tst r8, #0x100
- it ne
- movne r8, r7, lsr #24
- mov r9, r6
- ldrsh r5, [r0, #4] /* moved form [A] */
- orr r9, r9, r8, lsl #8
- /* block[2] and block[3] */
- /* [A] */
- ldrsh r7, [r0, #6]
- and r6, r4, #0xFF0000
- and r8, r4, #0xFF000000
- add r6, r5, r6, lsr #16
- add r8, r7, r8, lsr #24
- mvn r5, r5
- mvn r7, r7
- tst r6, #0x100
- it ne
- movne r6, r5, lsr #24
- tst r8, #0x100
- it ne
- movne r8, r7, lsr #24
- orr r9, r9, r6, lsl #16
- ldr r4, [r1, #4] /* moved form [B] */
- orr r9, r9, r8, lsl #24
- /* store dest */
- ldrsh r5, [r0, #8] /* moved form [C] */
- str r9, [r1]
-
- /* load dest */
- /* [B] */
- /* block[4] and block[5] */
- /* [C] */
- ldrsh r7, [r0, #10]
- and r6, r4, #0xFF
- and r8, r4, #0xFF00
- add r6, r6, r5
- add r8, r7, r8, lsr #8
- mvn r5, r5
- mvn r7, r7
- tst r6, #0x100
- it ne
- movne r6, r5, lsr #24
- tst r8, #0x100
- it ne
- movne r8, r7, lsr #24
- mov r9, r6
- ldrsh r5, [r0, #12] /* moved from [D] */
- orr r9, r9, r8, lsl #8
- /* block[6] and block[7] */
- /* [D] */
- ldrsh r7, [r0, #14]
- and r6, r4, #0xFF0000
- and r8, r4, #0xFF000000
- add r6, r5, r6, lsr #16
- add r8, r7, r8, lsr #24
- mvn r5, r5
- mvn r7, r7
- tst r6, #0x100
- it ne
- movne r6, r5, lsr #24
- tst r8, #0x100
- it ne
- movne r8, r7, lsr #24
- orr r9, r9, r6, lsl #16
- add r0, r0, #16 /* moved from [E] */
- orr r9, r9, r8, lsl #24
- subs r10, r10, #1 /* moved from [F] */
- /* store dest */
- str r9, [r1, #4]
-
- /* [E] */
- /* [F] */
- add r1, r1, r2
- bne 1b
-
- pop {r4-r10}
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.h b/ffmpeg/libavcodec/arm/dsputil_arm.h
deleted file mode 100644
index b7b5bdc..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_arm.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_DSPUTIL_H
-#define AVCODEC_ARM_DSPUTIL_H
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/dsputil.h"
-
-void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx);
-void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx);
-void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx);
-void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
-
-#endif /* AVCODEC_ARM_DSPUTIL_H */
diff --git a/ffmpeg/libavcodec/arm/dsputil_armv6.S b/ffmpeg/libavcodec/arm/dsputil_armv6.S
deleted file mode 100644
index 6ec238b..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_armv6.S
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_add_pixels_clamped_armv6, export=1
- push {r4-r8,lr}
- mov r3, #8
-1:
- ldm r0!, {r4,r5,r12,lr}
- ldrd r6, r7, [r1]
- pkhbt r8, r4, r5, lsl #16
- pkhtb r5, r5, r4, asr #16
- pkhbt r4, r12, lr, lsl #16
- pkhtb lr, lr, r12, asr #16
- pld [r1, r2]
- uxtab16 r8, r8, r6
- uxtab16 r5, r5, r6, ror #8
- uxtab16 r4, r4, r7
- uxtab16 lr, lr, r7, ror #8
- usat16 r8, #8, r8
- usat16 r5, #8, r5
- usat16 r4, #8, r4
- usat16 lr, #8, lr
- orr r6, r8, r5, lsl #8
- orr r7, r4, lr, lsl #8
- subs r3, r3, #1
- strd_post r6, r7, r1, r2
- bgt 1b
- pop {r4-r8,pc}
-endfunc
-
-function ff_get_pixels_armv6, export=1
- pld [r1, r2]
- push {r4-r8, lr}
- mov lr, #8
-1:
- ldrd_post r4, r5, r1, r2
- subs lr, lr, #1
- uxtb16 r6, r4
- uxtb16 r4, r4, ror #8
- uxtb16 r12, r5
- uxtb16 r8, r5, ror #8
- pld [r1, r2]
- pkhbt r5, r6, r4, lsl #16
- pkhtb r6, r4, r6, asr #16
- pkhbt r7, r12, r8, lsl #16
- pkhtb r12, r8, r12, asr #16
- stm r0!, {r5,r6,r7,r12}
- bgt 1b
-
- pop {r4-r8, pc}
-endfunc
-
-function ff_diff_pixels_armv6, export=1
- pld [r1, r3]
- pld [r2, r3]
- push {r4-r9, lr}
- mov lr, #8
-1:
- ldrd_post r4, r5, r1, r3
- ldrd_post r6, r7, r2, r3
- uxtb16 r8, r4
- uxtb16 r4, r4, ror #8
- uxtb16 r9, r6
- uxtb16 r6, r6, ror #8
- pld [r1, r3]
- ssub16 r9, r8, r9
- ssub16 r6, r4, r6
- uxtb16 r8, r5
- uxtb16 r5, r5, ror #8
- pld [r2, r3]
- pkhbt r4, r9, r6, lsl #16
- pkhtb r6, r6, r9, asr #16
- uxtb16 r9, r7
- uxtb16 r7, r7, ror #8
- ssub16 r9, r8, r9
- ssub16 r5, r5, r7
- subs lr, lr, #1
- pkhbt r8, r9, r5, lsl #16
- pkhtb r9, r5, r9, asr #16
- stm r0!, {r4,r6,r8,r9}
- bgt 1b
-
- pop {r4-r9, pc}
-endfunc
-
-function ff_pix_abs16_armv6, export=1
- ldr r0, [sp]
- push {r4-r9, lr}
- mov r12, #0
- mov lr, #0
- ldm r1, {r4-r7}
- ldr r8, [r2]
-1:
- ldr r9, [r2, #4]
- pld [r1, r3]
- usada8 r12, r4, r8, r12
- ldr r8, [r2, #8]
- pld [r2, r3]
- usada8 lr, r5, r9, lr
- ldr r9, [r2, #12]
- usada8 r12, r6, r8, r12
- subs r0, r0, #1
- usada8 lr, r7, r9, lr
- beq 2f
- add r1, r1, r3
- ldm r1, {r4-r7}
- add r2, r2, r3
- ldr r8, [r2]
- b 1b
-2:
- add r0, r12, lr
- pop {r4-r9, pc}
-endfunc
-
-function ff_pix_abs16_x2_armv6, export=1
- ldr r12, [sp]
- push {r4-r11, lr}
- mov r0, #0
- mov lr, #1
- orr lr, lr, lr, lsl #8
- orr lr, lr, lr, lsl #16
-1:
- ldr r8, [r2]
- ldr r9, [r2, #4]
- lsr r10, r8, #8
- ldr r4, [r1]
- lsr r6, r9, #8
- orr r10, r10, r9, lsl #24
- ldr r5, [r2, #8]
- eor r11, r8, r10
- uhadd8 r7, r8, r10
- orr r6, r6, r5, lsl #24
- and r11, r11, lr
- uadd8 r7, r7, r11
- ldr r8, [r1, #4]
- usada8 r0, r4, r7, r0
- eor r7, r9, r6
- lsr r10, r5, #8
- and r7, r7, lr
- uhadd8 r4, r9, r6
- ldr r6, [r2, #12]
- uadd8 r4, r4, r7
- pld [r1, r3]
- orr r10, r10, r6, lsl #24
- usada8 r0, r8, r4, r0
- ldr r4, [r1, #8]
- eor r11, r5, r10
- ldrb r7, [r2, #16]
- and r11, r11, lr
- uhadd8 r8, r5, r10
- ldr r5, [r1, #12]
- uadd8 r8, r8, r11
- pld [r2, r3]
- lsr r10, r6, #8
- usada8 r0, r4, r8, r0
- orr r10, r10, r7, lsl #24
- subs r12, r12, #1
- eor r11, r6, r10
- add r1, r1, r3
- uhadd8 r9, r6, r10
- and r11, r11, lr
- uadd8 r9, r9, r11
- add r2, r2, r3
- usada8 r0, r5, r9, r0
- bgt 1b
-
- pop {r4-r11, pc}
-endfunc
-
-.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
- ldr \n0, [r2]
- eor \n1, \p0, \n0
- uhadd8 \p0, \p0, \n0
- and \n1, \n1, lr
- ldr \n2, [r1]
- uadd8 \p0, \p0, \n1
- ldr \n1, [r2, #4]
- usada8 r0, \p0, \n2, r0
- pld [r1, r3]
- eor \n3, \p1, \n1
- uhadd8 \p1, \p1, \n1
- and \n3, \n3, lr
- ldr \p0, [r1, #4]
- uadd8 \p1, \p1, \n3
- ldr \n2, [r2, #8]
- usada8 r0, \p1, \p0, r0
- pld [r2, r3]
- eor \p0, \p2, \n2
- uhadd8 \p2, \p2, \n2
- and \p0, \p0, lr
- ldr \p1, [r1, #8]
- uadd8 \p2, \p2, \p0
- ldr \n3, [r2, #12]
- usada8 r0, \p2, \p1, r0
- eor \p1, \p3, \n3
- uhadd8 \p3, \p3, \n3
- and \p1, \p1, lr
- ldr \p0, [r1, #12]
- uadd8 \p3, \p3, \p1
- add r1, r1, r3
- usada8 r0, \p3, \p0, r0
- add r2, r2, r3
-.endm
-
-function ff_pix_abs16_y2_armv6, export=1
- pld [r1]
- pld [r2]
- ldr r12, [sp]
- push {r4-r11, lr}
- mov r0, #0
- mov lr, #1
- orr lr, lr, lr, lsl #8
- orr lr, lr, lr, lsl #16
- ldr r4, [r2]
- ldr r5, [r2, #4]
- ldr r6, [r2, #8]
- ldr r7, [r2, #12]
- add r2, r2, r3
-1:
- usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
- subs r12, r12, #2
- usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
- bgt 1b
-
- pop {r4-r11, pc}
-endfunc
-
-function ff_pix_abs8_armv6, export=1
- pld [r2, r3]
- ldr r12, [sp]
- push {r4-r9, lr}
- mov r0, #0
- mov lr, #0
- ldrd_post r4, r5, r1, r3
-1:
- subs r12, r12, #2
- ldr r7, [r2, #4]
- ldr_post r6, r2, r3
- ldrd_post r8, r9, r1, r3
- usada8 r0, r4, r6, r0
- pld [r2, r3]
- usada8 lr, r5, r7, lr
- ldr r7, [r2, #4]
- ldr_post r6, r2, r3
- beq 2f
- ldrd_post r4, r5, r1, r3
- usada8 r0, r8, r6, r0
- pld [r2, r3]
- usada8 lr, r9, r7, lr
- b 1b
-2:
- usada8 r0, r8, r6, r0
- usada8 lr, r9, r7, lr
- add r0, r0, lr
- pop {r4-r9, pc}
-endfunc
-
-function ff_sse16_armv6, export=1
- ldr r12, [sp]
- push {r4-r9, lr}
- mov r0, #0
-1:
- ldrd r4, r5, [r1]
- ldr r8, [r2]
- uxtb16 lr, r4
- uxtb16 r4, r4, ror #8
- uxtb16 r9, r8
- uxtb16 r8, r8, ror #8
- ldr r7, [r2, #4]
- usub16 lr, lr, r9
- usub16 r4, r4, r8
- smlad r0, lr, lr, r0
- uxtb16 r6, r5
- uxtb16 lr, r5, ror #8
- uxtb16 r8, r7
- uxtb16 r9, r7, ror #8
- smlad r0, r4, r4, r0
- ldrd r4, r5, [r1, #8]
- usub16 r6, r6, r8
- usub16 r8, lr, r9
- ldr r7, [r2, #8]
- smlad r0, r6, r6, r0
- uxtb16 lr, r4
- uxtb16 r4, r4, ror #8
- uxtb16 r9, r7
- uxtb16 r7, r7, ror #8
- smlad r0, r8, r8, r0
- ldr r8, [r2, #12]
- usub16 lr, lr, r9
- usub16 r4, r4, r7
- smlad r0, lr, lr, r0
- uxtb16 r6, r5
- uxtb16 r5, r5, ror #8
- uxtb16 r9, r8
- uxtb16 r8, r8, ror #8
- smlad r0, r4, r4, r0
- usub16 r6, r6, r9
- usub16 r5, r5, r8
- smlad r0, r6, r6, r0
- add r1, r1, r3
- add r2, r2, r3
- subs r12, r12, #1
- smlad r0, r5, r5, r0
- bgt 1b
-
- pop {r4-r9, pc}
-endfunc
-
-function ff_pix_norm1_armv6, export=1
- push {r4-r6, lr}
- mov r12, #16
- mov lr, #0
-1:
- ldm r0, {r2-r5}
- uxtb16 r6, r2
- uxtb16 r2, r2, ror #8
- smlad lr, r6, r6, lr
- uxtb16 r6, r3
- smlad lr, r2, r2, lr
- uxtb16 r3, r3, ror #8
- smlad lr, r6, r6, lr
- uxtb16 r6, r4
- smlad lr, r3, r3, lr
- uxtb16 r4, r4, ror #8
- smlad lr, r6, r6, lr
- uxtb16 r6, r5
- smlad lr, r4, r4, lr
- uxtb16 r5, r5, ror #8
- smlad lr, r6, r6, lr
- subs r12, r12, #1
- add r0, r0, r1
- smlad lr, r5, r5, lr
- bgt 1b
-
- mov r0, lr
- pop {r4-r6, pc}
-endfunc
-
-function ff_pix_sum_armv6, export=1
- push {r4-r7, lr}
- mov r12, #16
- mov r2, #0
- mov r3, #0
- mov lr, #0
- ldr r4, [r0]
-1:
- subs r12, r12, #1
- ldr r5, [r0, #4]
- usada8 r2, r4, lr, r2
- ldr r6, [r0, #8]
- usada8 r3, r5, lr, r3
- ldr r7, [r0, #12]
- usada8 r2, r6, lr, r2
- beq 2f
- ldr_pre r4, r0, r1
- usada8 r3, r7, lr, r3
- bgt 1b
-2:
- usada8 r3, r7, lr, r3
- add r0, r2, r3
- pop {r4-r7, pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_arm.c b/ffmpeg/libavcodec/arm/dsputil_init_arm.c
deleted file mode 100644
index 68991fa..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_init_arm.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * ARM optimized DSP utils
- * Copyright (c) 2001 Lionel Ulmer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "dsputil_arm.h"
-
-void ff_j_rev_dct_arm(int16_t *data);
-void ff_simple_idct_arm(int16_t *data);
-
-/* XXX: local hack */
-static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
-
-void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
- int line_size);
-
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
- converted */
-static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_j_rev_dct_arm (block);
- ff_put_pixels_clamped(block, dest, line_size);
-}
-static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_j_rev_dct_arm (block);
- ff_add_pixels_clamped(block, dest, line_size);
-}
-static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_simple_idct_arm (block);
- ff_put_pixels_clamped(block, dest, line_size);
-}
-static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
-{
- ff_simple_idct_arm (block);
- ff_add_pixels_clamped(block, dest, line_size);
-}
-
-av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
-{
- int cpu_flags = av_get_cpu_flags();
-
- ff_put_pixels_clamped = c->put_pixels_clamped;
- ff_add_pixels_clamped = c->add_pixels_clamped;
-
- if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
- if(avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_ARM){
- c->idct_put = j_rev_dct_arm_put;
- c->idct_add = j_rev_dct_arm_add;
- c->idct = ff_j_rev_dct_arm;
- c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
- } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM){
- c->idct_put = simple_idct_arm_put;
- c->idct_add = simple_idct_arm_add;
- c->idct = ff_simple_idct_arm;
- c->idct_permutation_type = FF_NO_IDCT_PERM;
- }
- }
-
- c->add_pixels_clamped = ff_add_pixels_clamped_arm;
-
- if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx);
- if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx);
- if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx);
-}
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c b/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c
deleted file mode 100644
index 841fbfa..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "dsputil_arm.h"
-
-void ff_simple_idct_armv5te(int16_t *data);
-void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
-
-av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx)
-{
- if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
- (avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
- c->idct_put = ff_simple_idct_put_armv5te;
- c->idct_add = ff_simple_idct_add_armv5te;
- c->idct = ff_simple_idct_armv5te;
- c->idct_permutation_type = FF_NO_IDCT_PERM;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv6.c b/ffmpeg/libavcodec/arm/dsputil_init_armv6.c
deleted file mode 100644
index 8f38302..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_init_armv6.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavcodec/avcodec.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_armv6(int16_t *data);
-void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
-
-void ff_add_pixels_clamped_armv6(const int16_t *block,
- uint8_t *restrict pixels,
- int line_size);
-
-void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
-void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
- const uint8_t *s2, int stride);
-
-int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
- int line_size, int h);
-int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
- int line_size, int h);
-int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
- int line_size, int h);
-
-int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
- int line_size, int h);
-
-int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
- int line_size, int h);
-
-int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
-int ff_pix_sum_armv6(uint8_t *pix, int line_size);
-
-av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx)
-{
- const int high_bit_depth = avctx->bits_per_raw_sample > 8;
-
- if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
- (avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
- c->idct_put = ff_simple_idct_put_armv6;
- c->idct_add = ff_simple_idct_add_armv6;
- c->idct = ff_simple_idct_armv6;
- c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
- }
-
- if (!high_bit_depth)
- c->get_pixels = ff_get_pixels_armv6;
- c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
- c->diff_pixels = ff_diff_pixels_armv6;
-
- c->pix_abs[0][0] = ff_pix_abs16_armv6;
- c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
- c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
-
- c->pix_abs[1][0] = ff_pix_abs8_armv6;
-
- c->sad[0] = ff_pix_abs16_armv6;
- c->sad[1] = ff_pix_abs8_armv6;
-
- c->sse[0] = ff_sse16_armv6;
-
- c->pix_norm1 = ff_pix_norm1_armv6;
- c->pix_sum = ff_pix_sum_armv6;
-}
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_neon.c b/ffmpeg/libavcodec/arm/dsputil_init_neon.c
deleted file mode 100644
index c1f250a..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_init_neon.c
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/avcodec.h"
-#include "dsputil_arm.h"
-
-void ff_simple_idct_neon(int16_t *data);
-void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
-void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
-
-void ff_clear_block_neon(int16_t *block);
-void ff_clear_blocks_neon(int16_t *blocks);
-
-void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
-
-void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
- int len);
-void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
- int32_t max, unsigned int len);
-
-int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
- const int16_t *v3, int len, int mul);
-
-av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
-{
- const int high_bit_depth = avctx->bits_per_raw_sample > 8;
-
- if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
- if (avctx->idct_algo == FF_IDCT_AUTO ||
- avctx->idct_algo == FF_IDCT_SIMPLENEON) {
- c->idct_put = ff_simple_idct_put_neon;
- c->idct_add = ff_simple_idct_add_neon;
- c->idct = ff_simple_idct_neon;
- c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
- }
- }
-
- if (!high_bit_depth) {
- c->clear_block = ff_clear_block_neon;
- c->clear_blocks = ff_clear_blocks_neon;
- }
-
- c->add_pixels_clamped = ff_add_pixels_clamped_neon;
- c->put_pixels_clamped = ff_put_pixels_clamped_neon;
- c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
-
- c->vector_clipf = ff_vector_clipf_neon;
- c->vector_clip_int32 = ff_vector_clip_int32_neon;
-
- c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
- c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
-}
diff --git a/ffmpeg/libavcodec/arm/dsputil_neon.S b/ffmpeg/libavcodec/arm/dsputil_neon.S
deleted file mode 100644
index 6c8231e..0000000
--- a/ffmpeg/libavcodec/arm/dsputil_neon.S
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_clear_block_neon, export=1
- vmov.i16 q0, #0
- .rept 8
- vst1.16 {q0}, [r0,:128]!
- .endr
- bx lr
-endfunc
-
-function ff_clear_blocks_neon, export=1
- vmov.i16 q0, #0
- .rept 8*6
- vst1.16 {q0}, [r0,:128]!
- .endr
- bx lr
-endfunc
-
-function ff_put_pixels_clamped_neon, export=1
- vld1.16 {d16-d19}, [r0,:128]!
- vqmovun.s16 d0, q8
- vld1.16 {d20-d23}, [r0,:128]!
- vqmovun.s16 d1, q9
- vld1.16 {d24-d27}, [r0,:128]!
- vqmovun.s16 d2, q10
- vld1.16 {d28-d31}, [r0,:128]!
- vqmovun.s16 d3, q11
- vst1.8 {d0}, [r1,:64], r2
- vqmovun.s16 d4, q12
- vst1.8 {d1}, [r1,:64], r2
- vqmovun.s16 d5, q13
- vst1.8 {d2}, [r1,:64], r2
- vqmovun.s16 d6, q14
- vst1.8 {d3}, [r1,:64], r2
- vqmovun.s16 d7, q15
- vst1.8 {d4}, [r1,:64], r2
- vst1.8 {d5}, [r1,:64], r2
- vst1.8 {d6}, [r1,:64], r2
- vst1.8 {d7}, [r1,:64], r2
- bx lr
-endfunc
-
-function ff_put_signed_pixels_clamped_neon, export=1
- vmov.u8 d31, #128
- vld1.16 {d16-d17}, [r0,:128]!
- vqmovn.s16 d0, q8
- vld1.16 {d18-d19}, [r0,:128]!
- vqmovn.s16 d1, q9
- vld1.16 {d16-d17}, [r0,:128]!
- vqmovn.s16 d2, q8
- vld1.16 {d18-d19}, [r0,:128]!
- vadd.u8 d0, d0, d31
- vld1.16 {d20-d21}, [r0,:128]!
- vadd.u8 d1, d1, d31
- vld1.16 {d22-d23}, [r0,:128]!
- vadd.u8 d2, d2, d31
- vst1.8 {d0}, [r1,:64], r2
- vqmovn.s16 d3, q9
- vst1.8 {d1}, [r1,:64], r2
- vqmovn.s16 d4, q10
- vst1.8 {d2}, [r1,:64], r2
- vqmovn.s16 d5, q11
- vld1.16 {d24-d25}, [r0,:128]!
- vadd.u8 d3, d3, d31
- vld1.16 {d26-d27}, [r0,:128]!
- vadd.u8 d4, d4, d31
- vadd.u8 d5, d5, d31
- vst1.8 {d3}, [r1,:64], r2
- vqmovn.s16 d6, q12
- vst1.8 {d4}, [r1,:64], r2
- vqmovn.s16 d7, q13
- vst1.8 {d5}, [r1,:64], r2
- vadd.u8 d6, d6, d31
- vadd.u8 d7, d7, d31
- vst1.8 {d6}, [r1,:64], r2
- vst1.8 {d7}, [r1,:64], r2
- bx lr
-endfunc
-
-function ff_add_pixels_clamped_neon, export=1
- mov r3, r1
- vld1.8 {d16}, [r1,:64], r2
- vld1.16 {d0-d1}, [r0,:128]!
- vaddw.u8 q0, q0, d16
- vld1.8 {d17}, [r1,:64], r2
- vld1.16 {d2-d3}, [r0,:128]!
- vqmovun.s16 d0, q0
- vld1.8 {d18}, [r1,:64], r2
- vaddw.u8 q1, q1, d17
- vld1.16 {d4-d5}, [r0,:128]!
- vaddw.u8 q2, q2, d18
- vst1.8 {d0}, [r3,:64], r2
- vqmovun.s16 d2, q1
- vld1.8 {d19}, [r1,:64], r2
- vld1.16 {d6-d7}, [r0,:128]!
- vaddw.u8 q3, q3, d19
- vqmovun.s16 d4, q2
- vst1.8 {d2}, [r3,:64], r2
- vld1.8 {d16}, [r1,:64], r2
- vqmovun.s16 d6, q3
- vld1.16 {d0-d1}, [r0,:128]!
- vaddw.u8 q0, q0, d16
- vst1.8 {d4}, [r3,:64], r2
- vld1.8 {d17}, [r1,:64], r2
- vld1.16 {d2-d3}, [r0,:128]!
- vaddw.u8 q1, q1, d17
- vst1.8 {d6}, [r3,:64], r2
- vqmovun.s16 d0, q0
- vld1.8 {d18}, [r1,:64], r2
- vld1.16 {d4-d5}, [r0,:128]!
- vaddw.u8 q2, q2, d18
- vst1.8 {d0}, [r3,:64], r2
- vqmovun.s16 d2, q1
- vld1.8 {d19}, [r1,:64], r2
- vqmovun.s16 d4, q2
- vld1.16 {d6-d7}, [r0,:128]!
- vaddw.u8 q3, q3, d19
- vst1.8 {d2}, [r3,:64], r2
- vqmovun.s16 d6, q3
- vst1.8 {d4}, [r3,:64], r2
- vst1.8 {d6}, [r3,:64], r2
- bx lr
-endfunc
-
-function ff_vector_clipf_neon, export=1
-VFP vdup.32 q1, d0[1]
-VFP vdup.32 q0, d0[0]
-NOVFP vdup.32 q0, r2
-NOVFP vdup.32 q1, r3
-NOVFP ldr r2, [sp]
- vld1.f32 {q2},[r1,:128]!
- vmin.f32 q10, q2, q1
- vld1.f32 {q3},[r1,:128]!
- vmin.f32 q11, q3, q1
-1: vmax.f32 q8, q10, q0
- vmax.f32 q9, q11, q0
- subs r2, r2, #8
- beq 2f
- vld1.f32 {q2},[r1,:128]!
- vmin.f32 q10, q2, q1
- vld1.f32 {q3},[r1,:128]!
- vmin.f32 q11, q3, q1
- vst1.f32 {q8},[r0,:128]!
- vst1.f32 {q9},[r0,:128]!
- b 1b
-2: vst1.f32 {q8},[r0,:128]!
- vst1.f32 {q9},[r0,:128]!
- bx lr
-endfunc
-
-function ff_vector_clip_int32_neon, export=1
- vdup.32 q0, r2
- vdup.32 q1, r3
- ldr r2, [sp]
-1:
- vld1.32 {q2-q3}, [r1,:128]!
- vmin.s32 q2, q2, q1
- vmin.s32 q3, q3, q1
- vmax.s32 q2, q2, q0
- vmax.s32 q3, q3, q0
- vst1.32 {q2-q3}, [r0,:128]!
- subs r2, r2, #8
- bgt 1b
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c b/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c
deleted file mode 100644
index ef098f4..0000000
--- a/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/cpu.h"
-
-#define CONFIG_FFT_FLOAT 0
-#include "libavcodec/fft.h"
-
-void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
-void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
-void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
-
-av_cold void ff_fft_fixed_init_arm(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
-#if CONFIG_FFT
- s->fft_calc = ff_fft_fixed_calc_neon;
-#endif
-
-#if CONFIG_MDCT
- if (!s->inverse && s->nbits >= 3) {
- s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
- s->mdct_calc = ff_mdct_fixed_calc_neon;
- s->mdct_calcw = ff_mdct_fixed_calcw_neon;
- }
-#endif
- }
-}
diff --git a/ffmpeg/libavcodec/arm/fft_fixed_neon.S b/ffmpeg/libavcodec/arm/fft_fixed_neon.S
deleted file mode 100644
index d4a38a2..0000000
--- a/ffmpeg/libavcodec/arm/fft_fixed_neon.S
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro bflies d0, d1, r0, r1
- vrev64.32 \r0, \d1 @ t5, t6, t1, t2
- vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
- vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
- vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
- vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
- @ t5, t6, t4, t3
- vhsub.s16 \d1, \d0, \r0
- vhadd.s16 \d0, \d0, \r0
-.endm
-
-.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
- vrev32.16 \r0, \d3
- vmull.s16 \w0, \d3, \c0
- vmlal.s16 \w0, \r0, \c1
- vshrn.s32 \d3, \w0, #15
- bflies \q0, \q1, \w0, \w1
-.endm
-
-.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
- r0, r1, w0, w1
- vrev32.16 \r0, \d1
- vrev32.16 \r1, \d3
- vmull.s16 \w0, \d1, \c0
- vmlal.s16 \w0, \r0, \c1
- vmull.s16 \w1, \d3, \c2
- vmlal.s16 \w1, \r1, \c3
- vshrn.s32 \d1, \w0, #15
- vshrn.s32 \d3, \w1, #15
- bflies \q0, \q1, \w0, \w1
-.endm
-
-.macro fft4 d0, d1, r0, r1
- vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
- vhsub.s16 \r1, \d1, \d0
- vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
- vmov.i64 \d1, #0xffff00000000
- vbit \r0, \r1, \d1
- vrev64.16 \r1, \r0 @ t7, t8, t4, t3
- vtrn.32 \r0, \r1 @ t3, t4, t7, t8
- vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
- vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
- vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
-.endm
-
-.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
- fft4 \d0, \d1, \r0, \r1
- vtrn.32 \d0, \d1 @ z0, z2, z1, z3
- vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
- vhsub.s16 \d3, \d2, \d3 @ z5, z7
- vmov \d2, \r0
- transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
-.endm
-
-function fft4_neon
- vld1.16 {d0-d1}, [r0]
- fft4 d0, d1, d2, d3
- vst1.16 {d0-d1}, [r0]
- bx lr
-endfunc
-
-function fft8_neon
- vld1.16 {d0-d3}, [r0,:128]
- movrel r1, coefs
- vld1.16 {d30}, [r1,:64]
- vdup.16 d31, d30[0]
- fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
- vtrn.32 d0, d1
- vtrn.32 d2, d3
- vst1.16 {d0-d3}, [r0,:128]
- bx lr
-endfunc
-
-function fft16_neon
- vld1.16 {d0-d3}, [r0,:128]!
- vld1.16 {d4-d7}, [r0,:128]
- movrel r1, coefs
- sub r0, r0, #32
- vld1.16 {d28-d31},[r1,:128]
- vdup.16 d31, d28[0]
- fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
- vswp d5, d6
- fft4 q2, q3, q8, q9
- vswp d5, d6
- vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
- vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
- vswp d1, d2
- vdup.16 d31, d28[0]
- transform01 q0, q2, d5, d31, d28, d20, q8, q9
- vdup.16 d26, d29[0]
- vdup.16 d27, d30[0]
- transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
- d20, d21, q8, q9
- vtrn.32 q0, q1
- vtrn.32 q2, q3
- vst1.16 {d0-d3}, [r0,:128]!
- vst1.16 {d4-d7}, [r0,:128]
- bx lr
-endfunc
-
-function fft_pass_neon
- push {r4,lr}
- movrel lr, coefs+24
- vld1.16 {d30}, [lr,:64]
- lsl r12, r2, #3
- vmov d31, d30
- add r3, r1, r2, lsl #2
- mov lr, #-8
- sub r3, r3, #2
- mov r4, r0
- vld1.16 {d27[]}, [r3,:16]
- sub r3, r3, #6
- vld1.16 {q0}, [r4,:128], r12
- vld1.16 {q1}, [r4,:128], r12
- vld1.16 {q2}, [r4,:128], r12
- vld1.16 {q3}, [r4,:128], r12
- vld1.16 {d28}, [r1,:64]!
- vld1.16 {d29}, [r3,:64], lr
- vswp d1, d2
- vswp d5, d6
- vtrn.32 d0, d1
- vtrn.32 d4, d5
- vdup.16 d25, d28[1]
- vmul.s16 d27, d27, d31
- transform01 q0, q2, d5, d25, d27, d20, q8, q9
- b 2f
-1:
- mov r4, r0
- vdup.16 d26, d29[0]
- vld1.16 {q0}, [r4,:128], r12
- vld1.16 {q1}, [r4,:128], r12
- vld1.16 {q2}, [r4,:128], r12
- vld1.16 {q3}, [r4,:128], r12
- vld1.16 {d28}, [r1,:64]!
- vld1.16 {d29}, [r3,:64], lr
- vswp d1, d2
- vswp d5, d6
- vtrn.32 d0, d1
- vtrn.32 d4, d5
- vdup.16 d24, d28[0]
- vdup.16 d25, d28[1]
- vdup.16 d27, d29[3]
- vmul.s16 q13, q13, q15
- transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
- d16, d17, q9, q10
-2:
- vtrn.32 d2, d3
- vtrn.32 d6, d7
- vdup.16 d24, d28[2]
- vdup.16 d26, d29[2]
- vdup.16 d25, d28[3]
- vdup.16 d27, d29[1]
- vmul.s16 q13, q13, q15
- transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
- d16, d17, q9, q10
- vtrn.32 d0, d1
- vtrn.32 d2, d3
- vtrn.32 d4, d5
- vtrn.32 d6, d7
- vswp d1, d2
- vswp d5, d6
- mov r4, r0
- vst1.16 {q0}, [r4,:128], r12
- vst1.16 {q1}, [r4,:128], r12
- vst1.16 {q2}, [r4,:128], r12
- vst1.16 {q3}, [r4,:128], r12
- add r0, r0, #16
- subs r2, r2, #2
- bgt 1b
- pop {r4,pc}
-endfunc
-
-#define F_SQRT1_2 23170
-#define F_COS_16_1 30274
-#define F_COS_16_3 12540
-
-const coefs, align=4
- .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
- .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
- .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
- .short 1, -1, -1, 1
-endconst
-
-.macro def_fft n, n2, n4
-function fft\n\()_neon
- push {r4, lr}
- mov r4, r0
- bl fft\n2\()_neon
- add r0, r4, #\n4*2*4
- bl fft\n4\()_neon
- add r0, r4, #\n4*3*4
- bl fft\n4\()_neon
- mov r0, r4
- pop {r4, lr}
- movrelx r1, X(ff_cos_\n\()_fixed)
- mov r2, #\n4/2
- b fft_pass_neon
-endfunc
-.endm
-
- def_fft 32, 16, 8
- def_fft 64, 32, 16
- def_fft 128, 64, 32
- def_fft 256, 128, 64
- def_fft 512, 256, 128
- def_fft 1024, 512, 256
- def_fft 2048, 1024, 512
- def_fft 4096, 2048, 1024
- def_fft 8192, 4096, 2048
- def_fft 16384, 8192, 4096
- def_fft 32768, 16384, 8192
- def_fft 65536, 32768, 16384
-
-function ff_fft_fixed_calc_neon, export=1
- ldr r2, [r0]
- sub r2, r2, #2
- movrel r3, fft_fixed_tab_neon
- ldr r3, [r3, r2, lsl #2]
- mov r0, r1
- bx r3
-endfunc
-
-const fft_fixed_tab_neon
- .word fft4_neon
- .word fft8_neon
- .word fft16_neon
- .word fft32_neon
- .word fft64_neon
- .word fft128_neon
- .word fft256_neon
- .word fft512_neon
- .word fft1024_neon
- .word fft2048_neon
- .word fft4096_neon
- .word fft8192_neon
- .word fft16384_neon
- .word fft32768_neon
- .word fft65536_neon
-endconst
diff --git a/ffmpeg/libavcodec/arm/fft_init_arm.c b/ffmpeg/libavcodec/arm/fft_init_arm.c
deleted file mode 100644
index 7e49b9c..0000000
--- a/ffmpeg/libavcodec/arm/fft_init_arm.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/fft.h"
-#include "libavcodec/rdft.h"
-#include "libavcodec/synth_filter.h"
-
-void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
-void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
-
-void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
-
-void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
-
-av_cold void ff_fft_init_arm(FFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp(cpu_flags)) {
-#if CONFIG_MDCT
- if (!have_vfpv3(cpu_flags))
- s->imdct_half = ff_imdct_half_vfp;
-#endif
- }
-
- if (have_neon(cpu_flags)) {
-#if CONFIG_FFT
- s->fft_permute = ff_fft_permute_neon;
- s->fft_calc = ff_fft_calc_neon;
-#endif
-#if CONFIG_MDCT
- s->imdct_calc = ff_imdct_calc_neon;
- s->imdct_half = ff_imdct_half_neon;
- s->mdct_calc = ff_mdct_calc_neon;
- s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
-#endif
- }
-}
-
-#if CONFIG_RDFT
-av_cold void ff_rdft_init_arm(RDFTContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags))
- s->rdft_calc = ff_rdft_calc_neon;
-}
-#endif
diff --git a/ffmpeg/libavcodec/arm/fft_neon.S b/ffmpeg/libavcodec/arm/fft_neon.S
deleted file mode 100644
index 8b9ae2a..0000000
--- a/ffmpeg/libavcodec/arm/fft_neon.S
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * ARM NEON optimised FFT
- *
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- * Copyright (c) 2009 Naotoshi Nojiri
- *
- * This algorithm (though not any of the implementation details) is
- * based on libdjbfft by D. J. Bernstein.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-#define M_SQRT1_2 0.70710678118654752440
-
-
-function fft4_neon
- vld1.32 {d0-d3}, [r0,:128]
-
- vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
- vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
- vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
- vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
- vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
- vadd.f32 d1, d6, d7
- vsub.f32 d3, d6, d7
- vadd.f32 d0, d4, d5
- vsub.f32 d2, d4, d5
-
- vst1.32 {d0-d3}, [r0,:128]
-
- bx lr
-endfunc
-
-function fft8_neon
- mov r1, r0
- vld1.32 {d0-d3}, [r1,:128]!
- vld1.32 {d16-d19}, [r1,:128]
-
- movw r2, #0x04f3 @ sqrt(1/2)
- movt r2, #0x3f35
- eor r3, r2, #1<<31
- vdup.32 d31, r2
-
- vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
- vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
- vmov d28, r3, r2
- vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
- vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
- vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
- vrev64.32 d29, d28
- vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
- vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
- vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
- vext.32 q3, q2, q2, #1
- vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
- vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
- vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
- vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
- vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
- vadd.f32 d0, d20, d21
- vsub.f32 d2, d20, d21
- vadd.f32 d1, d22, d23
- vrev64.32 q13, q13
- vsub.f32 d3, d22, d23
- vsub.f32 d6, d6, d7
- vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
- vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
- vadd.f32 d7, d4, d5
- vsub.f32 d18, d2, d6
- vext.32 q13, q12, q12, #1
- vadd.f32 d2, d2, d6
- vsub.f32 d16, d0, d7
- vadd.f32 d5, d25, d24
- vsub.f32 d4, d26, d27
- vadd.f32 d0, d0, d7
- vsub.f32 d17, d1, d5
- vsub.f32 d19, d3, d4
- vadd.f32 d3, d3, d4
- vadd.f32 d1, d1, d5
-
- vst1.32 {d16-d19}, [r1,:128]
- vst1.32 {d0-d3}, [r0,:128]
-
- bx lr
-endfunc
-
-function fft16_neon
- movrel r1, mppm
- vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
- pld [r0, #32]
- vld1.32 {d2-d3}, [r1,:128]
- vext.32 q13, q9, q9, #1
- vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
- vadd.f32 d4, d16, d17
- vsub.f32 d5, d16, d17
- vadd.f32 d18, d18, d19
- vsub.f32 d19, d26, d27
-
- vadd.f32 d20, d22, d23
- vsub.f32 d22, d22, d23
- vsub.f32 d23, d24, d25
- vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
- vadd.f32 d21, d24, d25
- vmul.f32 d24, d22, d2
- vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
- vmul.f32 d25, d23, d3
- vuzp.32 d16, d17 @ {r0,r1,i0,i1}
- vmul.f32 q1, q11, d2[1]
- vuzp.32 d18, d19 @ {r2,r3,i2,i3}
- vrev64.32 q12, q12
- vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
- vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
- vzip.32 q10, q11
- vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
- vadd.f32 d0, d22, d20
- vadd.f32 d1, d21, d23
- vsub.f32 d2, d21, d23
- vsub.f32 d3, d22, d20
- sub r0, r0, #96
- vext.32 q13, q13, q13, #1
- vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
- vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
- vext.32 q15, q15, q15, #1
- vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
- vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
- vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
- vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
- vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
- vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
- movrelx r2, X(ff_cos_16)
- vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
- vrev64.32 d1, d1
- vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
- vrev64.32 d3, d3
- movrel r3, pmmp
- vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
- vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
- vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
- vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
- vld1.32 {d4-d5}, [r2,:64]
- vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
- vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
- vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
- vld1.32 {d6-d7}, [r3,:128]
- vrev64.32 q1, q14
- vmul.f32 q14, q14, d4[1]
- vmul.f32 q1, q1, q3
- vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
- vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
- vzip.32 q12, q14
- vadd.f32 d0, d28, d24
- vadd.f32 d1, d25, d29
- vsub.f32 d2, d25, d29
- vsub.f32 d3, d28, d24
- vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
- vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
- vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
- mov r1, #32
- vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
- vrev64.32 q0, q13
- vmul.f32 q13, q13, d5[0]
- vrev64.32 q1, q15
- vmul.f32 q15, q15, d5[1]
- vst2.32 {d16-d17},[r0,:128], r1
- vmul.f32 q0, q0, q3
- vst2.32 {d20-d21},[r0,:128], r1
- vmul.f32 q1, q1, q3
- vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
- vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
- vst2.32 {d24-d25},[r0,:128], r1
- vst2.32 {d28-d29},[r0,:128]
- vzip.32 q13, q15
- sub r0, r0, #80
- vadd.f32 d0, d30, d26
- vadd.f32 d1, d27, d31
- vsub.f32 d2, d27, d31
- vsub.f32 d3, d30, d26
- vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
- vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
- vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
- vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
- vst2.32 {d18-d19},[r0,:128], r1
- vst2.32 {d22-d23},[r0,:128], r1
- vst2.32 {d26-d27},[r0,:128], r1
- vst2.32 {d30-d31},[r0,:128]
- bx lr
-endfunc
-
-function fft_pass_neon
- push {r4-r6,lr}
- mov r6, r2 @ n
- lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
- lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
- lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
- add r3, r2, r4
- add r4, r4, r0 @ &z[o1]
- add r2, r2, r0 @ &z[o2]
- add r3, r3, r0 @ &z[o3]
- vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
- movrel r12, pmmp
- vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
- add r5, r5, r1 @ wim
- vld1.32 {d6-d7}, [r12,:128] @ pmmp
- vswp d21, d22
- vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
- sub r5, r5, #4 @ wim--
- vrev64.32 q1, q11
- vmul.f32 q11, q11, d4[1]
- vmul.f32 q1, q1, q3
- vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
- vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
- vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
- sub r6, r6, #1 @ n--
- vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
- vzip.32 q10, q11
- vadd.f32 d0, d22, d20
- vadd.f32 d1, d21, d23
- vsub.f32 d2, d21, d23
- vsub.f32 d3, d22, d20
- vsub.f32 q10, q8, q0
- vadd.f32 q8, q8, q0
- vsub.f32 q11, q9, q1
- vadd.f32 q9, q9, q1
- vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
- vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
- vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
- vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
- sub r5, r5, #8 @ wim -= 2
-1:
- vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
- vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
- vswp d21, d22
- vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
- vrev64.32 q0, q10
- vmul.f32 q10, q10, d4[0]
- vrev64.32 q1, q11
- vmul.f32 q11, q11, d4[1]
- vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
- vmul.f32 q0, q0, q3
- sub r5, r5, #8 @ wim -= 2
- vmul.f32 q1, q1, q3
- vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
- vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
- vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
- subs r6, r6, #1 @ n--
- vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
- vzip.32 q10, q11
- vadd.f32 d0, d22, d20
- vadd.f32 d1, d21, d23
- vsub.f32 d2, d21, d23
- vsub.f32 d3, d22, d20
- vsub.f32 q10, q8, q0
- vadd.f32 q8, q8, q0
- vsub.f32 q11, q9, q1
- vadd.f32 q9, q9, q1
- vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
- vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
- vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
- vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
- bne 1b
-
- pop {r4-r6,pc}
-endfunc
-
-.macro def_fft n, n2, n4
- .align 6
-function fft\n\()_neon
- push {r4, lr}
- mov r4, r0
- bl fft\n2\()_neon
- add r0, r4, #\n4*2*8
- bl fft\n4\()_neon
- add r0, r4, #\n4*3*8
- bl fft\n4\()_neon
- mov r0, r4
- pop {r4, lr}
- movrelx r1, X(ff_cos_\n)
- mov r2, #\n4/2
- b fft_pass_neon
-endfunc
-.endm
-
- def_fft 32, 16, 8
- def_fft 64, 32, 16
- def_fft 128, 64, 32
- def_fft 256, 128, 64
- def_fft 512, 256, 128
- def_fft 1024, 512, 256
- def_fft 2048, 1024, 512
- def_fft 4096, 2048, 1024
- def_fft 8192, 4096, 2048
- def_fft 16384, 8192, 4096
- def_fft 32768, 16384, 8192
- def_fft 65536, 32768, 16384
-
-function ff_fft_calc_neon, export=1
- ldr r2, [r0]
- sub r2, r2, #2
- movrel r3, fft_tab_neon
- ldr r3, [r3, r2, lsl #2]
- mov r0, r1
- bx r3
-endfunc
-
-function ff_fft_permute_neon, export=1
- push {r4,lr}
- mov r12, #1
- ldr r2, [r0] @ nbits
- ldr r3, [r0, #12] @ tmp_buf
- ldr r0, [r0, #8] @ revtab
- lsl r12, r12, r2
- mov r2, r12
-1:
- vld1.32 {d0-d1}, [r1,:128]!
- ldr r4, [r0], #4
- uxth lr, r4
- uxth r4, r4, ror #16
- add lr, r3, lr, lsl #3
- add r4, r3, r4, lsl #3
- vst1.32 {d0}, [lr,:64]
- vst1.32 {d1}, [r4,:64]
- subs r12, r12, #2
- bgt 1b
-
- sub r1, r1, r2, lsl #3
-1:
- vld1.32 {d0-d3}, [r3,:128]!
- vst1.32 {d0-d3}, [r1,:128]!
- subs r2, r2, #4
- bgt 1b
-
- pop {r4,pc}
-endfunc
-
-const fft_tab_neon
- .word fft4_neon
- .word fft8_neon
- .word fft16_neon
- .word fft32_neon
- .word fft64_neon
- .word fft128_neon
- .word fft256_neon
- .word fft512_neon
- .word fft1024_neon
- .word fft2048_neon
- .word fft4096_neon
- .word fft8192_neon
- .word fft16384_neon
- .word fft32768_neon
- .word fft65536_neon
-endconst
-
-const pmmp, align=4
- .float +1.0, -1.0, -1.0, +1.0
-endconst
-
-const mppm, align=4
- .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
-endconst
diff --git a/ffmpeg/libavcodec/arm/flacdsp_arm.S b/ffmpeg/libavcodec/arm/flacdsp_arm.S
deleted file mode 100644
index f8861c5..0000000
--- a/ffmpeg/libavcodec/arm/flacdsp_arm.S
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function flac_lpc_16_1_arm
- ldr r12, [sp]
- push {r4, lr}
- ldr r1, [r1]
- subs r12, r12, #2
- ldr lr, [r0], #4
- beq 2f
- it lt
- poplt {r4, pc}
-1:
- mul r4, lr, r1
- ldm r0, {r2, lr}
- add_sh r2, r2, r4, asr r3
- mul r4, r2, r1
- subs r12, r12, #2
- add_sh lr, lr, r4, asr r3
- stm r0!, {r2, lr}
- bgt 1b
- it lt
- poplt {r4, pc}
-2:
- mul r4, lr, r1
- ldr r2, [r0]
- add_sh r2, r2, r4, asr r3
- str r2, [r0]
- pop {r4, pc}
-endfunc
-
-function flac_lpc_16_2_arm
- ldr r12, [sp]
- subs r12, r12, r2
- it le
- bxle lr
-
- push {r4-r9, lr}
- ldm r0!, {r6, r7}
- ldm r1, {r8, r9}
- subs r12, r12, #1
- beq 2f
-1:
- mul r4, r6, r8
- mul r5, r7, r8
- mla r4, r7, r9, r4
- ldm r0, {r6, r7}
- add_sh r6, r6, r4, asr r3
- mla r5, r6, r9, r5
- add_sh r7, r7, r5, asr r3
- stm r0!, {r6, r7}
- subs r12, r12, #2
- bgt 1b
- it lt
- poplt {r4-r9, pc}
-2:
- mul r4, r6, r8
- mla r4, r7, r9, r4
- ldr r5, [r0]
- add_sh r5, r5, r4, asr r3
- str r5, [r0]
- pop {r4-r9, pc}
-endfunc
-
-function ff_flac_lpc_16_arm, export=1
- cmp r2, #2
- blt flac_lpc_16_1_arm
- beq flac_lpc_16_2_arm
-
- ldr r12, [sp]
- subs r12, r12, r2
- it le
- bxle lr
-
- push {r4-r9, lr}
-
- subs r12, r12, #1
- beq 3f
-1:
- sub lr, r2, #2
- mov r4, #0
- mov r5, #0
-
- ldr r7, [r0], #4
- ldr r9, [r1], #4
-2:
- mla r4, r7, r9, r4
- ldm r0!, {r6, r7}
- mla r5, r6, r9, r5
- ldm r1!, {r8, r9}
- mla r4, r6, r8, r4
- subs lr, lr, #2
- mla r5, r7, r8, r5
- bgt 2b
- blt 6f
-
- mla r4, r7, r9, r4
- ldr r7, [r0], #4
- mla r5, r7, r9, r5
- ldr r9, [r1], #4
-6:
- mla r4, r7, r9, r4
- ldm r0, {r6, r7}
- add_sh r6, r6, r4, asr r3
- mla r5, r6, r9, r5
- add_sh r7, r7, r5, asr r3
- stm r0!, {r6, r7}
- sub r0, r0, r2, lsl #2
- sub r1, r1, r2, lsl #2
-
- subs r12, r12, #2
- bgt 1b
- it lt
- poplt {r4-r9, pc}
-3:
- mov r4, #0
-4:
- ldr r5, [r1], #4
- ldr r6, [r0], #4
- mla r4, r5, r6, r4
- subs r2, r2, #1
- bgt 4b
- ldr r5, [r0]
- add_sh r5, r5, r4, asr r3
- str r5, [r0]
- pop {r4-r9, pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/flacdsp_init_arm.c b/ffmpeg/libavcodec/arm/flacdsp_init_arm.c
deleted file mode 100644
index 9b93942..0000000
--- a/ffmpeg/libavcodec/arm/flacdsp_init_arm.c
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavcodec/flacdsp.h"
-#include "config.h"
-
-void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
- int qlevel, int len);
-
-av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
- int bps)
-{
- if (bps <= 16)
- c->lpc = ff_flac_lpc_16_arm;
-}
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c b/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
deleted file mode 100644
index 37319ed..0000000
--- a/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * ARM optimized Format Conversion Utils
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/fmtconvert.h"
-
-void ff_int32_to_float_fmul_scalar_neon(float *dst, const int32_t *src,
- float mul, int len);
-
-void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
- float mul, int len);
-void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
- const int32_t *src, const float *mul,
- int len);
-
-void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
-void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
-
-void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
-
-av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_vfp(cpu_flags)) {
- if (!have_vfpv3(cpu_flags)) {
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
- c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
- }
-
- if (have_armv6(cpu_flags)) {
- c->float_to_int16 = ff_float_to_int16_vfp;
- }
- }
-
- if (have_neon(cpu_flags)) {
- c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
-
- if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
- c->float_to_int16 = ff_float_to_int16_neon;
- c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
- }
- }
-}
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_neon.S b/ffmpeg/libavcodec/arm/fmtconvert_neon.S
deleted file mode 100644
index 55d070e..0000000
--- a/ffmpeg/libavcodec/arm/fmtconvert_neon.S
+++ /dev/null
@@ -1,392 +0,0 @@
-/*
- * ARM NEON optimised Format Conversion Utils
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-function ff_float_to_int16_neon, export=1
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q9, q1, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vshrn.s32 d4, q8, #16
- vld1.64 {d0-d1}, [r1,:128]!
- vcvt.s32.f32 q0, q0, #16
- vshrn.s32 d5, q9, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vld1.64 {d16-d17},[r1,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r1,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vld1.64 {d0-d1}, [r1,:128]!
- vshrn.s32 d4, q8, #16
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r1,:128]!
- vshrn.s32 d5, q9, #16
- vcvt.s32.f32 q1, q1, #16
- vshrn.s32 d6, q0, #16
- vst1.64 {d4-d5}, [r0,:128]!
- vshrn.s32 d7, q1, #16
- vst1.64 {d6-d7}, [r0,:128]!
- bx lr
-3: vshrn.s32 d4, q8, #16
- vshrn.s32 d5, q9, #16
- vst1.64 {d4-d5}, [r0,:128]!
- bx lr
-endfunc
-
-function ff_float_to_int16_interleave_neon, export=1
- cmp r3, #2
- itt lt
- ldrlt r1, [r1]
- blt ff_float_to_int16_neon
- bne 4f
-
- ldr r3, [r1]
- ldr r1, [r1, #4]
-
- subs r2, r2, #8
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q8, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q9, q1, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 3f
- bics ip, r2, #15
- beq 2f
-1: subs ip, ip, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q10, q8, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vld1.64 {d26-d27},[r1,:128]!
- vsri.32 q11, q9, #16
- vst1.64 {d20-d21},[r0,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q12, q0, #16
- vld1.64 {d16-d17},[r3,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d25},[r0,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r3,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r1,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r1,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d26-d27},[r0,:128]!
- bne 1b
- ands r2, r2, #15
- beq 3f
-2: vsri.32 q10, q8, #16
- vld1.64 {d0-d1}, [r3,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r3,:128]!
- vcvt.s32.f32 q1, q1, #16
- vld1.64 {d24-d25},[r1,:128]!
- vcvt.s32.f32 q12, q12, #16
- vsri.32 q11, q9, #16
- vld1.64 {d26-d27},[r1,:128]!
- vcvt.s32.f32 q13, q13, #16
- vst1.64 {d20-d21},[r0,:128]!
- vsri.32 q12, q0, #16
- vst1.64 {d22-d23},[r0,:128]!
- vsri.32 q13, q1, #16
- vst1.64 {d24-d27},[r0,:128]!
- bx lr
-3: vsri.32 q10, q8, #16
- vsri.32 q11, q9, #16
- vst1.64 {d20-d23},[r0,:128]!
- bx lr
-
-4: push {r4-r8,lr}
- cmp r3, #4
- lsl ip, r3, #1
- blt 4f
-
- @ 4 channels
-5: ldmia r1!, {r4-r7}
- mov lr, r2
- mov r8, r0
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #8
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 q9, q8, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 q11, q10, #16
- vld1.64 {d4-d5}, [r6,:128]!
- vcvt.s32.f32 q2, q2, #16
- vzip.32 d18, d22
- vld1.64 {d6-d7}, [r7,:128]!
- vcvt.s32.f32 q3, q3, #16
- vzip.32 d19, d23
- vst1.64 {d18}, [r8], ip
- vsri.32 q1, q0, #16
- vst1.64 {d22}, [r8], ip
- vsri.32 q3, q2, #16
- vst1.64 {d19}, [r8], ip
- vzip.32 d2, d6
- vst1.64 {d23}, [r8], ip
- vzip.32 d3, d7
- beq 7f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.64 {d2}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.64 {d6}, [r8], ip
- vld1.64 {d20-d21},[r6,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.64 {d3}, [r8], ip
- vld1.64 {d22-d23},[r7,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.64 {d7}, [r8], ip
- b 6b
-7: vst1.64 {d2}, [r8], ip
- vst1.64 {d6}, [r8], ip
- vst1.64 {d3}, [r8], ip
- vst1.64 {d7}, [r8], ip
- subs r3, r3, #4
- it eq
- popeq {r4-r8,pc}
- cmp r3, #4
- add r0, r0, #8
- bge 5b
-
- @ 2 channels
-4: cmp r3, #2
- blt 4f
- ldmia r1!, {r4-r5}
- mov lr, r2
- mov r8, r0
- tst lr, #8
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- beq 6f
- subs lr, lr, #8
- beq 7f
- vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
-6: subs lr, lr, #16
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vsri.32 d18, d16, #16
- vld1.64 {d2-d3}, [r5,:128]!
- vcvt.s32.f32 q1, q1, #16
- vsri.32 d19, d17, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r5,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vsri.32 d2, d0, #16
- vst1.32 {d19[1]}, [r8], ip
- vsri.32 d3, d1, #16
- vst1.32 {d22[0]}, [r8], ip
- vsri.32 d6, d4, #16
- vst1.32 {d22[1]}, [r8], ip
- vsri.32 d7, d5, #16
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
- beq 6f
- vld1.64 {d16-d17},[r4,:128]!
- vcvt.s32.f32 q8, q8, #16
- vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vld1.64 {d18-d19},[r5,:128]!
- vcvt.s32.f32 q9, q9, #16
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vld1.64 {d20-d21},[r4,:128]!
- vcvt.s32.f32 q10, q10, #16
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vld1.64 {d22-d23},[r5,:128]!
- vcvt.s32.f32 q11, q11, #16
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- bgt 6b
-6: vst1.32 {d2[0]}, [r8], ip
- vst1.32 {d2[1]}, [r8], ip
- vst1.32 {d3[0]}, [r8], ip
- vst1.32 {d3[1]}, [r8], ip
- vst1.32 {d6[0]}, [r8], ip
- vst1.32 {d6[1]}, [r8], ip
- vst1.32 {d7[0]}, [r8], ip
- vst1.32 {d7[1]}, [r8], ip
- b 8f
-7: vsri.32 d18, d16, #16
- vsri.32 d19, d17, #16
- vst1.32 {d18[0]}, [r8], ip
- vsri.32 d22, d20, #16
- vst1.32 {d18[1]}, [r8], ip
- vsri.32 d23, d21, #16
- vst1.32 {d19[0]}, [r8], ip
- vst1.32 {d19[1]}, [r8], ip
- vst1.32 {d22[0]}, [r8], ip
- vst1.32 {d22[1]}, [r8], ip
- vst1.32 {d23[0]}, [r8], ip
- vst1.32 {d23[1]}, [r8], ip
-8: subs r3, r3, #2
- add r0, r0, #4
- it eq
- popeq {r4-r8,pc}
-
- @ 1 channel
-4: ldr r4, [r1],#4
- tst r2, #8
- mov lr, r2
- mov r5, r0
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- bne 8f
-6: subs lr, lr, #16
- vld1.64 {d4-d5}, [r4,:128]!
- vcvt.s32.f32 q2, q2, #16
- vld1.64 {d6-d7}, [r4,:128]!
- vcvt.s32.f32 q3, q3, #16
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- beq 7f
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
-7: vst1.16 {d4[1]}, [r5,:16], ip
- vst1.16 {d4[3]}, [r5,:16], ip
- vst1.16 {d5[1]}, [r5,:16], ip
- vst1.16 {d5[3]}, [r5,:16], ip
- vst1.16 {d6[1]}, [r5,:16], ip
- vst1.16 {d6[3]}, [r5,:16], ip
- vst1.16 {d7[1]}, [r5,:16], ip
- vst1.16 {d7[3]}, [r5,:16], ip
- bgt 6b
- pop {r4-r8,pc}
-8: subs lr, lr, #8
- vst1.16 {d0[1]}, [r5,:16], ip
- vst1.16 {d0[3]}, [r5,:16], ip
- vst1.16 {d1[1]}, [r5,:16], ip
- vst1.16 {d1[3]}, [r5,:16], ip
- vst1.16 {d2[1]}, [r5,:16], ip
- vst1.16 {d2[3]}, [r5,:16], ip
- vst1.16 {d3[1]}, [r5,:16], ip
- vst1.16 {d3[3]}, [r5,:16], ip
- it eq
- popeq {r4-r8,pc}
- vld1.64 {d0-d1}, [r4,:128]!
- vcvt.s32.f32 q0, q0, #16
- vld1.64 {d2-d3}, [r4,:128]!
- vcvt.s32.f32 q1, q1, #16
- b 6b
-endfunc
-
-function ff_int32_to_float_fmul_scalar_neon, export=1
-VFP vdup.32 q0, d0[0]
-VFP len .req r2
-NOVFP vdup.32 q0, r2
-NOVFP len .req r3
-
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
-1: subs len, len, #8
- pld [r1, #16]
- vmul.f32 q9, q3, q0
- vmul.f32 q10, q8, q0
- beq 2f
- vld1.32 {q1},[r1,:128]!
- vcvt.f32.s32 q3, q1
- vld1.32 {q2},[r1,:128]!
- vcvt.f32.s32 q8, q2
- vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- b 1b
-2: vst1.32 {q9}, [r0,:128]!
- vst1.32 {q10},[r0,:128]!
- bx lr
- .unreq len
-endfunc
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
deleted file mode 100644
index b14af45..0000000
--- a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-/**
- * ARM VFP optimised int32 to float conversion.
- * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
- * (16 bytes alignment is best for BCM2835), little-endian.
- */
-@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
-function ff_int32_to_float_fmul_array8_vfp, export=1
- push {lr}
- ldr a1, [sp, #4]
- subs lr, a1, #3*8
- bcc 50f @ too short to pipeline
- @ Now need to find (len / 8) % 3. The approximation
- @ x / 24 = (x * 0xAB) >> 12
- @ is good for x < 4096, which is true for both AC3 and DCA.
- mov a1, #0xAB
- ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
- mul a1, lr, a1
- vpush {s16-s31}
- mov a1, a1, lsr #12
- add a1, a1, a1, lsl #1
- rsb a1, a1, lr, lsr #3
- cmp a1, #1
- fmrx a1, FPSCR
- fmxr FPSCR, ip
- beq 11f
- blo 10f
- @ Array is (2 + multiple of 3) x 8 floats long
- @ drop through...
- vldmia a3!, {s16-s23}
- vldmia a4!, {s2,s3}
- vldmia a3!, {s24-s31}
- vcvt.f32.s32 s16, s16
- vcvt.f32.s32 s17, s17
- vcvt.f32.s32 s18, s18
- vcvt.f32.s32 s19, s19
- vcvt.f32.s32 s20, s20
- vcvt.f32.s32 s21, s21
- vcvt.f32.s32 s22, s22
- vcvt.f32.s32 s23, s23
- vmul.f32 s16, s16, s2
- @ drop through...
-3:
- vldmia a3!, {s8-s15}
- vldmia a4!, {s1}
- vcvt.f32.s32 s24, s24
- vcvt.f32.s32 s25, s25
- vcvt.f32.s32 s26, s26
- vcvt.f32.s32 s27, s27
- vcvt.f32.s32 s28, s28
- vcvt.f32.s32 s29, s29
- vcvt.f32.s32 s30, s30
- vcvt.f32.s32 s31, s31
- vmul.f32 s24, s24, s3
- vstmia a2!, {s16-s19}
- vstmia a2!, {s20-s23}
-2:
- vldmia a3!, {s16-s23}
- vldmia a4!, {s2}
- vcvt.f32.s32 s8, s8
- vcvt.f32.s32 s9, s9
- vcvt.f32.s32 s10, s10
- vcvt.f32.s32 s11, s11
- vcvt.f32.s32 s12, s12
- vcvt.f32.s32 s13, s13
- vcvt.f32.s32 s14, s14
- vcvt.f32.s32 s15, s15
- vmul.f32 s8, s8, s1
- vstmia a2!, {s24-s27}
- vstmia a2!, {s28-s31}
-1:
- vldmia a3!, {s24-s31}
- vldmia a4!, {s3}
- vcvt.f32.s32 s16, s16
- vcvt.f32.s32 s17, s17
- vcvt.f32.s32 s18, s18
- vcvt.f32.s32 s19, s19
- vcvt.f32.s32 s20, s20
- vcvt.f32.s32 s21, s21
- vcvt.f32.s32 s22, s22
- vcvt.f32.s32 s23, s23
- vmul.f32 s16, s16, s2
- vstmia a2!, {s8-s11}
- vstmia a2!, {s12-s15}
-
- subs lr, lr, #8*3
- bpl 3b
-
- vcvt.f32.s32 s24, s24
- vcvt.f32.s32 s25, s25
- vcvt.f32.s32 s26, s26
- vcvt.f32.s32 s27, s27
- vcvt.f32.s32 s28, s28
- vcvt.f32.s32 s29, s29
- vcvt.f32.s32 s30, s30
- vcvt.f32.s32 s31, s31
- vmul.f32 s24, s24, s3
- vstmia a2!, {s16-s19}
- vstmia a2!, {s20-s23}
- vstmia a2!, {s24-s27}
- vstmia a2!, {s28-s31}
-
- fmxr FPSCR, a1
- vpop {s16-s31}
- pop {pc}
-
-10: @ Array is (multiple of 3) x 8 floats long
- vldmia a3!, {s8-s15}
- vldmia a4!, {s1,s2}
- vldmia a3!, {s16-s23}
- vcvt.f32.s32 s8, s8
- vcvt.f32.s32 s9, s9
- vcvt.f32.s32 s10, s10
- vcvt.f32.s32 s11, s11
- vcvt.f32.s32 s12, s12
- vcvt.f32.s32 s13, s13
- vcvt.f32.s32 s14, s14
- vcvt.f32.s32 s15, s15
- vmul.f32 s8, s8, s1
- b 1b
-
-11: @ Array is (1 + multiple of 3) x 8 floats long
- vldmia a3!, {s24-s31}
- vldmia a4!, {s3}
- vldmia a3!, {s8-s15}
- vldmia a4!, {s1}
- vcvt.f32.s32 s24, s24
- vcvt.f32.s32 s25, s25
- vcvt.f32.s32 s26, s26
- vcvt.f32.s32 s27, s27
- vcvt.f32.s32 s28, s28
- vcvt.f32.s32 s29, s29
- vcvt.f32.s32 s30, s30
- vcvt.f32.s32 s31, s31
- vmul.f32 s24, s24, s3
- b 2b
-
-50:
- ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
- fmrx ip, FPSCR
- fmxr FPSCR, lr
-51:
- vldmia a3!, {s8-s15}
- vldmia a4!, {s0}
- vcvt.f32.s32 s8, s8
- vcvt.f32.s32 s9, s9
- vcvt.f32.s32 s10, s10
- vcvt.f32.s32 s11, s11
- vcvt.f32.s32 s12, s12
- vcvt.f32.s32 s13, s13
- vcvt.f32.s32 s14, s14
- vcvt.f32.s32 s15, s15
- vmul.f32 s8, s8, s0
- subs a1, a1, #8
- vstmia a2!, {s8-s11}
- vstmia a2!, {s12-s15}
- bne 51b
-
- fmxr FPSCR, ip
- pop {pc}
-endfunc
-
-/**
- * ARM VFP optimised int32 to float conversion.
- * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
- * (16 bytes alignment is best for BCM2835), little-endian.
- * TODO: could be further optimised by unrolling and interleaving, as above
- */
-@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
-function ff_int32_to_float_fmul_scalar_vfp, export=1
-VFP tmp .req a4
-VFP len .req a3
-NOVFP tmp .req a3
-NOVFP len .req a4
-NOVFP vmov s0, a3
- ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
- fmrx ip, FPSCR
- fmxr FPSCR, tmp
-1:
- vldmia a2!, {s8-s15}
- vcvt.f32.s32 s8, s8
- vcvt.f32.s32 s9, s9
- vcvt.f32.s32 s10, s10
- vcvt.f32.s32 s11, s11
- vcvt.f32.s32 s12, s12
- vcvt.f32.s32 s13, s13
- vcvt.f32.s32 s14, s14
- vcvt.f32.s32 s15, s15
- vmul.f32 s8, s8, s0
- subs len, len, #8
- vstmia a1!, {s8-s11}
- vstmia a1!, {s12-s15}
- bne 1b
-
- fmxr FPSCR, ip
- bx lr
-endfunc
- .unreq tmp
- .unreq len
diff --git a/ffmpeg/libavcodec/arm/h264chroma_init_arm.c b/ffmpeg/libavcodec/arm/h264chroma_init_arm.c
deleted file mode 100644
index 13f7e0d..0000000
--- a/ffmpeg/libavcodec/arm/h264chroma_init_arm.c
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * ARM NEON optimised H.264 chroma functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/h264chroma.h"
-
-void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
-
-void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
-
-av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
-{
- const int high_bit_depth = bit_depth > 8;
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags) && !high_bit_depth) {
- c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
- c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
- c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
-
- c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
- c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
- c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/h264cmc_neon.S b/ffmpeg/libavcodec/arm/h264cmc_neon.S
deleted file mode 100644
index 0bcae11..0000000
--- a/ffmpeg/libavcodec/arm/h264cmc_neon.S
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-.macro h264_chroma_mc8 type, codec=h264
-function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
- push {r4-r7, lr}
- ldrd r4, r5, [sp, #20]
- .ifc \type,avg
- mov lr, r0
- .endif
- pld [r1]
- pld [r1, r2]
-
- .ifc \codec,rv40
- movrel r6, rv40bias
- lsr r7, r5, #1
- add r6, r6, r7, lsl #3
- lsr r7, r4, #1
- add r6, r6, r7, lsl #1
- vld1.16 {d22[],d23[]}, [r6,:16]
- .endif
- .ifc \codec,vc1
- vmov.u16 q11, #28
- .endif
-
-A muls r7, r4, r5
-T mul r7, r4, r5
-T cmp r7, #0
- rsb r6, r7, r5, lsl #3
- rsb r12, r7, r4, lsl #3
- sub r4, r7, r4, lsl #3
- sub r4, r4, r5, lsl #3
- add r4, r4, #64
-
- beq 2f
-
- vdup.8 d0, r4
- vdup.8 d1, r12
- vld1.8 {d4, d5}, [r1], r2
- vdup.8 d2, r6
- vdup.8 d3, r7
- vext.8 d5, d4, d5, #1
-
-1: vld1.8 {d6, d7}, [r1], r2
- vmull.u8 q8, d4, d0
- vmlal.u8 q8, d5, d1
- vext.8 d7, d6, d7, #1
- vld1.8 {d4, d5}, [r1], r2
- vmlal.u8 q8, d6, d2
- pld [r1]
- vext.8 d5, d4, d5, #1
- vmlal.u8 q8, d7, d3
- vmull.u8 q9, d6, d0
- subs r3, r3, #2
- vmlal.u8 q9, d7, d1
- vmlal.u8 q9, d4, d2
- vmlal.u8 q9, d5, d3
- pld [r1, r2]
- .ifc \codec,h264
- vrshrn.u16 d16, q8, #6
- vrshrn.u16 d17, q9, #6
- .else
- vadd.u16 q8, q8, q11
- vadd.u16 q9, q9, q11
- vshrn.u16 d16, q8, #6
- vshrn.u16 d17, q9, #6
- .endif
- .ifc \type,avg
- vld1.8 {d20}, [lr,:64], r2
- vld1.8 {d21}, [lr,:64], r2
- vrhadd.u8 q8, q8, q10
- .endif
- vst1.8 {d16}, [r0,:64], r2
- vst1.8 {d17}, [r0,:64], r2
- bgt 1b
-
- pop {r4-r7, pc}
-
-2: tst r6, r6
- add r12, r12, r6
- vdup.8 d0, r4
- vdup.8 d1, r12
-
- beq 4f
-
- vld1.8 {d4}, [r1], r2
-
-3: vld1.8 {d6}, [r1], r2
- vmull.u8 q8, d4, d0
- vmlal.u8 q8, d6, d1
- vld1.8 {d4}, [r1], r2
- vmull.u8 q9, d6, d0
- vmlal.u8 q9, d4, d1
- pld [r1]
- .ifc \codec,h264
- vrshrn.u16 d16, q8, #6
- vrshrn.u16 d17, q9, #6
- .else
- vadd.u16 q8, q8, q11
- vadd.u16 q9, q9, q11
- vshrn.u16 d16, q8, #6
- vshrn.u16 d17, q9, #6
- .endif
- pld [r1, r2]
- .ifc \type,avg
- vld1.8 {d20}, [lr,:64], r2
- vld1.8 {d21}, [lr,:64], r2
- vrhadd.u8 q8, q8, q10
- .endif
- subs r3, r3, #2
- vst1.8 {d16}, [r0,:64], r2
- vst1.8 {d17}, [r0,:64], r2
- bgt 3b
-
- pop {r4-r7, pc}
-
-4: vld1.8 {d4, d5}, [r1], r2
- vld1.8 {d6, d7}, [r1], r2
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- pld [r1]
- subs r3, r3, #2
- vmull.u8 q8, d4, d0
- vmlal.u8 q8, d5, d1
- vmull.u8 q9, d6, d0
- vmlal.u8 q9, d7, d1
- pld [r1, r2]
- .ifc \codec,h264
- vrshrn.u16 d16, q8, #6
- vrshrn.u16 d17, q9, #6
- .else
- vadd.u16 q8, q8, q11
- vadd.u16 q9, q9, q11
- vshrn.u16 d16, q8, #6
- vshrn.u16 d17, q9, #6
- .endif
- .ifc \type,avg
- vld1.8 {d20}, [lr,:64], r2
- vld1.8 {d21}, [lr,:64], r2
- vrhadd.u8 q8, q8, q10
- .endif
- vst1.8 {d16}, [r0,:64], r2
- vst1.8 {d17}, [r0,:64], r2
- bgt 4b
-
- pop {r4-r7, pc}
-endfunc
-.endm
-
-/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
-.macro h264_chroma_mc4 type, codec=h264
-function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
- push {r4-r7, lr}
- ldrd r4, r5, [sp, #20]
- .ifc \type,avg
- mov lr, r0
- .endif
- pld [r1]
- pld [r1, r2]
-
- .ifc \codec,rv40
- movrel r6, rv40bias
- lsr r7, r5, #1
- add r6, r6, r7, lsl #3
- lsr r7, r4, #1
- add r6, r6, r7, lsl #1
- vld1.16 {d22[],d23[]}, [r6,:16]
- .endif
- .ifc \codec,vc1
- vmov.u16 q11, #28
- .endif
-
-A muls r7, r4, r5
-T mul r7, r4, r5
-T cmp r7, #0
- rsb r6, r7, r5, lsl #3
- rsb r12, r7, r4, lsl #3
- sub r4, r7, r4, lsl #3
- sub r4, r4, r5, lsl #3
- add r4, r4, #64
-
- beq 2f
-
- vdup.8 d0, r4
- vdup.8 d1, r12
- vld1.8 {d4}, [r1], r2
- vdup.8 d2, r6
- vdup.8 d3, r7
-
- vext.8 d5, d4, d5, #1
- vtrn.32 d4, d5
-
- vtrn.32 d0, d1
- vtrn.32 d2, d3
-
-1: vld1.8 {d6}, [r1], r2
- vext.8 d7, d6, d7, #1
- vtrn.32 d6, d7
- vmull.u8 q8, d4, d0
- vmlal.u8 q8, d6, d2
- vld1.8 {d4}, [r1], r2
- vext.8 d5, d4, d5, #1
- vtrn.32 d4, d5
- pld [r1]
- vmull.u8 q9, d6, d0
- vmlal.u8 q9, d4, d2
- vadd.i16 d16, d16, d17
- vadd.i16 d17, d18, d19
- .ifc \codec,h264
- vrshrn.u16 d16, q8, #6
- .else
- vadd.u16 q8, q8, q11
- vshrn.u16 d16, q8, #6
- .endif
- subs r3, r3, #2
- pld [r1, r2]
- .ifc \type,avg
- vld1.32 {d20[0]}, [lr,:32], r2
- vld1.32 {d20[1]}, [lr,:32], r2
- vrhadd.u8 d16, d16, d20
- .endif
- vst1.32 {d16[0]}, [r0,:32], r2
- vst1.32 {d16[1]}, [r0,:32], r2
- bgt 1b
-
- pop {r4-r7, pc}
-
-2: tst r6, r6
- add r12, r12, r6
- vdup.8 d0, r4
- vdup.8 d1, r12
- vtrn.32 d0, d1
-
- beq 4f
-
- vext.32 d1, d0, d1, #1
- vld1.32 {d4[0]}, [r1], r2
-
-3: vld1.32 {d4[1]}, [r1], r2
- vmull.u8 q8, d4, d0
- vld1.32 {d4[0]}, [r1], r2
- vmull.u8 q9, d4, d1
- vadd.i16 d16, d16, d17
- vadd.i16 d17, d18, d19
- pld [r1]
- .ifc \codec,h264
- vrshrn.u16 d16, q8, #6
- .else
- vadd.u16 q8, q8, q11
- vshrn.u16 d16, q8, #6
- .endif
- .ifc \type,avg
- vld1.32 {d20[0]}, [lr,:32], r2
- vld1.32 {d20[1]}, [lr,:32], r2
- vrhadd.u8 d16, d16, d20
- .endif
- subs r3, r3, #2
- pld [r1, r2]
- vst1.32 {d16[0]}, [r0,:32], r2
- vst1.32 {d16[1]}, [r0,:32], r2
- bgt 3b
-
- pop {r4-r7, pc}
-
-4: vld1.8 {d4}, [r1], r2
- vld1.8 {d6}, [r1], r2
- vext.8 d5, d4, d5, #1
- vext.8 d7, d6, d7, #1
- vtrn.32 d4, d5
- vtrn.32 d6, d7
- vmull.u8 q8, d4, d0
- vmull.u8 q9, d6, d0
- subs r3, r3, #2
- vadd.i16 d16, d16, d17
- vadd.i16 d17, d18, d19
- pld [r1]
- .ifc \codec,h264
- vrshrn.u16 d16, q8, #6
- .else
- vadd.u16 q8, q8, q11
- vshrn.u16 d16, q8, #6
- .endif
- .ifc \type,avg
- vld1.32 {d20[0]}, [lr,:32], r2
- vld1.32 {d20[1]}, [lr,:32], r2
- vrhadd.u8 d16, d16, d20
- .endif
- pld [r1]
- vst1.32 {d16[0]}, [r0,:32], r2
- vst1.32 {d16[1]}, [r0,:32], r2
- bgt 4b
-
- pop {r4-r7, pc}
-endfunc
-.endm
-
-.macro h264_chroma_mc2 type
-function ff_\type\()_h264_chroma_mc2_neon, export=1
- push {r4-r6, lr}
- ldr r4, [sp, #16]
- ldr lr, [sp, #20]
- pld [r1]
- pld [r1, r2]
- orrs r5, r4, lr
- beq 2f
-
- mul r5, r4, lr
- rsb r6, r5, lr, lsl #3
- rsb r12, r5, r4, lsl #3
- sub r4, r5, r4, lsl #3
- sub r4, r4, lr, lsl #3
- add r4, r4, #64
- vdup.8 d0, r4
- vdup.8 d2, r12
- vdup.8 d1, r6
- vdup.8 d3, r5
- vtrn.16 q0, q1
-1:
- vld1.32 {d4[0]}, [r1], r2
- vld1.32 {d4[1]}, [r1], r2
- vrev64.32 d5, d4
- vld1.32 {d5[1]}, [r1]
- vext.8 q3, q2, q2, #1
- vtrn.16 q2, q3
- vmull.u8 q8, d4, d0
- vmlal.u8 q8, d5, d1
- .ifc \type,avg
- vld1.16 {d18[0]}, [r0,:16], r2
- vld1.16 {d18[1]}, [r0,:16]
- sub r0, r0, r2
- .endif
- vtrn.32 d16, d17
- vadd.i16 d16, d16, d17
- vrshrn.u16 d16, q8, #6
- .ifc \type,avg
- vrhadd.u8 d16, d16, d18
- .endif
- vst1.16 {d16[0]}, [r0,:16], r2
- vst1.16 {d16[1]}, [r0,:16], r2
- subs r3, r3, #2
- bgt 1b
- pop {r4-r6, pc}
-2:
- .ifc \type,put
- ldrh_post r5, r1, r2
- strh_post r5, r0, r2
- ldrh_post r6, r1, r2
- strh_post r6, r0, r2
- .else
- vld1.16 {d16[0]}, [r1], r2
- vld1.16 {d16[1]}, [r1], r2
- vld1.16 {d18[0]}, [r0,:16], r2
- vld1.16 {d18[1]}, [r0,:16]
- sub r0, r0, r2
- vrhadd.u8 d16, d16, d18
- vst1.16 {d16[0]}, [r0,:16], r2
- vst1.16 {d16[1]}, [r0,:16], r2
- .endif
- subs r3, r3, #2
- bgt 2b
- pop {r4-r6, pc}
-endfunc
-.endm
-
- h264_chroma_mc8 put
- h264_chroma_mc8 avg
- h264_chroma_mc4 put
- h264_chroma_mc4 avg
- h264_chroma_mc2 put
- h264_chroma_mc2 avg
-
-#if CONFIG_RV40_DECODER
-const rv40bias
- .short 0, 16, 32, 16
- .short 32, 28, 32, 28
- .short 0, 32, 16, 32
- .short 32, 28, 32, 28
-endconst
-
- h264_chroma_mc8 put, rv40
- h264_chroma_mc8 avg, rv40
- h264_chroma_mc4 put, rv40
- h264_chroma_mc4 avg, rv40
-#endif
-
-#if CONFIG_VC1_DECODER
- h264_chroma_mc8 put, vc1
- h264_chroma_mc8 avg, vc1
- h264_chroma_mc4 put, vc1
- h264_chroma_mc4 avg, vc1
-#endif
diff --git a/ffmpeg/libavcodec/arm/h264dsp_init_arm.c b/ffmpeg/libavcodec/arm/h264dsp_init_arm.c
deleted file mode 100644
index 2cafbaf..0000000
--- a/ffmpeg/libavcodec/arm/h264dsp_init_arm.c
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/h264dsp.h"
-
-int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
-
-void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
- int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
- int beta, int8_t *tc0);
-void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
- int beta, int8_t *tc0);
-void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
- int beta, int8_t *tc0);
-
-void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
- int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
- int log2_den, int weight, int offset);
-void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
- int log2_den, int weight, int offset);
-
-void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
- int height, int log2_den, int weightd,
- int weights, int offset);
-void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
- int height, int log2_den, int weightd,
- int weights, int offset);
-void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
- int height, int log2_den, int weightd,
- int weights, int offset);
-
-void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
- int16_t *block, int stride,
- const uint8_t nnzc[6*8]);
-void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
- int16_t *block, int stride,
- const uint8_t nnzc[6*8]);
-void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
- int16_t *block, int stride,
- const uint8_t nnzc[6*8]);
-
-void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
-void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
- int16_t *block, int stride,
- const uint8_t nnzc[6*8]);
-
-static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
- const int chroma_format_idc)
-{
-#if HAVE_NEON
- if (bit_depth == 8) {
- c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
- c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
- if(chroma_format_idc == 1){
- c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
- c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
- }
-
- c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
- c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
- c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
-
- c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
- c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
- c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
-
- c->h264_idct_add = ff_h264_idct_add_neon;
- c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
- c->h264_idct_add16 = ff_h264_idct_add16_neon;
- c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
- if (chroma_format_idc == 1)
- c->h264_idct_add8 = ff_h264_idct_add8_neon;
- c->h264_idct8_add = ff_h264_idct8_add_neon;
- c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
- c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
- }
-#endif // HAVE_NEON
-}
-
-av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
- const int chroma_format_idc)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_armv6(cpu_flags))
- c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
- if (have_neon(cpu_flags))
- h264dsp_init_neon(c, bit_depth, chroma_format_idc);
-}
diff --git a/ffmpeg/libavcodec/arm/h264dsp_neon.S b/ffmpeg/libavcodec/arm/h264dsp_neon.S
deleted file mode 100644
index 274a547..0000000
--- a/ffmpeg/libavcodec/arm/h264dsp_neon.S
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-#include "neon.S"
-
- /* H.264 loop filter */
-
-.macro h264_loop_filter_start
- ldr r12, [sp]
- tst r2, r2
- ldr r12, [r12]
- it ne
- tstne r3, r3
- vmov.32 d24[0], r12
- and r12, r12, r12, lsl #16
- it eq
- bxeq lr
- ands r12, r12, r12, lsl #8
- it lt
- bxlt lr
-.endm
-
-.macro h264_loop_filter_luma
- vdup.8 q11, r2 @ alpha
- vmovl.u8 q12, d24
- vabd.u8 q6, q8, q0 @ abs(p0 - q0)
- vmovl.u16 q12, d24
- vabd.u8 q14, q9, q8 @ abs(p1 - p0)
- vsli.16 q12, q12, #8
- vabd.u8 q15, q1, q0 @ abs(q1 - q0)
- vsli.32 q12, q12, #16
- vclt.u8 q6, q6, q11 @ < alpha
- vdup.8 q11, r3 @ beta
- vclt.s8 q7, q12, #0
- vclt.u8 q14, q14, q11 @ < beta
- vclt.u8 q15, q15, q11 @ < beta
- vbic q6, q6, q7
- vabd.u8 q4, q10, q8 @ abs(p2 - p0)
- vand q6, q6, q14
- vabd.u8 q5, q2, q0 @ abs(q2 - q0)
- vclt.u8 q4, q4, q11 @ < beta
- vand q6, q6, q15
- vclt.u8 q5, q5, q11 @ < beta
- vand q4, q4, q6
- vand q5, q5, q6
- vand q12, q12, q6
- vrhadd.u8 q14, q8, q0
- vsub.i8 q6, q12, q4
- vqadd.u8 q7, q9, q12
- vhadd.u8 q10, q10, q14
- vsub.i8 q6, q6, q5
- vhadd.u8 q14, q2, q14
- vmin.u8 q7, q7, q10
- vqsub.u8 q11, q9, q12
- vqadd.u8 q2, q1, q12
- vmax.u8 q7, q7, q11
- vqsub.u8 q11, q1, q12
- vmin.u8 q14, q2, q14
- vmovl.u8 q2, d0
- vmax.u8 q14, q14, q11
- vmovl.u8 q10, d1
- vsubw.u8 q2, q2, d16
- vsubw.u8 q10, q10, d17
- vshl.i16 q2, q2, #2
- vshl.i16 q10, q10, #2
- vaddw.u8 q2, q2, d18
- vaddw.u8 q10, q10, d19
- vsubw.u8 q2, q2, d2
- vsubw.u8 q10, q10, d3
- vrshrn.i16 d4, q2, #3
- vrshrn.i16 d5, q10, #3
- vbsl q4, q7, q9
- vbsl q5, q14, q1
- vneg.s8 q7, q6
- vmovl.u8 q14, d16
- vmin.s8 q2, q2, q6
- vmovl.u8 q6, d17
- vmax.s8 q2, q2, q7
- vmovl.u8 q11, d0
- vmovl.u8 q12, d1
- vaddw.s8 q14, q14, d4
- vaddw.s8 q6, q6, d5
- vsubw.s8 q11, q11, d4
- vsubw.s8 q12, q12, d5
- vqmovun.s16 d16, q14
- vqmovun.s16 d17, q6
- vqmovun.s16 d0, q11
- vqmovun.s16 d1, q12
-.endm
-
-function ff_h264_v_loop_filter_luma_neon, export=1
- h264_loop_filter_start
-
- vld1.8 {d0, d1}, [r0,:128], r1
- vld1.8 {d2, d3}, [r0,:128], r1
- vld1.8 {d4, d5}, [r0,:128], r1
- sub r0, r0, r1, lsl #2
- sub r0, r0, r1, lsl #1
- vld1.8 {d20,d21}, [r0,:128], r1
- vld1.8 {d18,d19}, [r0,:128], r1
- vld1.8 {d16,d17}, [r0,:128], r1
-
- vpush {d8-d15}
-
- h264_loop_filter_luma
-
- sub r0, r0, r1, lsl #1
- vst1.8 {d8, d9}, [r0,:128], r1
- vst1.8 {d16,d17}, [r0,:128], r1
- vst1.8 {d0, d1}, [r0,:128], r1
- vst1.8 {d10,d11}, [r0,:128]
-
- vpop {d8-d15}
- bx lr
-endfunc
-
-function ff_h264_h_loop_filter_luma_neon, export=1
- h264_loop_filter_start
-
- sub r0, r0, #4
- vld1.8 {d6}, [r0], r1
- vld1.8 {d20}, [r0], r1
- vld1.8 {d18}, [r0], r1
- vld1.8 {d16}, [r0], r1
- vld1.8 {d0}, [r0], r1
- vld1.8 {d2}, [r0], r1
- vld1.8 {d4}, [r0], r1
- vld1.8 {d26}, [r0], r1
- vld1.8 {d7}, [r0], r1
- vld1.8 {d21}, [r0], r1
- vld1.8 {d19}, [r0], r1
- vld1.8 {d17}, [r0], r1
- vld1.8 {d1}, [r0], r1
- vld1.8 {d3}, [r0], r1
- vld1.8 {d5}, [r0], r1
- vld1.8 {d27}, [r0], r1
-
- transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
-
- vpush {d8-d15}
-
- h264_loop_filter_luma
-
- transpose_4x4 q4, q8, q0, q5
-
- sub r0, r0, r1, lsl #4
- add r0, r0, #2
- vst1.32 {d8[0]}, [r0], r1
- vst1.32 {d16[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d10[0]}, [r0], r1
- vst1.32 {d8[1]}, [r0], r1
- vst1.32 {d16[1]}, [r0], r1
- vst1.32 {d0[1]}, [r0], r1
- vst1.32 {d10[1]}, [r0], r1
- vst1.32 {d9[0]}, [r0], r1
- vst1.32 {d17[0]}, [r0], r1
- vst1.32 {d1[0]}, [r0], r1
- vst1.32 {d11[0]}, [r0], r1
- vst1.32 {d9[1]}, [r0], r1
- vst1.32 {d17[1]}, [r0], r1
- vst1.32 {d1[1]}, [r0], r1
- vst1.32 {d11[1]}, [r0], r1
-
- vpop {d8-d15}
- bx lr
-endfunc
-
-.macro h264_loop_filter_chroma
- vdup.8 d22, r2 @ alpha
- vmovl.u8 q12, d24
- vabd.u8 d26, d16, d0 @ abs(p0 - q0)
- vmovl.u8 q2, d0
- vabd.u8 d28, d18, d16 @ abs(p1 - p0)
- vsubw.u8 q2, q2, d16
- vsli.16 d24, d24, #8
- vshl.i16 q2, q2, #2
- vabd.u8 d30, d2, d0 @ abs(q1 - q0)
- vaddw.u8 q2, q2, d18
- vclt.u8 d26, d26, d22 @ < alpha
- vsubw.u8 q2, q2, d2
- vdup.8 d22, r3 @ beta
- vrshrn.i16 d4, q2, #3
- vclt.u8 d28, d28, d22 @ < beta
- vclt.u8 d30, d30, d22 @ < beta
- vmin.s8 d4, d4, d24
- vneg.s8 d25, d24
- vand d26, d26, d28
- vmax.s8 d4, d4, d25
- vand d26, d26, d30
- vmovl.u8 q11, d0
- vand d4, d4, d26
- vmovl.u8 q14, d16
- vaddw.s8 q14, q14, d4
- vsubw.s8 q11, q11, d4
- vqmovun.s16 d16, q14
- vqmovun.s16 d0, q11
-.endm
-
-function ff_h264_v_loop_filter_chroma_neon, export=1
- h264_loop_filter_start
-
- sub r0, r0, r1, lsl #1
- vld1.8 {d18}, [r0,:64], r1
- vld1.8 {d16}, [r0,:64], r1
- vld1.8 {d0}, [r0,:64], r1
- vld1.8 {d2}, [r0,:64]
-
- h264_loop_filter_chroma
-
- sub r0, r0, r1, lsl #1
- vst1.8 {d16}, [r0,:64], r1
- vst1.8 {d0}, [r0,:64], r1
-
- bx lr
-endfunc
-
-function ff_h264_h_loop_filter_chroma_neon, export=1
- h264_loop_filter_start
-
- sub r0, r0, #2
- vld1.32 {d18[0]}, [r0], r1
- vld1.32 {d16[0]}, [r0], r1
- vld1.32 {d0[0]}, [r0], r1
- vld1.32 {d2[0]}, [r0], r1
- vld1.32 {d18[1]}, [r0], r1
- vld1.32 {d16[1]}, [r0], r1
- vld1.32 {d0[1]}, [r0], r1
- vld1.32 {d2[1]}, [r0], r1
-
- vtrn.16 d18, d0
- vtrn.16 d16, d2
- vtrn.8 d18, d16
- vtrn.8 d0, d2
-
- h264_loop_filter_chroma
-
- vtrn.16 d18, d0
- vtrn.16 d16, d2
- vtrn.8 d18, d16
- vtrn.8 d0, d2
-
- sub r0, r0, r1, lsl #3
- vst1.32 {d18[0]}, [r0], r1
- vst1.32 {d16[0]}, [r0], r1
- vst1.32 {d0[0]}, [r0], r1
- vst1.32 {d2[0]}, [r0], r1
- vst1.32 {d18[1]}, [r0], r1
- vst1.32 {d16[1]}, [r0], r1
- vst1.32 {d0[1]}, [r0], r1
- vst1.32 {d2[1]}, [r0], r1
-
- bx lr
-endfunc
-
-@ Biweighted prediction
-
-.macro biweight_16 macs, macd
- vdup.8 d0, r4
- vdup.8 d1, r5
- vmov q2, q8
- vmov q3, q8
-1: subs r3, r3, #2
- vld1.8 {d20-d21},[r0,:128], r2
- \macd q2, d0, d20
- pld [r0]
- \macd q3, d0, d21
- vld1.8 {d22-d23},[r1,:128], r2
- \macs q2, d1, d22
- pld [r1]
- \macs q3, d1, d23
- vmov q12, q8
- vld1.8 {d28-d29},[r0,:128], r2
- vmov q13, q8
- \macd q12, d0, d28
- pld [r0]
- \macd q13, d0, d29
- vld1.8 {d30-d31},[r1,:128], r2
- \macs q12, d1, d30
- pld [r1]
- \macs q13, d1, d31
- vshl.s16 q2, q2, q9
- vshl.s16 q3, q3, q9
- vqmovun.s16 d4, q2
- vqmovun.s16 d5, q3
- vshl.s16 q12, q12, q9
- vshl.s16 q13, q13, q9
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vmov q3, q8
- vst1.8 {d4- d5}, [r6,:128], r2
- vmov q2, q8
- vst1.8 {d24-d25},[r6,:128], r2
- bne 1b
- pop {r4-r6, pc}
-.endm
-
-.macro biweight_8 macs, macd
- vdup.8 d0, r4
- vdup.8 d1, r5
- vmov q1, q8
- vmov q10, q8
-1: subs r3, r3, #2
- vld1.8 {d4},[r0,:64], r2
- \macd q1, d0, d4
- pld [r0]
- vld1.8 {d5},[r1,:64], r2
- \macs q1, d1, d5
- pld [r1]
- vld1.8 {d6},[r0,:64], r2
- \macd q10, d0, d6
- pld [r0]
- vld1.8 {d7},[r1,:64], r2
- \macs q10, d1, d7
- pld [r1]
- vshl.s16 q1, q1, q9
- vqmovun.s16 d2, q1
- vshl.s16 q10, q10, q9
- vqmovun.s16 d4, q10
- vmov q10, q8
- vst1.8 {d2},[r6,:64], r2
- vmov q1, q8
- vst1.8 {d4},[r6,:64], r2
- bne 1b
- pop {r4-r6, pc}
-.endm
-
-.macro biweight_4 macs, macd
- vdup.8 d0, r4
- vdup.8 d1, r5
- vmov q1, q8
- vmov q10, q8
-1: subs r3, r3, #4
- vld1.32 {d4[0]},[r0,:32], r2
- vld1.32 {d4[1]},[r0,:32], r2
- \macd q1, d0, d4
- pld [r0]
- vld1.32 {d5[0]},[r1,:32], r2
- vld1.32 {d5[1]},[r1,:32], r2
- \macs q1, d1, d5
- pld [r1]
- blt 2f
- vld1.32 {d6[0]},[r0,:32], r2
- vld1.32 {d6[1]},[r0,:32], r2
- \macd q10, d0, d6
- pld [r0]
- vld1.32 {d7[0]},[r1,:32], r2
- vld1.32 {d7[1]},[r1,:32], r2
- \macs q10, d1, d7
- pld [r1]
- vshl.s16 q1, q1, q9
- vqmovun.s16 d2, q1
- vshl.s16 q10, q10, q9
- vqmovun.s16 d4, q10
- vmov q10, q8
- vst1.32 {d2[0]},[r6,:32], r2
- vst1.32 {d2[1]},[r6,:32], r2
- vmov q1, q8
- vst1.32 {d4[0]},[r6,:32], r2
- vst1.32 {d4[1]},[r6,:32], r2
- bne 1b
- pop {r4-r6, pc}
-2: vshl.s16 q1, q1, q9
- vqmovun.s16 d2, q1
- vst1.32 {d2[0]},[r6,:32], r2
- vst1.32 {d2[1]},[r6,:32], r2
- pop {r4-r6, pc}
-.endm
-
-.macro biweight_func w
-function ff_biweight_h264_pixels_\w\()_neon, export=1
- push {r4-r6, lr}
- ldr r12, [sp, #16]
- add r4, sp, #20
- ldm r4, {r4-r6}
- lsr lr, r4, #31
- add r6, r6, #1
- eors lr, lr, r5, lsr #30
- orr r6, r6, #1
- vdup.16 q9, r12
- lsl r6, r6, r12
- vmvn q9, q9
- vdup.16 q8, r6
- mov r6, r0
- beq 10f
- subs lr, lr, #1
- beq 20f
- subs lr, lr, #1
- beq 30f
- b 40f
-10: biweight_\w vmlal.u8, vmlal.u8
-20: rsb r4, r4, #0
- biweight_\w vmlal.u8, vmlsl.u8
-30: rsb r4, r4, #0
- rsb r5, r5, #0
- biweight_\w vmlsl.u8, vmlsl.u8
-40: rsb r5, r5, #0
- biweight_\w vmlsl.u8, vmlal.u8
-endfunc
-.endm
-
- biweight_func 16
- biweight_func 8
- biweight_func 4
-
-@ Weighted prediction
-
-.macro weight_16 add
- vdup.8 d0, r12
-1: subs r2, r2, #2
- vld1.8 {d20-d21},[r0,:128], r1
- vmull.u8 q2, d0, d20
- pld [r0]
- vmull.u8 q3, d0, d21
- vld1.8 {d28-d29},[r0,:128], r1
- vmull.u8 q12, d0, d28
- pld [r0]
- vmull.u8 q13, d0, d29
- \add q2, q8, q2
- vrshl.s16 q2, q2, q9
- \add q3, q8, q3
- vrshl.s16 q3, q3, q9
- vqmovun.s16 d4, q2
- vqmovun.s16 d5, q3
- \add q12, q8, q12
- vrshl.s16 q12, q12, q9
- \add q13, q8, q13
- vrshl.s16 q13, q13, q9
- vqmovun.s16 d24, q12
- vqmovun.s16 d25, q13
- vst1.8 {d4- d5}, [r4,:128], r1
- vst1.8 {d24-d25},[r4,:128], r1
- bne 1b
- pop {r4, pc}
-.endm
-
-.macro weight_8 add
- vdup.8 d0, r12
-1: subs r2, r2, #2
- vld1.8 {d4},[r0,:64], r1
- vmull.u8 q1, d0, d4
- pld [r0]
- vld1.8 {d6},[r0,:64], r1
- vmull.u8 q10, d0, d6
- \add q1, q8, q1
- pld [r0]
- vrshl.s16 q1, q1, q9
- vqmovun.s16 d2, q1
- \add q10, q8, q10
- vrshl.s16 q10, q10, q9
- vqmovun.s16 d4, q10
- vst1.8 {d2},[r4,:64], r1
- vst1.8 {d4},[r4,:64], r1
- bne 1b
- pop {r4, pc}
-.endm
-
-.macro weight_4 add
- vdup.8 d0, r12
- vmov q1, q8
- vmov q10, q8
-1: subs r2, r2, #4
- vld1.32 {d4[0]},[r0,:32], r1
- vld1.32 {d4[1]},[r0,:32], r1
- vmull.u8 q1, d0, d4
- pld [r0]
- blt 2f
- vld1.32 {d6[0]},[r0,:32], r1
- vld1.32 {d6[1]},[r0,:32], r1
- vmull.u8 q10, d0, d6
- pld [r0]
- \add q1, q8, q1
- vrshl.s16 q1, q1, q9
- vqmovun.s16 d2, q1
- \add q10, q8, q10
- vrshl.s16 q10, q10, q9
- vqmovun.s16 d4, q10
- vmov q10, q8
- vst1.32 {d2[0]},[r4,:32], r1
- vst1.32 {d2[1]},[r4,:32], r1
- vmov q1, q8
- vst1.32 {d4[0]},[r4,:32], r1
- vst1.32 {d4[1]},[r4,:32], r1
- bne 1b
- pop {r4, pc}
-2: \add q1, q8, q1
- vrshl.s16 q1, q1, q9
- vqmovun.s16 d2, q1
- vst1.32 {d2[0]},[r4,:32], r1
- vst1.32 {d2[1]},[r4,:32], r1
- pop {r4, pc}
-.endm
-
-.macro weight_func w
-function ff_weight_h264_pixels_\w\()_neon, export=1
- push {r4, lr}
- ldr r12, [sp, #8]
- ldr r4, [sp, #12]
- cmp r3, #1
- lsl r4, r4, r3
- vdup.16 q8, r4
- mov r4, r0
- ble 20f
- rsb lr, r3, #1
- vdup.16 q9, lr
- cmp r12, #0
- blt 10f
- weight_\w vhadd.s16
-10: rsb r12, r12, #0
- weight_\w vhsub.s16
-20: rsb lr, r3, #0
- vdup.16 q9, lr
- cmp r12, #0
- blt 10f
- weight_\w vadd.s16
-10: rsb r12, r12, #0
- weight_\w vsub.s16
-endfunc
-.endm
-
- weight_func 16
- weight_func 8
- weight_func 4
diff --git a/ffmpeg/libavcodec/arm/h264idct_neon.S b/ffmpeg/libavcodec/arm/h264idct_neon.S
deleted file mode 100644
index 2edeca2..0000000
--- a/ffmpeg/libavcodec/arm/h264idct_neon.S
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_h264_idct_add_neon, export=1
- vld1.64 {d0-d3}, [r1,:128]
- vmov.i16 q15, #0
-
- vswp d1, d2
- vst1.16 {q15}, [r1,:128]!
- vadd.i16 d4, d0, d1
- vst1.16 {q15}, [r1,:128]!
- vshr.s16 q8, q1, #1
- vsub.i16 d5, d0, d1
- vadd.i16 d6, d2, d17
- vsub.i16 d7, d16, d3
- vadd.i16 q0, q2, q3
- vsub.i16 q1, q2, q3
-
- vtrn.16 d0, d1
- vtrn.16 d3, d2
- vtrn.32 d0, d3
- vtrn.32 d1, d2
-
- vadd.i16 d4, d0, d3
- vld1.32 {d18[0]}, [r0,:32], r2
- vswp d1, d3
- vshr.s16 q8, q1, #1
- vld1.32 {d19[1]}, [r0,:32], r2
- vsub.i16 d5, d0, d1
- vld1.32 {d18[1]}, [r0,:32], r2
- vadd.i16 d6, d16, d3
- vld1.32 {d19[0]}, [r0,:32], r2
- vsub.i16 d7, d2, d17
- sub r0, r0, r2, lsl #2
- vadd.i16 q0, q2, q3
- vsub.i16 q1, q2, q3
-
- vrshr.s16 q0, q0, #6
- vrshr.s16 q1, q1, #6
-
- vaddw.u8 q0, q0, d18
- vaddw.u8 q1, q1, d19
-
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
-
- vst1.32 {d0[0]}, [r0,:32], r2
- vst1.32 {d1[1]}, [r0,:32], r2
- vst1.32 {d0[1]}, [r0,:32], r2
- vst1.32 {d1[0]}, [r0,:32], r2
-
- sub r1, r1, #32
- bx lr
-endfunc
-
-function ff_h264_idct_dc_add_neon, export=1
- mov r3, #0
- vld1.16 {d2[],d3[]}, [r1,:16]
- strh r3, [r1]
- vrshr.s16 q1, q1, #6
- vld1.32 {d0[0]}, [r0,:32], r2
- vld1.32 {d0[1]}, [r0,:32], r2
- vaddw.u8 q2, q1, d0
- vld1.32 {d1[0]}, [r0,:32], r2
- vld1.32 {d1[1]}, [r0,:32], r2
- vaddw.u8 q1, q1, d1
- vqmovun.s16 d0, q2
- vqmovun.s16 d1, q1
- sub r0, r0, r2, lsl #2
- vst1.32 {d0[0]}, [r0,:32], r2
- vst1.32 {d0[1]}, [r0,:32], r2
- vst1.32 {d1[0]}, [r0,:32], r2
- vst1.32 {d1[1]}, [r0,:32], r2
- bx lr
-endfunc
-
-function ff_h264_idct_add16_neon, export=1
- push {r4-r8,lr}
- mov r4, r0
- mov r5, r1
- mov r1, r2
- mov r2, r3
- ldr r6, [sp, #24]
- movrel r7, scan8
- mov ip, #16
-1: ldrb r8, [r7], #1
- ldr r0, [r5], #4
- ldrb r8, [r6, r8]
- subs r8, r8, #1
- blt 2f
- ldrsh lr, [r1]
- add r0, r0, r4
- it ne
- movne lr, #0
- cmp lr, #0
- ite ne
- adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
- adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
- blx lr
-2: subs ip, ip, #1
- add r1, r1, #32
- bne 1b
- pop {r4-r8,pc}
-endfunc
-
-function ff_h264_idct_add16intra_neon, export=1
- push {r4-r8,lr}
- mov r4, r0
- mov r5, r1
- mov r1, r2
- mov r2, r3
- ldr r6, [sp, #24]
- movrel r7, scan8
- mov ip, #16
-1: ldrb r8, [r7], #1
- ldr r0, [r5], #4
- ldrb r8, [r6, r8]
- add r0, r0, r4
- cmp r8, #0
- ldrsh r8, [r1]
- iteet ne
- adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
- adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
- cmpeq r8, #0
- blxne lr
- subs ip, ip, #1
- add r1, r1, #32
- bne 1b
- pop {r4-r8,pc}
-endfunc
-
-function ff_h264_idct_add8_neon, export=1
- push {r4-r10,lr}
- ldm r0, {r4,r9}
- add r5, r1, #16*4
- add r1, r2, #16*32
- mov r2, r3
- mov r10, r1
- ldr r6, [sp, #32]
- movrel r7, scan8+16
- mov r12, #0
-1: ldrb r8, [r7, r12]
- ldr r0, [r5, r12, lsl #2]
- ldrb r8, [r6, r8]
- add r0, r0, r4
- add r1, r10, r12, lsl #5
- cmp r8, #0
- ldrsh r8, [r1]
- iteet ne
- adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
- adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
- cmpeq r8, #0
- blxne lr
- add r12, r12, #1
- cmp r12, #4
- itt eq
- moveq r12, #16
- moveq r4, r9
- cmp r12, #20
- blt 1b
- pop {r4-r10,pc}
-endfunc
-
-.macro idct8x8_cols pass
- .if \pass == 0
- qa .req q2
- qb .req q14
- vshr.s16 q2, q10, #1
- vadd.i16 q0, q8, q12
- vld1.16 {q14-q15},[r1,:128]
- vst1.16 {q3}, [r1,:128]!
- vst1.16 {q3}, [r1,:128]!
- vsub.i16 q1, q8, q12
- vshr.s16 q3, q14, #1
- vsub.i16 q2, q2, q14
- vadd.i16 q3, q3, q10
- .else
- qa .req q14
- qb .req q2
- vtrn.32 q8, q10
- vtrn.16 q12, q13
- vtrn.32 q9, q11
- vtrn.32 q12, q2
- vtrn.32 q13, q15
- vswp d21, d4
- vshr.s16 q14, q10, #1
- vswp d17, d24
- vshr.s16 q3, q2, #1
- vswp d19, d26
- vadd.i16 q0, q8, q12
- vswp d23, d30
- vsub.i16 q1, q8, q12
- vsub.i16 q14, q14, q2
- vadd.i16 q3, q3, q10
- .endif
- vadd.i16 q10, q1, qa
- vsub.i16 q12, q1, qa
- vadd.i16 q8, q0, q3
- vsub.i16 qb, q0, q3
- vsub.i16 q0, q13, q11
- vadd.i16 q1, q15, q9
- vsub.i16 qa, q15, q9
- vadd.i16 q3, q13, q11
- vsub.i16 q0, q0, q15
- vsub.i16 q1, q1, q11
- vadd.i16 qa, qa, q13
- vadd.i16 q3, q3, q9
- vshr.s16 q9, q9, #1
- vshr.s16 q11, q11, #1
- vshr.s16 q13, q13, #1
- vshr.s16 q15, q15, #1
- vsub.i16 q0, q0, q15
- vsub.i16 q1, q1, q11
- vadd.i16 qa, qa, q13
- vadd.i16 q3, q3, q9
- vshr.s16 q9, q0, #2
- vshr.s16 q11, q1, #2
- vshr.s16 q13, qa, #2
- vshr.s16 q15, q3, #2
- vsub.i16 q3, q3, q9
- vsub.i16 qa, q11, qa
- vadd.i16 q1, q1, q13
- vadd.i16 q0, q0, q15
- .if \pass == 0
- vsub.i16 q15, q8, q3
- vadd.i16 q8, q8, q3
- vadd.i16 q9, q10, q2
- vsub.i16 q2, q10, q2
- vtrn.16 q8, q9
- vadd.i16 q10, q12, q1
- vtrn.16 q2, q15
- vadd.i16 q11, q14, q0
- vsub.i16 q13, q12, q1
- vtrn.16 q10, q11
- vsub.i16 q12, q14, q0
- .else
- vsub.i16 q15, q8, q3
- vadd.i16 q8, q8, q3
- vadd.i16 q9, q10, q14
- vsub.i16 q14, q10, q14
- vadd.i16 q10, q12, q1
- vsub.i16 q13, q12, q1
- vadd.i16 q11, q2, q0
- vsub.i16 q12, q2, q0
- .endif
- .unreq qa
- .unreq qb
-.endm
-
-function ff_h264_idct8_add_neon, export=1
- vmov.i16 q3, #0
- vld1.16 {q8-q9}, [r1,:128]
- vst1.16 {q3}, [r1,:128]!
- vst1.16 {q3}, [r1,:128]!
- vld1.16 {q10-q11},[r1,:128]
- vst1.16 {q3}, [r1,:128]!
- vst1.16 {q3}, [r1,:128]!
- vld1.16 {q12-q13},[r1,:128]
- vst1.16 {q3}, [r1,:128]!
- vst1.16 {q3}, [r1,:128]!
-
- idct8x8_cols 0
- idct8x8_cols 1
-
- mov r3, r0
- vrshr.s16 q8, q8, #6
- vld1.8 {d0}, [r0,:64], r2
- vrshr.s16 q9, q9, #6
- vld1.8 {d1}, [r0,:64], r2
- vrshr.s16 q10, q10, #6
- vld1.8 {d2}, [r0,:64], r2
- vrshr.s16 q11, q11, #6
- vld1.8 {d3}, [r0,:64], r2
- vrshr.s16 q12, q12, #6
- vld1.8 {d4}, [r0,:64], r2
- vrshr.s16 q13, q13, #6
- vld1.8 {d5}, [r0,:64], r2
- vrshr.s16 q14, q14, #6
- vld1.8 {d6}, [r0,:64], r2
- vrshr.s16 q15, q15, #6
- vld1.8 {d7}, [r0,:64], r2
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vqmovun.s16 d0, q8
- vaddw.u8 q11, q11, d3
- vqmovun.s16 d1, q9
- vaddw.u8 q12, q12, d4
- vqmovun.s16 d2, q10
- vst1.8 {d0}, [r3,:64], r2
- vaddw.u8 q13, q13, d5
- vqmovun.s16 d3, q11
- vst1.8 {d1}, [r3,:64], r2
- vaddw.u8 q14, q14, d6
- vqmovun.s16 d4, q12
- vst1.8 {d2}, [r3,:64], r2
- vaddw.u8 q15, q15, d7
- vqmovun.s16 d5, q13
- vst1.8 {d3}, [r3,:64], r2
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
- vst1.8 {d4}, [r3,:64], r2
- vst1.8 {d5}, [r3,:64], r2
- vst1.8 {d6}, [r3,:64], r2
- vst1.8 {d7}, [r3,:64], r2
-
- sub r1, r1, #128
- bx lr
-endfunc
-
-function ff_h264_idct8_dc_add_neon, export=1
- mov r3, #0
- vld1.16 {d30[],d31[]},[r1,:16]
- strh r3, [r1]
- vld1.32 {d0}, [r0,:64], r2
- vrshr.s16 q15, q15, #6
- vld1.32 {d1}, [r0,:64], r2
- vld1.32 {d2}, [r0,:64], r2
- vaddw.u8 q8, q15, d0
- vld1.32 {d3}, [r0,:64], r2
- vaddw.u8 q9, q15, d1
- vld1.32 {d4}, [r0,:64], r2
- vaddw.u8 q10, q15, d2
- vld1.32 {d5}, [r0,:64], r2
- vaddw.u8 q11, q15, d3
- vld1.32 {d6}, [r0,:64], r2
- vaddw.u8 q12, q15, d4
- vld1.32 {d7}, [r0,:64], r2
- vaddw.u8 q13, q15, d5
- vaddw.u8 q14, q15, d6
- vaddw.u8 q15, q15, d7
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- sub r0, r0, r2, lsl #3
- vst1.32 {d0}, [r0,:64], r2
- vqmovun.s16 d4, q12
- vst1.32 {d1}, [r0,:64], r2
- vqmovun.s16 d5, q13
- vst1.32 {d2}, [r0,:64], r2
- vqmovun.s16 d6, q14
- vst1.32 {d3}, [r0,:64], r2
- vqmovun.s16 d7, q15
- vst1.32 {d4}, [r0,:64], r2
- vst1.32 {d5}, [r0,:64], r2
- vst1.32 {d6}, [r0,:64], r2
- vst1.32 {d7}, [r0,:64], r2
- bx lr
-endfunc
-
-function ff_h264_idct8_add4_neon, export=1
- push {r4-r8,lr}
- mov r4, r0
- mov r5, r1
- mov r1, r2
- mov r2, r3
- ldr r6, [sp, #24]
- movrel r7, scan8
- mov r12, #16
-1: ldrb r8, [r7], #4
- ldr r0, [r5], #16
- ldrb r8, [r6, r8]
- subs r8, r8, #1
- blt 2f
- ldrsh lr, [r1]
- add r0, r0, r4
- it ne
- movne lr, #0
- cmp lr, #0
- ite ne
- adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
- adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
- blx lr
-2: subs r12, r12, #4
- add r1, r1, #128
- bne 1b
- pop {r4-r8,pc}
-endfunc
-
-const scan8
- .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
- .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
- .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
- .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
- .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
- .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
- .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
- .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
- .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
- .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
- .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
- .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
-endconst
diff --git a/ffmpeg/libavcodec/arm/h264pred_init_arm.c b/ffmpeg/libavcodec/arm/h264pred_init_arm.c
deleted file mode 100644
index 1562f0b..0000000
--- a/ffmpeg/libavcodec/arm/h264pred_init_arm.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/h264pred.h"
-
-void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
-
-void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
-void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
-
-static av_cold void h264_pred_init_neon(H264PredContext *h, int codec_id,
- const int bit_depth,
- const int chroma_format_idc)
-{
-#if HAVE_NEON
- const int high_depth = bit_depth > 8;
-
- if (high_depth)
- return;
- if(chroma_format_idc == 1){
- h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
- h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
- if (codec_id != AV_CODEC_ID_VP8)
- h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
- h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
- if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) {
- h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
- h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
- h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
- h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
- h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
- h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
- h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
- }
- }
-
- h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
- h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
- h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
- h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
- h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
- h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
- if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8)
- h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
-#endif // HAVE_NEON
-}
-
-av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
- int bit_depth, const int chroma_format_idc)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags))
- h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
-}
diff --git a/ffmpeg/libavcodec/arm/h264pred_neon.S b/ffmpeg/libavcodec/arm/h264pred_neon.S
deleted file mode 100644
index 4dc47ba..0000000
--- a/ffmpeg/libavcodec/arm/h264pred_neon.S
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
- .macro ldcol.8 rd, rs, rt, n=8, hi=0
-.if \n == 8 || \hi == 0
- vld1.8 {\rd[0]}, [\rs], \rt
- vld1.8 {\rd[1]}, [\rs], \rt
- vld1.8 {\rd[2]}, [\rs], \rt
- vld1.8 {\rd[3]}, [\rs], \rt
-.endif
-.if \n == 8 || \hi == 1
- vld1.8 {\rd[4]}, [\rs], \rt
- vld1.8 {\rd[5]}, [\rs], \rt
- vld1.8 {\rd[6]}, [\rs], \rt
- vld1.8 {\rd[7]}, [\rs], \rt
-.endif
- .endm
-
- .macro add16x8 dq, dl, dh, rl, rh
- vaddl.u8 \dq, \rl, \rh
- vadd.u16 \dl, \dl, \dh
- vpadd.u16 \dl, \dl, \dl
- vpadd.u16 \dl, \dl, \dl
- .endm
-
-function ff_pred16x16_128_dc_neon, export=1
- vmov.i8 q0, #128
- b .L_pred16x16_dc_end
-endfunc
-
-function ff_pred16x16_top_dc_neon, export=1
- sub r2, r0, r1
- vld1.8 {q0}, [r2,:128]
- add16x8 q0, d0, d1, d0, d1
- vrshrn.u16 d0, q0, #4
- vdup.8 q0, d0[0]
- b .L_pred16x16_dc_end
-endfunc
-
-function ff_pred16x16_left_dc_neon, export=1
- sub r2, r0, #1
- ldcol.8 d0, r2, r1
- ldcol.8 d1, r2, r1
- add16x8 q0, d0, d1, d0, d1
- vrshrn.u16 d0, q0, #4
- vdup.8 q0, d0[0]
- b .L_pred16x16_dc_end
-endfunc
-
-function ff_pred16x16_dc_neon, export=1
- sub r2, r0, r1
- vld1.8 {q0}, [r2,:128]
- sub r2, r0, #1
- ldcol.8 d2, r2, r1
- ldcol.8 d3, r2, r1
- vaddl.u8 q0, d0, d1
- vaddl.u8 q1, d2, d3
- vadd.u16 q0, q0, q1
- vadd.u16 d0, d0, d1
- vpadd.u16 d0, d0, d0
- vpadd.u16 d0, d0, d0
- vrshrn.u16 d0, q0, #5
- vdup.8 q0, d0[0]
-.L_pred16x16_dc_end:
- mov r3, #8
-6: vst1.8 {q0}, [r0,:128], r1
- vst1.8 {q0}, [r0,:128], r1
- subs r3, r3, #1
- bne 6b
- bx lr
-endfunc
-
-function ff_pred16x16_hor_neon, export=1
- sub r2, r0, #1
- mov r3, #16
-1: vld1.8 {d0[],d1[]},[r2], r1
- vst1.8 {q0}, [r0,:128], r1
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-function ff_pred16x16_vert_neon, export=1
- sub r0, r0, r1
- vld1.8 {q0}, [r0,:128], r1
- mov r3, #8
-1: vst1.8 {q0}, [r0,:128], r1
- vst1.8 {q0}, [r0,:128], r1
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-function ff_pred16x16_plane_neon, export=1
- sub r3, r0, r1
- add r2, r3, #8
- sub r3, r3, #1
- vld1.8 {d0}, [r3]
- vld1.8 {d2}, [r2,:64], r1
- ldcol.8 d1, r3, r1
- add r3, r3, r1
- ldcol.8 d3, r3, r1
- vrev64.8 q0, q0
- vaddl.u8 q8, d2, d3
- vsubl.u8 q2, d2, d0
- vsubl.u8 q3, d3, d1
- movrel r3, p16weight
- vld1.8 {q0}, [r3,:128]
- vmul.s16 q2, q2, q0
- vmul.s16 q3, q3, q0
- vadd.i16 d4, d4, d5
- vadd.i16 d5, d6, d7
- vpadd.i16 d4, d4, d5
- vpadd.i16 d4, d4, d4
- vshll.s16 q3, d4, #2
- vaddw.s16 q2, q3, d4
- vrshrn.s32 d4, q2, #6
- mov r3, #0
- vtrn.16 d4, d5
- vadd.i16 d2, d4, d5
- vshl.i16 d3, d2, #3
- vrev64.16 d16, d17
- vsub.i16 d3, d3, d2
- vadd.i16 d16, d16, d0
- vshl.i16 d2, d16, #4
- vsub.i16 d2, d2, d3
- vshl.i16 d3, d4, #4
- vext.16 q0, q0, q0, #7
- vsub.i16 d6, d5, d3
- vmov.16 d0[0], r3
- vmul.i16 q0, q0, d4[0]
- vdup.16 q1, d2[0]
- vdup.16 q2, d4[0]
- vdup.16 q3, d6[0]
- vshl.i16 q2, q2, #3
- vadd.i16 q1, q1, q0
- vadd.i16 q3, q3, q2
- mov r3, #16
-1:
- vqshrun.s16 d0, q1, #5
- vadd.i16 q1, q1, q2
- vqshrun.s16 d1, q1, #5
- vadd.i16 q1, q1, q3
- vst1.8 {q0}, [r0,:128], r1
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-const p16weight, align=4
- .short 1,2,3,4,5,6,7,8
-endconst
-
-function ff_pred8x8_hor_neon, export=1
- sub r2, r0, #1
- mov r3, #8
-1: vld1.8 {d0[]}, [r2], r1
- vst1.8 {d0}, [r0,:64], r1
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-function ff_pred8x8_vert_neon, export=1
- sub r0, r0, r1
- vld1.8 {d0}, [r0,:64], r1
- mov r3, #4
-1: vst1.8 {d0}, [r0,:64], r1
- vst1.8 {d0}, [r0,:64], r1
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-function ff_pred8x8_plane_neon, export=1
- sub r3, r0, r1
- add r2, r3, #4
- sub r3, r3, #1
- vld1.32 {d0[0]}, [r3]
- vld1.32 {d2[0]}, [r2,:32], r1
- ldcol.8 d0, r3, r1, 4, hi=1
- add r3, r3, r1
- ldcol.8 d3, r3, r1, 4
- vaddl.u8 q8, d2, d3
- vrev32.8 d0, d0
- vtrn.32 d2, d3
- vsubl.u8 q2, d2, d0
- movrel r3, p16weight
- vld1.16 {q0}, [r3,:128]
- vmul.s16 d4, d4, d0
- vmul.s16 d5, d5, d0
- vpadd.i16 d4, d4, d5
- vpaddl.s16 d4, d4
- vshl.i32 d5, d4, #4
- vadd.s32 d4, d4, d5
- vrshrn.s32 d4, q2, #5
- mov r3, #0
- vtrn.16 d4, d5
- vadd.i16 d2, d4, d5
- vshl.i16 d3, d2, #2
- vrev64.16 d16, d16
- vsub.i16 d3, d3, d2
- vadd.i16 d16, d16, d0
- vshl.i16 d2, d16, #4
- vsub.i16 d2, d2, d3
- vshl.i16 d3, d4, #3
- vext.16 q0, q0, q0, #7
- vsub.i16 d6, d5, d3
- vmov.16 d0[0], r3
- vmul.i16 q0, q0, d4[0]
- vdup.16 q1, d2[0]
- vdup.16 q2, d4[0]
- vdup.16 q3, d6[0]
- vshl.i16 q2, q2, #3
- vadd.i16 q1, q1, q0
- vadd.i16 q3, q3, q2
- mov r3, #8
-1:
- vqshrun.s16 d0, q1, #5
- vadd.i16 q1, q1, q3
- vst1.8 {d0}, [r0,:64], r1
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-function ff_pred8x8_128_dc_neon, export=1
- vmov.i8 q0, #128
- b .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_top_dc_neon, export=1
- sub r2, r0, r1
- vld1.8 {d0}, [r2,:64]
- vpaddl.u8 d0, d0
- vpadd.u16 d0, d0, d0
- vrshrn.u16 d0, q0, #2
- vdup.8 d1, d0[1]
- vdup.8 d0, d0[0]
- vtrn.32 d0, d1
- b .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_left_dc_neon, export=1
- sub r2, r0, #1
- ldcol.8 d0, r2, r1
- vpaddl.u8 d0, d0
- vpadd.u16 d0, d0, d0
- vrshrn.u16 d0, q0, #2
- vdup.8 d1, d0[1]
- vdup.8 d0, d0[0]
- b .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_dc_neon, export=1
- sub r2, r0, r1
- vld1.8 {d0}, [r2,:64]
- sub r2, r0, #1
- ldcol.8 d1, r2, r1
- vtrn.32 d0, d1
- vpaddl.u8 q0, q0
- vpadd.u16 d0, d0, d1
- vpadd.u16 d1, d0, d0
- vrshrn.u16 d2, q0, #3
- vrshrn.u16 d3, q0, #2
- vdup.8 d0, d2[4]
- vdup.8 d1, d3[3]
- vdup.8 d4, d3[2]
- vdup.8 d5, d2[5]
- vtrn.32 q0, q2
-.L_pred8x8_dc_end:
- mov r3, #4
- add r2, r0, r1, lsl #2
-6: vst1.8 {d0}, [r0,:64], r1
- vst1.8 {d1}, [r2,:64], r1
- subs r3, r3, #1
- bne 6b
- bx lr
-endfunc
-
-function ff_pred8x8_l0t_dc_neon, export=1
- sub r2, r0, r1
- vld1.8 {d0}, [r2,:64]
- sub r2, r0, #1
- ldcol.8 d1, r2, r1, 4
- vtrn.32 d0, d1
- vpaddl.u8 q0, q0
- vpadd.u16 d0, d0, d1
- vpadd.u16 d1, d0, d0
- vrshrn.u16 d2, q0, #3
- vrshrn.u16 d3, q0, #2
- vdup.8 d0, d2[4]
- vdup.8 d1, d3[0]
- vdup.8 q2, d3[2]
- vtrn.32 q0, q2
- b .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_l00_dc_neon, export=1
- sub r2, r0, #1
- ldcol.8 d0, r2, r1, 4
- vpaddl.u8 d0, d0
- vpadd.u16 d0, d0, d0
- vrshrn.u16 d0, q0, #2
- vmov.i8 d1, #128
- vdup.8 d0, d0[0]
- b .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_0lt_dc_neon, export=1
- sub r2, r0, r1
- vld1.8 {d0}, [r2,:64]
- add r2, r0, r1, lsl #2
- sub r2, r2, #1
- ldcol.8 d1, r2, r1, 4, hi=1
- vtrn.32 d0, d1
- vpaddl.u8 q0, q0
- vpadd.u16 d0, d0, d1
- vpadd.u16 d1, d0, d0
- vrshrn.u16 d3, q0, #2
- vrshrn.u16 d2, q0, #3
- vdup.8 d0, d3[0]
- vdup.8 d1, d3[3]
- vdup.8 d4, d3[2]
- vdup.8 d5, d2[5]
- vtrn.32 q0, q2
- b .L_pred8x8_dc_end
-endfunc
-
-function ff_pred8x8_0l0_dc_neon, export=1
- add r2, r0, r1, lsl #2
- sub r2, r2, #1
- ldcol.8 d1, r2, r1, 4
- vpaddl.u8 d2, d1
- vpadd.u16 d2, d2, d2
- vrshrn.u16 d1, q1, #2
- vmov.i8 d0, #128
- vdup.8 d1, d1[0]
- b .L_pred8x8_dc_end
-endfunc
diff --git a/ffmpeg/libavcodec/arm/h264qpel_init_arm.c b/ffmpeg/libavcodec/arm/h264qpel_init_arm.c
deleted file mode 100644
index eaa1324..0000000
--- a/ffmpeg/libavcodec/arm/h264qpel_init_arm.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/h264qpel.h"
-
-void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
-
-void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
-
-void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
-
-void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
-void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
-
-av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth)
-{
- const int high_bit_depth = bit_depth > 8;
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags) && !high_bit_depth) {
- c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
- c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
- c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
- c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
- c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
- c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
- c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
- c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
- c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
- c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
- c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
- c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
- c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
- c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
- c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
- c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
-
- c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
- c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
- c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
- c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
- c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
- c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
- c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
- c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
- c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
- c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
- c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
- c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
- c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
- c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
- c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
- c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
-
- c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
- c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
- c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
- c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
- c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
- c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
- c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
- c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
- c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
- c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
- c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
- c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
- c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
- c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
- c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
- c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
-
- c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
- c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
- c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
- c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
- c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
- c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
- c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
- c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
- c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
- c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
- c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
- c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
- c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
- c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
- c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
- c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/h264qpel_neon.S b/ffmpeg/libavcodec/arm/h264qpel_neon.S
deleted file mode 100644
index 21336c6..0000000
--- a/ffmpeg/libavcodec/arm/h264qpel_neon.S
+++ /dev/null
@@ -1,955 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-#include "neon.S"
-
- /* H.264 qpel MC */
-
-.macro lowpass_const r
- movw \r, #5
- movt \r, #20
- vmov.32 d6[0], \r
-.endm
-
-.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
- .if \narrow
- t0 .req q0
- t1 .req q8
- .else
- t0 .req \d0
- t1 .req \d1
- .endif
- vext.8 d2, \r0, \r1, #2
- vext.8 d3, \r0, \r1, #3
- vaddl.u8 q1, d2, d3
- vext.8 d4, \r0, \r1, #1
- vext.8 d5, \r0, \r1, #4
- vaddl.u8 q2, d4, d5
- vext.8 d30, \r0, \r1, #5
- vaddl.u8 t0, \r0, d30
- vext.8 d18, \r2, \r3, #2
- vmla.i16 t0, q1, d6[1]
- vext.8 d19, \r2, \r3, #3
- vaddl.u8 q9, d18, d19
- vext.8 d20, \r2, \r3, #1
- vmls.i16 t0, q2, d6[0]
- vext.8 d21, \r2, \r3, #4
- vaddl.u8 q10, d20, d21
- vext.8 d31, \r2, \r3, #5
- vaddl.u8 t1, \r2, d31
- vmla.i16 t1, q9, d6[1]
- vmls.i16 t1, q10, d6[0]
- .if \narrow
- vqrshrun.s16 \d0, t0, #5
- vqrshrun.s16 \d1, t1, #5
- .endif
- .unreq t0
- .unreq t1
-.endm
-
-.macro lowpass_8_1 r0, r1, d0, narrow=1
- .if \narrow
- t0 .req q0
- .else
- t0 .req \d0
- .endif
- vext.8 d2, \r0, \r1, #2
- vext.8 d3, \r0, \r1, #3
- vaddl.u8 q1, d2, d3
- vext.8 d4, \r0, \r1, #1
- vext.8 d5, \r0, \r1, #4
- vaddl.u8 q2, d4, d5
- vext.8 d30, \r0, \r1, #5
- vaddl.u8 t0, \r0, d30
- vmla.i16 t0, q1, d6[1]
- vmls.i16 t0, q2, d6[0]
- .if \narrow
- vqrshrun.s16 \d0, t0, #5
- .endif
- .unreq t0
-.endm
-
-.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
- vext.16 q1, \r0, \r1, #2
- vext.16 q0, \r0, \r1, #3
- vaddl.s16 q9, d2, d0
- vext.16 q2, \r0, \r1, #1
- vaddl.s16 q1, d3, d1
- vext.16 q3, \r0, \r1, #4
- vaddl.s16 q10, d4, d6
- vext.16 \r1, \r0, \r1, #5
- vaddl.s16 q2, d5, d7
- vaddl.s16 q0, \h0, \h1
- vaddl.s16 q8, \l0, \l1
-
- vshl.i32 q3, q9, #4
- vshl.i32 q9, q9, #2
- vshl.i32 q15, q10, #2
- vadd.i32 q9, q9, q3
- vadd.i32 q10, q10, q15
-
- vshl.i32 q3, q1, #4
- vshl.i32 q1, q1, #2
- vshl.i32 q15, q2, #2
- vadd.i32 q1, q1, q3
- vadd.i32 q2, q2, q15
-
- vadd.i32 q9, q9, q8
- vsub.i32 q9, q9, q10
-
- vadd.i32 q1, q1, q0
- vsub.i32 q1, q1, q2
-
- vrshrn.s32 d18, q9, #10
- vrshrn.s32 d19, q1, #10
-
- vqmovun.s16 \d, q9
-.endm
-
-function put_h264_qpel16_h_lowpass_neon_packed
- mov r4, lr
- mov r12, #16
- mov r3, #8
- bl put_h264_qpel8_h_lowpass_neon
- sub r1, r1, r2, lsl #4
- add r1, r1, #8
- mov r12, #16
- mov lr, r4
- b put_h264_qpel8_h_lowpass_neon
-endfunc
-
-.macro h264_qpel_h_lowpass type
-function \type\()_h264_qpel16_h_lowpass_neon
- push {lr}
- mov r12, #16
- bl \type\()_h264_qpel8_h_lowpass_neon
- sub r0, r0, r3, lsl #4
- sub r1, r1, r2, lsl #4
- add r0, r0, #8
- add r1, r1, #8
- mov r12, #16
- pop {lr}
-endfunc
-
-function \type\()_h264_qpel8_h_lowpass_neon
-1: vld1.8 {d0, d1}, [r1], r2
- vld1.8 {d16,d17}, [r1], r2
- subs r12, r12, #2
- lowpass_8 d0, d1, d16, d17, d0, d16
- .ifc \type,avg
- vld1.8 {d2}, [r0,:64], r3
- vrhadd.u8 d0, d0, d2
- vld1.8 {d3}, [r0,:64]
- vrhadd.u8 d16, d16, d3
- sub r0, r0, r3
- .endif
- vst1.8 {d0}, [r0,:64], r3
- vst1.8 {d16}, [r0,:64], r3
- bne 1b
- bx lr
-endfunc
-.endm
-
- h264_qpel_h_lowpass put
- h264_qpel_h_lowpass avg
-
-.macro h264_qpel_h_lowpass_l2 type
-function \type\()_h264_qpel16_h_lowpass_l2_neon
- push {lr}
- mov r12, #16
- bl \type\()_h264_qpel8_h_lowpass_l2_neon
- sub r0, r0, r2, lsl #4
- sub r1, r1, r2, lsl #4
- sub r3, r3, r2, lsl #4
- add r0, r0, #8
- add r1, r1, #8
- add r3, r3, #8
- mov r12, #16
- pop {lr}
-endfunc
-
-function \type\()_h264_qpel8_h_lowpass_l2_neon
-1: vld1.8 {d0, d1}, [r1], r2
- vld1.8 {d16,d17}, [r1], r2
- vld1.8 {d28}, [r3], r2
- vld1.8 {d29}, [r3], r2
- subs r12, r12, #2
- lowpass_8 d0, d1, d16, d17, d0, d1
- vrhadd.u8 q0, q0, q14
- .ifc \type,avg
- vld1.8 {d2}, [r0,:64], r2
- vrhadd.u8 d0, d0, d2
- vld1.8 {d3}, [r0,:64]
- vrhadd.u8 d1, d1, d3
- sub r0, r0, r2
- .endif
- vst1.8 {d0}, [r0,:64], r2
- vst1.8 {d1}, [r0,:64], r2
- bne 1b
- bx lr
-endfunc
-.endm
-
- h264_qpel_h_lowpass_l2 put
- h264_qpel_h_lowpass_l2 avg
-
-function put_h264_qpel16_v_lowpass_neon_packed
- mov r4, lr
- mov r2, #8
- bl put_h264_qpel8_v_lowpass_neon
- sub r1, r1, r3, lsl #2
- bl put_h264_qpel8_v_lowpass_neon
- sub r1, r1, r3, lsl #4
- sub r1, r1, r3, lsl #2
- add r1, r1, #8
- bl put_h264_qpel8_v_lowpass_neon
- sub r1, r1, r3, lsl #2
- mov lr, r4
- b put_h264_qpel8_v_lowpass_neon
-endfunc
-
-.macro h264_qpel_v_lowpass type
-function \type\()_h264_qpel16_v_lowpass_neon
- mov r4, lr
- bl \type\()_h264_qpel8_v_lowpass_neon
- sub r1, r1, r3, lsl #2
- bl \type\()_h264_qpel8_v_lowpass_neon
- sub r0, r0, r2, lsl #4
- add r0, r0, #8
- sub r1, r1, r3, lsl #4
- sub r1, r1, r3, lsl #2
- add r1, r1, #8
- bl \type\()_h264_qpel8_v_lowpass_neon
- sub r1, r1, r3, lsl #2
- mov lr, r4
-endfunc
-
-function \type\()_h264_qpel8_v_lowpass_neon
- vld1.8 {d8}, [r1], r3
- vld1.8 {d10}, [r1], r3
- vld1.8 {d12}, [r1], r3
- vld1.8 {d14}, [r1], r3
- vld1.8 {d22}, [r1], r3
- vld1.8 {d24}, [r1], r3
- vld1.8 {d26}, [r1], r3
- vld1.8 {d28}, [r1], r3
- vld1.8 {d9}, [r1], r3
- vld1.8 {d11}, [r1], r3
- vld1.8 {d13}, [r1], r3
- vld1.8 {d15}, [r1], r3
- vld1.8 {d23}, [r1]
-
- transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
- lowpass_8 d8, d9, d10, d11, d8, d10
- lowpass_8 d12, d13, d14, d15, d12, d14
- lowpass_8 d22, d23, d24, d25, d22, d24
- lowpass_8 d26, d27, d28, d29, d26, d28
- transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
-
- .ifc \type,avg
- vld1.8 {d9}, [r0,:64], r2
- vrhadd.u8 d8, d8, d9
- vld1.8 {d11}, [r0,:64], r2
- vrhadd.u8 d10, d10, d11
- vld1.8 {d13}, [r0,:64], r2
- vrhadd.u8 d12, d12, d13
- vld1.8 {d15}, [r0,:64], r2
- vrhadd.u8 d14, d14, d15
- vld1.8 {d23}, [r0,:64], r2
- vrhadd.u8 d22, d22, d23
- vld1.8 {d25}, [r0,:64], r2
- vrhadd.u8 d24, d24, d25
- vld1.8 {d27}, [r0,:64], r2
- vrhadd.u8 d26, d26, d27
- vld1.8 {d29}, [r0,:64], r2
- vrhadd.u8 d28, d28, d29
- sub r0, r0, r2, lsl #3
- .endif
-
- vst1.8 {d8}, [r0,:64], r2
- vst1.8 {d10}, [r0,:64], r2
- vst1.8 {d12}, [r0,:64], r2
- vst1.8 {d14}, [r0,:64], r2
- vst1.8 {d22}, [r0,:64], r2
- vst1.8 {d24}, [r0,:64], r2
- vst1.8 {d26}, [r0,:64], r2
- vst1.8 {d28}, [r0,:64], r2
-
- bx lr
-endfunc
-.endm
-
- h264_qpel_v_lowpass put
- h264_qpel_v_lowpass avg
-
-.macro h264_qpel_v_lowpass_l2 type
-function \type\()_h264_qpel16_v_lowpass_l2_neon
- mov r4, lr
- bl \type\()_h264_qpel8_v_lowpass_l2_neon
- sub r1, r1, r3, lsl #2
- bl \type\()_h264_qpel8_v_lowpass_l2_neon
- sub r0, r0, r3, lsl #4
- sub r12, r12, r2, lsl #4
- add r0, r0, #8
- add r12, r12, #8
- sub r1, r1, r3, lsl #4
- sub r1, r1, r3, lsl #2
- add r1, r1, #8
- bl \type\()_h264_qpel8_v_lowpass_l2_neon
- sub r1, r1, r3, lsl #2
- mov lr, r4
-endfunc
-
-function \type\()_h264_qpel8_v_lowpass_l2_neon
- vld1.8 {d8}, [r1], r3
- vld1.8 {d10}, [r1], r3
- vld1.8 {d12}, [r1], r3
- vld1.8 {d14}, [r1], r3
- vld1.8 {d22}, [r1], r3
- vld1.8 {d24}, [r1], r3
- vld1.8 {d26}, [r1], r3
- vld1.8 {d28}, [r1], r3
- vld1.8 {d9}, [r1], r3
- vld1.8 {d11}, [r1], r3
- vld1.8 {d13}, [r1], r3
- vld1.8 {d15}, [r1], r3
- vld1.8 {d23}, [r1]
-
- transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
- lowpass_8 d8, d9, d10, d11, d8, d9
- lowpass_8 d12, d13, d14, d15, d12, d13
- lowpass_8 d22, d23, d24, d25, d22, d23
- lowpass_8 d26, d27, d28, d29, d26, d27
- transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
-
- vld1.8 {d0}, [r12], r2
- vld1.8 {d1}, [r12], r2
- vld1.8 {d2}, [r12], r2
- vld1.8 {d3}, [r12], r2
- vld1.8 {d4}, [r12], r2
- vrhadd.u8 q0, q0, q4
- vld1.8 {d5}, [r12], r2
- vrhadd.u8 q1, q1, q6
- vld1.8 {d10}, [r12], r2
- vrhadd.u8 q2, q2, q11
- vld1.8 {d11}, [r12], r2
- vrhadd.u8 q5, q5, q13
-
- .ifc \type,avg
- vld1.8 {d16}, [r0,:64], r3
- vrhadd.u8 d0, d0, d16
- vld1.8 {d17}, [r0,:64], r3
- vrhadd.u8 d1, d1, d17
- vld1.8 {d16}, [r0,:64], r3
- vrhadd.u8 d2, d2, d16
- vld1.8 {d17}, [r0,:64], r3
- vrhadd.u8 d3, d3, d17
- vld1.8 {d16}, [r0,:64], r3
- vrhadd.u8 d4, d4, d16
- vld1.8 {d17}, [r0,:64], r3
- vrhadd.u8 d5, d5, d17
- vld1.8 {d16}, [r0,:64], r3
- vrhadd.u8 d10, d10, d16
- vld1.8 {d17}, [r0,:64], r3
- vrhadd.u8 d11, d11, d17
- sub r0, r0, r3, lsl #3
- .endif
-
- vst1.8 {d0}, [r0,:64], r3
- vst1.8 {d1}, [r0,:64], r3
- vst1.8 {d2}, [r0,:64], r3
- vst1.8 {d3}, [r0,:64], r3
- vst1.8 {d4}, [r0,:64], r3
- vst1.8 {d5}, [r0,:64], r3
- vst1.8 {d10}, [r0,:64], r3
- vst1.8 {d11}, [r0,:64], r3
-
- bx lr
-endfunc
-.endm
-
- h264_qpel_v_lowpass_l2 put
- h264_qpel_v_lowpass_l2 avg
-
-function put_h264_qpel8_hv_lowpass_neon_top
- lowpass_const r12
- mov r12, #12
-1: vld1.8 {d0, d1}, [r1], r3
- vld1.8 {d16,d17}, [r1], r3
- subs r12, r12, #2
- lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
- vst1.8 {d22-d25}, [r4,:128]!
- bne 1b
-
- vld1.8 {d0, d1}, [r1]
- lowpass_8_1 d0, d1, q12, narrow=0
-
- mov r12, #-16
- add r4, r4, r12
- vld1.8 {d30,d31}, [r4,:128], r12
- vld1.8 {d20,d21}, [r4,:128], r12
- vld1.8 {d18,d19}, [r4,:128], r12
- vld1.8 {d16,d17}, [r4,:128], r12
- vld1.8 {d14,d15}, [r4,:128], r12
- vld1.8 {d12,d13}, [r4,:128], r12
- vld1.8 {d10,d11}, [r4,:128], r12
- vld1.8 {d8, d9}, [r4,:128], r12
- vld1.8 {d6, d7}, [r4,:128], r12
- vld1.8 {d4, d5}, [r4,:128], r12
- vld1.8 {d2, d3}, [r4,:128], r12
- vld1.8 {d0, d1}, [r4,:128]
-
- swap4 d1, d3, d5, d7, d8, d10, d12, d14
- transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
-
- swap4 d17, d19, d21, d31, d24, d26, d28, d22
- transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
-
- vst1.8 {d30,d31}, [r4,:128]!
- vst1.8 {d6, d7}, [r4,:128]!
- vst1.8 {d20,d21}, [r4,:128]!
- vst1.8 {d4, d5}, [r4,:128]!
- vst1.8 {d18,d19}, [r4,:128]!
- vst1.8 {d2, d3}, [r4,:128]!
- vst1.8 {d16,d17}, [r4,:128]!
- vst1.8 {d0, d1}, [r4,:128]
-
- lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
- lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
- lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
- lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
-
- vld1.8 {d16,d17}, [r4,:128], r12
- vld1.8 {d30,d31}, [r4,:128], r12
- lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
- vld1.8 {d16,d17}, [r4,:128], r12
- vld1.8 {d30,d31}, [r4,:128], r12
- lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
- vld1.8 {d16,d17}, [r4,:128], r12
- vld1.8 {d30,d31}, [r4,:128], r12
- lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
- vld1.8 {d16,d17}, [r4,:128], r12
- vld1.8 {d30,d31}, [r4,:128]
- lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
-
- transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
-
- bx lr
-endfunc
-
-.macro h264_qpel8_hv_lowpass type
-function \type\()_h264_qpel8_hv_lowpass_neon
- mov r10, lr
- bl put_h264_qpel8_hv_lowpass_neon_top
- .ifc \type,avg
- vld1.8 {d0}, [r0,:64], r2
- vrhadd.u8 d12, d12, d0
- vld1.8 {d1}, [r0,:64], r2
- vrhadd.u8 d13, d13, d1
- vld1.8 {d2}, [r0,:64], r2
- vrhadd.u8 d14, d14, d2
- vld1.8 {d3}, [r0,:64], r2
- vrhadd.u8 d15, d15, d3
- vld1.8 {d4}, [r0,:64], r2
- vrhadd.u8 d8, d8, d4
- vld1.8 {d5}, [r0,:64], r2
- vrhadd.u8 d9, d9, d5
- vld1.8 {d6}, [r0,:64], r2
- vrhadd.u8 d10, d10, d6
- vld1.8 {d7}, [r0,:64], r2
- vrhadd.u8 d11, d11, d7
- sub r0, r0, r2, lsl #3
- .endif
-
- vst1.8 {d12}, [r0,:64], r2
- vst1.8 {d13}, [r0,:64], r2
- vst1.8 {d14}, [r0,:64], r2
- vst1.8 {d15}, [r0,:64], r2
- vst1.8 {d8}, [r0,:64], r2
- vst1.8 {d9}, [r0,:64], r2
- vst1.8 {d10}, [r0,:64], r2
- vst1.8 {d11}, [r0,:64], r2
-
- mov lr, r10
- bx lr
-endfunc
-.endm
-
- h264_qpel8_hv_lowpass put
- h264_qpel8_hv_lowpass avg
-
-.macro h264_qpel8_hv_lowpass_l2 type
-function \type\()_h264_qpel8_hv_lowpass_l2_neon
- mov r10, lr
- bl put_h264_qpel8_hv_lowpass_neon_top
-
- vld1.8 {d0, d1}, [r2,:128]!
- vld1.8 {d2, d3}, [r2,:128]!
- vrhadd.u8 q0, q0, q6
- vld1.8 {d4, d5}, [r2,:128]!
- vrhadd.u8 q1, q1, q7
- vld1.8 {d6, d7}, [r2,:128]!
- vrhadd.u8 q2, q2, q4
- vrhadd.u8 q3, q3, q5
- .ifc \type,avg
- vld1.8 {d16}, [r0,:64], r3
- vrhadd.u8 d0, d0, d16
- vld1.8 {d17}, [r0,:64], r3
- vrhadd.u8 d1, d1, d17
- vld1.8 {d18}, [r0,:64], r3
- vrhadd.u8 d2, d2, d18
- vld1.8 {d19}, [r0,:64], r3
- vrhadd.u8 d3, d3, d19
- vld1.8 {d20}, [r0,:64], r3
- vrhadd.u8 d4, d4, d20
- vld1.8 {d21}, [r0,:64], r3
- vrhadd.u8 d5, d5, d21
- vld1.8 {d22}, [r0,:64], r3
- vrhadd.u8 d6, d6, d22
- vld1.8 {d23}, [r0,:64], r3
- vrhadd.u8 d7, d7, d23
- sub r0, r0, r3, lsl #3
- .endif
- vst1.8 {d0}, [r0,:64], r3
- vst1.8 {d1}, [r0,:64], r3
- vst1.8 {d2}, [r0,:64], r3
- vst1.8 {d3}, [r0,:64], r3
- vst1.8 {d4}, [r0,:64], r3
- vst1.8 {d5}, [r0,:64], r3
- vst1.8 {d6}, [r0,:64], r3
- vst1.8 {d7}, [r0,:64], r3
-
- mov lr, r10
- bx lr
-endfunc
-.endm
-
- h264_qpel8_hv_lowpass_l2 put
- h264_qpel8_hv_lowpass_l2 avg
-
-.macro h264_qpel16_hv type
-function \type\()_h264_qpel16_hv_lowpass_neon
- mov r9, lr
- bl \type\()_h264_qpel8_hv_lowpass_neon
- sub r1, r1, r3, lsl #2
- bl \type\()_h264_qpel8_hv_lowpass_neon
- sub r1, r1, r3, lsl #4
- sub r1, r1, r3, lsl #2
- add r1, r1, #8
- sub r0, r0, r2, lsl #4
- add r0, r0, #8
- bl \type\()_h264_qpel8_hv_lowpass_neon
- sub r1, r1, r3, lsl #2
- mov lr, r9
- b \type\()_h264_qpel8_hv_lowpass_neon
-endfunc
-
-function \type\()_h264_qpel16_hv_lowpass_l2_neon
- mov r9, lr
- sub r2, r4, #256
- bl \type\()_h264_qpel8_hv_lowpass_l2_neon
- sub r1, r1, r3, lsl #2
- bl \type\()_h264_qpel8_hv_lowpass_l2_neon
- sub r1, r1, r3, lsl #4
- sub r1, r1, r3, lsl #2
- add r1, r1, #8
- sub r0, r0, r3, lsl #4
- add r0, r0, #8
- bl \type\()_h264_qpel8_hv_lowpass_l2_neon
- sub r1, r1, r3, lsl #2
- mov lr, r9
- b \type\()_h264_qpel8_hv_lowpass_l2_neon
-endfunc
-.endm
-
- h264_qpel16_hv put
- h264_qpel16_hv avg
-
-.macro h264_qpel8 type
-function ff_\type\()_h264_qpel8_mc10_neon, export=1
- lowpass_const r3
- mov r3, r1
- sub r1, r1, #2
- mov r12, #8
- b \type\()_h264_qpel8_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel8_mc20_neon, export=1
- lowpass_const r3
- sub r1, r1, #2
- mov r3, r2
- mov r12, #8
- b \type\()_h264_qpel8_h_lowpass_neon
-endfunc
-
-function ff_\type\()_h264_qpel8_mc30_neon, export=1
- lowpass_const r3
- add r3, r1, #1
- sub r1, r1, #2
- mov r12, #8
- b \type\()_h264_qpel8_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel8_mc01_neon, export=1
- push {lr}
- mov r12, r1
-\type\()_h264_qpel8_mc01:
- lowpass_const r3
- mov r3, r2
- sub r1, r1, r2, lsl #1
- vpush {d8-d15}
- bl \type\()_h264_qpel8_v_lowpass_l2_neon
- vpop {d8-d15}
- pop {pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc11_neon, export=1
- push {r0, r1, r11, lr}
-\type\()_h264_qpel8_mc11:
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r0, r11, #15
-T mov sp, r0
- sub sp, sp, #64
- mov r0, sp
- sub r1, r1, #2
- mov r3, #8
- mov r12, #8
- vpush {d8-d15}
- bl put_h264_qpel8_h_lowpass_neon
- ldrd r0, r1, [r11], #8
- mov r3, r2
- add r12, sp, #64
- sub r1, r1, r2, lsl #1
- mov r2, #8
- bl \type\()_h264_qpel8_v_lowpass_l2_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc21_neon, export=1
- push {r0, r1, r4, r10, r11, lr}
-\type\()_h264_qpel8_mc21:
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r0, r11, #15
-T mov sp, r0
- sub sp, sp, #(8*8+16*12)
- sub r1, r1, #2
- mov r3, #8
- mov r0, sp
- mov r12, #8
- vpush {d8-d15}
- bl put_h264_qpel8_h_lowpass_neon
- mov r4, r0
- ldrd r0, r1, [r11], #8
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, r2
- sub r2, r4, #64
- bl \type\()_h264_qpel8_hv_lowpass_l2_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4, r10, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc31_neon, export=1
- add r1, r1, #1
- push {r0, r1, r11, lr}
- sub r1, r1, #1
- b \type\()_h264_qpel8_mc11
-endfunc
-
-function ff_\type\()_h264_qpel8_mc02_neon, export=1
- push {lr}
- lowpass_const r3
- sub r1, r1, r2, lsl #1
- mov r3, r2
- vpush {d8-d15}
- bl \type\()_h264_qpel8_v_lowpass_neon
- vpop {d8-d15}
- pop {pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc12_neon, export=1
- push {r0, r1, r4, r10, r11, lr}
-\type\()_h264_qpel8_mc12:
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r0, r11, #15
-T mov sp, r0
- sub sp, sp, #(8*8+16*12)
- sub r1, r1, r2, lsl #1
- mov r3, r2
- mov r2, #8
- mov r0, sp
- vpush {d8-d15}
- bl put_h264_qpel8_v_lowpass_neon
- mov r4, r0
- ldrd r0, r1, [r11], #8
- sub r1, r1, r3, lsl #1
- sub r1, r1, #2
- sub r2, r4, #64
- bl \type\()_h264_qpel8_hv_lowpass_l2_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4, r10, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc22_neon, export=1
- push {r4, r10, r11, lr}
- mov r11, sp
-A bic sp, sp, #15
-T bic r4, r11, #15
-T mov sp, r4
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, r2
- sub sp, sp, #(16*12)
- mov r4, sp
- vpush {d8-d15}
- bl \type\()_h264_qpel8_hv_lowpass_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4, r10, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel8_mc32_neon, export=1
- push {r0, r1, r4, r10, r11, lr}
- add r1, r1, #1
- b \type\()_h264_qpel8_mc12
-endfunc
-
-function ff_\type\()_h264_qpel8_mc03_neon, export=1
- push {lr}
- add r12, r1, r2
- b \type\()_h264_qpel8_mc01
-endfunc
-
-function ff_\type\()_h264_qpel8_mc13_neon, export=1
- push {r0, r1, r11, lr}
- add r1, r1, r2
- b \type\()_h264_qpel8_mc11
-endfunc
-
-function ff_\type\()_h264_qpel8_mc23_neon, export=1
- push {r0, r1, r4, r10, r11, lr}
- add r1, r1, r2
- b \type\()_h264_qpel8_mc21
-endfunc
-
-function ff_\type\()_h264_qpel8_mc33_neon, export=1
- add r1, r1, #1
- push {r0, r1, r11, lr}
- add r1, r1, r2
- sub r1, r1, #1
- b \type\()_h264_qpel8_mc11
-endfunc
-.endm
-
- h264_qpel8 put
- h264_qpel8 avg
-
-.macro h264_qpel16 type
-function ff_\type\()_h264_qpel16_mc10_neon, export=1
- lowpass_const r3
- mov r3, r1
- sub r1, r1, #2
- b \type\()_h264_qpel16_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel16_mc20_neon, export=1
- lowpass_const r3
- sub r1, r1, #2
- mov r3, r2
- b \type\()_h264_qpel16_h_lowpass_neon
-endfunc
-
-function ff_\type\()_h264_qpel16_mc30_neon, export=1
- lowpass_const r3
- add r3, r1, #1
- sub r1, r1, #2
- b \type\()_h264_qpel16_h_lowpass_l2_neon
-endfunc
-
-function ff_\type\()_h264_qpel16_mc01_neon, export=1
- push {r4, lr}
- mov r12, r1
-\type\()_h264_qpel16_mc01:
- lowpass_const r3
- mov r3, r2
- sub r1, r1, r2, lsl #1
- vpush {d8-d15}
- bl \type\()_h264_qpel16_v_lowpass_l2_neon
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc11_neon, export=1
- push {r0, r1, r4, r11, lr}
-\type\()_h264_qpel16_mc11:
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r0, r11, #15
-T mov sp, r0
- sub sp, sp, #256
- mov r0, sp
- sub r1, r1, #2
- mov r3, #16
- vpush {d8-d15}
- bl put_h264_qpel16_h_lowpass_neon
- ldrd r0, r1, [r11], #8
- mov r3, r2
- add r12, sp, #64
- sub r1, r1, r2, lsl #1
- mov r2, #16
- bl \type\()_h264_qpel16_v_lowpass_l2_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4, r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc21_neon, export=1
- push {r0, r1, r4-r5, r9-r11, lr}
-\type\()_h264_qpel16_mc21:
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r0, r11, #15
-T mov sp, r0
- sub sp, sp, #(16*16+16*12)
- sub r1, r1, #2
- mov r0, sp
- vpush {d8-d15}
- bl put_h264_qpel16_h_lowpass_neon_packed
- mov r4, r0
- ldrd r0, r1, [r11], #8
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, r2
- bl \type\()_h264_qpel16_hv_lowpass_l2_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4-r5, r9-r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc31_neon, export=1
- add r1, r1, #1
- push {r0, r1, r4, r11, lr}
- sub r1, r1, #1
- b \type\()_h264_qpel16_mc11
-endfunc
-
-function ff_\type\()_h264_qpel16_mc02_neon, export=1
- push {r4, lr}
- lowpass_const r3
- sub r1, r1, r2, lsl #1
- mov r3, r2
- vpush {d8-d15}
- bl \type\()_h264_qpel16_v_lowpass_neon
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc12_neon, export=1
- push {r0, r1, r4-r5, r9-r11, lr}
-\type\()_h264_qpel16_mc12:
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r0, r11, #15
-T mov sp, r0
- sub sp, sp, #(16*16+16*12)
- sub r1, r1, r2, lsl #1
- mov r0, sp
- mov r3, r2
- vpush {d8-d15}
- bl put_h264_qpel16_v_lowpass_neon_packed
- mov r4, r0
- ldrd r0, r1, [r11], #8
- sub r1, r1, r3, lsl #1
- sub r1, r1, #2
- mov r2, r3
- bl \type\()_h264_qpel16_hv_lowpass_l2_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4-r5, r9-r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc22_neon, export=1
- push {r4, r9-r11, lr}
- lowpass_const r3
- mov r11, sp
-A bic sp, sp, #15
-T bic r4, r11, #15
-T mov sp, r4
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, r2
- sub sp, sp, #(16*12)
- mov r4, sp
- vpush {d8-d15}
- bl \type\()_h264_qpel16_hv_lowpass_neon
- vpop {d8-d15}
- mov sp, r11
- pop {r4, r9-r11, pc}
-endfunc
-
-function ff_\type\()_h264_qpel16_mc32_neon, export=1
- push {r0, r1, r4-r5, r9-r11, lr}
- add r1, r1, #1
- b \type\()_h264_qpel16_mc12
-endfunc
-
-function ff_\type\()_h264_qpel16_mc03_neon, export=1
- push {r4, lr}
- add r12, r1, r2
- b \type\()_h264_qpel16_mc01
-endfunc
-
-function ff_\type\()_h264_qpel16_mc13_neon, export=1
- push {r0, r1, r4, r11, lr}
- add r1, r1, r2
- b \type\()_h264_qpel16_mc11
-endfunc
-
-function ff_\type\()_h264_qpel16_mc23_neon, export=1
- push {r0, r1, r4-r5, r9-r11, lr}
- add r1, r1, r2
- b \type\()_h264_qpel16_mc21
-endfunc
-
-function ff_\type\()_h264_qpel16_mc33_neon, export=1
- add r1, r1, #1
- push {r0, r1, r4, r11, lr}
- add r1, r1, r2
- sub r1, r1, #1
- b \type\()_h264_qpel16_mc11
-endfunc
-.endm
-
- h264_qpel16 put
- h264_qpel16 avg
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.S b/ffmpeg/libavcodec/arm/hpeldsp_arm.S
deleted file mode 100644
index 2f3d311..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_arm.S
+++ /dev/null
@@ -1,611 +0,0 @@
-@
-@ ARMv4 optimized DSP utils
-@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
-@
-@ This file is part of FFmpeg.
-@
-@ FFmpeg is free software; you can redistribute it and/or
-@ modify it under the terms of the GNU Lesser General Public
-@ License as published by the Free Software Foundation; either
-@ version 2.1 of the License, or (at your option) any later version.
-@
-@ FFmpeg is distributed in the hope that it will be useful,
-@ but WITHOUT ANY WARRANTY; without even the implied warranty of
-@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-@ Lesser General Public License for more details.
-@
-@ You should have received a copy of the GNU Lesser General Public
-@ License along with FFmpeg; if not, write to the Free Software
-@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-@
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-#if !HAVE_ARMV5TE_EXTERNAL
-#define pld @
-#endif
-
-.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
- mov \Rd0, \Rn0, lsr #(\shift * 8)
- mov \Rd1, \Rn1, lsr #(\shift * 8)
- mov \Rd2, \Rn2, lsr #(\shift * 8)
- mov \Rd3, \Rn3, lsr #(\shift * 8)
- orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
- orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
- orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
- orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
-.endm
-.macro ALIGN_DWORD shift, R0, R1, R2
- mov \R0, \R0, lsr #(\shift * 8)
- orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
- mov \R1, \R1, lsr #(\shift * 8)
- orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
-.endm
-.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
- mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
- mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
- orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
- orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
-.endm
-
-.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
- @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
- @ Rmask = 0xFEFEFEFE
- @ Rn = destroy
- eor \Rd0, \Rn0, \Rm0
- eor \Rd1, \Rn1, \Rm1
- orr \Rn0, \Rn0, \Rm0
- orr \Rn1, \Rn1, \Rm1
- and \Rd0, \Rd0, \Rmask
- and \Rd1, \Rd1, \Rmask
- sub \Rd0, \Rn0, \Rd0, lsr #1
- sub \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
- @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
- @ Rmask = 0xFEFEFEFE
- @ Rn = destroy
- eor \Rd0, \Rn0, \Rm0
- eor \Rd1, \Rn1, \Rm1
- and \Rn0, \Rn0, \Rm0
- and \Rn1, \Rn1, \Rm1
- and \Rd0, \Rd0, \Rmask
- and \Rd1, \Rd1, \Rmask
- add \Rd0, \Rn0, \Rd0, lsr #1
- add \Rd1, \Rn1, \Rd1, lsr #1
-.endm
-
-.macro JMP_ALIGN tmp, reg
- ands \tmp, \reg, #3
- bic \reg, \reg, #3
- beq 1f
- subs \tmp, \tmp, #1
- beq 2f
- subs \tmp, \tmp, #1
- beq 3f
- b 4f
-.endm
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels16_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11, lr}
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r7}
- add r1, r1, r2
- stm r0, {r4-r7}
- pld [r1]
- subs r3, r3, #1
- add r0, r0, r2
- bne 1b
- pop {r4-r11, pc}
- .align 5
-2:
- ldm r1, {r4-r8}
- add r1, r1, r2
- ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
- pld [r1]
- subs r3, r3, #1
- stm r0, {r9-r12}
- add r0, r0, r2
- bne 2b
- pop {r4-r11, pc}
- .align 5
-3:
- ldm r1, {r4-r8}
- add r1, r1, r2
- ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
- pld [r1]
- subs r3, r3, #1
- stm r0, {r9-r12}
- add r0, r0, r2
- bne 3b
- pop {r4-r11, pc}
- .align 5
-4:
- ldm r1, {r4-r8}
- add r1, r1, r2
- ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
- pld [r1]
- subs r3, r3, #1
- stm r0, {r9-r12}
- add r0, r0, r2
- bne 4b
- pop {r4-r11,pc}
-endfunc
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels8_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r5,lr}
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5}
- add r1, r1, r2
- subs r3, r3, #1
- pld [r1]
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 1b
- pop {r4-r5,pc}
- .align 5
-2:
- ldm r1, {r4-r5, r12}
- add r1, r1, r2
- ALIGN_DWORD 1, r4, r5, r12
- pld [r1]
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 2b
- pop {r4-r5,pc}
- .align 5
-3:
- ldm r1, {r4-r5, r12}
- add r1, r1, r2
- ALIGN_DWORD 2, r4, r5, r12
- pld [r1]
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 3b
- pop {r4-r5,pc}
- .align 5
-4:
- ldm r1, {r4-r5, r12}
- add r1, r1, r2
- ALIGN_DWORD 3, r4, r5, r12
- pld [r1]
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 4b
- pop {r4-r5,pc}
-endfunc
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels8_x2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r10,lr}
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- pld [r1]
- RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 1b
- pop {r4-r10,pc}
- .align 5
-2:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
- pld [r1]
- RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 2b
- pop {r4-r10,pc}
- .align 5
-3:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
- pld [r1]
- RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 3b
- pop {r4-r10,pc}
- .align 5
-4:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
- pld [r1]
- RND_AVG32 r8, r9, r6, r7, r5, r10, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 4b
- pop {r4-r10,pc}
-endfunc
-
- .align 5
-function ff_put_no_rnd_pixels8_x2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r10,lr}
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 1b
- pop {r4-r10,pc}
- .align 5
-2:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 2b
- pop {r4-r10,pc}
- .align 5
-3:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
- ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
- subs r3, r3, #1
- stm r0, {r4-r5}
- add r0, r0, r2
- bne 3b
- pop {r4-r10,pc}
- .align 5
-4:
- ldm r1, {r4-r5, r10}
- add r1, r1, r2
- ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
- pld [r1]
- NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 4b
- pop {r4-r10,pc}
-endfunc
-
-
-@ ----------------------------------------------------------------
- .align 5
-function ff_put_pixels8_y2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr}
- mov r3, r3, lsr #1
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5}
- add r1, r1, r2
-6: ldm r1, {r6-r7}
- add r1, r1, r2
- pld [r1]
- RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- ldm r1, {r4-r5}
- add r1, r1, r2
- stm r0, {r8-r9}
- add r0, r0, r2
- pld [r1]
- RND_AVG32 r8, r9, r6, r7, r4, r5, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-2:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r7, r8, r9
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
- subs r3, r3, #1
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-3:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r7, r8, r9
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
- subs r3, r3, #1
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-4:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r7, r8, r9
- RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
- subs r3, r3, #1
- RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
-endfunc
-
- .align 5
-function ff_put_no_rnd_pixels8_y2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr}
- mov r3, r3, lsr #1
- ldr r12, =0xfefefefe
- JMP_ALIGN r5, r1
-1:
- ldm r1, {r4-r5}
- add r1, r1, r2
-6: ldm r1, {r6-r7}
- add r1, r1, r2
- pld [r1]
- NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
- ldm r1, {r4-r5}
- add r1, r1, r2
- stm r0, {r8-r9}
- add r0, r0, r2
- pld [r1]
- NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
- subs r3, r3, #1
- stm r0, {r8-r9}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-2:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r7, r8, r9
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 1, r4, r5, r6
- subs r3, r3, #1
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-3:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r7, r8, r9
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 2, r4, r5, r6
- subs r3, r3, #1
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
- .align 5
-4:
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
-6: ldm r1, {r7-r9}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r7, r8, r9
- NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- ldm r1, {r4-r6}
- add r1, r1, r2
- pld [r1]
- ALIGN_DWORD 3, r4, r5, r6
- subs r3, r3, #1
- NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
- stm r0, {r10-r11}
- add r0, r0, r2
- bne 6b
- pop {r4-r11,pc}
-endfunc
-
- .ltorg
-
-@ ----------------------------------------------------------------
-.macro RND_XY2_IT align, rnd
- @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
- @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
-.if \align == 0
- ldm r1, {r6-r8}
-.elseif \align == 3
- ldm r1, {r5-r7}
-.else
- ldm r1, {r8-r10}
-.endif
- add r1, r1, r2
- pld [r1]
-.if \align == 0
- ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
-.elseif \align == 1
- ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
- ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
-.elseif \align == 2
- ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
- ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
-.elseif \align == 3
- ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
-.endif
- ldr r14, =0x03030303
- tst r3, #1
- and r8, r4, r14
- and r9, r5, r14
- and r10, r6, r14
- and r11, r7, r14
- it eq
- andeq r14, r14, r14, \rnd #1
- add r8, r8, r10
- add r9, r9, r11
- ldr r12, =0xfcfcfcfc >> 2
- itt eq
- addeq r8, r8, r14
- addeq r9, r9, r14
- and r4, r12, r4, lsr #2
- and r5, r12, r5, lsr #2
- and r6, r12, r6, lsr #2
- and r7, r12, r7, lsr #2
- add r10, r4, r6
- add r11, r5, r7
- subs r3, r3, #1
-.endm
-
-.macro RND_XY2_EXPAND align, rnd
- RND_XY2_IT \align, \rnd
-6: push {r8-r11}
- RND_XY2_IT \align, \rnd
- pop {r4-r7}
- add r4, r4, r8
- add r5, r5, r9
- ldr r14, =0x0f0f0f0f
- add r6, r6, r10
- add r7, r7, r11
- and r4, r14, r4, lsr #2
- and r5, r14, r5, lsr #2
- add r4, r4, r6
- add r5, r5, r7
- stm r0, {r4-r5}
- add r0, r0, r2
- bge 6b
- pop {r4-r11,pc}
-.endm
-
- .align 5
-function ff_put_pixels8_xy2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr} @ R14 is also called LR
- JMP_ALIGN r5, r1
-1: RND_XY2_EXPAND 0, lsl
- .align 5
-2: RND_XY2_EXPAND 1, lsl
- .align 5
-3: RND_XY2_EXPAND 2, lsl
- .align 5
-4: RND_XY2_EXPAND 3, lsl
-endfunc
-
- .align 5
-function ff_put_no_rnd_pixels8_xy2_arm, export=1
- @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
- @ block = word aligned, pixles = unaligned
- pld [r1]
- push {r4-r11,lr}
- JMP_ALIGN r5, r1
-1: RND_XY2_EXPAND 0, lsr
- .align 5
-2: RND_XY2_EXPAND 1, lsr
- .align 5
-3: RND_XY2_EXPAND 2, lsr
- .align 5
-4: RND_XY2_EXPAND 3, lsr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.h b/ffmpeg/libavcodec/arm/hpeldsp_arm.h
deleted file mode 100644
index 3f18c62..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_arm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_HPELDSP_H
-#define AVCODEC_ARM_HPELDSP_H
-
-#include "libavcodec/hpeldsp.h"
-
-void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags);
-void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
-
-#endif /* AVCODEC_ARM_HPELDSP_H */
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_armv6.S b/ffmpeg/libavcodec/arm/hpeldsp_armv6.S
deleted file mode 100644
index cd50150..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_armv6.S
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro call_2x_pixels type, subp
-function ff_\type\()_pixels16\subp\()_armv6, export=1
- push {r0-r3, lr}
- bl ff_\type\()_pixels8\subp\()_armv6
- pop {r0-r3, lr}
- add r0, r0, #8
- add r1, r1, #8
- b ff_\type\()_pixels8\subp\()_armv6
-endfunc
-.endm
-
-call_2x_pixels avg
-call_2x_pixels put, _x2
-call_2x_pixels put, _y2
-call_2x_pixels put, _x2_no_rnd
-call_2x_pixels put, _y2_no_rnd
-
-function ff_put_pixels16_armv6, export=1
- push {r4-r11}
-1:
- ldr r5, [r1, #4]
- ldr r6, [r1, #8]
- ldr r7, [r1, #12]
- ldr_post r4, r1, r2
- strd r6, r7, [r0, #8]
- ldr r9, [r1, #4]
- strd_post r4, r5, r0, r2
- ldr r10, [r1, #8]
- ldr r11, [r1, #12]
- ldr_post r8, r1, r2
- strd r10, r11, [r0, #8]
- subs r3, r3, #2
- strd_post r8, r9, r0, r2
- bne 1b
-
- pop {r4-r11}
- bx lr
-endfunc
-
-function ff_put_pixels8_armv6, export=1
- push {r4-r7}
-1:
- ldr r5, [r1, #4]
- ldr_post r4, r1, r2
- ldr r7, [r1, #4]
- strd_post r4, r5, r0, r2
- ldr_post r6, r1, r2
- subs r3, r3, #2
- strd_post r6, r7, r0, r2
- bne 1b
-
- pop {r4-r7}
- bx lr
-endfunc
-
-function ff_put_pixels8_x2_armv6, export=1
- push {r4-r11, lr}
- mov r12, #1
- orr r12, r12, r12, lsl #8
- orr r12, r12, r12, lsl #16
-1:
- ldr r4, [r1]
- subs r3, r3, #2
- ldr r5, [r1, #4]
- ldr r7, [r1, #5]
- lsr r6, r4, #8
- ldr_pre r8, r1, r2
- orr r6, r6, r5, lsl #24
- ldr r9, [r1, #4]
- ldr r11, [r1, #5]
- lsr r10, r8, #8
- add r1, r1, r2
- orr r10, r10, r9, lsl #24
- eor r14, r4, r6
- uhadd8 r4, r4, r6
- eor r6, r5, r7
- uhadd8 r5, r5, r7
- and r14, r14, r12
- and r6, r6, r12
- uadd8 r4, r4, r14
- eor r14, r8, r10
- uadd8 r5, r5, r6
- eor r6, r9, r11
- uhadd8 r8, r8, r10
- and r14, r14, r12
- uhadd8 r9, r9, r11
- and r6, r6, r12
- uadd8 r8, r8, r14
- strd_post r4, r5, r0, r2
- uadd8 r9, r9, r6
- strd_post r8, r9, r0, r2
- bne 1b
-
- pop {r4-r11, pc}
-endfunc
-
-function ff_put_pixels8_y2_armv6, export=1
- push {r4-r11}
- mov r12, #1
- orr r12, r12, r12, lsl #8
- orr r12, r12, r12, lsl #16
- ldr r4, [r1]
- ldr r5, [r1, #4]
- ldr_pre r6, r1, r2
- ldr r7, [r1, #4]
-1:
- subs r3, r3, #2
- uhadd8 r8, r4, r6
- eor r10, r4, r6
- uhadd8 r9, r5, r7
- eor r11, r5, r7
- and r10, r10, r12
- ldr_pre r4, r1, r2
- uadd8 r8, r8, r10
- and r11, r11, r12
- uadd8 r9, r9, r11
- ldr r5, [r1, #4]
- uhadd8 r10, r4, r6
- eor r6, r4, r6
- uhadd8 r11, r5, r7
- and r6, r6, r12
- eor r7, r5, r7
- uadd8 r10, r10, r6
- and r7, r7, r12
- ldr_pre r6, r1, r2
- uadd8 r11, r11, r7
- strd_post r8, r9, r0, r2
- ldr r7, [r1, #4]
- strd_post r10, r11, r0, r2
- bne 1b
-
- pop {r4-r11}
- bx lr
-endfunc
-
-function ff_put_pixels8_x2_no_rnd_armv6, export=1
- push {r4-r9, lr}
-1:
- subs r3, r3, #2
- ldr r4, [r1]
- ldr r5, [r1, #4]
- ldr r7, [r1, #5]
- ldr_pre r8, r1, r2
- ldr r9, [r1, #4]
- ldr r14, [r1, #5]
- add r1, r1, r2
- lsr r6, r4, #8
- orr r6, r6, r5, lsl #24
- lsr r12, r8, #8
- orr r12, r12, r9, lsl #24
- uhadd8 r4, r4, r6
- uhadd8 r5, r5, r7
- uhadd8 r8, r8, r12
- uhadd8 r9, r9, r14
- stm r0, {r4,r5}
- add r0, r0, r2
- stm r0, {r8,r9}
- add r0, r0, r2
- bne 1b
-
- pop {r4-r9, pc}
-endfunc
-
-function ff_put_pixels8_y2_no_rnd_armv6, export=1
- push {r4-r9, lr}
- ldr r4, [r1]
- ldr r5, [r1, #4]
- ldr_pre r6, r1, r2
- ldr r7, [r1, #4]
-1:
- subs r3, r3, #2
- uhadd8 r8, r4, r6
- ldr_pre r4, r1, r2
- uhadd8 r9, r5, r7
- ldr r5, [r1, #4]
- uhadd8 r12, r4, r6
- ldr_pre r6, r1, r2
- uhadd8 r14, r5, r7
- ldr r7, [r1, #4]
- stm r0, {r8,r9}
- add r0, r0, r2
- stm r0, {r12,r14}
- add r0, r0, r2
- bne 1b
-
- pop {r4-r9, pc}
-endfunc
-
-function ff_avg_pixels8_armv6, export=1
- pld [r1, r2]
- push {r4-r10, lr}
- mov lr, #1
- orr lr, lr, lr, lsl #8
- orr lr, lr, lr, lsl #16
- ldrd r4, r5, [r0]
- ldr r10, [r1, #4]
- ldr_post r9, r1, r2
- subs r3, r3, #2
-1:
- pld [r1, r2]
- eor r8, r4, r9
- uhadd8 r4, r4, r9
- eor r12, r5, r10
- ldrd_reg r6, r7, r0, r2
- uhadd8 r5, r5, r10
- and r8, r8, lr
- ldr r10, [r1, #4]
- and r12, r12, lr
- uadd8 r4, r4, r8
- ldr_post r9, r1, r2
- eor r8, r6, r9
- uadd8 r5, r5, r12
- pld [r1, r2, lsl #1]
- eor r12, r7, r10
- uhadd8 r6, r6, r9
- strd_post r4, r5, r0, r2
- uhadd8 r7, r7, r10
- beq 2f
- and r8, r8, lr
- ldrd_reg r4, r5, r0, r2
- uadd8 r6, r6, r8
- ldr r10, [r1, #4]
- and r12, r12, lr
- subs r3, r3, #2
- uadd8 r7, r7, r12
- ldr_post r9, r1, r2
- strd_post r6, r7, r0, r2
- b 1b
-2:
- and r8, r8, lr
- and r12, r12, lr
- uadd8 r6, r6, r8
- uadd8 r7, r7, r12
- strd_post r6, r7, r0, r2
-
- pop {r4-r10, pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c b/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c
deleted file mode 100644
index 2cc2b78..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * ARM optimized DSP utils
- * Copyright (c) 2001 Lionel Ulmer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/cpu.h"
-#include "libavutil/attributes.h"
-#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS
-#include "libavcodec/rnd_avg.h"
-#include "hpeldsp_arm.h"
-
-void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-
-void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-
-void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
-
-CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
-CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
-CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
-CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
-
-av_cold void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags)
-{
- int cpu_flags = av_get_cpu_flags();
-
- c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
- c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
- c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
- c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
-
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
- c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
- c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
- c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
- c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
-
- if (have_armv6(cpu_flags))
- ff_hpeldsp_init_armv6(c, flags);
- if (have_neon(cpu_flags))
- ff_hpeldsp_init_neon(c, flags);
-}
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c b/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c
deleted file mode 100644
index 967a8e0..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "hpeldsp_arm.h"
-
-void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
-{
- c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
-/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
- c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
-/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
-
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
- c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
- c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
-/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
-/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
-
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
-}
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c b/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c
deleted file mode 100644
index d9feadd..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "hpeldsp_arm.h"
-
-void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
-
-av_cold void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
-{
- c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
- c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
- c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
- c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
- c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
- c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
- c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
-
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
- c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
- c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
- c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
-
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
- c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
- c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
- c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
- c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
- c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
- c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
-
- c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
- c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
- c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
- c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
-}
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_neon.S b/ffmpeg/libavcodec/arm/hpeldsp_neon.S
deleted file mode 100644
index cf4a6cf..0000000
--- a/ffmpeg/libavcodec/arm/hpeldsp_neon.S
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro pixels16 rnd=1, avg=0
- .if \avg
- mov r12, r0
- .endif
-1: vld1.8 {q0}, [r1], r2
- vld1.8 {q1}, [r1], r2
- vld1.8 {q2}, [r1], r2
- pld [r1, r2, lsl #2]
- vld1.8 {q3}, [r1], r2
- pld [r1]
- pld [r1, r2]
- pld [r1, r2, lsl #1]
- .if \avg
- vld1.8 {q8}, [r12,:128], r2
- vrhadd.u8 q0, q0, q8
- vld1.8 {q9}, [r12,:128], r2
- vrhadd.u8 q1, q1, q9
- vld1.8 {q10}, [r12,:128], r2
- vrhadd.u8 q2, q2, q10
- vld1.8 {q11}, [r12,:128], r2
- vrhadd.u8 q3, q3, q11
- .endif
- subs r3, r3, #4
- vst1.64 {q0}, [r0,:128], r2
- vst1.64 {q1}, [r0,:128], r2
- vst1.64 {q2}, [r0,:128], r2
- vst1.64 {q3}, [r0,:128], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels16_x2 rnd=1, avg=0
-1: vld1.8 {d0-d2}, [r1], r2
- vld1.8 {d4-d6}, [r1], r2
- pld [r1]
- pld [r1, r2]
- subs r3, r3, #2
- vext.8 q1, q0, q1, #1
- avg q0, q0, q1
- vext.8 q3, q2, q3, #1
- avg q2, q2, q3
- .if \avg
- vld1.8 {q1}, [r0,:128], r2
- vld1.8 {q3}, [r0,:128]
- vrhadd.u8 q0, q0, q1
- vrhadd.u8 q2, q2, q3
- sub r0, r0, r2
- .endif
- vst1.8 {q0}, [r0,:128], r2
- vst1.8 {q2}, [r0,:128], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels16_y2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {q0}, [r1], r2
- vld1.8 {q1}, [r1], r2
-1: subs r3, r3, #2
- avg q2, q0, q1
- vld1.8 {q0}, [r1], r2
- avg q3, q0, q1
- vld1.8 {q1}, [r1], r2
- pld [r1]
- pld [r1, r2]
- .if \avg
- vld1.8 {q8}, [r0,:128], r2
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q2, q2, q8
- vrhadd.u8 q3, q3, q9
- sub r0, r0, r2
- .endif
- vst1.8 {q2}, [r0,:128], r2
- vst1.8 {q3}, [r0,:128], r2
- bne 1b
-
- avg q2, q0, q1
- vld1.8 {q0}, [r1], r2
- avg q3, q0, q1
- .if \avg
- vld1.8 {q8}, [r0,:128], r2
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q2, q2, q8
- vrhadd.u8 q3, q3, q9
- sub r0, r0, r2
- .endif
- vst1.8 {q2}, [r0,:128], r2
- vst1.8 {q3}, [r0,:128], r2
-
- bx lr
-.endm
-
-.macro pixels16_xy2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {d0-d2}, [r1], r2
- vld1.8 {d4-d6}, [r1], r2
-NRND vmov.i16 q13, #1
- pld [r1]
- pld [r1, r2]
- vext.8 q1, q0, q1, #1
- vext.8 q3, q2, q3, #1
- vaddl.u8 q8, d0, d2
- vaddl.u8 q10, d1, d3
- vaddl.u8 q9, d4, d6
- vaddl.u8 q11, d5, d7
-1: subs r3, r3, #2
- vld1.8 {d0-d2}, [r1], r2
- vadd.u16 q12, q8, q9
- pld [r1]
-NRND vadd.u16 q12, q12, q13
- vext.8 q15, q0, q1, #1
- vadd.u16 q1 , q10, q11
- shrn d28, q12, #2
-NRND vadd.u16 q1, q1, q13
- shrn d29, q1, #2
- .if \avg
- vld1.8 {q8}, [r0,:128]
- vrhadd.u8 q14, q14, q8
- .endif
- vaddl.u8 q8, d0, d30
- vld1.8 {d2-d4}, [r1], r2
- vaddl.u8 q10, d1, d31
- vst1.8 {q14}, [r0,:128], r2
- vadd.u16 q12, q8, q9
- pld [r1, r2]
-NRND vadd.u16 q12, q12, q13
- vext.8 q2, q1, q2, #1
- vadd.u16 q0, q10, q11
- shrn d30, q12, #2
-NRND vadd.u16 q0, q0, q13
- shrn d31, q0, #2
- .if \avg
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q15, q15, q9
- .endif
- vaddl.u8 q9, d2, d4
- vaddl.u8 q11, d3, d5
- vst1.8 {q15}, [r0,:128], r2
- bgt 1b
-
- vld1.8 {d0-d2}, [r1], r2
- vadd.u16 q12, q8, q9
-NRND vadd.u16 q12, q12, q13
- vext.8 q15, q0, q1, #1
- vadd.u16 q1 , q10, q11
- shrn d28, q12, #2
-NRND vadd.u16 q1, q1, q13
- shrn d29, q1, #2
- .if \avg
- vld1.8 {q8}, [r0,:128]
- vrhadd.u8 q14, q14, q8
- .endif
- vaddl.u8 q8, d0, d30
- vaddl.u8 q10, d1, d31
- vst1.8 {q14}, [r0,:128], r2
- vadd.u16 q12, q8, q9
-NRND vadd.u16 q12, q12, q13
- vadd.u16 q0, q10, q11
- shrn d30, q12, #2
-NRND vadd.u16 q0, q0, q13
- shrn d31, q0, #2
- .if \avg
- vld1.8 {q9}, [r0,:128]
- vrhadd.u8 q15, q15, q9
- .endif
- vst1.8 {q15}, [r0,:128], r2
-
- bx lr
-.endm
-
-.macro pixels8 rnd=1, avg=0
-1: vld1.8 {d0}, [r1], r2
- vld1.8 {d1}, [r1], r2
- vld1.8 {d2}, [r1], r2
- pld [r1, r2, lsl #2]
- vld1.8 {d3}, [r1], r2
- pld [r1]
- pld [r1, r2]
- pld [r1, r2, lsl #1]
- .if \avg
- vld1.8 {d4}, [r0,:64], r2
- vrhadd.u8 d0, d0, d4
- vld1.8 {d5}, [r0,:64], r2
- vrhadd.u8 d1, d1, d5
- vld1.8 {d6}, [r0,:64], r2
- vrhadd.u8 d2, d2, d6
- vld1.8 {d7}, [r0,:64], r2
- vrhadd.u8 d3, d3, d7
- sub r0, r0, r2, lsl #2
- .endif
- subs r3, r3, #4
- vst1.8 {d0}, [r0,:64], r2
- vst1.8 {d1}, [r0,:64], r2
- vst1.8 {d2}, [r0,:64], r2
- vst1.8 {d3}, [r0,:64], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels8_x2 rnd=1, avg=0
-1: vld1.8 {q0}, [r1], r2
- vext.8 d1, d0, d1, #1
- vld1.8 {q1}, [r1], r2
- vext.8 d3, d2, d3, #1
- pld [r1]
- pld [r1, r2]
- subs r3, r3, #2
- vswp d1, d2
- avg q0, q0, q1
- .if \avg
- vld1.8 {d4}, [r0,:64], r2
- vld1.8 {d5}, [r0,:64]
- vrhadd.u8 q0, q0, q2
- sub r0, r0, r2
- .endif
- vst1.8 {d0}, [r0,:64], r2
- vst1.8 {d1}, [r0,:64], r2
- bne 1b
- bx lr
-.endm
-
-.macro pixels8_y2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {d0}, [r1], r2
- vld1.8 {d1}, [r1], r2
-1: subs r3, r3, #2
- avg d4, d0, d1
- vld1.8 {d0}, [r1], r2
- avg d5, d0, d1
- vld1.8 {d1}, [r1], r2
- pld [r1]
- pld [r1, r2]
- .if \avg
- vld1.8 {d2}, [r0,:64], r2
- vld1.8 {d3}, [r0,:64]
- vrhadd.u8 q2, q2, q1
- sub r0, r0, r2
- .endif
- vst1.8 {d4}, [r0,:64], r2
- vst1.8 {d5}, [r0,:64], r2
- bne 1b
-
- avg d4, d0, d1
- vld1.8 {d0}, [r1], r2
- avg d5, d0, d1
- .if \avg
- vld1.8 {d2}, [r0,:64], r2
- vld1.8 {d3}, [r0,:64]
- vrhadd.u8 q2, q2, q1
- sub r0, r0, r2
- .endif
- vst1.8 {d4}, [r0,:64], r2
- vst1.8 {d5}, [r0,:64], r2
-
- bx lr
-.endm
-
-.macro pixels8_xy2 rnd=1, avg=0
- sub r3, r3, #2
- vld1.8 {q0}, [r1], r2
- vld1.8 {q1}, [r1], r2
-NRND vmov.i16 q11, #1
- pld [r1]
- pld [r1, r2]
- vext.8 d4, d0, d1, #1
- vext.8 d6, d2, d3, #1
- vaddl.u8 q8, d0, d4
- vaddl.u8 q9, d2, d6
-1: subs r3, r3, #2
- vld1.8 {q0}, [r1], r2
- pld [r1]
- vadd.u16 q10, q8, q9
- vext.8 d4, d0, d1, #1
-NRND vadd.u16 q10, q10, q11
- vaddl.u8 q8, d0, d4
- shrn d5, q10, #2
- vld1.8 {q1}, [r1], r2
- vadd.u16 q10, q8, q9
- pld [r1, r2]
- .if \avg
- vld1.8 {d7}, [r0,:64]
- vrhadd.u8 d5, d5, d7
- .endif
-NRND vadd.u16 q10, q10, q11
- vst1.8 {d5}, [r0,:64], r2
- shrn d7, q10, #2
- .if \avg
- vld1.8 {d5}, [r0,:64]
- vrhadd.u8 d7, d7, d5
- .endif
- vext.8 d6, d2, d3, #1
- vaddl.u8 q9, d2, d6
- vst1.8 {d7}, [r0,:64], r2
- bgt 1b
-
- vld1.8 {q0}, [r1], r2
- vadd.u16 q10, q8, q9
- vext.8 d4, d0, d1, #1
-NRND vadd.u16 q10, q10, q11
- vaddl.u8 q8, d0, d4
- shrn d5, q10, #2
- vadd.u16 q10, q8, q9
- .if \avg
- vld1.8 {d7}, [r0,:64]
- vrhadd.u8 d5, d5, d7
- .endif
-NRND vadd.u16 q10, q10, q11
- vst1.8 {d5}, [r0,:64], r2
- shrn d7, q10, #2
- .if \avg
- vld1.8 {d5}, [r0,:64]
- vrhadd.u8 d7, d7, d5
- .endif
- vst1.8 {d7}, [r0,:64], r2
-
- bx lr
-.endm
-
-.macro pixfunc pfx, name, suf, rnd=1, avg=0
- .if \rnd
- .macro avg rd, rn, rm
- vrhadd.u8 \rd, \rn, \rm
- .endm
- .macro shrn rd, rn, rm
- vrshrn.u16 \rd, \rn, \rm
- .endm
- .macro NRND insn:vararg
- .endm
- .else
- .macro avg rd, rn, rm
- vhadd.u8 \rd, \rn, \rm
- .endm
- .macro shrn rd, rn, rm
- vshrn.u16 \rd, \rn, \rm
- .endm
- .macro NRND insn:vararg
- \insn
- .endm
- .endif
-function ff_\pfx\name\suf\()_neon, export=1
- \name \rnd, \avg
-endfunc
- .purgem avg
- .purgem shrn
- .purgem NRND
-.endm
-
-.macro pixfunc2 pfx, name, avg=0
- pixfunc \pfx, \name, rnd=1, avg=\avg
- pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
-.endm
-
-function ff_put_h264_qpel16_mc00_neon, export=1
- mov r3, #16
-endfunc
-
- pixfunc put_, pixels16, avg=0
- pixfunc2 put_, pixels16_x2, avg=0
- pixfunc2 put_, pixels16_y2, avg=0
- pixfunc2 put_, pixels16_xy2, avg=0
-
-function ff_avg_h264_qpel16_mc00_neon, export=1
- mov r3, #16
-endfunc
-
- pixfunc avg_, pixels16, avg=1
- pixfunc2 avg_, pixels16_x2, avg=1
- pixfunc2 avg_, pixels16_y2, avg=1
- pixfunc2 avg_, pixels16_xy2, avg=1
-
-function ff_put_h264_qpel8_mc00_neon, export=1
- mov r3, #8
-endfunc
-
- pixfunc put_, pixels8, avg=0
- pixfunc2 put_, pixels8_x2, avg=0
- pixfunc2 put_, pixels8_y2, avg=0
- pixfunc2 put_, pixels8_xy2, avg=0
-
-function ff_avg_h264_qpel8_mc00_neon, export=1
- mov r3, #8
-endfunc
-
- pixfunc avg_, pixels8, avg=1
- pixfunc avg_, pixels8_x2, avg=1
- pixfunc avg_, pixels8_y2, avg=1
- pixfunc avg_, pixels8_xy2, avg=1
diff --git a/ffmpeg/libavcodec/arm/int_neon.S b/ffmpeg/libavcodec/arm/int_neon.S
deleted file mode 100644
index b3f5a69..0000000
--- a/ffmpeg/libavcodec/arm/int_neon.S
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * ARM NEON optimised integer operations
- * Copyright (c) 2009 Konstantin Shishkov
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
- .fpu neon
-
-function ff_scalarproduct_int16_neon, export=1
- vmov.i16 q0, #0
- vmov.i16 q1, #0
- vmov.i16 q2, #0
- vmov.i16 q3, #0
-1: vld1.16 {d16-d17}, [r0]!
- vld1.16 {d20-d21}, [r1,:128]!
- vmlal.s16 q0, d16, d20
- vld1.16 {d18-d19}, [r0]!
- vmlal.s16 q1, d17, d21
- vld1.16 {d22-d23}, [r1,:128]!
- vmlal.s16 q2, d18, d22
- vmlal.s16 q3, d19, d23
- subs r2, r2, #16
- bne 1b
-
- vpadd.s32 d16, d0, d1
- vpadd.s32 d17, d2, d3
- vpadd.s32 d18, d4, d5
- vpadd.s32 d19, d6, d7
- vpadd.s32 d0, d16, d17
- vpadd.s32 d1, d18, d19
- vpadd.s32 d2, d0, d1
- vpaddl.s32 d3, d2
- vmov.32 r0, d3[0]
- bx lr
-endfunc
-
-@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
-function ff_scalarproduct_and_madd_int16_neon, export=1
- vld1.16 {d28[],d29[]}, [sp]
- vmov.i16 q0, #0
- vmov.i16 q1, #0
- vmov.i16 q2, #0
- vmov.i16 q3, #0
- mov r12, r0
-
-1: vld1.16 {d16-d17}, [r0,:128]!
- vld1.16 {d18-d19}, [r1]!
- vld1.16 {d20-d21}, [r2]!
- vld1.16 {d22-d23}, [r0,:128]!
- vld1.16 {d24-d25}, [r1]!
- vld1.16 {d26-d27}, [r2]!
- vmul.s16 q10, q10, q14
- vmul.s16 q13, q13, q14
- vmlal.s16 q0, d16, d18
- vmlal.s16 q1, d17, d19
- vadd.s16 q10, q8, q10
- vadd.s16 q13, q11, q13
- vmlal.s16 q2, d22, d24
- vmlal.s16 q3, d23, d25
- vst1.16 {q10}, [r12,:128]!
- subs r3, r3, #16
- vst1.16 {q13}, [r12,:128]!
- bne 1b
-
- vpadd.s32 d16, d0, d1
- vpadd.s32 d17, d2, d3
- vpadd.s32 d18, d4, d5
- vpadd.s32 d19, d6, d7
- vpadd.s32 d0, d16, d17
- vpadd.s32 d1, d18, d19
- vpadd.s32 d2, d0, d1
- vpaddl.s32 d3, d2
- vmov.32 r0, d3[0]
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/jrevdct_arm.S b/ffmpeg/libavcodec/arm/jrevdct_arm.S
deleted file mode 100644
index f951e2a..0000000
--- a/ffmpeg/libavcodec/arm/jrevdct_arm.S
+++ /dev/null
@@ -1,383 +0,0 @@
-/*
- C-like prototype :
- void j_rev_dct_arm(DCTBLOCK data)
-
- With DCTBLOCK being a pointer to an array of 64 'signed shorts'
-
- Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
-
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
-
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
- IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-*/
-
-#include "libavutil/arm/asm.S"
-
-#define FIX_0_298631336 2446
-#define FIX_0_541196100 4433
-#define FIX_0_765366865 6270
-#define FIX_1_175875602 9633
-#define FIX_1_501321110 12299
-#define FIX_2_053119869 16819
-#define FIX_3_072711026 25172
-#define FIX_M_0_390180644 -3196
-#define FIX_M_0_899976223 -7373
-#define FIX_M_1_847759065 -15137
-#define FIX_M_1_961570560 -16069
-#define FIX_M_2_562915447 -20995
-#define FIX_0xFFFF 0xFFFF
-
-#define FIX_0_298631336_ID 0
-#define FIX_0_541196100_ID 4
-#define FIX_0_765366865_ID 8
-#define FIX_1_175875602_ID 12
-#define FIX_1_501321110_ID 16
-#define FIX_2_053119869_ID 20
-#define FIX_3_072711026_ID 24
-#define FIX_M_0_390180644_ID 28
-#define FIX_M_0_899976223_ID 32
-#define FIX_M_1_847759065_ID 36
-#define FIX_M_1_961570560_ID 40
-#define FIX_M_2_562915447_ID 44
-#define FIX_0xFFFF_ID 48
-
-function ff_j_rev_dct_arm, export=1
- push {r0, r4 - r11, lr}
-
- mov lr, r0 @ lr = pointer to the current row
- mov r12, #8 @ r12 = row-counter
- movrel r11, const_array @ r11 = base pointer to the constants array
-row_loop:
- ldrsh r0, [lr, # 0] @ r0 = 'd0'
- ldrsh r2, [lr, # 2] @ r2 = 'd2'
-
- @ Optimization for row that have all items except the first set to 0
- @ (this works as the int16_t are always 4-byte aligned)
- ldr r5, [lr, # 0]
- ldr r6, [lr, # 4]
- ldr r3, [lr, # 8]
- ldr r4, [lr, #12]
- orr r3, r3, r4
- orr r3, r3, r6
- orrs r5, r3, r5
- beq end_of_row_loop @ nothing to be done as ALL of them are '0'
- orrs r3, r3, r2
- beq empty_row
-
- ldrsh r1, [lr, # 8] @ r1 = 'd1'
- ldrsh r4, [lr, # 4] @ r4 = 'd4'
- ldrsh r6, [lr, # 6] @ r6 = 'd6'
-
- ldr r3, [r11, #FIX_0_541196100_ID]
- add r7, r2, r6
- ldr r5, [r11, #FIX_M_1_847759065_ID]
- mul r7, r3, r7 @ r7 = z1
- ldr r3, [r11, #FIX_0_765366865_ID]
- mla r6, r5, r6, r7 @ r6 = tmp2
- add r5, r0, r4 @ r5 = tmp0
- mla r2, r3, r2, r7 @ r2 = tmp3
- sub r3, r0, r4 @ r3 = tmp1
-
- add r0, r2, r5, lsl #13 @ r0 = tmp10
- rsb r2, r2, r5, lsl #13 @ r2 = tmp13
- add r4, r6, r3, lsl #13 @ r4 = tmp11
- rsb r3, r6, r3, lsl #13 @ r3 = tmp12
-
- push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
-
- ldrsh r3, [lr, #10] @ r3 = 'd3'
- ldrsh r5, [lr, #12] @ r5 = 'd5'
- ldrsh r7, [lr, #14] @ r7 = 'd7'
-
- add r0, r3, r5 @ r0 = 'z2'
- add r2, r1, r7 @ r2 = 'z1'
- add r4, r3, r7 @ r4 = 'z3'
- add r6, r1, r5 @ r6 = 'z4'
- ldr r9, [r11, #FIX_1_175875602_ID]
- add r8, r4, r6 @ r8 = z3 + z4
- ldr r10, [r11, #FIX_M_0_899976223_ID]
- mul r8, r9, r8 @ r8 = 'z5'
- ldr r9, [r11, #FIX_M_2_562915447_ID]
- mul r2, r10, r2 @ r2 = 'z1'
- ldr r10, [r11, #FIX_M_1_961570560_ID]
- mul r0, r9, r0 @ r0 = 'z2'
- ldr r9, [r11, #FIX_M_0_390180644_ID]
- mla r4, r10, r4, r8 @ r4 = 'z3'
- ldr r10, [r11, #FIX_0_298631336_ID]
- mla r6, r9, r6, r8 @ r6 = 'z4'
- ldr r9, [r11, #FIX_2_053119869_ID]
- mla r7, r10, r7, r2 @ r7 = tmp0 + z1
- ldr r10, [r11, #FIX_3_072711026_ID]
- mla r5, r9, r5, r0 @ r5 = tmp1 + z2
- ldr r9, [r11, #FIX_1_501321110_ID]
- mla r3, r10, r3, r0 @ r3 = tmp2 + z2
- add r7, r7, r4 @ r7 = tmp0
- mla r1, r9, r1, r2 @ r1 = tmp3 + z1
- add r5, r5, r6 @ r5 = tmp1
- add r3, r3, r4 @ r3 = tmp2
- add r1, r1, r6 @ r1 = tmp3
-
- pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
- @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
-
- @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
- add r8, r0, r1
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 0]
-
- @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
- sub r8, r0, r1
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, #14]
-
- @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
- add r8, r6, r3
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 2]
-
- @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
- sub r8, r6, r3
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, #12]
-
- @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
- add r8, r4, r5
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 4]
-
- @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
- sub r8, r4, r5
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, #10]
-
- @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
- add r8, r2, r7
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 6]
-
- @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
- sub r8, r2, r7
- add r8, r8, #(1<<10)
- mov r8, r8, asr #11
- strh r8, [lr, # 8]
-
- @ End of row loop
- add lr, lr, #16
- subs r12, r12, #1
- bne row_loop
- beq start_column_loop
-
-empty_row:
- ldr r1, [r11, #FIX_0xFFFF_ID]
- mov r0, r0, lsl #2
- and r0, r0, r1
- add r0, r0, r0, lsl #16
- str r0, [lr, # 0]
- str r0, [lr, # 4]
- str r0, [lr, # 8]
- str r0, [lr, #12]
-
-end_of_row_loop:
- @ End of loop
- add lr, lr, #16
- subs r12, r12, #1
- bne row_loop
-
-start_column_loop:
- @ Start of column loop
- pop {lr}
- mov r12, #8
-column_loop:
- ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
- ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
- ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
- ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
-
- ldr r3, [r11, #FIX_0_541196100_ID]
- add r1, r2, r6
- ldr r5, [r11, #FIX_M_1_847759065_ID]
- mul r1, r3, r1 @ r1 = z1
- ldr r3, [r11, #FIX_0_765366865_ID]
- mla r6, r5, r6, r1 @ r6 = tmp2
- add r5, r0, r4 @ r5 = tmp0
- mla r2, r3, r2, r1 @ r2 = tmp3
- sub r3, r0, r4 @ r3 = tmp1
-
- add r0, r2, r5, lsl #13 @ r0 = tmp10
- rsb r2, r2, r5, lsl #13 @ r2 = tmp13
- add r4, r6, r3, lsl #13 @ r4 = tmp11
- rsb r6, r6, r3, lsl #13 @ r6 = tmp12
-
- ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
- ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
- ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
- ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
-
- @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
- orr r9, r1, r3
- orr r10, r5, r7
- orrs r10, r9, r10
- beq empty_odd_column
-
- push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
-
- add r0, r3, r5 @ r0 = 'z2'
- add r2, r1, r7 @ r2 = 'z1'
- add r4, r3, r7 @ r4 = 'z3'
- add r6, r1, r5 @ r6 = 'z4'
- ldr r9, [r11, #FIX_1_175875602_ID]
- add r8, r4, r6
- ldr r10, [r11, #FIX_M_0_899976223_ID]
- mul r8, r9, r8 @ r8 = 'z5'
- ldr r9, [r11, #FIX_M_2_562915447_ID]
- mul r2, r10, r2 @ r2 = 'z1'
- ldr r10, [r11, #FIX_M_1_961570560_ID]
- mul r0, r9, r0 @ r0 = 'z2'
- ldr r9, [r11, #FIX_M_0_390180644_ID]
- mla r4, r10, r4, r8 @ r4 = 'z3'
- ldr r10, [r11, #FIX_0_298631336_ID]
- mla r6, r9, r6, r8 @ r6 = 'z4'
- ldr r9, [r11, #FIX_2_053119869_ID]
- mla r7, r10, r7, r2 @ r7 = tmp0 + z1
- ldr r10, [r11, #FIX_3_072711026_ID]
- mla r5, r9, r5, r0 @ r5 = tmp1 + z2
- ldr r9, [r11, #FIX_1_501321110_ID]
- mla r3, r10, r3, r0 @ r3 = tmp2 + z2
- add r7, r7, r4 @ r7 = tmp0
- mla r1, r9, r1, r2 @ r1 = tmp3 + z1
- add r5, r5, r6 @ r5 = tmp1
- add r3, r3, r4 @ r3 = tmp2
- add r1, r1, r6 @ r1 = tmp3
-
- pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
- @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
-
- @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
- add r8, r0, r1
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 0*8)]
-
- @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
- sub r8, r0, r1
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #(14*8)]
-
- @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
- add r8, r4, r3
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 2*8)]
-
- @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
- sub r8, r4, r3
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #(12*8)]
-
- @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
- add r8, r6, r5
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 4*8)]
-
- @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
- sub r8, r6, r5
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #(10*8)]
-
- @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
- add r8, r2, r7
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 6*8)]
-
- @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
- sub r8, r2, r7
- add r8, r8, #(1<<17)
- mov r8, r8, asr #18
- strh r8, [lr, #( 8*8)]
-
- @ End of row loop
- add lr, lr, #2
- subs r12, r12, #1
- bne column_loop
- beq the_end
-
-empty_odd_column:
- @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
- add r0, r0, #(1<<17)
- mov r0, r0, asr #18
- strh r0, [lr, #( 0*8)]
- strh r0, [lr, #(14*8)]
-
- @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
- add r4, r4, #(1<<17)
- mov r4, r4, asr #18
- strh r4, [lr, #( 2*8)]
- strh r4, [lr, #(12*8)]
-
- @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
- add r6, r6, #(1<<17)
- mov r6, r6, asr #18
- strh r6, [lr, #( 4*8)]
- strh r6, [lr, #(10*8)]
-
- @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
- @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
- add r2, r2, #(1<<17)
- mov r2, r2, asr #18
- strh r2, [lr, #( 6*8)]
- strh r2, [lr, #( 8*8)]
-
- @ End of row loop
- add lr, lr, #2
- subs r12, r12, #1
- bne column_loop
-
-the_end:
- @ The end....
- pop {r4 - r11, pc}
-endfunc
-
-const const_array
- .word FIX_0_298631336
- .word FIX_0_541196100
- .word FIX_0_765366865
- .word FIX_1_175875602
- .word FIX_1_501321110
- .word FIX_2_053119869
- .word FIX_3_072711026
- .word FIX_M_0_390180644
- .word FIX_M_0_899976223
- .word FIX_M_1_847759065
- .word FIX_M_1_961570560
- .word FIX_M_2_562915447
- .word FIX_0xFFFF
-endconst
diff --git a/ffmpeg/libavcodec/arm/mathops.h b/ffmpeg/libavcodec/arm/mathops.h
deleted file mode 100644
index dc57c55..0000000
--- a/ffmpeg/libavcodec/arm/mathops.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * simple math operations
- * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_MATHOPS_H
-#define AVCODEC_ARM_MATHOPS_H
-
-#include <stdint.h>
-#include "config.h"
-#include "libavutil/common.h"
-
-#if HAVE_INLINE_ASM
-
-#if HAVE_ARMV6_INLINE
-#define MULH MULH
-static inline av_const int MULH(int a, int b)
-{
- int r;
- __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
- return r;
-}
-
-#define FASTDIV FASTDIV
-static av_always_inline av_const int FASTDIV(int a, int b)
-{
- int r;
- __asm__ ("cmp %2, #2 \n\t"
- "ldr %0, [%3, %2, lsl #2] \n\t"
- "ite le \n\t"
- "lsrle %0, %1, #1 \n\t"
- "smmulgt %0, %0, %1 \n\t"
- : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
- return r;
-}
-
-#else /* HAVE_ARMV6_INLINE */
-
-#define FASTDIV FASTDIV
-static av_always_inline av_const int FASTDIV(int a, int b)
-{
- int r, t;
- __asm__ ("umull %1, %0, %2, %3"
- : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
- return r;
-}
-#endif
-
-#define MLS64(d, a, b) MAC64(d, -(a), b)
-
-#if HAVE_ARMV5TE_INLINE
-
-/* signed 16x16 -> 32 multiply add accumulate */
-# define MAC16(rt, ra, rb) \
- __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
-
-/* signed 16x16 -> 32 multiply */
-# define MUL16 MUL16
-static inline av_const int MUL16(int ra, int rb)
-{
- int rt;
- __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
- return rt;
-}
-
-#endif
-
-#define mid_pred mid_pred
-static inline av_const int mid_pred(int a, int b, int c)
-{
- int m;
- __asm__ (
- "mov %0, %2 \n\t"
- "cmp %1, %2 \n\t"
- "itt gt \n\t"
- "movgt %0, %1 \n\t"
- "movgt %1, %2 \n\t"
- "cmp %1, %3 \n\t"
- "it le \n\t"
- "movle %1, %3 \n\t"
- "cmp %0, %1 \n\t"
- "it gt \n\t"
- "movgt %0, %1 \n\t"
- : "=&r"(m), "+r"(a)
- : "r"(b), "r"(c)
- : "cc");
- return m;
-}
-
-#endif /* HAVE_INLINE_ASM */
-
-#endif /* AVCODEC_ARM_MATHOPS_H */
diff --git a/ffmpeg/libavcodec/arm/mdct_fixed_neon.S b/ffmpeg/libavcodec/arm/mdct_fixed_neon.S
deleted file mode 100644
index 365c5e7..0000000
--- a/ffmpeg/libavcodec/arm/mdct_fixed_neon.S
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro prerot dst, rt
- lsr r3, r6, #2 @ n4
- add \rt, r4, r6, lsr #1 @ revtab + n4
- add r9, r3, r3, lsl #1 @ n3
- add r8, r7, r6 @ tcos + n4
- add r3, r2, r6, lsr #1 @ in + n4
- add r9, r2, r9, lsl #1 @ in + n3
- sub r8, r8, #16
- sub r10, r3, #16
- sub r11, r9, #16
- mov r12, #-16
-1:
- vld2.16 {d0,d1}, [r9, :128]!
- vld2.16 {d2,d3}, [r11,:128], r12
- vld2.16 {d4,d5}, [r3, :128]!
- vld2.16 {d6,d7}, [r10,:128], r12
- vld2.16 {d16,d17},[r7, :128]! @ cos, sin
- vld2.16 {d18,d19},[r8, :128], r12
- vrev64.16 q1, q1
- vrev64.16 q3, q3
- vrev64.16 q9, q9
- vneg.s16 d0, d0
- vneg.s16 d2, d2
- vneg.s16 d16, d16
- vneg.s16 d18, d18
- vhsub.s16 d0, d0, d3 @ re
- vhsub.s16 d4, d7, d4 @ im
- vhsub.s16 d6, d6, d5
- vhsub.s16 d2, d2, d1
- vmull.s16 q10, d0, d16
- vmlsl.s16 q10, d4, d17
- vmull.s16 q11, d0, d17
- vmlal.s16 q11, d4, d16
- vmull.s16 q12, d6, d18
- vmlsl.s16 q12, d2, d19
- vmull.s16 q13, d6, d19
- vmlal.s16 q13, d2, d18
- vshrn.s32 d0, q10, #15
- vshrn.s32 d1, q11, #15
- vshrn.s32 d2, q12, #15
- vshrn.s32 d3, q13, #15
- vzip.16 d0, d1
- vzip.16 d2, d3
- ldrh lr, [r4], #2
- ldrh r2, [\rt, #-2]!
- add lr, \dst, lr, lsl #2
- add r2, \dst, r2, lsl #2
- vst1.32 {d0[0]}, [lr,:32]
- vst1.32 {d2[0]}, [r2,:32]
- ldrh lr, [r4], #2
- ldrh r2, [\rt, #-2]!
- add lr, \dst, lr, lsl #2
- add r2, \dst, r2, lsl #2
- vst1.32 {d0[1]}, [lr,:32]
- vst1.32 {d2[1]}, [r2,:32]
- ldrh lr, [r4], #2
- ldrh r2, [\rt, #-2]!
- add lr, \dst, lr, lsl #2
- add r2, \dst, r2, lsl #2
- vst1.32 {d1[0]}, [lr,:32]
- vst1.32 {d3[0]}, [r2,:32]
- ldrh lr, [r4], #2
- ldrh r2, [\rt, #-2]!
- add lr, \dst, lr, lsl #2
- add r2, \dst, r2, lsl #2
- vst1.32 {d1[1]}, [lr,:32]
- vst1.32 {d3[1]}, [r2,:32]
- subs r6, r6, #32
- bgt 1b
-.endm
-
-function ff_mdct_fixed_calc_neon, export=1
- push {r1,r4-r11,lr}
-
- ldr r4, [r0, #8] @ revtab
- ldr r6, [r0, #16] @ mdct_size; n
- ldr r7, [r0, #24] @ tcos
-
- prerot r1, r5
-
- mov r4, r0
- bl X(ff_fft_fixed_calc_neon)
-
- pop {r5}
- mov r12, #-16
- ldr r6, [r4, #16] @ mdct_size; n
- ldr r7, [r4, #24] @ tcos
- add r5, r5, r6, lsr #1
- add r7, r7, r6, lsr #1
- sub r1, r5, #16
- sub r2, r7, #16
-1:
- vld2.16 {d4,d5}, [r7,:128]!
- vld2.16 {d6,d7}, [r2,:128], r12
- vld2.16 {d0,d1}, [r5,:128]
- vld2.16 {d2,d3}, [r1,:128]
- vrev64.16 q3, q3
- vrev64.16 q1, q1
- vneg.s16 q3, q3
- vneg.s16 q2, q2
- vmull.s16 q11, d2, d6
- vmlal.s16 q11, d3, d7
- vmull.s16 q8, d0, d5
- vmlsl.s16 q8, d1, d4
- vmull.s16 q9, d0, d4
- vmlal.s16 q9, d1, d5
- vmull.s16 q10, d2, d7
- vmlsl.s16 q10, d3, d6
- vshrn.s32 d0, q11, #15
- vshrn.s32 d1, q8, #15
- vshrn.s32 d2, q9, #15
- vshrn.s32 d3, q10, #15
- vrev64.16 q0, q0
- vst2.16 {d2,d3}, [r5,:128]!
- vst2.16 {d0,d1}, [r1,:128], r12
- subs r6, r6, #32
- bgt 1b
-
- pop {r4-r11,pc}
-endfunc
-
-function ff_mdct_fixed_calcw_neon, export=1
- push {r1,r4-r11,lr}
-
- ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
- ldr r6, [r0, #16] @ mdct_size; n
- ldr r7, [r0, #24] @ tcos
-
- prerot r5, r1
-
- mov r4, r0
- mov r1, r5
- bl X(ff_fft_fixed_calc_neon)
-
- pop {r7}
- mov r12, #-16
- ldr r6, [r4, #16] @ mdct_size; n
- ldr r9, [r4, #24] @ tcos
- add r5, r5, r6, lsr #1
- add r7, r7, r6
- add r9, r9, r6, lsr #1
- sub r3, r5, #16
- sub r1, r7, #16
- sub r2, r9, #16
-1:
- vld2.16 {d4,d5}, [r9,:128]!
- vld2.16 {d6,d7}, [r2,:128], r12
- vld2.16 {d0,d1}, [r5,:128]!
- vld2.16 {d2,d3}, [r3,:128], r12
- vrev64.16 q3, q3
- vrev64.16 q1, q1
- vneg.s16 q3, q3
- vneg.s16 q2, q2
- vmull.s16 q8, d2, d6
- vmlal.s16 q8, d3, d7
- vmull.s16 q9, d0, d5
- vmlsl.s16 q9, d1, d4
- vmull.s16 q10, d0, d4
- vmlal.s16 q10, d1, d5
- vmull.s16 q11, d2, d7
- vmlsl.s16 q11, d3, d6
- vrev64.32 q8, q8
- vrev64.32 q9, q9
- vst2.32 {q10,q11},[r7,:128]!
- vst2.32 {d16,d18},[r1,:128], r12
- vst2.32 {d17,d19},[r1,:128], r12
- subs r6, r6, #32
- bgt 1b
-
- pop {r4-r11,pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/mdct_neon.S b/ffmpeg/libavcodec/arm/mdct_neon.S
deleted file mode 100644
index e481cd1..0000000
--- a/ffmpeg/libavcodec/arm/mdct_neon.S
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * ARM NEON optimised MDCT
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-#define ff_fft_calc_neon X(ff_fft_calc_neon)
-
-function ff_imdct_half_neon, export=1
- push {r4-r8,lr}
-
- mov r12, #1
- ldr lr, [r0, #20] @ mdct_bits
- ldr r4, [r0, #24] @ tcos
- ldr r3, [r0, #8] @ revtab
- lsl r12, r12, lr @ n = 1 << nbits
- lsr lr, r12, #2 @ n4 = n >> 2
- add r7, r2, r12, lsl #1
- mov r12, #-16
- sub r7, r7, #16
-
- vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
- vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
- vrev64.32 d17, d17
- vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
- vmul.f32 d6, d17, d2
- vmul.f32 d7, d0, d2
-1:
- subs lr, lr, #2
- ldr r6, [r3], #4
- vmul.f32 d4, d0, d3
- vmul.f32 d5, d17, d3
- vsub.f32 d4, d6, d4
- vadd.f32 d5, d5, d7
- uxth r8, r6, ror #16
- uxth r6, r6
- add r8, r1, r8, lsl #3
- add r6, r1, r6, lsl #3
- beq 1f
- vld2.32 {d16-d17},[r7,:128],r12
- vld2.32 {d0-d1}, [r2,:128]!
- vrev64.32 d17, d17
- vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
- vmul.f32 d6, d17, d2
- vmul.f32 d7, d0, d2
- vst2.32 {d4[0],d5[0]}, [r6,:64]
- vst2.32 {d4[1],d5[1]}, [r8,:64]
- b 1b
-1:
- vst2.32 {d4[0],d5[0]}, [r6,:64]
- vst2.32 {d4[1],d5[1]}, [r8,:64]
-
- mov r4, r0
- mov r6, r1
- bl ff_fft_calc_neon
-
- mov r12, #1
- ldr lr, [r4, #20] @ mdct_bits
- ldr r4, [r4, #24] @ tcos
- lsl r12, r12, lr @ n = 1 << nbits
- lsr lr, r12, #3 @ n8 = n >> 3
-
- add r4, r4, lr, lsl #3
- add r6, r6, lr, lsl #3
- sub r1, r4, #16
- sub r3, r6, #16
-
- mov r7, #-16
- mov r8, r6
- mov r0, r3
-
- vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
- vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
- vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
-1:
- subs lr, lr, #2
- vmul.f32 d7, d0, d18
- vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
- vmul.f32 d4, d1, d18
- vmul.f32 d5, d21, d19
- vmul.f32 d6, d20, d19
- vmul.f32 d22, d1, d16
- vmul.f32 d23, d21, d17
- vmul.f32 d24, d0, d16
- vmul.f32 d25, d20, d17
- vadd.f32 d7, d7, d22
- vadd.f32 d6, d6, d23
- vsub.f32 d4, d4, d24
- vsub.f32 d5, d5, d25
- beq 1f
- vld2.32 {d0-d1}, [r3,:128], r7
- vld2.32 {d20-d21},[r6,:128]!
- vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
- vrev64.32 q3, q3
- vst2.32 {d4,d6}, [r0,:128], r7
- vst2.32 {d5,d7}, [r8,:128]!
- b 1b
-1:
- vrev64.32 q3, q3
- vst2.32 {d4,d6}, [r0,:128]
- vst2.32 {d5,d7}, [r8,:128]
-
- pop {r4-r8,pc}
-endfunc
-
-function ff_imdct_calc_neon, export=1
- push {r4-r6,lr}
-
- ldr r3, [r0, #20]
- mov r4, #1
- mov r5, r1
- lsl r4, r4, r3
- add r1, r1, r4
-
- bl ff_imdct_half_neon
-
- add r0, r5, r4, lsl #2
- add r1, r5, r4, lsl #1
- sub r0, r0, #8
- sub r2, r1, #16
- mov r3, #-16
- mov r6, #-8
- vmov.i32 d30, #1<<31
-1:
- vld1.32 {d0-d1}, [r2,:128], r3
- pld [r0, #-16]
- vrev64.32 q0, q0
- vld1.32 {d2-d3}, [r1,:128]!
- veor d4, d1, d30
- pld [r2, #-16]
- vrev64.32 q1, q1
- veor d5, d0, d30
- vst1.32 {d2}, [r0,:64], r6
- vst1.32 {d3}, [r0,:64], r6
- vst1.32 {d4-d5}, [r5,:128]!
- subs r4, r4, #16
- bgt 1b
-
- pop {r4-r6,pc}
-endfunc
-
-function ff_mdct_calc_neon, export=1
- push {r4-r10,lr}
-
- mov r12, #1
- ldr lr, [r0, #20] @ mdct_bits
- ldr r4, [r0, #24] @ tcos
- ldr r3, [r0, #8] @ revtab
- lsl lr, r12, lr @ n = 1 << nbits
- add r7, r2, lr @ in4u
- sub r9, r7, #16 @ in4d
- add r2, r7, lr, lsl #1 @ in3u
- add r8, r9, lr, lsl #1 @ in3d
- add r5, r4, lr, lsl #1
- sub r5, r5, #16
- sub r3, r3, #4
- mov r12, #-16
-
- vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
- vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
- vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
- vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
- vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
- vsub.f32 d0, d18, d0 @ in4d-in4u I
- vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
- vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
- vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
- vadd.f32 d1, d1, d19 @ in3u+in3d -R
- vsub.f32 d16, d16, d2 @ in0u-in2d R
- vadd.f32 d17, d17, d3 @ in2u+in1d -I
-1:
- vmul.f32 d7, d0, d21 @ I*s
-A ldr r10, [r3, lr, lsr #1]
-T lsr r10, lr, #1
-T ldr r10, [r3, r10]
- vmul.f32 d6, d1, d20 @ -R*c
- ldr r6, [r3, #4]!
- vmul.f32 d4, d1, d21 @ -R*s
- vmul.f32 d5, d0, d20 @ I*c
- vmul.f32 d24, d16, d30 @ R*c
- vmul.f32 d25, d17, d31 @ -I*s
- vmul.f32 d22, d16, d31 @ R*s
- vmul.f32 d23, d17, d30 @ I*c
- subs lr, lr, #16
- vsub.f32 d6, d6, d7 @ -R*c-I*s
- vadd.f32 d7, d4, d5 @ -R*s+I*c
- vsub.f32 d24, d25, d24 @ I*s-R*c
- vadd.f32 d25, d22, d23 @ R*s-I*c
- beq 1f
- mov r12, #-16
- vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
- vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
- vneg.f32 d7, d7 @ R*s-I*c
- vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
- vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
- vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
- vsub.f32 d0, d18, d0 @ in4d-in4u I
- vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
- vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
- vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
- vadd.f32 d1, d1, d19 @ in3u+in3d -R
- vsub.f32 d16, d16, d2 @ in0u-in2d R
- vadd.f32 d17, d17, d3 @ in2u+in1d -I
- uxth r12, r6, ror #16
- uxth r6, r6
- add r12, r1, r12, lsl #3
- add r6, r1, r6, lsl #3
- vst2.32 {d6[0],d7[0]}, [r6,:64]
- vst2.32 {d6[1],d7[1]}, [r12,:64]
- uxth r6, r10, ror #16
- uxth r10, r10
- add r6 , r1, r6, lsl #3
- add r10, r1, r10, lsl #3
- vst2.32 {d24[0],d25[0]},[r10,:64]
- vst2.32 {d24[1],d25[1]},[r6,:64]
- b 1b
-1:
- vneg.f32 d7, d7 @ R*s-I*c
- uxth r12, r6, ror #16
- uxth r6, r6
- add r12, r1, r12, lsl #3
- add r6, r1, r6, lsl #3
- vst2.32 {d6[0],d7[0]}, [r6,:64]
- vst2.32 {d6[1],d7[1]}, [r12,:64]
- uxth r6, r10, ror #16
- uxth r10, r10
- add r6 , r1, r6, lsl #3
- add r10, r1, r10, lsl #3
- vst2.32 {d24[0],d25[0]},[r10,:64]
- vst2.32 {d24[1],d25[1]},[r6,:64]
-
- mov r4, r0
- mov r6, r1
- bl ff_fft_calc_neon
-
- mov r12, #1
- ldr lr, [r4, #20] @ mdct_bits
- ldr r4, [r4, #24] @ tcos
- lsl r12, r12, lr @ n = 1 << nbits
- lsr lr, r12, #3 @ n8 = n >> 3
-
- add r4, r4, lr, lsl #3
- add r6, r6, lr, lsl #3
- sub r1, r4, #16
- sub r3, r6, #16
-
- mov r7, #-16
- mov r8, r6
- mov r0, r3
-
- vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
- vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
- vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
-1:
- subs lr, lr, #2
- vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
- vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
- vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
- vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
- vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
- vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
- vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
- vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
- vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
- vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
- vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
- vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
- vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
- vneg.f32 q2, q2
- beq 1f
- vld2.32 {d0-d1}, [r3,:128], r7
- vld2.32 {d20-d21},[r6,:128]!
- vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
- vrev64.32 q3, q3
- vst2.32 {d4,d6}, [r0,:128], r7
- vst2.32 {d5,d7}, [r8,:128]!
- b 1b
-1:
- vrev64.32 q3, q3
- vst2.32 {d4,d6}, [r0,:128]
- vst2.32 {d5,d7}, [r8,:128]
-
- pop {r4-r10,pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
deleted file mode 100644
index 977abb6..0000000
--- a/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro skip args:vararg
-.endm
-
-.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0
- ldr \t1, [\w, #4*\offs]
- ldr \t2, [\p, #4]!
- \rsb \t1, \t1, #0
- .irpc i, 135
- ldr \t3, [\w, #4*64*\i+4*\offs]
- ldr \t4, [\p, #4*64*\i]
- smlal \lo, \hi, \t1, \t2
- \rsb \t3, \t3, #0
- ldr \t1, [\w, #4*64*(\i+1)+4*\offs]
- ldr \t2, [\p, #4*64*(\i+1)]
- smlal \lo, \hi, \t3, \t4
- \rsb \t1, \t1, #0
- .endr
- ldr \t3, [\w, #4*64*7+4*\offs]
- ldr \t4, [\p, #4*64*7]
- smlal \lo, \hi, \t1, \t2
- \rsb \t3, \t3, #0
- smlal \lo, \hi, \t3, \t4
-.endm
-
-.macro round rd, lo, hi
- lsr \rd, \lo, #24
- bic \lo, \lo, #0xff000000
- orr \rd, \rd, \hi, lsl #8
- mov \hi, #0
- ssat \rd, #16, \rd
-.endm
-
-function ff_mpadsp_apply_window_fixed_armv6, export=1
- push {r2,r4-r11,lr}
-
- add r4, r0, #4*512 @ synth_buf + 512
- .rept 4
- ldm r0!, {r5-r12}
- stm r4!, {r5-r12}
- .endr
-
- ldr r4, [sp, #40] @ incr
- sub r0, r0, #4*17 @ synth_buf + 16
- ldr r8, [r2] @ sum:low
- add r2, r0, #4*32 @ synth_buf + 48
- rsb r5, r4, r4, lsl #5 @ 31 * incr
- lsl r4, r4, #1
- asr r9, r8, #31 @ sum:high
- add r5, r3, r5, lsl #1 @ samples2
- add r6, r1, #4*32 @ w2
- str r4, [sp, #40]
-
- sum8 r8, r9, r1, r0, r10, r11, r12, lr
- sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
- round r10, r8, r9
- strh_post r10, r3, r4
-
- mov lr, #15
-1:
- ldr r12, [r0, #4]!
- ldr r11, [r6, #-4]!
- ldr r10, [r1, #4]!
- .irpc i, 0246
- .if \i
- ldr r11, [r6, #4*64*\i]
- ldr r10, [r1, #4*64*\i]
- .endif
- rsb r11, r11, #0
- smlal r8, r9, r10, r12
- ldr r10, [r0, #4*64*(\i+1)]
- .ifeq \i
- smull r4, r7, r11, r12
- .else
- smlal r4, r7, r11, r12
- .endif
- ldr r11, [r6, #4*64*(\i+1)]
- ldr r12, [r1, #4*64*(\i+1)]
- rsb r11, r11, #0
- smlal r8, r9, r12, r10
- .iflt \i-6
- ldr r12, [r0, #4*64*(\i+2)]
- .else
- ldr r12, [r2, #-4]!
- .endif
- smlal r4, r7, r11, r10
- .endr
- .irpc i, 0246
- ldr r10, [r1, #4*64*\i+4*32]
- rsb r12, r12, #0
- ldr r11, [r6, #4*64*\i+4*32]
- smlal r8, r9, r10, r12
- ldr r10, [r2, #4*64*(\i+1)]
- smlal r4, r7, r11, r12
- ldr r12, [r1, #4*64*(\i+1)+4*32]
- rsb r10, r10, #0
- ldr r11, [r6, #4*64*(\i+1)+4*32]
- smlal r8, r9, r12, r10
- .iflt \i-6
- ldr r12, [r2, #4*64*(\i+2)]
- .else
- ldr r12, [sp, #40]
- .endif
- smlal r4, r7, r11, r10
- .endr
- round r10, r8, r9
- adds r8, r8, r4
- adc r9, r9, r7
- strh_post r10, r3, r12
- round r11, r8, r9
- subs lr, lr, #1
- strh_dpost r11, r5, r12
- bgt 1b
-
- sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
- pop {r4}
- round r10, r8, r9
- str r8, [r4]
- strh r10, [r3]
-
- pop {r4-r11,pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c b/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c
deleted file mode 100644
index 98e0c8a..0000000
--- a/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2011 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/mpegaudiodsp.h"
-#include "config.h"
-
-void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
- int *dither, int16_t *out, int incr);
-
-av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_armv6(cpu_flags)) {
- s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.c b/ffmpeg/libavcodec/arm/mpegvideo_arm.c
deleted file mode 100644
index 6566798..0000000
--- a/ffmpeg/libavcodec/arm/mpegvideo_arm.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2002 Michael Niedermayer
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/mpegvideo.h"
-#include "mpegvideo_arm.h"
-#include "asm-offsets.h"
-
-#if HAVE_NEON
-CHK_OFFS(MpegEncContext, y_dc_scale, Y_DC_SCALE);
-CHK_OFFS(MpegEncContext, c_dc_scale, C_DC_SCALE);
-CHK_OFFS(MpegEncContext, ac_pred, AC_PRED);
-CHK_OFFS(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
-CHK_OFFS(MpegEncContext, inter_scantable.raster_end, INTER_SCANTAB_RASTER_END);
-CHK_OFFS(MpegEncContext, h263_aic, H263_AIC);
-#endif
-
-void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
- int n, int qscale);
-void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
- int n, int qscale);
-
-av_cold void ff_MPV_common_init_arm(MpegEncContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_armv5te(cpu_flags))
- ff_MPV_common_init_armv5te(s);
-
- if (have_neon(cpu_flags)) {
- s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon;
- s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.h b/ffmpeg/libavcodec/arm/mpegvideo_arm.h
deleted file mode 100644
index 4ff93b7..0000000
--- a/ffmpeg/libavcodec/arm/mpegvideo_arm.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_MPEGVIDEO_H
-#define AVCODEC_ARM_MPEGVIDEO_H
-
-#include "libavcodec/mpegvideo.h"
-
-void ff_MPV_common_init_armv5te(MpegEncContext *s);
-
-#endif /* AVCODEC_ARM_MPEGVIDEO_H */
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c b/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c
deleted file mode 100644
index a572290..0000000
--- a/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Optimization of some functions from mpegvideo.c for armv5te
- * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/avassert.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/mpegvideo.h"
-#include "mpegvideo_arm.h"
-
-void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count);
-
-#ifdef ENABLE_ARM_TESTS
-/**
- * h263 dequantizer supplementary function, it is performance critical and needs to
- * have optimized implementations for each architecture. Is also used as a reference
- * implementation in regression tests
- */
-static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count)
-{
- int i, level;
- for (i = 0; i < count; i++) {
- level = block[i];
- if (level) {
- if (level < 0) {
- level = level * qmul - qadd;
- } else {
- level = level * qmul + qadd;
- }
- block[i] = level;
- }
- }
-}
-#endif
-
-static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- int level, qmul, qadd;
- int nCoeffs;
-
- av_assert2(s->block_last_index[n]>=0);
-
- qmul = qscale << 1;
-
- if (!s->h263_aic) {
- if (n < 4)
- level = block[0] * s->y_dc_scale;
- else
- level = block[0] * s->c_dc_scale;
- qadd = (qscale - 1) | 1;
- }else{
- qadd = 0;
- level = block[0];
- }
- if(s->ac_pred)
- nCoeffs=63;
- else
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
- ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
- block[0] = level;
-}
-
-static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
- int16_t *block, int n, int qscale)
-{
- int qmul, qadd;
- int nCoeffs;
-
- av_assert2(s->block_last_index[n]>=0);
-
- qadd = (qscale - 1) | 1;
- qmul = qscale << 1;
-
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
-
- ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
-}
-
-av_cold void ff_MPV_common_init_armv5te(MpegEncContext *s)
-{
- s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
- s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
-}
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S b/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S
deleted file mode 100644
index 8687d6b..0000000
--- a/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Optimization of some functions from mpegvideo.c for armv5te
- * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-/*
- * Special optimized version of dct_unquantize_h263_helper_c, it
- * requires the block to be at least 8 bytes aligned, and may process
- * more elements than requested. But it is guaranteed to never
- * process more than 64 elements provided that count argument is <= 64,
- * so it is safe. This function is optimized for a common distribution
- * of values for nCoeffs (they are mostly multiple of 8 plus one or
- * two extra elements). So this function processes data as 8 elements
- * per loop iteration and contains optional 2 elements processing in
- * the end.
- *
- * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
- */
-
-.macro dequant_t dst, src, mul, add, tmp
- rsbs \tmp, ip, \src, asr #16
- it gt
- addgt \tmp, \add, #0
- it lt
- rsblt \tmp, \add, #0
- it ne
- smlatbne \dst, \src, \mul, \tmp
-.endm
-
-.macro dequant_b dst, src, mul, add, tmp
- rsbs \tmp, ip, \src, lsl #16
- it gt
- addgt \tmp, \add, #0
- it lt
- rsblt \tmp, \add, #0
- it ne
- smlabbne \dst, \src, \mul, \tmp
-.endm
-
-function ff_dct_unquantize_h263_armv5te, export=1
- push {r4-r9,lr}
- mov ip, #0
- subs r3, r3, #2
- ble 2f
- ldrd r4, r5, [r0, #0]
-1:
- ldrd r6, r7, [r0, #8]
-
- dequant_t r9, r4, r1, r2, r9
- dequant_t lr, r5, r1, r2, lr
- dequant_b r4, r4, r1, r2, r8
- dequant_b r5, r5, r1, r2, r8
-
- strh r4, [r0], #2
- strh r9, [r0], #2
- strh r5, [r0], #2
- strh lr, [r0], #2
-
- dequant_t r9, r6, r1, r2, r9
- dequant_t lr, r7, r1, r2, lr
- dequant_b r6, r6, r1, r2, r8
- dequant_b r7, r7, r1, r2, r8
-
- strh r6, [r0], #2
- strh r9, [r0], #2
- strh r7, [r0], #2
- strh lr, [r0], #2
-
- subs r3, r3, #8
- it gt
- ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */
- bgt 1b
-
- adds r3, r3, #2
- it le
- pople {r4-r9,pc}
-2:
- ldrsh r9, [r0, #0]
- ldrsh lr, [r0, #2]
- mov r8, r2
- cmp r9, #0
- it lt
- rsblt r8, r2, #0
- it ne
- smlabbne r9, r9, r1, r8
- mov r8, r2
- cmp lr, #0
- it lt
- rsblt r8, r2, #0
- it ne
- smlabbne lr, lr, r1, r8
- strh r9, [r0], #2
- strh lr, [r0], #2
- pop {r4-r9,pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_neon.S b/ffmpeg/libavcodec/arm/mpegvideo_neon.S
deleted file mode 100644
index e05df8e..0000000
--- a/ffmpeg/libavcodec/arm/mpegvideo_neon.S
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-#include "asm-offsets.h"
-
-function ff_dct_unquantize_h263_inter_neon, export=1
- add r12, r0, #BLOCK_LAST_INDEX
- ldr r12, [r12, r2, lsl #2]
- add r0, r0, #INTER_SCANTAB_RASTER_END
- ldrb r12, [r0, r12]
- sub r2, r3, #1
- lsl r0, r3, #1
- orr r2, r2, #1
- add r3, r12, #1
-endfunc
-
-function ff_dct_unquantize_h263_neon, export=1
- vdup.16 q15, r0 @ qmul
- vdup.16 q14, r2 @ qadd
- vneg.s16 q13, q14
- cmp r3, #4
- mov r0, r1
- ble 2f
-1:
- vld1.16 {q0}, [r0,:128]!
- vclt.s16 q3, q0, #0
- vld1.16 {q8}, [r0,:128]!
- vceq.s16 q1, q0, #0
- vmul.s16 q2, q0, q15
- vclt.s16 q11, q8, #0
- vmul.s16 q10, q8, q15
- vbsl q3, q13, q14
- vbsl q11, q13, q14
- vadd.s16 q2, q2, q3
- vceq.s16 q9, q8, #0
- vadd.s16 q10, q10, q11
- vbif q0, q2, q1
- vbif q8, q10, q9
- subs r3, r3, #16
- vst1.16 {q0}, [r1,:128]!
- vst1.16 {q8}, [r1,:128]!
- it le
- bxle lr
- cmp r3, #8
- bgt 1b
-2:
- vld1.16 {d0}, [r0,:64]
- vclt.s16 d3, d0, #0
- vceq.s16 d1, d0, #0
- vmul.s16 d2, d0, d30
- vbsl d3, d26, d28
- vadd.s16 d2, d2, d3
- vbif d0, d2, d1
- vst1.16 {d0}, [r1,:64]
- bx lr
-endfunc
-
-function ff_dct_unquantize_h263_intra_neon, export=1
- push {r4-r6,lr}
- add r12, r0, #BLOCK_LAST_INDEX
- ldr r6, [r0, #AC_PRED]
- add lr, r0, #INTER_SCANTAB_RASTER_END
- cmp r6, #0
- it ne
- movne r12, #63
- bne 1f
- ldr r12, [r12, r2, lsl #2]
- ldrb r12, [lr, r12]
-1: ldr r5, [r0, #H263_AIC]
- ldrsh r4, [r1]
- cmp r5, #0
- mov r5, r1
- it ne
- movne r2, #0
- bne 2f
- cmp r2, #4
- it ge
- addge r0, r0, #4
- sub r2, r3, #1
- ldr r6, [r0, #Y_DC_SCALE]
- orr r2, r2, #1
- smulbb r4, r4, r6
-2: lsl r0, r3, #1
- add r3, r12, #1
- bl ff_dct_unquantize_h263_neon
- vmov.16 d0[0], r4
- vst1.16 {d0[0]}, [r5]
- pop {r4-r6,pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/neon.S b/ffmpeg/libavcodec/arm/neon.S
deleted file mode 100644
index 787bc4b..0000000
--- a/ffmpeg/libavcodec/arm/neon.S
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
- vtrn.32 \r0, \r4
- vtrn.32 \r1, \r5
- vtrn.32 \r2, \r6
- vtrn.32 \r3, \r7
- vtrn.16 \r0, \r2
- vtrn.16 \r1, \r3
- vtrn.16 \r4, \r6
- vtrn.16 \r5, \r7
- vtrn.8 \r0, \r1
- vtrn.8 \r2, \r3
- vtrn.8 \r4, \r5
- vtrn.8 \r6, \r7
-.endm
-
-.macro transpose_4x4 r0, r1, r2, r3
- vtrn.16 \r0, \r2
- vtrn.16 \r1, \r3
- vtrn.8 \r0, \r1
- vtrn.8 \r2, \r3
-.endm
-
-.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7
- vswp \r0, \r4
- vswp \r1, \r5
- vswp \r2, \r6
- vswp \r3, \r7
-.endm
-
-.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7
- vtrn.32 \r0, \r2
- vtrn.32 \r1, \r3
- vtrn.32 \r4, \r6
- vtrn.32 \r5, \r7
- vtrn.16 \r0, \r1
- vtrn.16 \r2, \r3
- vtrn.16 \r4, \r5
- vtrn.16 \r6, \r7
-.endm
diff --git a/ffmpeg/libavcodec/arm/rdft_neon.S b/ffmpeg/libavcodec/arm/rdft_neon.S
deleted file mode 100644
index 781d976..0000000
--- a/ffmpeg/libavcodec/arm/rdft_neon.S
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * ARM NEON optimised RDFT
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_rdft_calc_neon, export=1
- push {r4-r8,lr}
-
- ldr r6, [r0, #4] @ inverse
- mov r4, r0
- mov r5, r1
-
- lsls r6, r6, #31
- bne 1f
- add r0, r4, #20
- bl X(ff_fft_permute_neon)
- add r0, r4, #20
- mov r1, r5
- bl X(ff_fft_calc_neon)
-1:
- ldr r12, [r4, #0] @ nbits
- mov r2, #1
- lsl r12, r2, r12
- add r0, r5, #8
- add r1, r5, r12, lsl #2
- lsr r12, r12, #2
- ldr r2, [r4, #12] @ tcos
- sub r12, r12, #2
- ldr r3, [r4, #16] @ tsin
- mov r7, r0
- sub r1, r1, #8
- mov lr, r1
- mov r8, #-8
- vld1.32 {d0}, [r0,:64]! @ d1[0,1]
- vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
- vld1.32 {d4}, [r2,:64]! @ tcos[i]
- vld1.32 {d5}, [r3,:64]! @ tsin[i]
- vmov.f32 d18, #0.5 @ k1
- vdup.32 d19, r6
- pld [r0, #32]
- veor d19, d18, d19 @ k2
- vmov.i32 d16, #0
- vmov.i32 d17, #1<<31
- pld [r1, #-32]
- vtrn.32 d16, d17
- pld [r2, #32]
- vrev64.32 d16, d16 @ d16=1,0 d17=0,1
- pld [r3, #32]
-2:
- veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
- vld1.32 {d24}, [r0,:64]! @ d1[0,1]
- vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
- vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
- vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
- veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
- pld [r0, #32]
- vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
- pld [r1, #-32]
- vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
- vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
- vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
- veor d7, d21, d16 @ -od.im, od.re
- vrev64.32 d3, d21 @ od.re, od.im
- veor d6, d20, d17 @ ev.re,-ev.im
- veor d2, d3, d16 @ -od.re, od.im
- vmla.f32 d20, d3, d4[1]
- vmla.f32 d20, d7, d5[1]
- vmla.f32 d6, d2, d4[1]
- vmla.f32 d6, d21, d5[1]
- vld1.32 {d4}, [r2,:64]! @ tcos[i]
- veor d7, d23, d16 @ -od.im, od.re
- vld1.32 {d5}, [r3,:64]! @ tsin[i]
- veor d24, d22, d17 @ ev.re,-ev.im
- vrev64.32 d3, d23 @ od.re, od.im
- pld [r2, #32]
- veor d2, d3, d16 @ -od.re, od.im
- pld [r3, #32]
- vmla.f32 d22, d3, d4[0]
- vmla.f32 d22, d7, d5[0]
- vmla.f32 d24, d2, d4[0]
- vmla.f32 d24, d23, d5[0]
- vld1.32 {d0}, [r0,:64]! @ d1[0,1]
- vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
- vst1.32 {d20}, [r7,:64]!
- vst1.32 {d6}, [lr,:64], r8
- vst1.32 {d22}, [r7,:64]!
- vst1.32 {d24}, [lr,:64], r8
- subs r12, r12, #2
- bgt 2b
-
- veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
- vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
- vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
- ldr r2, [r4, #8] @ sign_convention
- vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
- add r0, r0, #4
- bfc r2, #0, #31
- vld1.32 {d0[0]}, [r0,:32]
- veor d7, d21, d16 @ -od.im, od.re
- vrev64.32 d3, d21 @ od.re, od.im
- veor d6, d20, d17 @ ev.re,-ev.im
- vld1.32 {d22}, [r5,:64]
- vdup.32 d1, r2
- vmov d23, d22
- veor d2, d3, d16 @ -od.re, od.im
- vtrn.32 d22, d23
- veor d0, d0, d1
- veor d23, d23, d17
- vmla.f32 d20, d3, d4[1]
- vmla.f32 d20, d7, d5[1]
- vmla.f32 d6, d2, d4[1]
- vmla.f32 d6, d21, d5[1]
- vadd.f32 d22, d22, d23
- vst1.32 {d20}, [r7,:64]
- vst1.32 {d6}, [lr,:64]
- vst1.32 {d0[0]}, [r0,:32]
- vst1.32 {d22}, [r5,:64]
-
- cmp r6, #0
- it eq
- popeq {r4-r8,pc}
-
- vmul.f32 d22, d22, d18
- vst1.32 {d22}, [r5,:64]
- add r0, r4, #20
- mov r1, r5
- bl X(ff_fft_permute_neon)
- add r0, r4, #20
- mov r1, r5
- pop {r4-r8,lr}
- b X(ff_fft_calc_neon)
-endfunc
diff --git a/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c
deleted file mode 100644
index 8bfe90b..0000000
--- a/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/rv34dsp.h"
-#include "libavutil/arm/cpu.h"
-
-void ff_rv34_inv_transform_noround_neon(int16_t *block);
-
-void ff_rv34_inv_transform_noround_dc_neon(int16_t *block);
-
-void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block);
-void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc);
-
-av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon;
- c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
-
- c->rv34_idct_add = ff_rv34_idct_add_neon;
- c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/rv34dsp_neon.S b/ffmpeg/libavcodec/arm/rv34dsp_neon.S
deleted file mode 100644
index 3d4a83d..0000000
--- a/ffmpeg/libavcodec/arm/rv34dsp_neon.S
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-#include "neon.S"
-
-.macro rv34_inv_transform r0
- vld1.16 {q14-q15}, [\r0,:128]
- vmov.s16 d0, #13
- vshll.s16 q12, d29, #3
- vshll.s16 q13, d29, #4
- vshll.s16 q9, d31, #3
- vshll.s16 q1, d31, #4
- vmull.s16 q10, d28, d0
- vmlal.s16 q10, d30, d0
- vmull.s16 q11, d28, d0
- vmlsl.s16 q11, d30, d0
- vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7
- vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17
- vsubw.s16 q9, q9, d31
- vaddw.s16 q1, q1, d31
- vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3]
- vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3]
- vadd.s32 q1, q10, q13 @ z0 + z3
- vadd.s32 q2, q11, q12 @ z1 + z2
- vsub.s32 q8, q10, q13 @ z0 - z3
- vsub.s32 q3, q11, q12 @ z1 - z2
- vtrn.32 q1, q2
- vtrn.32 q3, q8
- vswp d3, d6
- vswp d5, d16
- vmov.s32 d0, #13
- vadd.s32 q10, q1, q3
- vsub.s32 q11, q1, q3
- vshl.s32 q12, q2, #3
- vshl.s32 q9, q2, #4
- vmul.s32 q13, q11, d0[0]
- vshl.s32 q11, q8, #4
- vadd.s32 q9, q9, q2
- vshl.s32 q15, q8, #3
- vsub.s32 q12, q12, q2
- vadd.s32 q11, q11, q8
- vmul.s32 q14, q10, d0[0]
- vsub.s32 q8, q15, q8
- vsub.s32 q12, q12, q11
- vadd.s32 q9, q9, q8
- vadd.s32 q2, q13, q12 @ z1 + z2
- vadd.s32 q1, q14, q9 @ z0 + z3
- vsub.s32 q3, q13, q12 @ z1 - z2
- vsub.s32 q15, q14, q9 @ z0 - z3
-.endm
-
-/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */
-function ff_rv34_idct_add_neon, export=1
- mov r3, r0
- rv34_inv_transform r2
- vmov.i16 q12, #0
- vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10
- vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10
- vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10
- vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10
- vld1.32 {d28[]}, [r0,:32], r1
- vld1.32 {d29[]}, [r0,:32], r1
- vtrn.32 q8, q9
- vld1.32 {d28[1]}, [r0,:32], r1
- vld1.32 {d29[1]}, [r0,:32], r1
- vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16)
- vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16)
- vtrn.16 d16, d17
- vtrn.32 d28, d29
- vtrn.16 d18, d19
- vaddw.u8 q0, q8, d28
- vaddw.u8 q1, q9, d29
- vqmovun.s16 d28, q0
- vqmovun.s16 d29, q1
- vst1.32 {d28[0]}, [r3,:32], r1
- vst1.32 {d28[1]}, [r3,:32], r1
- vst1.32 {d29[0]}, [r3,:32], r1
- vst1.32 {d29[1]}, [r3,:32], r1
- bx lr
-endfunc
-
-/* void rv34_inv_transform_noround_neon(int16_t *block); */
-function ff_rv34_inv_transform_noround_neon, export=1
- rv34_inv_transform r0
- vshl.s32 q11, q2, #1
- vshl.s32 q10, q1, #1
- vshl.s32 q12, q3, #1
- vshl.s32 q13, q15, #1
- vadd.s32 q11, q11, q2
- vadd.s32 q10, q10, q1
- vadd.s32 q12, q12, q3
- vadd.s32 q13, q13, q15
- vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11
- vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11
- vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11
- vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11
- vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
- vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
- vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
- vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
- bx lr
-endfunc
-
-/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
-function ff_rv34_idct_dc_add_neon, export=1
- mov r3, r0
- vld1.32 {d28[]}, [r0,:32], r1
- vld1.32 {d29[]}, [r0,:32], r1
- vdup.16 d0, r2
- vmov.s16 d1, #169
- vld1.32 {d28[1]}, [r0,:32], r1
- vmull.s16 q1, d0, d1 @ dc * 13 * 13
- vld1.32 {d29[1]}, [r0,:32], r1
- vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10
- vmov d1, d0
- vaddw.u8 q2, q0, d28
- vaddw.u8 q3, q0, d29
- vqmovun.s16 d28, q2
- vqmovun.s16 d29, q3
- vst1.32 {d28[0]}, [r3,:32], r1
- vst1.32 {d29[0]}, [r3,:32], r1
- vst1.32 {d28[1]}, [r3,:32], r1
- vst1.32 {d29[1]}, [r3,:32], r1
- bx lr
-endfunc
-
-/* void rv34_inv_transform_dc_noround_c(int16_t *block) */
-function ff_rv34_inv_transform_noround_dc_neon, export=1
- vld1.16 {d28[]}, [r0,:16] @ block[0]
- vmov.i16 d4, #251
- vorr.s16 d4, #256 @ 13^2 * 3
- vmull.s16 q3, d28, d4
- vshrn.s32 d0, q3, #11
- vmov.i16 d1, d0
- vst1.64 {q0}, [r0,:128]!
- vst1.64 {q0}, [r0,:128]!
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c
deleted file mode 100644
index 3bf9ac7..0000000
--- a/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/avcodec.h"
-#include "libavcodec/rv34dsp.h"
-#include "libavutil/arm/cpu.h"
-
-#define DECL_QPEL3(type, w, pos) \
- void ff_##type##_rv40_qpel##w##_mc##pos##_neon(uint8_t *dst, uint8_t *src,\
- ptrdiff_t stride)
-#define DECL_QPEL2(w, pos) \
- DECL_QPEL3(put, w, pos); \
- DECL_QPEL3(avg, w, pos)
-
-#define DECL_QPEL_XY(x, y) \
- DECL_QPEL2(16, x ## y); \
- DECL_QPEL2(8, x ## y)
-
-#define DECL_QPEL_Y(y) \
- DECL_QPEL_XY(0, y); \
- DECL_QPEL_XY(1, y); \
- DECL_QPEL_XY(2, y); \
- DECL_QPEL_XY(3, y); \
-
-DECL_QPEL_Y(0);
-DECL_QPEL_Y(1);
-DECL_QPEL_Y(2);
-DECL_QPEL_Y(3);
-
-void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-
-void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
-void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
-
-void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
-void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
-
-int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
- int beta, int beta2, int edge,
- int *p1, int *q1);
-int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
- int beta, int beta2, int edge,
- int *p1, int *q1);
-
-void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
- int filter_q1, int alpha, int beta,
- int lim_p0q0, int lim_q1, int lim_p1);
-void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
- int filter_q1, int alpha, int beta,
- int lim_p0q0, int lim_q1, int lim_p1);
-
-static av_cold void rv40dsp_init_neon(RV34DSPContext *c)
-{
- c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
- c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon;
- c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon;
- c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon;
- c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon;
- c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon;
- c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon;
- c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon;
- c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon;
- c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon;
- c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon;
- c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon;
- c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon;
- c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon;
- c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon;
- c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon;
- c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon;
- c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon;
- c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon;
- c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon;
- c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon;
- c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon;
- c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon;
- c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon;
- c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon;
- c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon;
- c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon;
- c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon;
- c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon;
- c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon;
- c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon;
- c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon;
- c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon;
- c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon;
- c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon;
- c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon;
- c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon;
- c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon;
- c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon;
- c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon;
- c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon;
- c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon;
- c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon;
- c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon;
- c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon;
- c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon;
- c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon;
- c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon;
- c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon;
- c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon;
- c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon;
- c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon;
-
- c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
- c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
- c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
- c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
-
- c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
- c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
-
- c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
- c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
- c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon;
- c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon;
-}
-
-av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags))
- rv40dsp_init_neon(c);
-}
diff --git a/ffmpeg/libavcodec/arm/rv40dsp_neon.S b/ffmpeg/libavcodec/arm/rv40dsp_neon.S
deleted file mode 100644
index 099f88c..0000000
--- a/ffmpeg/libavcodec/arm/rv40dsp_neon.S
+++ /dev/null
@@ -1,920 +0,0 @@
-/*
- * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-#include "neon.S"
-
-.macro qpel_lowpass r0, r1, rc1, rc2, shift
- vext.8 d25, \r0, \r1, #1 @ src[-1]
- vext.8 d26, \r0, \r1, #4 @ src[ 2]
- vext.8 d24, \r0, \r1, #5 @ src[ 3]
- vaddl.u8 q9, d25, d26
- vaddl.u8 q8, \r0, d24
- vext.8 d27, \r0, \r1, #2 @ src[ 0]
- vshl.s16 q12, q9, #2
- vsub.s16 q8, q8, q9
- vext.8 d28, \r0, \r1, #3 @ src[ 1]
- vsub.s16 q8, q8, q12
- vmlal.u8 q8, d27, \rc1
- vmlal.u8 q8, d28, \rc2
- vqrshrun.s16 \r0, q8, #\shift
-.endm
-
-.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
- vext.8 d25, \r0, \r1, #1 @ src[-1]
- vext.8 d26, \r0, \r1, #4 @ src[ 2]
- vext.8 d24, \r0, \r1, #5 @ src[ 3]
- vaddl.u8 q9, d25, d26
- vaddl.u8 q8, \r0, d24
- vext.8 d29, \r0, \r1, #2 @ src[ 0]
- vext.8 d28, \r0, \r1, #3 @ src[ 1]
- vshl.s16 q10, q9, #2
- vext.8 \r1, \r2, \r3, #1 @ src[-1]
- vsub.s16 q8, q8, q9
- vext.8 d22, \r2, \r3, #4 @ src[ 2]
- vext.8 \r0, \r2, \r3, #5 @ src[ 3]
- vaddl.u8 q13, \r1, d22
- vaddl.u8 q12, \r2, \r0
- vsub.s16 q8, q8, q10
- vshl.s16 q9, q13, #2
- vsub.s16 q12, q12, q13
- vmlal.u8 q8, d29, \rc1
- vmlal.u8 q8, d28, \rc2
- vsub.s16 q12, q12, q9
- vext.8 d26, \r2, \r3, #2 @ src[ 0]
- vext.8 d27, \r2, \r3, #3 @ src[ 1]
- vmlal.u8 q12, d26, \rc1
- vmlal.u8 q12, d27, \rc2
- vqrshrun.s16 \r0, q8, #\shift
- vqrshrun.s16 \r2, q12, #\shift
-.endm
-
-.macro rv40_qpel8_h shift
-function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
-1:
- vld1.8 {q2}, [r1], r2
- vld1.8 {q3}, [r1], r2
- qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
- vst1.8 {d4}, [r12,:64]!
- vst1.8 {d6}, [r12,:64]!
- subs r3, r3, #2
- bgt 1b
- vld1.8 {q2}, [r1]
- qpel_lowpass d4, d5, d0, d1, \shift
- vst1.8 {d4}, [r12,:64]!
- bx lr
-endfunc
-.endm
-
-.macro rv40_qpel8_v shift, type
-function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
- vld1.64 {d2}, [r1,:64]!
- vld1.64 {d3}, [r1,:64]!
- vld1.64 {d4}, [r1,:64]!
- vld1.64 {d5}, [r1,:64]!
- vld1.64 {d6}, [r1,:64]!
- vld1.64 {d7}, [r1,:64]!
- vld1.64 {d8}, [r1,:64]!
- vld1.64 {d9}, [r1,:64]!
- vld1.64 {d10}, [r1,:64]!
- vld1.64 {d11}, [r1,:64]!
- vld1.64 {d12}, [r1,:64]!
- vld1.64 {d13}, [r1,:64]!
- vld1.64 {d14}, [r1,:64]!
- transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
- transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
- qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
- qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
- qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
- qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
- transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
- .ifc \type,avg
- vld1.64 d12, [r0,:64], r2
- vld1.64 d13, [r0,:64], r2
- vld1.64 d14, [r0,:64], r2
- vld1.64 d15, [r0,:64], r2
- vld1.64 d16, [r0,:64], r2
- vld1.64 d17, [r0,:64], r2
- vld1.64 d18, [r0,:64], r2
- vld1.64 d19, [r0,:64], r2
- sub r0, r0, r2, lsl #3
- vrhadd.u8 q1, q1, q6
- vrhadd.u8 q2, q2, q7
- vrhadd.u8 q3, q3, q8
- vrhadd.u8 q4, q4, q9
- .endif
- vst1.64 d2, [r0,:64], r2
- vst1.64 d3, [r0,:64], r2
- vst1.64 d4, [r0,:64], r2
- vst1.64 d5, [r0,:64], r2
- vst1.64 d6, [r0,:64], r2
- vst1.64 d7, [r0,:64], r2
- vst1.64 d8, [r0,:64], r2
- vst1.64 d9, [r0,:64], r2
- bx lr
-endfunc
-.endm
-
- rv40_qpel8_h 5
- rv40_qpel8_h 6
-
-.macro rv40_qpel type
-function \type\()_rv40_qpel8_h_lowpass_neon
- .ifc \type,avg
- mov r12, r0
- .endif
-1:
- vld1.8 {q2}, [r1], r2
- vld1.8 {q3}, [r1], r2
- qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
- .ifc \type,avg
- vld1.8 {d3}, [r12,:64], r2
- vld1.8 {d16}, [r12,:64], r2
- vrhadd.u8 d4, d4, d3
- vrhadd.u8 d6, d6, d16
- .endif
- vst1.8 {d4}, [r0,:64], r2
- vst1.8 {d6}, [r0,:64], r2
- subs r3, r3, #2
- bgt 1b
- bx lr
-endfunc
-
-function \type\()_rv40_qpel8_v_lowpass_neon
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1], r2
- vld1.64 {d8}, [r1], r2
- vld1.64 {d9}, [r1], r2
- vld1.64 {d10}, [r1], r2
- vld1.64 {d11}, [r1], r2
- vld1.64 {d12}, [r1], r2
- vld1.64 {d13}, [r1], r2
- vld1.64 {d14}, [r1]
- transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
- transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
- qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
- qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
- qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
- qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
- transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
- .ifc \type,avg
- vld1.64 d12, [r0,:64], r2
- vld1.64 d13, [r0,:64], r2
- vld1.64 d14, [r0,:64], r2
- vld1.64 d15, [r0,:64], r2
- vld1.64 d16, [r0,:64], r2
- vld1.64 d17, [r0,:64], r2
- vld1.64 d18, [r0,:64], r2
- vld1.64 d19, [r0,:64], r2
- sub r0, r0, r2, lsl #3
- vrhadd.u8 q1, q1, q6
- vrhadd.u8 q2, q2, q7
- vrhadd.u8 q3, q3, q8
- vrhadd.u8 q4, q4, q9
- .endif
- vst1.64 d2, [r0,:64], r2
- vst1.64 d3, [r0,:64], r2
- vst1.64 d4, [r0,:64], r2
- vst1.64 d5, [r0,:64], r2
- vst1.64 d6, [r0,:64], r2
- vst1.64 d7, [r0,:64], r2
- vst1.64 d8, [r0,:64], r2
- vst1.64 d9, [r0,:64], r2
- bx lr
-endfunc
-
- rv40_qpel8_v 5, \type
- rv40_qpel8_v 6, \type
-
-function ff_\type\()_rv40_qpel8_mc10_neon, export=1
- sub r1, r1, #2
- mov r3, #8
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- b \type\()_rv40_qpel8_h_lowpass_neon
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc30_neon, export=1
- sub r1, r1, #2
- mov r3, #8
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- b \type\()_rv40_qpel8_h_lowpass_neon
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc01_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub r1, r1, r2, lsl #1
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl \type\()_rv40_qpel8_v_lowpass_neon
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc11_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- add r1, sp, #7
- bic r1, r1, #7
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc21_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #20
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- add r1, sp, #7
- bic r1, r1, #7
- vmov.i8 d0, #52
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc31_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- add r1, sp, #7
- bic r1, r1, #7
- vswp d0, d1
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc12_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- add r1, sp, #7
- bic r1, r1, #7
- vmov.i8 d0, #20
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc22_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #20
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- add r1, sp, #7
- bic r1, r1, #7
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc32_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- add r1, sp, #7
- bic r1, r1, #7
- vmov.i8 d1, #20
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc03_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub r1, r1, r2, lsl #1
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- bl \type\()_rv40_qpel8_v_lowpass_neon
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc33_neon, export=1
- mov r3, #8
- b X(ff_\type\()_pixels8_xy2_neon)
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc13_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- add r1, sp, #7
- bic r1, r1, #7
- vswp d0, d1
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel8_mc23_neon, export=1
- push {r4, lr}
- vpush {d8-d15}
- sub sp, sp, #14*8
- add r12, sp, #7
- bic r12, r12, #7
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- mov r3, #12
- vmov.i8 d0, #20
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- add r1, sp, #7
- bic r1, r1, #7
- vmov.i8 d1, #52
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- add sp, sp, #14*8
- vpop {d8-d15}
- pop {r4, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc10_neon, export=1
- vmov.i8 d0, #52
- vmov.i8 d1, #20
-.L\type\()_rv40_qpel16_h:
- push {r1, lr}
- sub r1, r1, #2
- mov r3, #16
- bl \type\()_rv40_qpel8_h_lowpass_neon
- pop {r1, lr}
- sub r0, r0, r2, lsl #4
- add r0, r0, #8
- add r1, r1, #6
- mov r3, #16
- b \type\()_rv40_qpel8_h_lowpass_neon
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc30_neon, export=1
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- b .L\type\()_rv40_qpel16_h
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc01_neon, export=1
- vmov.i8 d0, #52
- vmov.i8 d1, #20
-.L\type\()_rv40_qpel16_v:
- sub r1, r1, r2, lsl #1
- push {r1, lr}
- vpush {d8-d15}
- bl \type\()_rv40_qpel8_v_lowpass_neon
- sub r1, r1, r2, lsl #2
- bl \type\()_rv40_qpel8_v_lowpass_neon
- ldr r1, [sp, #64]
- sub r0, r0, r2, lsl #4
- add r0, r0, #8
- add r1, r1, #8
- bl \type\()_rv40_qpel8_v_lowpass_neon
- sub r1, r1, r2, lsl #2
- bl \type\()_rv40_qpel8_v_lowpass_neon
- vpop {d8-d15}
- pop {r1, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc11_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
-.L\type\()_rv40_qpel16_v_s6:
- add r1, sp, #7
- bic r1, r1, #7
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- sub r1, r1, #40
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- sub r0, r0, r2, lsl #4
- add r0, r0, #8
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- sub r1, r1, #40
- bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
- add sp, sp, #44*8
- vpop {d8-d15}
- pop {r1, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc21_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #20
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- vmov.i8 d0, #52
- b .L\type\()_rv40_qpel16_v_s6
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc31_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- vswp d0, d1
- b .L\type\()_rv40_qpel16_v_s6
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc12_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- vmov.i8 d0, #20
-.L\type\()_rv40_qpel16_v_s5:
- add r1, sp, #7
- bic r1, r1, #7
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- sub r1, r1, #40
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- sub r0, r0, r2, lsl #4
- add r0, r0, #8
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- sub r1, r1, #40
- bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
- add sp, sp, #44*8
- vpop {d8-d15}
- pop {r1, pc}
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc22_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #20
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- b .L\type\()_rv40_qpel16_v_s5
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc32_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- vmov.i8 d1, #20
- b .L\type\()_rv40_qpel16_v_s5
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc03_neon, export=1
- vmov.i8 d0, #20
- vmov.i8 d1, #52
- b .L\type\()_rv40_qpel16_v
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc13_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #52
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s6_neon
- vswp d0, d1
- b .L\type\()_rv40_qpel16_v_s6
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc23_neon, export=1
- sub r1, r1, r2, lsl #1
- sub r1, r1, #2
- push {r1, lr}
- vpush {d8-d15}
- sub sp, sp, #44*8
- add r12, sp, #7
- bic r12, r12, #7
- mov r3, #20
- vmov.i8 d0, #20
- vmov.i8 d1, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- ldr r1, [sp, #416]
- add r1, r1, #8
- mov r3, #20
- bl put_rv40_qpel8_h_lp_packed_s5_neon
- vmov.i8 d1, #52
- b .L\type\()_rv40_qpel16_v_s6
-endfunc
-
-function ff_\type\()_rv40_qpel16_mc33_neon, export=1
- mov r3, #16
- b X(ff_\type\()_pixels16_xy2_neon)
-endfunc
-.endm
-
- rv40_qpel put
- rv40_qpel avg
-
-.macro rv40_weight
- vmovl.u8 q8, d2
- vmovl.u8 q9, d3
- vmovl.u8 q10, d4
- vmovl.u8 q11, d5
- vmull.u16 q2, d16, d0[2]
- vmull.u16 q3, d17, d0[2]
- vmull.u16 q8, d18, d0[2]
- vmull.u16 q9, d19, d0[2]
- vmull.u16 q12, d20, d0[0]
- vmull.u16 q13, d21, d0[0]
- vmull.u16 q14, d22, d0[0]
- vmull.u16 q15, d23, d0[0]
- vshrn.i32 d4, q2, #9
- vshrn.i32 d5, q3, #9
- vshrn.i32 d6, q8, #9
- vshrn.i32 d7, q9, #9
- vshrn.i32 d16, q12, #9
- vshrn.i32 d17, q13, #9
- vshrn.i32 d18, q14, #9
- vshrn.i32 d19, q15, #9
- vadd.u16 q2, q2, q8
- vadd.u16 q3, q3, q9
- vrshrn.i16 d2, q2, #5
- vrshrn.i16 d3, q3, #5
-.endm
-
-/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int w1, int w2, int stride) */
-function ff_rv40_weight_func_16_neon, export=1
- ldr r12, [sp]
- vmov d0, r3, r12
- ldr r12, [sp, #4]
- mov r3, #16
-1:
- vld1.8 {q1}, [r1,:128], r12
- vld1.8 {q2}, [r2,:128], r12
- rv40_weight
- vst1.8 {q1}, [r0,:128], r12
- subs r3, r3, #1
- bne 1b
- bx lr
-endfunc
-
-/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
- int w1, int w2, int stride) */
-function ff_rv40_weight_func_8_neon, export=1
- ldr r12, [sp]
- vmov d0, r3, r12
- ldr r12, [sp, #4]
- mov r3, #8
-1:
- vld1.8 {d2}, [r1,:64], r12
- vld1.8 {d3}, [r1,:64], r12
- vld1.8 {d4}, [r2,:64], r12
- vld1.8 {d5}, [r2,:64], r12
- rv40_weight
- vst1.8 {d2}, [r0,:64], r12
- vst1.8 {d3}, [r0,:64], r12
- subs r3, r3, #2
- bne 1b
- bx lr
-endfunc
-
-function ff_rv40_h_loop_filter_strength_neon, export=1
- pkhbt r2, r3, r2, lsl #18
-
- ldr r3, [r0]
- ldr_dpre r12, r0, r1
- teq r3, r12
- beq 1f
-
- sub r0, r0, r1, lsl #1
-
- vld1.32 {d4[]}, [r0,:32], r1 @ -3
- vld1.32 {d0[]}, [r0,:32], r1 @ -2
- vld1.32 {d4[1]}, [r0,:32], r1 @ -1
- vld1.32 {d5[]}, [r0,:32], r1 @ 0
- vld1.32 {d1[]}, [r0,:32], r1 @ 1
- vld1.32 {d5[0]}, [r0,:32], r1 @ 2
-
- vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
- vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
- vdup.32 d30, r2 @ beta2, beta << 2
- vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
- vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
- vabd.u16 d16, d18, d16
- vclt.u16 d16, d16, d30
-
- ldrd r2, r3, [sp, #4]
- vmovl.u16 q12, d16
- vtrn.16 d16, d17
- vshr.u32 q12, q12, #15
- ldr r0, [sp]
- vst1.32 {d24[1]}, [r2,:32]
- vst1.32 {d25[1]}, [r3,:32]
-
- cmp r0, #0
- it eq
- bxeq lr
-
- vand d18, d16, d17
- vtrn.32 d18, d19
- vand d18, d18, d19
- vmov.u16 r0, d18[0]
- bx lr
-1:
- ldrd r2, r3, [sp, #4]
- mov r0, #0
- str r0, [r2]
- str r0, [r3]
- bx lr
-endfunc
-
-function ff_rv40_v_loop_filter_strength_neon, export=1
- sub r0, r0, #3
- pkhbt r2, r3, r2, lsl #18
-
- vld1.8 {d0}, [r0], r1
- vld1.8 {d1}, [r0], r1
- vld1.8 {d2}, [r0], r1
- vld1.8 {d3}, [r0], r1
-
- vaddl.u8 q0, d0, d1
- vaddl.u8 q1, d2, d3
- vdup.32 q15, r2
- vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
- vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
- vabd.u16 q0, q1, q0
- vclt.u16 q0, q0, q15
-
- ldrd r2, r3, [sp, #4]
- vmovl.u16 q1, d0
- vext.16 d1, d0, d1, #3
- vshr.u32 q1, q1, #15
- ldr r0, [sp]
- vst1.32 {d2[1]}, [r2,:32]
- vst1.32 {d3[1]}, [r3,:32]
-
- cmp r0, #0
- it eq
- bxeq lr
-
- vand d0, d0, d1
- vtrn.16 d0, d1
- vand d0, d0, d1
- vmov.u16 r0, d0[0]
- bx lr
-endfunc
-
-.macro rv40_weak_loop_filter
- vdup.16 d30, r2 @ filter_p1
- vdup.16 d31, r3 @ filter_q1
- ldrd r2, r3, [sp]
- vdup.16 d28, r2 @ alpha
- vdup.16 d29, r3 @ beta
- ldr r12, [sp, #8]
- vdup.16 d25, r12 @ lim_p0q0
- ldrd r2, r3, [sp, #12]
- vsubl.u8 q9, d5, d4 @ x, t
- vabdl.u8 q8, d5, d4 @ x, abs(t)
- vneg.s16 q15, q15
- vceq.i16 d16, d19, #0 @ !t
- vshl.s16 d19, d19, #2 @ t << 2
- vmul.u16 d18, d17, d28 @ alpha * abs(t)
- vand d24, d30, d31 @ filter_p1 & filter_q1
- vsubl.u8 q1, d0, d4 @ p1p2, p1p0
- vsubl.u8 q3, d1, d5 @ q1q2, q1q0
- vmov.i16 d22, #3
- vshr.u16 d18, d18, #7
- vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
- vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
- vcle.u16 d18, d18, d22
- vand d20, d20, d24
- vneg.s16 d23, d25 @ -lim_p0q0
- vadd.s16 d19, d19, d20
- vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
- vtrn.32 d4, d5 @ -3, 2, -1, 0
- vrshr.s16 d19, d19, #3
- vmov d28, d29 @ beta
- vswp d3, d6 @ q1q2, p1p0
- vmin.s16 d19, d19, d25
- vand d30, d30, d16
- vand d31, d31, d16
- vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
- vmax.s16 d19, d19, d23 @ diff
- vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
- vand d18, d19, d16 @ diff
- vcle.u16 q1, q1, q14
- vneg.s16 d19, d18 @ -diff
- vdup.16 d26, r3 @ lim_p1
- vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
- vhsub.s16 q11, q10, q9
- vand q1, q1, q15
- vqmovun.s16 d4, q2 @ -1, 0
- vand q9, q11, q1
- vdup.16 d27, r2 @ lim_q1
- vneg.s16 q9, q9
- vneg.s16 q14, q13
- vmin.s16 q9, q9, q13
- vtrn.32 d0, d1 @ -2, 1, -2, 1
- vmax.s16 q9, q9, q14
- vaddw.u8 q3, q9, d0
- vqmovun.s16 d5, q3 @ -2, 1
-.endm
-
-function ff_rv40_h_weak_loop_filter_neon, export=1
- sub r0, r0, r1, lsl #1
- sub r0, r0, r1
-
- vld1.32 {d4[]}, [r0,:32], r1
- vld1.32 {d0[]}, [r0,:32], r1
- vld1.32 {d4[1]}, [r0,:32], r1
- vld1.32 {d5[]}, [r0,:32], r1
- vld1.32 {d1[]}, [r0,:32], r1
- vld1.32 {d5[0]}, [r0,:32]
-
- sub r0, r0, r1, lsl #2
-
- rv40_weak_loop_filter
-
- vst1.32 {d5[0]}, [r0,:32], r1
- vst1.32 {d4[0]}, [r0,:32], r1
- vst1.32 {d4[1]}, [r0,:32], r1
- vst1.32 {d5[1]}, [r0,:32], r1
-
- bx lr
-endfunc
-
-function ff_rv40_v_weak_loop_filter_neon, export=1
- sub r12, r0, #3
- sub r0, r0, #2
-
- vld1.8 {d4}, [r12], r1
- vld1.8 {d5}, [r12], r1
- vld1.8 {d2}, [r12], r1
- vld1.8 {d3}, [r12], r1
-
- vtrn.16 q2, q1
- vtrn.8 d4, d5
- vtrn.8 d2, d3
-
- vrev64.32 d5, d5
- vtrn.32 q2, q1
- vdup.32 d0, d3[0]
- vdup.32 d1, d2[0]
-
- rv40_weak_loop_filter
-
- vtrn.32 q2, q3
- vswp d4, d5
-
- vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
- vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
- vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
- vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
-
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c b/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c
deleted file mode 100644
index 4fb69f9..0000000
--- a/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2012 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-#include "libavutil/arm/cpu.h"
-#include "libavutil/attributes.h"
-#include "libavcodec/sbrdsp.h"
-
-void ff_sbr_sum64x5_neon(float *z);
-float ff_sbr_sum_square_neon(float (*x)[2], int n);
-void ff_sbr_neg_odd_64_neon(float *x);
-void ff_sbr_qmf_pre_shuffle_neon(float *z);
-void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
-void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
-void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
-void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
- const float *g_filt, int m_max, intptr_t ixh);
-void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
- const float alpha0[2], const float alpha1[2],
- float bw, int start, int end);
-void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
-
-void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
- const float *q_filt, int noise,
- int kx, int m_max);
-
-av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- s->sum64x5 = ff_sbr_sum64x5_neon;
- s->sum_square = ff_sbr_sum_square_neon;
- s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
- s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
- s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
- s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
- s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
- s->hf_g_filt = ff_sbr_hf_g_filt_neon;
- s->hf_gen = ff_sbr_hf_gen_neon;
- s->autocorrelate = ff_sbr_autocorrelate_neon;
- s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
- s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
- s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
- s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/sbrdsp_neon.S b/ffmpeg/libavcodec/arm/sbrdsp_neon.S
deleted file mode 100644
index e66abd6..0000000
--- a/ffmpeg/libavcodec/arm/sbrdsp_neon.S
+++ /dev/null
@@ -1,411 +0,0 @@
-/*
- * Copyright (c) 2012 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_sbr_sum64x5_neon, export=1
- push {lr}
- add r1, r0, # 64*4
- add r2, r0, #128*4
- add r3, r0, #192*4
- add lr, r0, #256*4
- mov r12, #64
-1:
- vld1.32 {q0}, [r0,:128]
- vld1.32 {q1}, [r1,:128]!
- vadd.f32 q0, q0, q1
- vld1.32 {q2}, [r2,:128]!
- vadd.f32 q0, q0, q2
- vld1.32 {q3}, [r3,:128]!
- vadd.f32 q0, q0, q3
- vld1.32 {q8}, [lr,:128]!
- vadd.f32 q0, q0, q8
- vst1.32 {q0}, [r0,:128]!
- subs r12, #4
- bgt 1b
- pop {pc}
-endfunc
-
-function ff_sbr_sum_square_neon, export=1
- vmov.f32 q0, #0.0
-1:
- vld1.32 {q1}, [r0,:128]!
- vmla.f32 q0, q1, q1
- subs r1, r1, #2
- bgt 1b
- vadd.f32 d0, d0, d1
- vpadd.f32 d0, d0, d0
-NOVFP vmov.32 r0, d0[0]
- bx lr
-endfunc
-
-function ff_sbr_neg_odd_64_neon, export=1
- mov r1, r0
- vmov.i32 q8, #1<<31
- vld2.32 {q0,q1}, [r0,:128]!
- veor q1, q1, q8
- vld2.32 {q2,q3}, [r0,:128]!
- .rept 3
- vst2.32 {q0,q1}, [r1,:128]!
- veor q3, q3, q8
- vld2.32 {q0,q1}, [r0,:128]!
- vst2.32 {q2,q3}, [r1,:128]!
- veor q1, q1, q8
- vld2.32 {q2,q3}, [r0,:128]!
- .endr
- veor q3, q3, q8
- vst2.32 {q0,q1}, [r1,:128]!
- vst2.32 {q2,q3}, [r1,:128]!
- bx lr
-endfunc
-
-function ff_sbr_qmf_pre_shuffle_neon, export=1
- add r1, r0, #60*4
- add r2, r0, #64*4
- vld1.32 {d0}, [r0,:64]!
- vst1.32 {d0}, [r2,:64]!
- mov r3, #-16
- mov r12, #24
- vmov.i32 q8, #1<<31
- vld1.32 {q0}, [r1,:128], r3
- vld1.32 {d2}, [r0,:64]!
-1:
- vld1.32 {d3,d4}, [r0,:128]!
- vrev64.32 q0, q0
- vld1.32 {q9}, [r1,:128], r3
- veor q0, q0, q8
- vld1.32 {d5,d6}, [r0,:128]!
- vswp d0, d1
- vrev64.32 q9, q9
- vst2.32 {q0,q1}, [r2,:64]!
- vmov q10, q2
- veor q9, q9, q8
- vmov d2, d6
- vswp d18, d19
- vld1.32 {q0}, [r1,:128], r3
- vst2.32 {q9,q10}, [r2,:64]!
- subs r12, r12, #8
- bgt 1b
- vld1.32 {d3,d4}, [r0,:128]!
- vrev64.32 q0, q0
- vld1.32 {q9}, [r1,:128], r3
- veor q0, q0, q8
- vld1.32 {d5}, [r0,:64]!
- vswp d0, d1
- vrev64.32 q9, q9
- vst2.32 {q0,q1}, [r2,:64]!
- vswp d4, d5
- veor q1, q9, q8
- vst2.32 {d3,d5}, [r2,:64]!
- vst2.32 {d2[0],d4[0]}, [r2,:64]!
- bx lr
-endfunc
-
-function ff_sbr_qmf_post_shuffle_neon, export=1
- add r2, r1, #60*4
- mov r3, #-16
- mov r12, #32
- vmov.i32 q8, #1<<31
- vld1.32 {q0}, [r2,:128], r3
- vld1.32 {q1}, [r1,:128]!
-1:
- pld [r2, #-32]
- vrev64.32 q0, q0
- vswp d2, d3
- veor q0, q0, q8
- vld1.32 {q2}, [r2,:128], r3
- vld1.32 {q3}, [r1,:128]!
- vst2.32 {d1,d3}, [r0,:128]!
- vst2.32 {d0,d2}, [r0,:128]!
- pld [r2, #-32]
- vrev64.32 q2, q2
- vswp d6, d7
- veor q2, q2, q8
- vld1.32 {q0}, [r2,:128], r3
- vld1.32 {q1}, [r1,:128]!
- vst2.32 {d5,d7}, [r0,:128]!
- vst2.32 {d4,d6}, [r0,:128]!
- subs r12, r12, #8
- bgt 1b
- bx lr
-endfunc
-
-function ff_sbr_qmf_deint_neg_neon, export=1
- add r1, r1, #60*4
- add r2, r0, #62*4
- mov r3, #-16
- mov r12, #32
- vmov.i32 d2, #1<<31
-1:
- vld2.32 {d0,d1}, [r1,:128], r3
- veor d0, d0, d2
- vrev64.32 d1, d1
- vst1.32 {d0}, [r2,:64]
- vst1.32 {d1}, [r0,:64]!
- sub r2, r2, #8
- subs r12, r12, #2
- bgt 1b
- bx lr
-endfunc
-
-function ff_sbr_qmf_deint_bfly_neon, export=1
- push {lr}
- add r2, r2, #60*4
- add r3, r0, #124*4
- mov r12, #64
- mov lr, #-16
-1:
- vld1.32 {q0}, [r1,:128]!
- vld1.32 {q1}, [r2,:128], lr
- vrev64.32 q2, q0
- vrev64.32 q3, q1
- vadd.f32 d3, d4, d3
- vadd.f32 d2, d5, d2
- vsub.f32 d0, d0, d7
- vsub.f32 d1, d1, d6
- vst1.32 {q1}, [r3,:128], lr
- vst1.32 {q0}, [r0,:128]!
- subs r12, r12, #4
- bgt 1b
- pop {pc}
-endfunc
-
-function ff_sbr_hf_g_filt_neon, export=1
- ldr r12, [sp]
- add r1, r1, r12, lsl #3
- mov r12, #40*2*4
- sub r3, r3, #1
- vld2.32 {d2[],d3[]},[r2,:64]!
- vld1.32 {d0}, [r1,:64], r12
-1:
- vld1.32 {d1}, [r1,:64], r12
- vmul.f32 q3, q0, q1
- vld2.32 {d2[],d3[]},[r2,:64]!
- vld1.32 {d0}, [r1,:64], r12
- vst1.32 {q3}, [r0,:64]!
- subs r3, r3, #2
- bgt 1b
- it lt
- bxlt lr
- vmul.f32 d0, d0, d2
- vst1.32 {d0}, [r0,:64]!
- bx lr
-endfunc
-
-function ff_sbr_hf_gen_neon, export=1
-NOVFP vld1.32 {d1[]}, [sp,:32]
-VFP vdup.32 d1, d0[0]
- vmul.f32 d0, d1, d1
- vld1.32 {d3}, [r2,:64]
- vld1.32 {d2}, [r3,:64]
- vmul.f32 q0, q0, q1
- ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
- vtrn.32 d0, d1
- vneg.f32 d18, d1
- vtrn.32 d18, d1
- add r0, r0, r2, lsl #3
- add r1, r1, r2, lsl #3
- sub r1, r1, #2*8
- sub r3, r3, r2
- vld1.32 {q1}, [r1,:128]!
-1:
- vld1.32 {q3}, [r1,:128]!
- vrev64.32 q2, q1
- vmov q8, q3
- vrev64.32 d20, d3
- vrev64.32 d21, d6
- vmla.f32 q3, q1, d0[0]
- vmla.f32 d6, d4, d18
- vmla.f32 d7, d20, d18
- vmla.f32 d6, d3, d0[1]
- vmla.f32 d7, d16, d0[1]
- vmla.f32 d6, d5, d1
- vmla.f32 d7, d21, d1
- vmov q1, q8
- vst1.32 {q3}, [r0,:128]!
- subs r3, r3, #2
- bgt 1b
- bx lr
-endfunc
-
-function ff_sbr_autocorrelate_neon, export=1
- vld1.32 {q0}, [r0,:128]!
- vmov.f32 q1, #0.0
- vmov.f32 q3, #0.0
- vmov.f32 d20, #0.0
- vmul.f32 d21, d1, d1
- vmov q8, q0
- vmov q11, q0
- mov r12, #36
-1:
- vld1.32 {q2}, [r0,:128]!
- vrev64.32 q12, q2
- vmla.f32 q10, q2, q2
- vmla.f32 d2, d1, d4
- vmla.f32 d3, d1, d24
- vmla.f32 d6, d0, d4
- vmla.f32 d7, d0, d24
- vmla.f32 d2, d4, d5
- vmla.f32 d3, d4, d25
- vmla.f32 d6, d1, d5
- vmla.f32 d7, d1, d25
- vmov q0, q2
- subs r12, r12, #2
- bgt 1b
- vld1.32 {q2}, [r0,:128]!
- vrev64.32 q12, q2
- vmla.f32 d2, d1, d4
- vmla.f32 d3, d1, d24
- vmla.f32 d6, d0, d4
- vmla.f32 d7, d0, d24
- vadd.f32 d20, d20, d21
- vrev64.32 d18, d17
- vmla.f32 d6, d1, d5
- vmla.f32 d7, d1, d25
- vmov q0, q1
- vmla.f32 d0, d16, d17
- vmla.f32 d1, d16, d18
- vmla.f32 d2, d4, d5
- vmla.f32 d3, d4, d25
- vneg.f32 s15, s15
- vmov d21, d20
- vpadd.f32 d0, d0, d2
- vpadd.f32 d7, d6, d7
- vtrn.32 d1, d3
- vsub.f32 d6, d1, d3
- vmla.f32 d20, d22, d22
- vmla.f32 d21, d4, d4
- vtrn.32 d0, d6
- vpadd.f32 d20, d20, d21
- vst1.32 {q3}, [r1,:128]!
- vst1.32 {d20[1]}, [r1,:32]
- add r1, r1, #2*4
- vst1.32 {d0}, [r1,:64]
- add r1, r1, #4*4
- vst1.32 {d20[0]}, [r1,:32]
- bx lr
-endfunc
-
-function ff_sbr_hf_apply_noise_0_neon, export=1
- vmov.i32 d3, #0
-.Lhf_apply_noise_0:
- push {r4,lr}
- movrelx r4, X(ff_sbr_noise_table)
- ldr r12, [sp, #12]
- add r3, r3, #1
- bfc r3, #9, #23
- sub r12, r12, #1
-1:
- add lr, r4, r3, lsl #3
- vld2.32 {q0}, [r0,:64]
- vld2.32 {q3}, [lr,:64]
- vld1.32 {d2}, [r1,:64]!
- vld1.32 {d18}, [r2,:64]!
- vceq.f32 d16, d2, #0
- veor d2, d2, d3
- vmov q2, q0
- vmla.f32 d0, d6, d18
- vmla.f32 d1, d7, d18
- vadd.f32 d4, d4, d2
- add r3, r3, #2
- bfc r3, #9, #23
- vbif d0, d4, d16
- vbif d1, d5, d16
- vst2.32 {q0}, [r0,:64]!
- subs r12, r12, #2
- bgt 1b
- blt 2f
- add lr, r4, r3, lsl #3
- vld1.32 {d0}, [r0,:64]
- vld1.32 {d6}, [lr,:64]
- vld1.32 {d2[]}, [r1,:32]!
- vld1.32 {d3[]}, [r2,:32]!
- vceq.f32 d4, d2, #0
- veor d2, d2, d3
- vmov d1, d0
- vmla.f32 d0, d6, d3
- vadd.f32 s2, s2, s4
- vbif d0, d1, d4
- vst1.32 {d0}, [r0,:64]!
-2:
- pop {r4,pc}
-endfunc
-
-function ff_sbr_hf_apply_noise_1_neon, export=1
- ldr r12, [sp]
- push {r4,lr}
- lsl r12, r12, #31
- eor lr, r12, #1<<31
- vmov d3, r12, lr
-.Lhf_apply_noise_1:
- movrelx r4, X(ff_sbr_noise_table)
- ldr r12, [sp, #12]
- add r3, r3, #1
- bfc r3, #9, #23
- sub r12, r12, #1
-1:
- add lr, r4, r3, lsl #3
- vld2.32 {q0}, [r0,:64]
- vld2.32 {q3}, [lr,:64]
- vld1.32 {d2}, [r1,:64]!
- vld1.32 {d18}, [r2,:64]!
- vceq.f32 d16, d2, #0
- veor d2, d2, d3
- vmov q2, q0
- vmla.f32 d0, d6, d18
- vmla.f32 d1, d7, d18
- vadd.f32 d5, d5, d2
- add r3, r3, #2
- bfc r3, #9, #23
- vbif d0, d4, d16
- vbif d1, d5, d16
- vst2.32 {q0}, [r0,:64]!
- subs r12, r12, #2
- bgt 1b
- blt 2f
- add lr, r4, r3, lsl #3
- vld1.32 {d0}, [r0,:64]
- vld1.32 {d6}, [lr,:64]
- vld1.32 {d2[]}, [r1,:32]!
- vld1.32 {d18[]}, [r2,:32]!
- vceq.f32 d4, d2, #0
- veor d2, d2, d3
- vmov d1, d0
- vmla.f32 d0, d6, d18
- vadd.f32 s3, s3, s5
- vbif d0, d1, d4
- vst1.32 {d0}, [r0,:64]!
-2:
- pop {r4,pc}
-endfunc
-
-function ff_sbr_hf_apply_noise_2_neon, export=1
- vmov.i32 d3, #1<<31
- b .Lhf_apply_noise_0
-endfunc
-
-function ff_sbr_hf_apply_noise_3_neon, export=1
- ldr r12, [sp]
- push {r4,lr}
- lsl r12, r12, #31
- eor lr, r12, #1<<31
- vmov d3, lr, r12
- b .Lhf_apply_noise_1
-endfunc
diff --git a/ffmpeg/libavcodec/arm/simple_idct_arm.S b/ffmpeg/libavcodec/arm/simple_idct_arm.S
deleted file mode 100644
index 50d20c9..0000000
--- a/ffmpeg/libavcodec/arm/simple_idct_arm.S
+++ /dev/null
@@ -1,479 +0,0 @@
-/*
- * Copyright (C) 2002 Frederic 'dilb' Boulay
- *
- * Author: Frederic Boulay <dilb@handhelds.org>
- *
- * The function defined in this file is derived from the simple_idct function
- * from the libavcodec library part of the FFmpeg project.
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-/* useful constants for the algorithm */
-#define W1 22725
-#define W2 21407
-#define W3 19266
-#define W4 16383
-#define W5 12873
-#define W6 8867
-#define W7 4520
-#define MASK_MSHW 0xFFFF0000
-
-#define ROW_SHIFT 11
-#define ROW_SHIFT2MSHW (16-11)
-#define COL_SHIFT 20
-#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
-#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
-
-
-function ff_simple_idct_arm, export=1
- @@ void simple_idct_arm(int16_t *block)
- @@ save stack for reg needed (take all of them),
- @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
- @@ so it must not be overwritten, if it is not saved!!
- @@ R12 is another scratch register, so it should not be saved too
- @@ save all registers
- stmfd sp!, {r4-r11, r14} @ R14 is also called LR
- @@ at this point, R0=block, other registers are free.
- add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
- @@ add 2 temporary variables in the stack: R0 and R14
- sub sp, sp, #8 @ allow 2 local variables
- str r0, [sp, #0] @ save block in sp[0]
- @@ stack status
- @@ sp+4 free
- @@ sp+0 R0 (block)
-
-
- @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
-
-
-__row_loop:
- @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
- ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
- ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
- ldr r3, [r14, #8] @ R3=ROWr32[2]
- ldr r4, [r14, #12] @ R4=ROWr32[3]
- @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
- @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
- @@ else follow the complete algorithm.
- @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
- @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
- orr r5, r4, r3 @ R5=R4 | R3
- orr r5, r5, r2 @ R5=R4 | R3 | R2
- orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
- beq __end_row_loop
- mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
- ldrsh r6, [r14, #0] @ R6=ROWr16[0]
- orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
- beq __almost_empty_row
-
-@@ __b_evaluation:
- @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
- @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
- @@ R12=__const_ptr_, R14=&block[n]
- @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
-
- @@ MUL16(b0, W1, row[1]);
- @@ MUL16(b1, W3, row[1]);
- @@ MUL16(b2, W5, row[1]);
- @@ MUL16(b3, W7, row[1]);
- @@ MAC16(b0, W3, row[3]);
- @@ MAC16(b1, -W7, row[3]);
- @@ MAC16(b2, -W1, row[3]);
- @@ MAC16(b3, -W5, row[3]);
- ldr r8, =W1 @ R8=W1
- mov r2, r2, asr #16 @ R2=ROWr16[3]
- mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldr r9, =W3 @ R9=W3
- ldr r10, =W5 @ R10=W5
- mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldr r11, =W7 @ R11=W7
- mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- teq r2, #0 @ if null avoid muls
- itttt ne
- mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- rsbne r2, r2, #0 @ R2=-ROWr16[3]
- mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- it ne
- mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-
- @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
- @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
- @@ R12=__const_ptr_, R14=&block[n]
- @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
- @@ if (temp != 0) {}
- orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
- beq __end_b_evaluation
-
- @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
- @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
- @@ R12=__const_ptr_, R14=&block[n]
- @@ MAC16(b0, W5, row[5]);
- @@ MAC16(b2, W7, row[5]);
- @@ MAC16(b3, W3, row[5]);
- @@ MAC16(b1, -W1, row[5]);
- @@ MAC16(b0, W7, row[7]);
- @@ MAC16(b2, W3, row[7]);
- @@ MAC16(b3, -W1, row[7]);
- @@ MAC16(b1, -W5, row[7]);
- mov r3, r3, asr #16 @ R3=ROWr16[5]
- teq r3, #0 @ if null avoid muls
- it ne
- mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
- mov r4, r4, asr #16 @ R4=ROWr16[7]
- itttt ne
- mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
- mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
- rsbne r3, r3, #0 @ R3=-ROWr16[5]
- mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
- @@ R3 is free now
- teq r4, #0 @ if null avoid muls
- itttt ne
- mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
- mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
- rsbne r4, r4, #0 @ R4=-ROWr16[7]
- mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
- it ne
- mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
- @@ R4 is free now
-__end_b_evaluation:
- @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
- @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
-
-@@ __a_evaluation:
- @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
- @@ a1 = a0 + W6 * row[2];
- @@ a2 = a0 - W6 * row[2];
- @@ a3 = a0 - W2 * row[2];
- @@ a0 = a0 + W2 * row[2];
- ldr r9, =W4 @ R9=W4
- mul r6, r9, r6 @ R6=W4*ROWr16[0]
- ldr r10, =W6 @ R10=W6
- ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
- add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
-
- mul r11, r10, r4 @ R11=W6*ROWr16[2]
- ldr r8, =W2 @ R8=W2
- sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
- @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
- @@ if (temp != 0) {}
- teq r2, #0
- beq __end_bef_a_evaluation
-
- add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
- mul r11, r8, r4 @ R11=W2*ROWr16[2]
- sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
- add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
-
-
- @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
- @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
-
-
- @@ a0 += W4*row[4]
- @@ a1 -= W4*row[4]
- @@ a2 -= W4*row[4]
- @@ a3 += W4*row[4]
- ldrsh r11, [r14, #8] @ R11=ROWr16[4]
- teq r11, #0 @ if null avoid muls
- it ne
- mulne r11, r9, r11 @ R11=W4*ROWr16[4]
- @@ R9 is free now
- ldrsh r9, [r14, #12] @ R9=ROWr16[6]
- itttt ne
- addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
- subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
- subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
- addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
- @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
- teq r9, #0 @ if null avoid muls
- itttt ne
- mulne r11, r10, r9 @ R11=W6*ROWr16[6]
- addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
- mulne r10, r8, r9 @ R10=W2*ROWr16[6]
- @@ a0 += W6*row[6];
- @@ a3 -= W6*row[6];
- @@ a1 -= W2*row[6];
- @@ a2 += W2*row[6];
- subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
- itt ne
- subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
- addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
-
-__end_a_evaluation:
- @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
- @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
- @@ row[0] = (a0 + b0) >> ROW_SHIFT;
- @@ row[1] = (a1 + b1) >> ROW_SHIFT;
- @@ row[2] = (a2 + b2) >> ROW_SHIFT;
- @@ row[3] = (a3 + b3) >> ROW_SHIFT;
- @@ row[4] = (a3 - b3) >> ROW_SHIFT;
- @@ row[5] = (a2 - b2) >> ROW_SHIFT;
- @@ row[6] = (a1 - b1) >> ROW_SHIFT;
- @@ row[7] = (a0 - b0) >> ROW_SHIFT;
- add r8, r6, r0 @ R8=a0+b0
- add r9, r2, r1 @ R9=a1+b1
- @@ put 2 16 bits half-words in a 32bits word
- @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
- ldr r10, =MASK_MSHW @ R10=0xFFFF0000
- and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
- mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
- and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
- orr r8, r8, r9
- str r8, [r14, #0]
-
- add r8, r3, r5 @ R8=a2+b2
- add r9, r4, r7 @ R9=a3+b3
- and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
- and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
- orr r8, r8, r9
- str r8, [r14, #4]
-
- sub r8, r4, r7 @ R8=a3-b3
- sub r9, r3, r5 @ R9=a2-b2
- and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
- and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
- orr r8, r8, r9
- str r8, [r14, #8]
-
- sub r8, r2, r1 @ R8=a1-b1
- sub r9, r6, r0 @ R9=a0-b0
- and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
- and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
- orr r8, r8, r9
- str r8, [r14, #12]
-
- bal __end_row_loop
-
-__almost_empty_row:
- @@ the row was empty, except ROWr16[0], now, management of this special case
- @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
- @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
- @@ R8=0xFFFF (temp), R9-R11 free
- mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
- sub r8, r8, #1 @ R8 is now ready.
- and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
- orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
- str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
- str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
- str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
- str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
-
-__end_row_loop:
- @@ at this point, R0-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- ldr r0, [sp, #0] @ R0=block
- teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
- sub r14, r14, #16
- bne __row_loop
-
-
-
- @@ at this point, R0=block, R1-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
-__col_loop:
-
-@@ __b_evaluation2:
- @@ at this point, R0=block (temp), R1-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- @@ proceed with b0-b3 first, followed by a0-a3
- @@ MUL16(b0, W1, col[8x1]);
- @@ MUL16(b1, W3, col[8x1]);
- @@ MUL16(b2, W5, col[8x1]);
- @@ MUL16(b3, W7, col[8x1]);
- @@ MAC16(b0, W3, col[8x3]);
- @@ MAC16(b1, -W7, col[8x3]);
- @@ MAC16(b2, -W1, col[8x3]);
- @@ MAC16(b3, -W5, col[8x3]);
- ldr r8, =W1 @ R8=W1
- ldrsh r7, [r14, #16]
- mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldr r9, =W3 @ R9=W3
- ldr r10, =W5 @ R10=W5
- mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldr r11, =W7 @ R11=W7
- mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- ldrsh r2, [r14, #48]
- mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
- teq r2, #0 @ if 0, then avoid muls
- itttt ne
- mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- rsbne r2, r2, #0 @ R2=-ROWr16[3]
- mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
- it ne
- mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
-
- @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
- @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
- @@ R12=__const_ptr_, R14=&block[n]
- @@ MAC16(b0, W5, col[5x8]);
- @@ MAC16(b2, W7, col[5x8]);
- @@ MAC16(b3, W3, col[5x8]);
- @@ MAC16(b1, -W1, col[5x8]);
- @@ MAC16(b0, W7, col[7x8]);
- @@ MAC16(b2, W3, col[7x8]);
- @@ MAC16(b3, -W1, col[7x8]);
- @@ MAC16(b1, -W5, col[7x8]);
- ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
- teq r3, #0 @ if 0 then avoid muls
- itttt ne
- mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
- mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
- mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
- rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
- ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
- it ne
- mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
- @@ R3 is free now
- teq r4, #0 @ if 0 then avoid muls
- itttt ne
- mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
- mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
- rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
- mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
- it ne
- mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
- @@ R4 is free now
-@@ __end_b_evaluation2:
- @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
- @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
-
-@@ __a_evaluation2:
- @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
- @@ a1 = a0 + W6 * row[2];
- @@ a2 = a0 - W6 * row[2];
- @@ a3 = a0 - W2 * row[2];
- @@ a0 = a0 + W2 * row[2];
- ldrsh r6, [r14, #0]
- ldr r9, =W4 @ R9=W4
- mul r6, r9, r6 @ R6=W4*ROWr16[0]
- ldr r10, =W6 @ R10=W6
- ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
- add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
- mul r11, r10, r4 @ R11=W6*ROWr16[2]
- ldr r8, =W2 @ R8=W2
- add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
- sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
- mul r11, r8, r4 @ R11=W2*ROWr16[2]
- sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
- add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
-
- @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
- @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
- @@ a0 += W4*row[4]
- @@ a1 -= W4*row[4]
- @@ a2 -= W4*row[4]
- @@ a3 += W4*row[4]
- ldrsh r11, [r14, #64] @ R11=ROWr16[4]
- teq r11, #0 @ if null avoid muls
- itttt ne
- mulne r11, r9, r11 @ R11=W4*ROWr16[4]
- @@ R9 is free now
- addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
- subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
- subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
- ldrsh r9, [r14, #96] @ R9=ROWr16[6]
- it ne
- addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
- @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
- teq r9, #0 @ if null avoid muls
- itttt ne
- mulne r11, r10, r9 @ R11=W6*ROWr16[6]
- addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
- mulne r10, r8, r9 @ R10=W2*ROWr16[6]
- @@ a0 += W6*row[6];
- @@ a3 -= W6*row[6];
- @@ a1 -= W2*row[6];
- @@ a2 += W2*row[6];
- subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
- itt ne
- subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
- addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
-@@ __end_a_evaluation2:
- @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
- @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
- @@ R12=__const_ptr_, R14=&block[n]
- @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
- @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
- @@ col[16] = ((a2 + b2) >> COL_SHIFT);
- @@ col[24] = ((a3 + b3) >> COL_SHIFT);
- @@ col[32] = ((a3 - b3) >> COL_SHIFT);
- @@ col[40] = ((a2 - b2) >> COL_SHIFT);
- @@ col[48] = ((a1 - b1) >> COL_SHIFT);
- @@ col[56] = ((a0 - b0) >> COL_SHIFT);
- @@@@@ no optimization here @@@@@
- add r8, r6, r0 @ R8=a0+b0
- add r9, r2, r1 @ R9=a1+b1
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #0]
- strh r9, [r14, #16]
- add r8, r3, r5 @ R8=a2+b2
- add r9, r4, r7 @ R9=a3+b3
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #32]
- strh r9, [r14, #48]
- sub r8, r4, r7 @ R8=a3-b3
- sub r9, r3, r5 @ R9=a2-b2
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #64]
- strh r9, [r14, #80]
- sub r8, r2, r1 @ R8=a1-b1
- sub r9, r6, r0 @ R9=a0-b0
- mov r8, r8, asr #COL_SHIFT
- mov r9, r9, asr #COL_SHIFT
- strh r8, [r14, #96]
- strh r9, [r14, #112]
-
-@@ __end_col_loop:
- @@ at this point, R0-R11 (free)
- @@ R12=__const_ptr_, R14=&block[n]
- ldr r0, [sp, #0] @ R0=block
- teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
- sub r14, r14, #2
- bne __col_loop
-
-
-
-
-@@ __end_simple_idct_arm:
- @@ restore registers to previous status!
- add sp, sp, #8 @@ the local variables!
- ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
-
-
-
-@@ kind of sub-function, here not to overload the common case.
-__end_bef_a_evaluation:
- add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
- mul r11, r8, r4 @ R11=W2*ROWr16[2]
- sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
- add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
- bal __end_a_evaluation
diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv5te.S b/ffmpeg/libavcodec/arm/simple_idct_armv5te.S
deleted file mode 100644
index d1f10b7..0000000
--- a/ffmpeg/libavcodec/arm/simple_idct_armv5te.S
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
- * Simple IDCT
- *
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define ROW_SHIFT 11
-#define COL_SHIFT 20
-
-#define W13 (W1 | (W3 << 16))
-#define W26 (W2 | (W6 << 16))
-#define W57 (W5 | (W7 << 16))
-
-function idct_row_armv5te
- str lr, [sp, #-4]!
-
- ldrd v1, v2, [a1, #8]
- ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */
- orrs v1, v1, v2
- itt eq
- cmpeq v1, a4
- cmpeq v1, a3, lsr #16
- beq row_dc_only
-
- mov v1, #(1<<(ROW_SHIFT-1))
- mov ip, #16384
- sub ip, ip, #1 /* ip = W4 */
- smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
- ldr ip, =W26 /* ip = W2 | (W6 << 16) */
- smultb a2, ip, a4
- smulbb lr, ip, a4
- add v2, v1, a2
- sub v3, v1, a2
- sub v4, v1, lr
- add v1, v1, lr
-
- ldr ip, =W13 /* ip = W1 | (W3 << 16) */
- ldr lr, =W57 /* lr = W5 | (W7 << 16) */
- smulbt v5, ip, a3
- smultt v6, lr, a4
- smlatt v5, ip, a4, v5
- smultt a2, ip, a3
- smulbt v7, lr, a3
- sub v6, v6, a2
- smulbt a2, ip, a4
- smultt fp, lr, a3
- sub v7, v7, a2
- smulbt a2, lr, a4
- ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
- sub fp, fp, a2
-
- orrs a2, a3, a4
- beq 1f
-
- smlabt v5, lr, a3, v5
- smlabt v6, ip, a3, v6
- smlatt v5, lr, a4, v5
- smlabt v6, lr, a4, v6
- smlatt v7, lr, a3, v7
- smlatt fp, ip, a3, fp
- smulbt a2, ip, a4
- smlatt v7, ip, a4, v7
- sub fp, fp, a2
-
- ldr ip, =W26 /* ip = W2 | (W6 << 16) */
- mov a2, #16384
- sub a2, a2, #1 /* a2 = W4 */
- smulbb a2, a2, a3 /* a2 = W4*row[4] */
- smultb lr, ip, a4 /* lr = W6*row[6] */
- add v1, v1, a2 /* v1 += W4*row[4] */
- add v1, v1, lr /* v1 += W6*row[6] */
- add v4, v4, a2 /* v4 += W4*row[4] */
- sub v4, v4, lr /* v4 -= W6*row[6] */
- smulbb lr, ip, a4 /* lr = W2*row[6] */
- sub v2, v2, a2 /* v2 -= W4*row[4] */
- sub v2, v2, lr /* v2 -= W2*row[6] */
- sub v3, v3, a2 /* v3 -= W4*row[4] */
- add v3, v3, lr /* v3 += W2*row[6] */
-
-1: add a2, v1, v5
- mov a3, a2, lsr #11
- bic a3, a3, #0x1f0000
- sub a2, v2, v6
- mov a2, a2, lsr #11
- add a3, a3, a2, lsl #16
- add a2, v3, v7
- mov a4, a2, lsr #11
- bic a4, a4, #0x1f0000
- add a2, v4, fp
- mov a2, a2, lsr #11
- add a4, a4, a2, lsl #16
- strd a3, a4, [a1]
-
- sub a2, v4, fp
- mov a3, a2, lsr #11
- bic a3, a3, #0x1f0000
- sub a2, v3, v7
- mov a2, a2, lsr #11
- add a3, a3, a2, lsl #16
- add a2, v2, v6
- mov a4, a2, lsr #11
- bic a4, a4, #0x1f0000
- sub a2, v1, v5
- mov a2, a2, lsr #11
- add a4, a4, a2, lsl #16
- strd a3, a4, [a1, #8]
-
- ldr pc, [sp], #4
-
-row_dc_only:
- orr a3, a3, a3, lsl #16
- bic a3, a3, #0xe000
- mov a3, a3, lsl #3
- mov a4, a3
- strd a3, a4, [a1]
- strd a3, a4, [a1, #8]
-
- ldr pc, [sp], #4
-endfunc
-
- .macro idct_col
- ldr a4, [a1] /* a4 = col[1:0] */
- mov ip, #16384
- sub ip, ip, #1 /* ip = W4 */
-#if 0
- mov v1, #(1<<(COL_SHIFT-1))
- smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
- smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
- ldr a4, [a1, #(16*4)]
-#else
- mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
- add v2, v1, a4, asr #16
- rsb v2, v2, v2, lsl #14
- mov a4, a4, lsl #16
- add v1, v1, a4, asr #16
- ldr a4, [a1, #(16*4)]
- rsb v1, v1, v1, lsl #14
-#endif
-
- smulbb lr, ip, a4
- smulbt a3, ip, a4
- sub v3, v1, lr
- sub v5, v1, lr
- add v7, v1, lr
- add v1, v1, lr
- sub v4, v2, a3
- sub v6, v2, a3
- add fp, v2, a3
- ldr ip, =W26
- ldr a4, [a1, #(16*2)]
- add v2, v2, a3
-
- smulbb lr, ip, a4
- smultb a3, ip, a4
- add v1, v1, lr
- sub v7, v7, lr
- add v3, v3, a3
- sub v5, v5, a3
- smulbt lr, ip, a4
- smultt a3, ip, a4
- add v2, v2, lr
- sub fp, fp, lr
- add v4, v4, a3
- ldr a4, [a1, #(16*6)]
- sub v6, v6, a3
-
- smultb lr, ip, a4
- smulbb a3, ip, a4
- add v1, v1, lr
- sub v7, v7, lr
- sub v3, v3, a3
- add v5, v5, a3
- smultt lr, ip, a4
- smulbt a3, ip, a4
- add v2, v2, lr
- sub fp, fp, lr
- sub v4, v4, a3
- add v6, v6, a3
-
- stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
-
- ldr ip, =W13
- ldr a4, [a1, #(16*1)]
- ldr lr, =W57
- smulbb v1, ip, a4
- smultb v3, ip, a4
- smulbb v5, lr, a4
- smultb v7, lr, a4
- smulbt v2, ip, a4
- smultt v4, ip, a4
- smulbt v6, lr, a4
- smultt fp, lr, a4
- rsb v4, v4, #0
- ldr a4, [a1, #(16*3)]
- rsb v3, v3, #0
-
- smlatb v1, ip, a4, v1
- smlatb v3, lr, a4, v3
- smulbb a3, ip, a4
- smulbb a2, lr, a4
- sub v5, v5, a3
- sub v7, v7, a2
- smlatt v2, ip, a4, v2
- smlatt v4, lr, a4, v4
- smulbt a3, ip, a4
- smulbt a2, lr, a4
- sub v6, v6, a3
- ldr a4, [a1, #(16*5)]
- sub fp, fp, a2
-
- smlabb v1, lr, a4, v1
- smlabb v3, ip, a4, v3
- smlatb v5, lr, a4, v5
- smlatb v7, ip, a4, v7
- smlabt v2, lr, a4, v2
- smlabt v4, ip, a4, v4
- smlatt v6, lr, a4, v6
- ldr a3, [a1, #(16*7)]
- smlatt fp, ip, a4, fp
-
- smlatb v1, lr, a3, v1
- smlabb v3, lr, a3, v3
- smlatb v5, ip, a3, v5
- smulbb a4, ip, a3
- smlatt v2, lr, a3, v2
- sub v7, v7, a4
- smlabt v4, lr, a3, v4
- smulbt a4, ip, a3
- smlatt v6, ip, a3, v6
- sub fp, fp, a4
- .endm
-
-function idct_col_armv5te
- str lr, [sp, #-4]!
-
- idct_col
-
- ldmfd sp!, {a3, a4}
- adds a2, a3, v1
- mov a2, a2, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- add ip, a4, v2
- mov ip, ip, asr #20
- orr a2, a2, ip, lsl #16
- str a2, [a1]
- subs a3, a3, v1
- mov a2, a3, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- sub a4, a4, v2
- mov a4, a4, asr #20
- orr a2, a2, a4, lsl #16
- ldmfd sp!, {a3, a4}
- str a2, [a1, #(16*7)]
-
- subs a2, a3, v3
- mov a2, a2, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- sub ip, a4, v4
- mov ip, ip, asr #20
- orr a2, a2, ip, lsl #16
- str a2, [a1, #(16*1)]
- adds a3, a3, v3
- mov a2, a3, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- add a4, a4, v4
- mov a4, a4, asr #20
- orr a2, a2, a4, lsl #16
- ldmfd sp!, {a3, a4}
- str a2, [a1, #(16*6)]
-
- adds a2, a3, v5
- mov a2, a2, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- add ip, a4, v6
- mov ip, ip, asr #20
- orr a2, a2, ip, lsl #16
- str a2, [a1, #(16*2)]
- subs a3, a3, v5
- mov a2, a3, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- sub a4, a4, v6
- mov a4, a4, asr #20
- orr a2, a2, a4, lsl #16
- ldmfd sp!, {a3, a4}
- str a2, [a1, #(16*5)]
-
- adds a2, a3, v7
- mov a2, a2, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- add ip, a4, fp
- mov ip, ip, asr #20
- orr a2, a2, ip, lsl #16
- str a2, [a1, #(16*3)]
- subs a3, a3, v7
- mov a2, a3, lsr #20
- it mi
- orrmi a2, a2, #0xf000
- sub a4, a4, fp
- mov a4, a4, asr #20
- orr a2, a2, a4, lsl #16
- str a2, [a1, #(16*4)]
-
- ldr pc, [sp], #4
-endfunc
-
-.macro clip dst, src:vararg
- movs \dst, \src
- it mi
- movmi \dst, #0
- cmp \dst, #255
- it gt
- movgt \dst, #255
-.endm
-
-.macro aclip dst, src:vararg
- adds \dst, \src
- it mi
- movmi \dst, #0
- cmp \dst, #255
- it gt
- movgt \dst, #255
-.endm
-
-function idct_col_put_armv5te
- str lr, [sp, #-4]!
-
- idct_col
-
- ldmfd sp!, {a3, a4}
- ldr lr, [sp, #32]
- add a2, a3, v1
- clip a2, a2, asr #20
- add ip, a4, v2
- clip ip, ip, asr #20
- orr a2, a2, ip, lsl #8
- sub a3, a3, v1
- clip a3, a3, asr #20
- sub a4, a4, v2
- clip a4, a4, asr #20
- ldr v1, [sp, #28]
- strh a2, [v1]
- add a2, v1, #2
- str a2, [sp, #28]
- orr a2, a3, a4, lsl #8
- rsb v2, lr, lr, lsl #3
- ldmfd sp!, {a3, a4}
- strh_pre a2, v2, v1
-
- sub a2, a3, v3
- clip a2, a2, asr #20
- sub ip, a4, v4
- clip ip, ip, asr #20
- orr a2, a2, ip, lsl #8
- strh_pre a2, v1, lr
- add a3, a3, v3
- clip a2, a3, asr #20
- add a4, a4, v4
- clip a4, a4, asr #20
- orr a2, a2, a4, lsl #8
- ldmfd sp!, {a3, a4}
- strh_dpre a2, v2, lr
-
- add a2, a3, v5
- clip a2, a2, asr #20
- add ip, a4, v6
- clip ip, ip, asr #20
- orr a2, a2, ip, lsl #8
- strh_pre a2, v1, lr
- sub a3, a3, v5
- clip a2, a3, asr #20
- sub a4, a4, v6
- clip a4, a4, asr #20
- orr a2, a2, a4, lsl #8
- ldmfd sp!, {a3, a4}
- strh_dpre a2, v2, lr
-
- add a2, a3, v7
- clip a2, a2, asr #20
- add ip, a4, fp
- clip ip, ip, asr #20
- orr a2, a2, ip, lsl #8
- strh a2, [v1, lr]
- sub a3, a3, v7
- clip a2, a3, asr #20
- sub a4, a4, fp
- clip a4, a4, asr #20
- orr a2, a2, a4, lsl #8
- strh_dpre a2, v2, lr
-
- ldr pc, [sp], #4
-endfunc
-
-function idct_col_add_armv5te
- str lr, [sp, #-4]!
-
- idct_col
-
- ldr lr, [sp, #36]
-
- ldmfd sp!, {a3, a4}
- ldrh ip, [lr]
- add a2, a3, v1
- sub a3, a3, v1
- and v1, ip, #255
- aclip a2, v1, a2, asr #20
- add v1, a4, v2
- mov v1, v1, asr #20
- aclip v1, v1, ip, lsr #8
- orr a2, a2, v1, lsl #8
- ldr v1, [sp, #32]
- sub a4, a4, v2
- rsb v2, v1, v1, lsl #3
- ldrh_pre ip, v2, lr
- strh a2, [lr]
- and a2, ip, #255
- aclip a3, a2, a3, asr #20
- mov a4, a4, asr #20
- aclip a4, a4, ip, lsr #8
- add a2, lr, #2
- str a2, [sp, #28]
- orr a2, a3, a4, lsl #8
- strh a2, [v2]
-
- ldmfd sp!, {a3, a4}
- ldrh_pre ip, lr, v1
- sub a2, a3, v3
- add a3, a3, v3
- and v3, ip, #255
- aclip a2, v3, a2, asr #20
- sub v3, a4, v4
- mov v3, v3, asr #20
- aclip v3, v3, ip, lsr #8
- orr a2, a2, v3, lsl #8
- add a4, a4, v4
- ldrh_dpre ip, v2, v1
- strh a2, [lr]
- and a2, ip, #255
- aclip a3, a2, a3, asr #20
- mov a4, a4, asr #20
- aclip a4, a4, ip, lsr #8
- orr a2, a3, a4, lsl #8
- strh a2, [v2]
-
- ldmfd sp!, {a3, a4}
- ldrh_pre ip, lr, v1
- add a2, a3, v5
- sub a3, a3, v5
- and v3, ip, #255
- aclip a2, v3, a2, asr #20
- add v3, a4, v6
- mov v3, v3, asr #20
- aclip v3, v3, ip, lsr #8
- orr a2, a2, v3, lsl #8
- sub a4, a4, v6
- ldrh_dpre ip, v2, v1
- strh a2, [lr]
- and a2, ip, #255
- aclip a3, a2, a3, asr #20
- mov a4, a4, asr #20
- aclip a4, a4, ip, lsr #8
- orr a2, a3, a4, lsl #8
- strh a2, [v2]
-
- ldmfd sp!, {a3, a4}
- ldrh_pre ip, lr, v1
- add a2, a3, v7
- sub a3, a3, v7
- and v3, ip, #255
- aclip a2, v3, a2, asr #20
- add v3, a4, fp
- mov v3, v3, asr #20
- aclip v3, v3, ip, lsr #8
- orr a2, a2, v3, lsl #8
- sub a4, a4, fp
- ldrh_dpre ip, v2, v1
- strh a2, [lr]
- and a2, ip, #255
- aclip a3, a2, a3, asr #20
- mov a4, a4, asr #20
- aclip a4, a4, ip, lsr #8
- orr a2, a3, a4, lsl #8
- strh a2, [v2]
-
- ldr pc, [sp], #4
-endfunc
-
-function ff_simple_idct_armv5te, export=1
- stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
-
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
-
- sub a1, a1, #(16*7)
-
- bl idct_col_armv5te
- add a1, a1, #4
- bl idct_col_armv5te
- add a1, a1, #4
- bl idct_col_armv5te
- add a1, a1, #4
- bl idct_col_armv5te
-
- ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
-endfunc
-
-function ff_simple_idct_add_armv5te, export=1
- stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
-
- mov a1, a3
-
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
-
- sub a1, a1, #(16*7)
-
- bl idct_col_add_armv5te
- add a1, a1, #4
- bl idct_col_add_armv5te
- add a1, a1, #4
- bl idct_col_add_armv5te
- add a1, a1, #4
- bl idct_col_add_armv5te
-
- add sp, sp, #8
- ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
-endfunc
-
-function ff_simple_idct_put_armv5te, export=1
- stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
-
- mov a1, a3
-
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
- add a1, a1, #16
- bl idct_row_armv5te
-
- sub a1, a1, #(16*7)
-
- bl idct_col_put_armv5te
- add a1, a1, #4
- bl idct_col_put_armv5te
- add a1, a1, #4
- bl idct_col_put_armv5te
- add a1, a1, #4
- bl idct_col_put_armv5te
-
- add sp, sp, #8
- ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv6.S b/ffmpeg/libavcodec/arm/simple_idct_armv6.S
deleted file mode 100644
index 79cf5d4..0000000
--- a/ffmpeg/libavcodec/arm/simple_idct_armv6.S
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Simple IDCT
- *
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
-#define ROW_SHIFT 11
-#define COL_SHIFT 20
-
-#define W13 (W1 | (W3 << 16))
-#define W26 (W2 | (W6 << 16))
-#define W42 (W4 | (W2 << 16))
-#define W42n (-W4&0xffff | (-W2 << 16))
-#define W46 (W4 | (W6 << 16))
-#define W57 (W5 | (W7 << 16))
-
-/*
- Compute partial IDCT of single row.
- shift = left-shift amount
- r0 = source address
- r2 = row[2,0] <= 2 cycles
- r3 = row[3,1]
- ip = w42 <= 2 cycles
-
- Output in registers r4--r11
-*/
- .macro idct_row shift
- ldr lr, =W46 /* lr = W4 | (W6 << 16) */
- mov r1, #(1<<(\shift-1))
- smlad r4, r2, ip, r1
- smlsd r7, r2, ip, r1
- ldr ip, =W13 /* ip = W1 | (W3 << 16) */
- ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
- smlad r5, r2, lr, r1
- smlsd r6, r2, lr, r1
-
- smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
- smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
- ldr lr, [r0, #12] /* lr = row[7,5] */
- pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
- pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
- smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
- smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */
- smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
-
- ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */
- smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */
- ldr r2, [r0, #4] /* r2 = row[6,4] */
- smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */
- ldr ip, =W46 /* ip = W4 | (W6 << 16) */
- smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */
-
- smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */
- smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */
- smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */
- smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */
- .endm
-
-/*
- Compute partial IDCT of half row.
- shift = left-shift amount
- r2 = row[2,0]
- r3 = row[3,1]
- ip = w42
-
- Output in registers r4--r11
-*/
- .macro idct_row4 shift
- ldr lr, =W46 /* lr = W4 | (W6 << 16) */
- ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
- mov r1, #(1<<(\shift-1))
- smlad r4, r2, ip, r1
- smlsd r7, r2, ip, r1
- ldr ip, =W13 /* ip = W1 | (W3 << 16) */
- smlad r5, r2, lr, r1
- smlsd r6, r2, lr, r1
- smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
- smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
- pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
- pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
- smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
- smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
- .endm
-
-/*
- Compute final part of IDCT single row without shift.
- Input in registers r4--r11
- Output in registers ip, r4--r6, lr, r8--r10
-*/
- .macro idct_finish
- add ip, r4, r8 /* r1 = A0 + B0 */
- sub lr, r4, r8 /* r2 = A0 - B0 */
- sub r4, r5, r9 /* r2 = A1 + B1 */
- add r8, r5, r9 /* r2 = A1 - B1 */
- add r5, r6, r10 /* r1 = A2 + B2 */
- sub r9, r6, r10 /* r1 = A2 - B2 */
- add r6, r7, r11 /* r2 = A3 + B3 */
- sub r10,r7, r11 /* r2 = A3 - B3 */
- .endm
-
-/*
- Compute final part of IDCT single row.
- shift = right-shift amount
- Input/output in registers r4--r11
-*/
- .macro idct_finish_shift shift
- add r3, r4, r8 /* r3 = A0 + B0 */
- sub r2, r4, r8 /* r2 = A0 - B0 */
- mov r4, r3, asr #\shift
- mov r8, r2, asr #\shift
-
- sub r3, r5, r9 /* r3 = A1 + B1 */
- add r2, r5, r9 /* r2 = A1 - B1 */
- mov r5, r3, asr #\shift
- mov r9, r2, asr #\shift
-
- add r3, r6, r10 /* r3 = A2 + B2 */
- sub r2, r6, r10 /* r2 = A2 - B2 */
- mov r6, r3, asr #\shift
- mov r10,r2, asr #\shift
-
- add r3, r7, r11 /* r3 = A3 + B3 */
- sub r2, r7, r11 /* r2 = A3 - B3 */
- mov r7, r3, asr #\shift
- mov r11,r2, asr #\shift
- .endm
-
-/*
- Compute final part of IDCT single row, saturating results at 8 bits.
- shift = right-shift amount
- Input/output in registers r4--r11
-*/
- .macro idct_finish_shift_sat shift
- add r3, r4, r8 /* r3 = A0 + B0 */
- sub ip, r4, r8 /* ip = A0 - B0 */
- usat r4, #8, r3, asr #\shift
- usat r8, #8, ip, asr #\shift
-
- sub r3, r5, r9 /* r3 = A1 + B1 */
- add ip, r5, r9 /* ip = A1 - B1 */
- usat r5, #8, r3, asr #\shift
- usat r9, #8, ip, asr #\shift
-
- add r3, r6, r10 /* r3 = A2 + B2 */
- sub ip, r6, r10 /* ip = A2 - B2 */
- usat r6, #8, r3, asr #\shift
- usat r10,#8, ip, asr #\shift
-
- add r3, r7, r11 /* r3 = A3 + B3 */
- sub ip, r7, r11 /* ip = A3 - B3 */
- usat r7, #8, r3, asr #\shift
- usat r11,#8, ip, asr #\shift
- .endm
-
-/*
- Compute IDCT of single row, storing as column.
- r0 = source
- r1 = dest
-*/
-function idct_row_armv6
- push {lr}
-
- ldr lr, [r0, #12] /* lr = row[7,5] */
- ldr ip, [r0, #4] /* ip = row[6,4] */
- ldr r3, [r0, #8] /* r3 = row[3,1] */
- ldr r2, [r0] /* r2 = row[2,0] */
- orrs lr, lr, ip
- itt eq
- cmpeq lr, r3
- cmpeq lr, r2, lsr #16
- beq 1f
- push {r1}
- ldr ip, =W42 /* ip = W4 | (W2 << 16) */
- cmp lr, #0
- beq 2f
-
- idct_row ROW_SHIFT
- b 3f
-
-2: idct_row4 ROW_SHIFT
-
-3: pop {r1}
- idct_finish_shift ROW_SHIFT
-
- strh r4, [r1]
- strh r5, [r1, #(16*2)]
- strh r6, [r1, #(16*4)]
- strh r7, [r1, #(16*6)]
- strh r11,[r1, #(16*1)]
- strh r10,[r1, #(16*3)]
- strh r9, [r1, #(16*5)]
- strh r8, [r1, #(16*7)]
-
- pop {pc}
-
-1: mov r2, r2, lsl #3
- strh r2, [r1]
- strh r2, [r1, #(16*2)]
- strh r2, [r1, #(16*4)]
- strh r2, [r1, #(16*6)]
- strh r2, [r1, #(16*1)]
- strh r2, [r1, #(16*3)]
- strh r2, [r1, #(16*5)]
- strh r2, [r1, #(16*7)]
- pop {pc}
-endfunc
-
-/*
- Compute IDCT of single column, read as row.
- r0 = source
- r1 = dest
-*/
-function idct_col_armv6
- push {r1, lr}
-
- ldr r2, [r0] /* r2 = row[2,0] */
- ldr ip, =W42 /* ip = W4 | (W2 << 16) */
- ldr r3, [r0, #8] /* r3 = row[3,1] */
- idct_row COL_SHIFT
- pop {r1}
- idct_finish_shift COL_SHIFT
-
- strh r4, [r1]
- strh r5, [r1, #(16*1)]
- strh r6, [r1, #(16*2)]
- strh r7, [r1, #(16*3)]
- strh r11,[r1, #(16*4)]
- strh r10,[r1, #(16*5)]
- strh r9, [r1, #(16*6)]
- strh r8, [r1, #(16*7)]
-
- pop {pc}
-endfunc
-
-/*
- Compute IDCT of single column, read as row, store saturated 8-bit.
- r0 = source
- r1 = dest
- r2 = line size
-*/
-function idct_col_put_armv6
- push {r1, r2, lr}
-
- ldr r2, [r0] /* r2 = row[2,0] */
- ldr ip, =W42 /* ip = W4 | (W2 << 16) */
- ldr r3, [r0, #8] /* r3 = row[3,1] */
- idct_row COL_SHIFT
- pop {r1, r2}
- idct_finish_shift_sat COL_SHIFT
-
- strb_post r4, r1, r2
- strb_post r5, r1, r2
- strb_post r6, r1, r2
- strb_post r7, r1, r2
- strb_post r11,r1, r2
- strb_post r10,r1, r2
- strb_post r9, r1, r2
- strb_post r8, r1, r2
-
- sub r1, r1, r2, lsl #3
-
- pop {pc}
-endfunc
-
-/*
- Compute IDCT of single column, read as row, add/store saturated 8-bit.
- r0 = source
- r1 = dest
- r2 = line size
-*/
-function idct_col_add_armv6
- push {r1, r2, lr}
-
- ldr r2, [r0] /* r2 = row[2,0] */
- ldr ip, =W42 /* ip = W4 | (W2 << 16) */
- ldr r3, [r0, #8] /* r3 = row[3,1] */
- idct_row COL_SHIFT
- pop {r1, r2}
- idct_finish
-
- ldrb r3, [r1]
- ldrb r7, [r1, r2]
- ldrb r11,[r1, r2, lsl #2]
- add ip, r3, ip, asr #COL_SHIFT
- usat ip, #8, ip
- add r4, r7, r4, asr #COL_SHIFT
- strb_post ip, r1, r2
- ldrb ip, [r1, r2]
- usat r4, #8, r4
- ldrb r11,[r1, r2, lsl #2]
- add r5, ip, r5, asr #COL_SHIFT
- usat r5, #8, r5
- strb_post r4, r1, r2
- ldrb r3, [r1, r2]
- ldrb ip, [r1, r2, lsl #2]
- strb_post r5, r1, r2
- ldrb r7, [r1, r2]
- ldrb r4, [r1, r2, lsl #2]
- add r6, r3, r6, asr #COL_SHIFT
- usat r6, #8, r6
- add r10,r7, r10,asr #COL_SHIFT
- usat r10,#8, r10
- add r9, r11,r9, asr #COL_SHIFT
- usat r9, #8, r9
- add r8, ip, r8, asr #COL_SHIFT
- usat r8, #8, r8
- add lr, r4, lr, asr #COL_SHIFT
- usat lr, #8, lr
- strb_post r6, r1, r2
- strb_post r10,r1, r2
- strb_post r9, r1, r2
- strb_post r8, r1, r2
- strb_post lr, r1, r2
-
- sub r1, r1, r2, lsl #3
-
- pop {pc}
-endfunc
-
-/*
- Compute 8 IDCT row transforms.
- func = IDCT row->col function
- width = width of columns in bytes
-*/
- .macro idct_rows func width
- bl \func
- add r0, r0, #(16*2)
- add r1, r1, #\width
- bl \func
- add r0, r0, #(16*2)
- add r1, r1, #\width
- bl \func
- add r0, r0, #(16*2)
- add r1, r1, #\width
- bl \func
- sub r0, r0, #(16*5)
- add r1, r1, #\width
- bl \func
- add r0, r0, #(16*2)
- add r1, r1, #\width
- bl \func
- add r0, r0, #(16*2)
- add r1, r1, #\width
- bl \func
- add r0, r0, #(16*2)
- add r1, r1, #\width
- bl \func
-
- sub r0, r0, #(16*7)
- .endm
-
-/* void ff_simple_idct_armv6(int16_t *data); */
-function ff_simple_idct_armv6, export=1
- push {r4-r11, lr}
- sub sp, sp, #128
-
- mov r1, sp
- idct_rows idct_row_armv6, 2
- mov r1, r0
- mov r0, sp
- idct_rows idct_col_armv6, 2
-
- add sp, sp, #128
- pop {r4-r11, pc}
-endfunc
-
-/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */
-function ff_simple_idct_add_armv6, export=1
- push {r0, r1, r4-r11, lr}
- sub sp, sp, #128
-
- mov r0, r2
- mov r1, sp
- idct_rows idct_row_armv6, 2
- mov r0, sp
- ldr r1, [sp, #128]
- ldr r2, [sp, #(128+4)]
- idct_rows idct_col_add_armv6, 1
-
- add sp, sp, #(128+8)
- pop {r4-r11, pc}
-endfunc
-
-/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */
-function ff_simple_idct_put_armv6, export=1
- push {r0, r1, r4-r11, lr}
- sub sp, sp, #128
-
- mov r0, r2
- mov r1, sp
- idct_rows idct_row_armv6, 2
- mov r0, sp
- ldr r1, [sp, #128]
- ldr r2, [sp, #(128+4)]
- idct_rows idct_col_put_armv6, 1
-
- add sp, sp, #(128+8)
- pop {r4-r11, pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/simple_idct_neon.S b/ffmpeg/libavcodec/arm/simple_idct_neon.S
deleted file mode 100644
index c3e573c..0000000
--- a/ffmpeg/libavcodec/arm/simple_idct_neon.S
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * ARM NEON IDCT
- *
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * Based on Simple IDCT
- * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
-#define W4c ((1<<(COL_SHIFT-1))/W4)
-#define ROW_SHIFT 11
-#define COL_SHIFT 20
-
-#define w1 d0[0]
-#define w2 d0[1]
-#define w3 d0[2]
-#define w4 d0[3]
-#define w5 d1[0]
-#define w6 d1[1]
-#define w7 d1[2]
-#define w4c d1[3]
-
- .macro idct_col4_top
- vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
- vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
- vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
- vadd.i32 q11, q15, q7
- vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
- vadd.i32 q12, q15, q8
- vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
- vsub.i32 q13, q15, q8
- vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
- vsub.i32 q14, q15, q7
-
- vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
- vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
- vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
- vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
- .endm
-
- .text
- .align 6
-
-function idct_row4_pld_neon
- pld [r0]
- add r3, r0, r1, lsl #2
- pld [r0, r1]
- pld [r0, r1, lsl #1]
-A pld [r3, -r1]
- pld [r3]
- pld [r3, r1]
- add r3, r3, r1, lsl #1
- pld [r3]
- pld [r3, r1]
-endfunc
-
-function idct_row4_neon
- vmov.i32 q15, #(1<<(ROW_SHIFT-1))
- vld1.64 {d2-d5}, [r2,:128]!
- vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
- vld1.64 {d6,d7}, [r2,:128]!
- vorr d10, d3, d5
- vld1.64 {d8,d9}, [r2,:128]!
- add r2, r2, #-64
-
- vorr d11, d7, d9
- vorr d10, d10, d11
- vmov r3, r4, d10
-
- idct_col4_top
-
- orrs r3, r3, r4
- beq 1f
-
- vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
- vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
- vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
- vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
- vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
- vadd.i32 q11, q11, q7
- vsub.i32 q12, q12, q7
- vsub.i32 q13, q13, q7
- vadd.i32 q14, q14, q7
- vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
- vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
- vmlal.s16 q9, d9, w7
- vmlsl.s16 q10, d9, w5
- vmlal.s16 q5, d9, w3
- vmlsl.s16 q6, d9, w1
- vadd.i32 q11, q11, q7
- vsub.i32 q12, q12, q8
- vadd.i32 q13, q13, q8
- vsub.i32 q14, q14, q7
-
-1: vadd.i32 q3, q11, q9
- vadd.i32 q4, q12, q10
- vshrn.i32 d2, q3, #ROW_SHIFT
- vshrn.i32 d4, q4, #ROW_SHIFT
- vadd.i32 q7, q13, q5
- vadd.i32 q8, q14, q6
- vtrn.16 d2, d4
- vshrn.i32 d6, q7, #ROW_SHIFT
- vshrn.i32 d8, q8, #ROW_SHIFT
- vsub.i32 q14, q14, q6
- vsub.i32 q11, q11, q9
- vtrn.16 d6, d8
- vsub.i32 q13, q13, q5
- vshrn.i32 d3, q14, #ROW_SHIFT
- vtrn.32 d2, d6
- vsub.i32 q12, q12, q10
- vtrn.32 d4, d8
- vshrn.i32 d5, q13, #ROW_SHIFT
- vshrn.i32 d7, q12, #ROW_SHIFT
- vshrn.i32 d9, q11, #ROW_SHIFT
-
- vtrn.16 d3, d5
- vtrn.16 d7, d9
- vtrn.32 d3, d7
- vtrn.32 d5, d9
-
- vst1.64 {d2-d5}, [r2,:128]!
- vst1.64 {d6-d9}, [r2,:128]!
-
- bx lr
-endfunc
-
-function idct_col4_neon
- mov ip, #16
- vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */
- vdup.16 d30, w4c
- vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */
- vadd.i16 d30, d30, d2
- vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */
- vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
- vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */
-
- ldrd r4, r5, [r2]
- ldrd r6, r7, [r2, #16]
- orrs r4, r4, r5
-
- idct_col4_top
- it eq
- addeq r2, r2, #16
- beq 1f
-
- vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */
- vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
- vadd.i32 q11, q11, q7
- vsub.i32 q12, q12, q7
- vsub.i32 q13, q13, q7
- vadd.i32 q14, q14, q7
-
-1: orrs r6, r6, r7
- ldrd r4, r5, [r2, #16]
- it eq
- addeq r2, r2, #16
- beq 2f
-
- vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */
- vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
- vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
- vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
- vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
-
-2: orrs r4, r4, r5
- ldrd r4, r5, [r2, #16]
- it eq
- addeq r2, r2, #16
- beq 3f
-
- vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */
- vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
- vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
- vadd.i32 q11, q11, q7
- vsub.i32 q14, q14, q7
- vsub.i32 q12, q12, q8
- vadd.i32 q13, q13, q8
-
-3: orrs r4, r4, r5
- it eq
- addeq r2, r2, #16
- beq 4f
-
- vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */
- vmlal.s16 q9, d9, w7
- vmlsl.s16 q10, d9, w5
- vmlal.s16 q5, d9, w3
- vmlsl.s16 q6, d9, w1
-
-4: vaddhn.i32 d2, q11, q9
- vaddhn.i32 d3, q12, q10
- vaddhn.i32 d4, q13, q5
- vaddhn.i32 d5, q14, q6
- vsubhn.i32 d9, q11, q9
- vsubhn.i32 d8, q12, q10
- vsubhn.i32 d7, q13, q5
- vsubhn.i32 d6, q14, q6
-
- bx lr
-endfunc
-
- .align 6
-
-function idct_col4_st8_neon
- vqshrun.s16 d2, q1, #COL_SHIFT-16
- vqshrun.s16 d3, q2, #COL_SHIFT-16
- vqshrun.s16 d4, q3, #COL_SHIFT-16
- vqshrun.s16 d5, q4, #COL_SHIFT-16
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- vst1.32 {d4[0]}, [r0,:32], r1
- vst1.32 {d4[1]}, [r0,:32], r1
- vst1.32 {d5[0]}, [r0,:32], r1
- vst1.32 {d5[1]}, [r0,:32], r1
-
- bx lr
-endfunc
-
-const idct_coeff_neon, align=4
- .short W1, W2, W3, W4, W5, W6, W7, W4c
-endconst
-
- .macro idct_start data
- push {r4-r7, lr}
- pld [\data]
- pld [\data, #64]
- vpush {d8-d15}
- movrel r3, idct_coeff_neon
- vld1.64 {d0,d1}, [r3,:128]
- .endm
-
- .macro idct_end
- vpop {d8-d15}
- pop {r4-r7, pc}
- .endm
-
-/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */
-function ff_simple_idct_put_neon, export=1
- idct_start r2
-
- bl idct_row4_pld_neon
- bl idct_row4_neon
- add r2, r2, #-128
- bl idct_col4_neon
- bl idct_col4_st8_neon
- sub r0, r0, r1, lsl #3
- add r0, r0, #4
- add r2, r2, #-120
- bl idct_col4_neon
- bl idct_col4_st8_neon
-
- idct_end
-endfunc
-
- .align 6
-
-function idct_col4_add8_neon
- mov ip, r0
-
- vld1.32 {d10[0]}, [r0,:32], r1
- vshr.s16 q1, q1, #COL_SHIFT-16
- vld1.32 {d10[1]}, [r0,:32], r1
- vshr.s16 q2, q2, #COL_SHIFT-16
- vld1.32 {d11[0]}, [r0,:32], r1
- vshr.s16 q3, q3, #COL_SHIFT-16
- vld1.32 {d11[1]}, [r0,:32], r1
- vshr.s16 q4, q4, #COL_SHIFT-16
- vld1.32 {d12[0]}, [r0,:32], r1
- vaddw.u8 q1, q1, d10
- vld1.32 {d12[1]}, [r0,:32], r1
- vaddw.u8 q2, q2, d11
- vld1.32 {d13[0]}, [r0,:32], r1
- vqmovun.s16 d2, q1
- vld1.32 {d13[1]}, [r0,:32], r1
- vaddw.u8 q3, q3, d12
- vst1.32 {d2[0]}, [ip,:32], r1
- vqmovun.s16 d3, q2
- vst1.32 {d2[1]}, [ip,:32], r1
- vaddw.u8 q4, q4, d13
- vst1.32 {d3[0]}, [ip,:32], r1
- vqmovun.s16 d4, q3
- vst1.32 {d3[1]}, [ip,:32], r1
- vqmovun.s16 d5, q4
- vst1.32 {d4[0]}, [ip,:32], r1
- vst1.32 {d4[1]}, [ip,:32], r1
- vst1.32 {d5[0]}, [ip,:32], r1
- vst1.32 {d5[1]}, [ip,:32], r1
-
- bx lr
-endfunc
-
-/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */
-function ff_simple_idct_add_neon, export=1
- idct_start r2
-
- bl idct_row4_pld_neon
- bl idct_row4_neon
- add r2, r2, #-128
- bl idct_col4_neon
- bl idct_col4_add8_neon
- sub r0, r0, r1, lsl #3
- add r0, r0, #4
- add r2, r2, #-120
- bl idct_col4_neon
- bl idct_col4_add8_neon
-
- idct_end
-endfunc
-
- .align 6
-
-function idct_col4_st16_neon
- mov ip, #16
-
- vshr.s16 q1, q1, #COL_SHIFT-16
- vshr.s16 q2, q2, #COL_SHIFT-16
- vst1.64 {d2}, [r2,:64], ip
- vshr.s16 q3, q3, #COL_SHIFT-16
- vst1.64 {d3}, [r2,:64], ip
- vshr.s16 q4, q4, #COL_SHIFT-16
- vst1.64 {d4}, [r2,:64], ip
- vst1.64 {d5}, [r2,:64], ip
- vst1.64 {d6}, [r2,:64], ip
- vst1.64 {d7}, [r2,:64], ip
- vst1.64 {d8}, [r2,:64], ip
- vst1.64 {d9}, [r2,:64], ip
-
- bx lr
-endfunc
-
-/* void ff_simple_idct_neon(int16_t *data); */
-function ff_simple_idct_neon, export=1
- idct_start r0
-
- mov r2, r0
- bl idct_row4_neon
- bl idct_row4_neon
- add r2, r2, #-128
- bl idct_col4_neon
- add r2, r2, #-128
- bl idct_col4_st16_neon
- add r2, r2, #-120
- bl idct_col4_neon
- add r2, r2, #-128
- bl idct_col4_st16_neon
-
- idct_end
-endfunc
diff --git a/ffmpeg/libavcodec/arm/synth_filter_neon.S b/ffmpeg/libavcodec/arm/synth_filter_neon.S
deleted file mode 100644
index 5417be7..0000000
--- a/ffmpeg/libavcodec/arm/synth_filter_neon.S
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_synth_filter_float_neon, export=1
- push {r3-r11,lr}
-
- ldr r4, [r2] @ synth_buf_offset
- add r1, r1, r4, lsl #2 @ synth_buf
- sub r12, r4, #32
- bfc r12, #9, #23
- bic r4, r4, #63
- str r12, [r2]
-
- ldr r2, [sp, #12*4] @ in
- mov r9, r1 @ synth_buf
-
-VFP vpush {d0}
- bl X(ff_imdct_half_neon)
-VFP vpop {d0}
- pop {r3}
-
- ldr r5, [sp, #9*4] @ window
- ldr r2, [sp, #10*4] @ out
-NOVFP vldr s0, [sp, #12*4] @ scale
- add r8, r9, #12*4
-
- mov lr, #64*4
- mov r1, #4
-1:
- add r10, r9, #16*4 @ synth_buf
- add r11, r8, #16*4
- add r0, r5, #16*4 @ window
- add r6, r5, #32*4
- add r7, r5, #48*4
-
- vld1.32 {q10}, [r3,:128] @ a
- add r3, r3, #16*4
- vld1.32 {q1}, [r3,:128] @ b
- vmov.f32 q2, #0.0 @ c
- vmov.f32 q3, #0.0 @ d
-
- mov r12, #512
-2:
- vld1.32 {q9}, [r8, :128], lr
- vrev64.32 q9, q9
- vld1.32 {q8}, [r5, :128], lr
- vmls.f32 d20, d16, d19
- vld1.32 {q11}, [r0, :128], lr
- vmls.f32 d21, d17, d18
- vld1.32 {q12}, [r9, :128], lr
- vmla.f32 d2, d22, d24
- vld1.32 {q8}, [r6, :128], lr
- vmla.f32 d3, d23, d25
- vld1.32 {q9}, [r10,:128], lr
- vmla.f32 d4, d16, d18
- vld1.32 {q12}, [r11,:128], lr
- vmla.f32 d5, d17, d19
- vrev64.32 q12, q12
- vld1.32 {q11}, [r7, :128], lr
- vmla.f32 d6, d22, d25
- vmla.f32 d7, d23, d24
- subs r12, r12, #64
- beq 3f
- cmp r12, r4
- bne 2b
- sub r8, r8, #512*4
- sub r9, r9, #512*4
- sub r10, r10, #512*4
- sub r11, r11, #512*4
- b 2b
-3:
- vmul.f32 q8, q10, d0[0]
- vmul.f32 q9, q1, d0[0]
- vst1.32 {q3}, [r3,:128]
- sub r3, r3, #16*4
- vst1.32 {q2}, [r3,:128]
- vst1.32 {q8}, [r2,:128]
- add r2, r2, #16*4
- vst1.32 {q9}, [r2,:128]
-
- subs r1, r1, #1
- it eq
- popeq {r4-r11,pc}
-
- cmp r4, #0
- itt eq
- subeq r8, r8, #512*4
- subeq r9, r9, #512*4
- sub r5, r5, #512*4
- sub r2, r2, #12*4 @ out
- add r3, r3, #4*4 @ synth_buf2
- add r5, r5, #4*4 @ window
- add r9, r9, #4*4 @ synth_buf
- sub r8, r8, #4*4 @ synth_buf
- b 1b
-endfunc
diff --git a/ffmpeg/libavcodec/arm/videodsp_arm.h b/ffmpeg/libavcodec/arm/videodsp_arm.h
deleted file mode 100644
index 112cbb8..0000000
--- a/ffmpeg/libavcodec/arm/videodsp_arm.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_VIDEODSP_ARM_H
-#define AVCODEC_ARM_VIDEODSP_ARM_H
-
-#include "libavcodec/avcodec.h"
-#include "libavcodec/videodsp.h"
-
-void ff_videodsp_init_armv5te(VideoDSPContext* ctx, int bpc);
-
-#endif /* AVCODEC_ARM_VIDEODSP_ARM_H */
diff --git a/ffmpeg/libavcodec/arm/videodsp_armv5te.S b/ffmpeg/libavcodec/arm/videodsp_armv5te.S
deleted file mode 100644
index 48a6c3b..0000000
--- a/ffmpeg/libavcodec/arm/videodsp_armv5te.S
+++ /dev/null
@@ -1,31 +0,0 @@
-@
-@ ARMv5te optimized DSP utils
-@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
-@
-@ This file is part of FFmpeg
-@
-@ FFmpeg is free software; you can redistribute it and/or
-@ modify it under the terms of the GNU Lesser General Public
-@ License as published by the Free Software Foundation; either
-@ version 2.1 of the License, or (at your option) any later version.
-@
-@ FFmpeg is distributed in the hope that it will be useful,
-@ but WITHOUT ANY WARRANTY; without even the implied warranty of
-@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-@ Lesser General Public License for more details.
-@
-@ You should have received a copy of the GNU Lesser General Public
-@ License along with FFmpeg; if not, write to the Free Software
-@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-@
-
-#include "config.h"
-#include "libavutil/arm/asm.S"
-
-function ff_prefetch_arm, export=1
- subs r2, r2, #1
- pld [r0]
- add r0, r0, r1
- bne ff_prefetch_arm
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/videodsp_init_arm.c b/ffmpeg/libavcodec/arm/videodsp_init_arm.c
deleted file mode 100644
index a89abb2..0000000
--- a/ffmpeg/libavcodec/arm/videodsp_init_arm.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) 2012 Ronald S. Bultje
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/videodsp.h"
-#include "videodsp_arm.h"
-
-av_cold void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc)
-{
- int cpu_flags = av_get_cpu_flags();
- if (have_armv5te(cpu_flags)) ff_videodsp_init_armv5te(ctx, bpc);
-}
diff --git a/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c b/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c
deleted file mode 100644
index 1ea1f34..0000000
--- a/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (C) 2012 Ronald S. Bultje
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/videodsp.h"
-#include "videodsp_arm.h"
-
-void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
-
-av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
-{
-#if HAVE_ARMV5TE_EXTERNAL
- ctx->prefetch = ff_prefetch_arm;
-#endif
-}
diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c b/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c
deleted file mode 100644
index f4b3d80..0000000
--- a/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/vorbisdsp.h"
-
-void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
- intptr_t blocksize);
-
-av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_neon.S b/ffmpeg/libavcodec/arm/vorbisdsp_neon.S
deleted file mode 100644
index 79ce54f..0000000
--- a/ffmpeg/libavcodec/arm/vorbisdsp_neon.S
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * ARM NEON optimised DSP functions
- * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-function ff_vorbis_inverse_coupling_neon, export=1
- vmov.i32 q10, #1<<31
- subs r2, r2, #4
- mov r3, r0
- mov r12, r1
- beq 3f
-
- vld1.32 {d24-d25},[r1,:128]!
- vld1.32 {d22-d23},[r0,:128]!
- vcle.s32 q8, q12, #0
- vand q9, q11, q10
- veor q12, q12, q9
- vand q2, q12, q8
- vbic q3, q12, q8
- vadd.f32 q12, q11, q2
- vsub.f32 q11, q11, q3
-1: vld1.32 {d2-d3}, [r1,:128]!
- vld1.32 {d0-d1}, [r0,:128]!
- vcle.s32 q8, q1, #0
- vand q9, q0, q10
- veor q1, q1, q9
- vst1.32 {d24-d25},[r3, :128]!
- vst1.32 {d22-d23},[r12,:128]!
- vand q2, q1, q8
- vbic q3, q1, q8
- vadd.f32 q1, q0, q2
- vsub.f32 q0, q0, q3
- subs r2, r2, #8
- ble 2f
- vld1.32 {d24-d25},[r1,:128]!
- vld1.32 {d22-d23},[r0,:128]!
- vcle.s32 q8, q12, #0
- vand q9, q11, q10
- veor q12, q12, q9
- vst1.32 {d2-d3}, [r3, :128]!
- vst1.32 {d0-d1}, [r12,:128]!
- vand q2, q12, q8
- vbic q3, q12, q8
- vadd.f32 q12, q11, q2
- vsub.f32 q11, q11, q3
- b 1b
-
-2: vst1.32 {d2-d3}, [r3, :128]!
- vst1.32 {d0-d1}, [r12,:128]!
- it lt
- bxlt lr
-
-3: vld1.32 {d2-d3}, [r1,:128]
- vld1.32 {d0-d1}, [r0,:128]
- vcle.s32 q8, q1, #0
- vand q9, q0, q10
- veor q1, q1, q9
- vand q2, q1, q8
- vbic q3, q1, q8
- vadd.f32 q1, q0, q2
- vsub.f32 q0, q0, q3
- vst1.32 {d2-d3}, [r0,:128]!
- vst1.32 {d0-d1}, [r1,:128]!
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c
deleted file mode 100644
index 5af795b..0000000
--- a/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/dsputil.h"
-#include "libavcodec/vp3dsp.h"
-
-void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
-void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
-void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const int16_t *data);
-
-void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
-void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
-
-av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_neon(cpu_flags)) {
- c->idct_put = ff_vp3_idct_put_neon;
- c->idct_add = ff_vp3_idct_add_neon;
- c->idct_dc_add = ff_vp3_idct_dc_add_neon;
- c->v_loop_filter = ff_vp3_v_loop_filter_neon;
- c->h_loop_filter = ff_vp3_h_loop_filter_neon;
- }
-}
diff --git a/ffmpeg/libavcodec/arm/vp3dsp_neon.S b/ffmpeg/libavcodec/arm/vp3dsp_neon.S
deleted file mode 100644
index f133905..0000000
--- a/ffmpeg/libavcodec/arm/vp3dsp_neon.S
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Copyright (c) 2009 David Conrad
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-const vp3_idct_constants, align=4
-.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
-endconst
-
-#define xC1S7 d0[0]
-#define xC2S6 d0[1]
-#define xC3S5 d0[2]
-#define xC4S4 d0[3]
-#define xC5S3 d1[0]
-#define xC6S2 d1[1]
-#define xC7S1 d1[2]
-
-.macro vp3_loop_filter
- vsubl.u8 q3, d18, d17
- vsubl.u8 q2, d16, d19
- vadd.i16 q1, q3, q3
- vadd.i16 q2, q2, q3
- vadd.i16 q0, q1, q2
- vrshr.s16 q0, q0, #3
- vmovl.u8 q9, d18
- vdup.u16 q15, r2
-
- vabs.s16 q1, q0
- vshr.s16 q0, q0, #15
- vqsub.u16 q2, q15, q1
- vqsub.u16 q3, q2, q1
- vsub.i16 q1, q2, q3
- veor q1, q1, q0
- vsub.i16 q0, q1, q0
-
- vaddw.u8 q2, q0, d17
- vsub.i16 q3, q9, q0
- vqmovun.s16 d0, q2
- vqmovun.s16 d1, q3
-.endm
-
-function ff_vp3_v_loop_filter_neon, export=1
- sub ip, r0, r1
- sub r0, r0, r1, lsl #1
- vld1.64 {d16}, [r0,:64], r1
- vld1.64 {d17}, [r0,:64], r1
- vld1.64 {d18}, [r0,:64], r1
- vld1.64 {d19}, [r0,:64], r1
- ldrb r2, [r2, #129*4]
-
- vp3_loop_filter
-
- vst1.64 {d0}, [ip,:64], r1
- vst1.64 {d1}, [ip,:64], r1
- bx lr
-endfunc
-
-function ff_vp3_h_loop_filter_neon, export=1
- sub ip, r0, #1
- sub r0, r0, #2
- vld1.32 {d16[]}, [r0], r1
- vld1.32 {d17[]}, [r0], r1
- vld1.32 {d18[]}, [r0], r1
- vld1.32 {d19[]}, [r0], r1
- vld1.32 {d16[1]}, [r0], r1
- vld1.32 {d17[1]}, [r0], r1
- vld1.32 {d18[1]}, [r0], r1
- vld1.32 {d19[1]}, [r0], r1
- ldrb r2, [r2, #129*4]
-
- vtrn.8 d16, d17
- vtrn.8 d18, d19
- vtrn.16 d16, d18
- vtrn.16 d17, d19
-
- vp3_loop_filter
-
- vtrn.8 d0, d1
-
- vst1.16 {d0[0]}, [ip], r1
- vst1.16 {d1[0]}, [ip], r1
- vst1.16 {d0[1]}, [ip], r1
- vst1.16 {d1[1]}, [ip], r1
- vst1.16 {d0[2]}, [ip], r1
- vst1.16 {d1[2]}, [ip], r1
- vst1.16 {d0[3]}, [ip], r1
- vst1.16 {d1[3]}, [ip], r1
- bx lr
-endfunc
-
-
-function vp3_idct_start_neon
- vpush {d8-d15}
- vmov.i16 q4, #0
- vmov.i16 q5, #0
- movrel r3, vp3_idct_constants
- vld1.64 {d0-d1}, [r3,:128]
- vld1.64 {d16-d19}, [r2,:128]
- vst1.64 {q4-q5}, [r2,:128]!
- vld1.64 {d20-d23}, [r2,:128]
- vst1.64 {q4-q5}, [r2,:128]!
- vld1.64 {d24-d27}, [r2,:128]
- vst1.64 {q4-q5}, [r2,:128]!
- vadd.s16 q1, q8, q12
- vsub.s16 q8, q8, q12
- vld1.64 {d28-d31}, [r2,:128]
- vst1.64 {q4-q5}, [r2,:128]!
-
-vp3_idct_core_neon:
- vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
- vmull.s16 q3, d19, xC1S7
- vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16
- vmull.s16 q5, d3, xC4S4
- vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16
- vmull.s16 q7, d17, xC4S4
- vshrn.s32 d4, q2, #16
- vshrn.s32 d5, q3, #16
- vshrn.s32 d6, q4, #16
- vshrn.s32 d7, q5, #16
- vshrn.s32 d8, q6, #16
- vshrn.s32 d9, q7, #16
- vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4
- vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4
- vadd.s16 q1, q2, q9 // ip[1] * C1
-
- vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16
- vmull.s16 q3, d31, xC1S7
- vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16
- vmull.s16 q5, d31, xC7S1
- vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16
- vmull.s16 q7, d19, xC7S1
- vshrn.s32 d4, q2, #16
- vshrn.s32 d5, q3, #16
- vshrn.s32 d6, q4, #16 // ip[7] * C7
- vshrn.s32 d7, q5, #16
- vshrn.s32 d8, q6, #16 // ip[1] * C7
- vshrn.s32 d9, q7, #16
- vadd.s16 q2, q2, q15 // ip[7] * C1
- vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7
- vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1
-
- vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16
- vmull.s16 q3, d23, xC5S3
- vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16
- vmull.s16 q5, d23, xC3S5
- vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16
- vmull.s16 q7, d27, xC5S3
- vshrn.s32 d4, q2, #16
- vshrn.s32 d5, q3, #16
- vshrn.s32 d6, q4, #16
- vshrn.s32 d7, q5, #16
- vshrn.s32 d8, q6, #16
- vshrn.s32 d9, q7, #16
- vadd.s16 q3, q3, q11 // ip[3] * C3
- vadd.s16 q4, q4, q13 // ip[5] * C5
- vadd.s16 q1, q2, q11 // ip[3] * C5
- vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5
-
- vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16
- vmull.s16 q3, d27, xC3S5
- vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16
- vmull.s16 q5, d21, xC2S6
- vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16
- vmull.s16 q7, d29, xC6S2
- vshrn.s32 d4, q2, #16
- vshrn.s32 d5, q3, #16
- vshrn.s32 d6, q4, #16
- vshrn.s32 d7, q5, #16
- vshrn.s32 d8, q6, #16 // ip[6] * C6
- vshrn.s32 d9, q7, #16
- vadd.s16 q2, q2, q13 // ip[5] * C3
- vadd.s16 q3, q3, q10 // ip[2] * C2
- vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5
- vsub.s16 q1, q9, q11 // (A - C)
- vadd.s16 q11, q9, q11 // Cd = A + C
- vsub.s16 q9, q15, q13 // (B - D)
- vadd.s16 q13, q15, q13 // Dd = B + D
- vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6
-
- vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16
- vmull.s16 q3, d3, xC4S4
- vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16
- vmull.s16 q5, d29, xC2S6
- vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16
- vmull.s16 q7, d21, xC6S2
- vshrn.s32 d4, q2, #16
- vshrn.s32 d5, q3, #16
- vshrn.s32 d6, q4, #16
- vshrn.s32 d7, q5, #16
- vshrn.s32 d8, q6, #16 // ip[2] * C6
- vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16
- vmull.s16 q6, d19, xC4S4
- vshrn.s32 d9, q7, #16
- vadd.s16 q3, q3, q14 // ip[6] * C2
- vadd.s16 q10, q1, q2 // Ad = (A - C) * C4
- vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2
- bx lr
-endfunc
-
-.macro VP3_IDCT_END type
-function vp3_idct_end_\type\()_neon
-.ifc \type, col
- vdup.16 q0, r3
- vadd.s16 q12, q12, q0
- vadd.s16 q8, q8, q0
-.endif
-
- vshrn.s32 d2, q5, #16
- vshrn.s32 d3, q6, #16
- vadd.s16 q2, q12, q15 // Gd = E + G
- vadd.s16 q9, q1, q9 // (B - D) * C4
- vsub.s16 q12, q12, q15 // Ed = E - G
- vsub.s16 q3, q8, q10 // Fd = F - Ad
- vadd.s16 q10, q8, q10 // Add = F + Ad
- vadd.s16 q4, q9, q14 // Hd = Bd + H
- vsub.s16 q14, q9, q14 // Bdd = Bd - H
- vadd.s16 q8, q2, q11 // [0] = Gd + Cd
- vsub.s16 q15, q2, q11 // [7] = Gd - Cd
- vadd.s16 q9, q10, q4 // [1] = Add + Hd
- vsub.s16 q10, q10, q4 // [2] = Add - Hd
- vadd.s16 q11, q12, q13 // [3] = Ed + Dd
- vsub.s16 q12, q12, q13 // [4] = Ed - Dd
-.ifc \type, row
- vtrn.16 q8, q9
-.endif
- vadd.s16 q13, q3, q14 // [5] = Fd + Bdd
- vsub.s16 q14, q3, q14 // [6] = Fd - Bdd
-
-.ifc \type, row
- // 8x8 transpose
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vswp d17, d24
- vswp d19, d26
- vadd.s16 q1, q8, q12
- vswp d21, d28
- vsub.s16 q8, q8, q12
- vswp d23, d30
-.endif
- bx lr
-endfunc
-.endm
-
-VP3_IDCT_END row
-VP3_IDCT_END col
-
-function ff_vp3_idct_put_neon, export=1
- mov ip, lr
- bl vp3_idct_start_neon
- bl vp3_idct_end_row_neon
- mov r3, #8
- add r3, r3, #2048 // convert signed pixel to unsigned
- bl vp3_idct_core_neon
- bl vp3_idct_end_col_neon
- mov lr, ip
- vpop {d8-d15}
-
- vqshrun.s16 d0, q8, #4
- vqshrun.s16 d1, q9, #4
- vqshrun.s16 d2, q10, #4
- vqshrun.s16 d3, q11, #4
- vst1.64 {d0}, [r0,:64], r1
- vqshrun.s16 d4, q12, #4
- vst1.64 {d1}, [r0,:64], r1
- vqshrun.s16 d5, q13, #4
- vst1.64 {d2}, [r0,:64], r1
- vqshrun.s16 d6, q14, #4
- vst1.64 {d3}, [r0,:64], r1
- vqshrun.s16 d7, q15, #4
- vst1.64 {d4}, [r0,:64], r1
- vst1.64 {d5}, [r0,:64], r1
- vst1.64 {d6}, [r0,:64], r1
- vst1.64 {d7}, [r0,:64], r1
- bx lr
-endfunc
-
-function ff_vp3_idct_add_neon, export=1
- mov ip, lr
- bl vp3_idct_start_neon
- bl vp3_idct_end_row_neon
- mov r3, #8
- bl vp3_idct_core_neon
- bl vp3_idct_end_col_neon
- mov lr, ip
- vpop {d8-d15}
- mov r2, r0
-
- vld1.64 {d0}, [r0,:64], r1
- vshr.s16 q8, q8, #4
- vld1.64 {d1}, [r0,:64], r1
- vshr.s16 q9, q9, #4
- vld1.64 {d2}, [r0,:64], r1
- vaddw.u8 q8, q8, d0
- vld1.64 {d3}, [r0,:64], r1
- vaddw.u8 q9, q9, d1
- vld1.64 {d4}, [r0,:64], r1
- vshr.s16 q10, q10, #4
- vld1.64 {d5}, [r0,:64], r1
- vshr.s16 q11, q11, #4
- vld1.64 {d6}, [r0,:64], r1
- vqmovun.s16 d0, q8
- vld1.64 {d7}, [r0,:64], r1
- vqmovun.s16 d1, q9
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vshr.s16 q12, q12, #4
- vshr.s16 q13, q13, #4
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vshr.s16 q14, q14, #4
- vshr.s16 q15, q15, #4
- vst1.64 {d0}, [r2,:64], r1
- vqmovun.s16 d4, q12
- vst1.64 {d1}, [r2,:64], r1
- vqmovun.s16 d5, q13
- vst1.64 {d2}, [r2,:64], r1
- vaddw.u8 q14, q14, d6
- vst1.64 {d3}, [r2,:64], r1
- vaddw.u8 q15, q15, d7
- vst1.64 {d4}, [r2,:64], r1
- vqmovun.s16 d6, q14
- vst1.64 {d5}, [r2,:64], r1
- vqmovun.s16 d7, q15
- vst1.64 {d6}, [r2,:64], r1
- vst1.64 {d7}, [r2,:64], r1
- bx lr
-endfunc
-
-function ff_vp3_idct_dc_add_neon, export=1
- ldrsh r12, [r2]
- mov r3, r0
- add r12, r12, #15
- vdup.16 q15, r12
- mov r12, 0
- strh r12, [r2]
- vshr.s16 q15, q15, #5
-
- vld1.8 {d0}, [r0,:64], r1
- vld1.8 {d1}, [r0,:64], r1
- vld1.8 {d2}, [r0,:64], r1
- vaddw.u8 q8, q15, d0
- vld1.8 {d3}, [r0,:64], r1
- vaddw.u8 q9, q15, d1
- vld1.8 {d4}, [r0,:64], r1
- vaddw.u8 q10, q15, d2
- vld1.8 {d5}, [r0,:64], r1
- vaddw.u8 q11, q15, d3
- vld1.8 {d6}, [r0,:64], r1
- vaddw.u8 q12, q15, d4
- vld1.8 {d7}, [r0,:64], r1
- vaddw.u8 q13, q15, d5
- vqmovun.s16 d0, q8
- vaddw.u8 q14, q15, d6
- vqmovun.s16 d1, q9
- vaddw.u8 q15, q15, d7
- vqmovun.s16 d2, q10
- vst1.8 {d0}, [r3,:64], r1
- vqmovun.s16 d3, q11
- vst1.8 {d1}, [r3,:64], r1
- vqmovun.s16 d4, q12
- vst1.8 {d2}, [r3,:64], r1
- vqmovun.s16 d5, q13
- vst1.8 {d3}, [r3,:64], r1
- vqmovun.s16 d6, q14
- vst1.8 {d4}, [r3,:64], r1
- vqmovun.s16 d7, q15
- vst1.8 {d5}, [r3,:64], r1
- vst1.8 {d6}, [r3,:64], r1
- vst1.8 {d7}, [r3,:64], r1
- bx lr
-endfunc
diff --git a/ffmpeg/libavcodec/arm/vp56_arith.h b/ffmpeg/libavcodec/arm/vp56_arith.h
deleted file mode 100644
index feb1247..0000000
--- a/ffmpeg/libavcodec/arm/vp56_arith.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_VP56_ARITH_H
-#define AVCODEC_ARM_VP56_ARITH_H
-
-#if CONFIG_THUMB
-# define A(x)
-# define T(x) x
-#else
-# define A(x) x
-# define T(x)
-#endif
-
-#if CONFIG_THUMB || defined __clang__
-# define L(x)
-# define U(x) x
-#else
-# define L(x) x
-# define U(x)
-#endif
-
-#if HAVE_ARMV6_INLINE
-
-#define vp56_rac_get_prob vp56_rac_get_prob_armv6
-static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
-{
- unsigned shift = ff_vp56_norm_shift[c->high];
- unsigned code_word = c->code_word << shift;
- unsigned high = c->high << shift;
- unsigned bit;
-
- __asm__ ("adds %3, %3, %0 \n"
- "itt cs \n"
- "cmpcs %7, %4 \n"
- L("ldrcsh %2, [%4], #2 \n")
- U("ldrhcs %2, [%4], #2 \n")
- "rsb %0, %6, #256 \n"
- "smlabb %0, %5, %6, %0 \n"
- T("itttt cs \n")
- "rev16cs %2, %2 \n"
- T("lslcs %2, %2, %3 \n")
- T("orrcs %1, %1, %2 \n")
- A("orrcs %1, %1, %2, lsl %3 \n")
- "subcs %3, %3, #16 \n"
- "lsr %0, %0, #8 \n"
- "cmp %1, %0, lsl #16 \n"
- "ittte ge \n"
- "subge %1, %1, %0, lsl #16 \n"
- "subge %0, %5, %0 \n"
- "movge %2, #1 \n"
- "movlt %2, #0 \n"
- : "=&r"(c->high), "=&r"(c->code_word), "=&r"(bit),
- "+&r"(c->bits), "+&r"(c->buffer)
- : "r"(high), "r"(pr), "r"(c->end - 1),
- "0"(shift), "1"(code_word)
- : "cc");
-
- return bit;
-}
-
-#define vp56_rac_get_prob_branchy vp56_rac_get_prob_branchy_armv6
-static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
-{
- unsigned shift = ff_vp56_norm_shift[c->high];
- unsigned code_word = c->code_word << shift;
- unsigned high = c->high << shift;
- unsigned low;
- unsigned tmp;
-
- __asm__ ("adds %3, %3, %0 \n"
- "itt cs \n"
- "cmpcs %7, %4 \n"
- L("ldrcsh %2, [%4], #2 \n")
- U("ldrhcs %2, [%4], #2 \n")
- "rsb %0, %6, #256 \n"
- "smlabb %0, %5, %6, %0 \n"
- T("itttt cs \n")
- "rev16cs %2, %2 \n"
- T("lslcs %2, %2, %3 \n")
- T("orrcs %1, %1, %2 \n")
- A("orrcs %1, %1, %2, lsl %3 \n")
- "subcs %3, %3, #16 \n"
- "lsr %0, %0, #8 \n"
- "lsl %2, %0, #16 \n"
- : "=&r"(low), "+&r"(code_word), "=&r"(tmp),
- "+&r"(c->bits), "+&r"(c->buffer)
- : "r"(high), "r"(pr), "r"(c->end - 1), "0"(shift)
- : "cc");
-
- if (code_word >= tmp) {
- c->high = high - low;
- c->code_word = code_word - tmp;
- return 1;
- }
-
- c->high = low;
- c->code_word = code_word;
- return 0;
-}
-
-#endif
-
-#endif /* AVCODEC_ARM_VP56_ARITH_H */
diff --git a/ffmpeg/libavcodec/arm/vp8.h b/ffmpeg/libavcodec/arm/vp8.h
deleted file mode 100644
index ddaa120..0000000
--- a/ffmpeg/libavcodec/arm/vp8.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_VP8_H
-#define AVCODEC_ARM_VP8_H
-
-#include <stdint.h>
-
-#include "config.h"
-#include "libavcodec/vp56.h"
-#include "libavcodec/vp8.h"
-
-#if HAVE_ARMV6_EXTERNAL
-#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6
-int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, int16_t block[16],
- uint8_t probs[8][3][NUM_DCT_TOKENS-1],
- int i, uint8_t *token_prob, int16_t qmul[2]);
-#endif
-
-#endif /* AVCODEC_ARM_VP8_H */
diff --git a/ffmpeg/libavcodec/arm/vp8_armv6.S b/ffmpeg/libavcodec/arm/vp8_armv6.S
deleted file mode 100644
index e7d25a4..0000000
--- a/ffmpeg/libavcodec/arm/vp8_armv6.S
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (C) 2010 Mans Rullgard
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-.macro rac_get_prob h, bs, buf, cw, pr, t0, t1
- adds \bs, \bs, \t0
- lsl \cw, \cw, \t0
- lsl \t0, \h, \t0
- rsb \h, \pr, #256
- it cs
- ldrhcs \t1, [\buf], #2
- smlabb \h, \t0, \pr, \h
-T itttt cs
- rev16cs \t1, \t1
-A orrcs \cw, \cw, \t1, lsl \bs
-T lslcs \t1, \t1, \bs
-T orrcs \cw, \cw, \t1
- subcs \bs, \bs, #16
- lsr \h, \h, #8
- cmp \cw, \h, lsl #16
- itt ge
- subge \cw, \cw, \h, lsl #16
- subge \h, \t0, \h
-.endm
-
-.macro rac_get_128 h, bs, buf, cw, t0, t1
- adds \bs, \bs, \t0
- lsl \cw, \cw, \t0
- lsl \t0, \h, \t0
- it cs
- ldrhcs \t1, [\buf], #2
- mov \h, #128
- it cs
- rev16cs \t1, \t1
- add \h, \h, \t0, lsl #7
-A orrcs \cw, \cw, \t1, lsl \bs
-T ittt cs
-T lslcs \t1, \t1, \bs
-T orrcs \cw, \cw, \t1
- subcs \bs, \bs, #16
- lsr \h, \h, #8
- cmp \cw, \h, lsl #16
- itt ge
- subge \cw, \cw, \h, lsl #16
- subge \h, \t0, \h
-.endm
-
-function ff_decode_block_coeffs_armv6, export=1
- push {r0,r1,r4-r11,lr}
- movrelx lr, X(ff_vp56_norm_shift)
- ldrd r4, r5, [sp, #44] @ token_prob, qmul
- cmp r3, #0
- ldr r11, [r5]
- ldm r0, {r5-r7} @ high, bits, buf
- it ne
- pkhtbne r11, r11, r11, asr #16
- ldr r8, [r0, #16] @ code_word
-0:
- ldrb r9, [lr, r5]
- add r3, r3, #1
- ldrb r0, [r4, #1]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- blt 2f
-
- ldrb r9, [lr, r5]
- ldrb r0, [r4, #2]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- ldrb r9, [lr, r5]
- bge 3f
-
- add r4, r3, r3, lsl #5
- sxth r12, r11
- add r4, r4, r2
- adds r6, r6, r9
- add r4, r4, #11
- lsl r8, r8, r9
- it cs
- ldrhcs r10, [r7], #2
- lsl r9, r5, r9
- mov r5, #128
- it cs
- rev16cs r10, r10
- add r5, r5, r9, lsl #7
-T ittt cs
-T lslcs r10, r10, r6
-T orrcs r8, r8, r10
-A orrcs r8, r8, r10, lsl r6
- subcs r6, r6, #16
- lsr r5, r5, #8
- cmp r8, r5, lsl #16
- movrel r10, zigzag_scan-1
- itt ge
- subge r8, r8, r5, lsl #16
- subge r5, r9, r5
- ldrb r10, [r10, r3]
- it ge
- rsbge r12, r12, #0
- cmp r3, #16
- strh r12, [r1, r10]
- bge 6f
-5:
- ldrb r9, [lr, r5]
- ldrb r0, [r4]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- pkhtb r11, r11, r11, asr #16
- bge 0b
-
-6:
- ldr r0, [sp]
- ldr r9, [r0, #12]
- cmp r7, r9
- it hi
- movhi r7, r9
- stm r0, {r5-r7} @ high, bits, buf
- str r8, [r0, #16] @ code_word
-
- add sp, sp, #8
- mov r0, r3
- pop {r4-r11,pc}
-2:
- add r4, r3, r3, lsl #5
- cmp r3, #16
- add r4, r4, r2
- pkhtb r11, r11, r11, asr #16
- bne 0b
- b 6b
-3:
- ldrb r0, [r4, #3]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- ldrb r9, [lr, r5]
- bge 1f
-
- mov r12, #2
- ldrb r0, [r4, #4]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- it ge
- addge r12, #1
- ldrb r9, [lr, r5]
- blt 4f
- ldrb r0, [r4, #5]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- it ge
- addge r12, #1
- ldrb r9, [lr, r5]
- b 4f
-1:
- ldrb r0, [r4, #6]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- ldrb r9, [lr, r5]
- bge 3f
-
- ldrb r0, [r4, #7]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- ldrb r9, [lr, r5]
- bge 2f
-
- mov r12, #5
- mov r0, #159
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- it ge
- addge r12, r12, #1
- ldrb r9, [lr, r5]
- b 4f
-2:
- mov r12, #7
- mov r0, #165
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- it ge
- addge r12, r12, #2
- ldrb r9, [lr, r5]
- mov r0, #145
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- it ge
- addge r12, r12, #1
- ldrb r9, [lr, r5]
- b 4f
-3:
- ldrb r0, [r4, #8]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- it ge
- addge r4, r4, #1
- ldrb r9, [lr, r5]
- ite ge
- movge r12, #2
- movlt r12, #0
- ldrb r0, [r4, #9]
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- mov r9, #8
- it ge
- addge r12, r12, #1
- movrelx r4, X(ff_vp8_dct_cat_prob), r1
- lsl r9, r9, r12
- ldr r4, [r4, r12, lsl #2]
- add r12, r9, #3
- mov r1, #0
- ldrb r0, [r4], #1
-1:
- ldrb r9, [lr, r5]
- lsl r1, r1, #1
- rac_get_prob r5, r6, r7, r8, r0, r9, r10
- ldrb r0, [r4], #1
- it ge
- addge r1, r1, #1
- cmp r0, #0
- bne 1b
- ldrb r9, [lr, r5]
- add r12, r12, r1
- ldr r1, [sp, #4]
-4:
- add r4, r3, r3, lsl #5
- add r4, r4, r2
- add r4, r4, #22
- rac_get_128 r5, r6, r7, r8, r9, r10
- it ge
- rsbge r12, r12, #0
- smulbb r12, r12, r11
- movrel r9, zigzag_scan-1
- ldrb r9, [r9, r3]
- cmp r3, #16
- strh r12, [r1, r9]
- bge 6b
- b 5b
-endfunc
-
-const zigzag_scan
- .byte 0, 2, 8, 16
- .byte 10, 4, 6, 12
- .byte 18, 24, 26, 20
- .byte 14, 22, 28, 30
-endconst
diff --git a/ffmpeg/libavcodec/arm/vp8dsp.h b/ffmpeg/libavcodec/arm/vp8dsp.h
deleted file mode 100644
index 6041ef1..0000000
--- a/ffmpeg/libavcodec/arm/vp8dsp.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef AVCODEC_ARM_VP8DSP_H
-#define AVCODEC_ARM_VP8DSP_H
-
-#include "libavcodec/vp8dsp.h"
-
-void ff_vp8dsp_init_armv6(VP8DSPContext *dsp);
-void ff_vp8dsp_init_neon(VP8DSPContext *dsp);
-
-#define VP8_LF_Y(hv, inner, opt) \
- void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \
- ptrdiff_t stride, \
- int flim_E, int flim_I, \
- int hev_thresh)
-
-#define VP8_LF_UV(hv, inner, opt) \
- void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \
- uint8_t *dstV, \
- ptrdiff_t stride, \
- int flim_E, int flim_I, \
- int hev_thresh)
-
-#define VP8_LF_SIMPLE(hv, opt) \
- void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
- ptrdiff_t stride, \
- int flim)
-
-#define VP8_LF_HV(inner, opt) \
- VP8_LF_Y(h, inner, opt); \
- VP8_LF_Y(v, inner, opt); \
- VP8_LF_UV(h, inner, opt); \
- VP8_LF_UV(v, inner, opt)
-
-#define VP8_LF(opt) \
- VP8_LF_HV(, opt); \
- VP8_LF_HV(_inner, opt); \
- VP8_LF_SIMPLE(h, opt); \
- VP8_LF_SIMPLE(v, opt)
-
-#define VP8_MC(n, opt) \
- void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \
- uint8_t *src, ptrdiff_t srcstride, \
- int h, int x, int y)
-
-#define VP8_EPEL(w, opt) \
- VP8_MC(pixels ## w, opt); \
- VP8_MC(epel ## w ## _h4, opt); \
- VP8_MC(epel ## w ## _h6, opt); \
- VP8_MC(epel ## w ## _v4, opt); \
- VP8_MC(epel ## w ## _h4v4, opt); \
- VP8_MC(epel ## w ## _h6v4, opt); \
- VP8_MC(epel ## w ## _v6, opt); \
- VP8_MC(epel ## w ## _h4v6, opt); \
- VP8_MC(epel ## w ## _h6v6, opt)
-
-#define VP8_BILIN(w, opt) \
- VP8_MC(bilin ## w ## _h, opt); \
- VP8_MC(bilin ## w ## _v, opt); \
- VP8_MC(bilin ## w ## _hv, opt)
-
-#endif /* AVCODEC_ARM_VP8DSP_H */
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_armv6.S b/ffmpeg/libavcodec/arm/vp8dsp_armv6.S
deleted file mode 100644
index a14b188..0000000
--- a/ffmpeg/libavcodec/arm/vp8dsp_armv6.S
+++ /dev/null
@@ -1,1634 +0,0 @@
-/*
- * VP8 ARMv6 optimisations
- *
- * Copyright (c) 2010 Google Inc.
- * Copyright (c) 2010 Rob Clark <rob@ti.com>
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- *
- * This code was partially ported from libvpx, which uses this license:
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * * Neither the name of Google nor the names of its contributors may
- * be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "libavutil/arm/asm.S"
-
-@ idct
-
-@ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
-function ff_vp8_luma_dc_wht_armv6, export=1
- push {r4-r10, lr}
-
- ldm r1, {r2-r9}
- mov r10, #0
- mov lr, #0
- uadd16 r12, r2, r8 @ t0[0,1]
- usub16 r2, r2, r8 @ t3[0,1]
- stm r1!, {r10, lr}
- uadd16 r8, r4, r6 @ t1[0,1]
- usub16 r4, r4, r6 @ t2[0,1]
- stm r1!, {r10, lr}
- uadd16 r6, r12, r8 @ dc0[0,1]
- usub16 r12, r12, r8 @ dc2[0,1]
- stm r1!, {r10, lr}
- uadd16 r8, r2, r4 @ dc1[0,1]
- usub16 r2, r2, r4 @ dc3[0,1]
- stm r1!, {r10, lr}
-
- uadd16 lr, r3, r9 @ t0[2,3]
- usub16 r3, r3, r9 @ t3[2,3]
- uadd16 r9, r5, r7 @ t1[2,3]
- usub16 r5, r5, r7 @ t2[2,3]
-
- uadd16 r7, lr, r9 @ dc0[2,3]
- usub16 lr, lr, r9 @ dc2[2,3]
- uadd16 r9, r3, r5 @ dc1[2,3]
- usub16 r3, r3, r5 @ dc3[2,3]
-
- mov r1, #3
- orr r1, r1, #0x30000 @ 3 | 3 (round)
-
- pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0]
- pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1]
- pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0]
- pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1]
- pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2]
- uadd16 r4, r4, r1
- uadd16 r5, r5, r1
- pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3]
- pkhbt r2, lr, r3, lsl #16 @ dc{2,3}[2]
- pkhtb lr, r3, lr, asr #16 @ dc{2,3}[3]
-
- uadd16 r9, r4, r7 @ t0[0,1]
- uadd16 r3, r5, lr @ t0[2,3]
- usub16 r4, r4, r7 @ t3[0,1]
- usub16 r5, r5, lr @ t3[2,3]
- uadd16 r7, r6, r8 @ t1[0,1]
- uadd16 lr, r12, r2 @ t1[2,3]
- usub16 r6, r6, r8 @ t2[0,1]
- usub16 r12, r12, r2 @ t2[2,3]
-
- uadd16 r8, r9, r7 @ block[0,1][0]
- uadd16 r2, r3, lr @ block[2,3][0]
- usub16 r9, r9, r7 @ block[0,1][2]
- usub16 r3, r3, lr @ block[2,3][2]
- uadd16 r7, r4, r6 @ block[0,1][1]
- uadd16 lr, r5, r12 @ block[2,3][1]
- usub16 r4, r4, r6 @ block[0,1][3]
- usub16 r5, r5, r12 @ block[2,3][3]
-
-#if HAVE_ARMV6T2_EXTERNAL
- sbfx r6, r8, #3, #13
- sbfx r12, r7, #3, #13
- sbfx r1, r9, #3, #13
- sbfx r10, r4, #3, #13
-#else
- sxth r6, r8
- sxth r12, r7
- sxth r1, r9
- sxth r10, r4
- asr r6, #3 @ block[0][0]
- asr r12, #3 @ block[0][1]
- asr r1, #3 @ block[0][2]
- asr r10, #3 @ block[0][3]
-#endif
-
- strh r6, [r0], #32
- asr r8, r8, #19 @ block[1][0]
- strh r12, [r0], #32
- asr r7, r7, #19 @ block[1][1]
- strh r1, [r0], #32
- asr r9, r9, #19 @ block[1][2]
- strh r10, [r0], #32
- asr r4, r4, #19 @ block[1][3]
- strh r8, [r0], #32
- asr r6, r2, #19 @ block[3][0]
- strh r7, [r0], #32
- asr r12, lr, #19 @ block[3][1]
- strh r9, [r0], #32
- asr r1, r3, #19 @ block[3][2]
- strh r4, [r0], #32
- asr r10, r5, #19 @ block[3][3]
-
-#if HAVE_ARMV6T2_EXTERNAL
- sbfx r2, r2, #3, #13
- sbfx lr, lr, #3, #13
- sbfx r3, r3, #3, #13
- sbfx r5, r5, #3, #13
-#else
- sxth r2, r2
- sxth lr, lr
- sxth r3, r3
- sxth r5, r5
- asr r2, #3 @ block[2][0]
- asr lr, #3 @ block[2][1]
- asr r3, #3 @ block[2][2]
- asr r5, #3 @ block[2][3]
-#endif
-
- strh r2, [r0], #32
- strh lr, [r0], #32
- strh r3, [r0], #32
- strh r5, [r0], #32
- strh r6, [r0], #32
- strh r12, [r0], #32
- strh r1, [r0], #32
- strh r10, [r0], #32
-
- pop {r4-r10, pc}
-endfunc
-
-@ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16])
-function ff_vp8_luma_dc_wht_dc_armv6, export=1
- ldrsh r2, [r1]
- mov r3, #0
- add r2, r2, #3
- strh r3, [r1]
- asr r2, r2, #3
- .rept 16
- strh r2, [r0], #32
- .endr
- bx lr
-endfunc
-
-@ void vp8_idct_add(uint8_t *dst, int16_t block[16], int stride)
-function ff_vp8_idct_add_armv6, export=1
- push {r4-r12, lr}
- sub sp, sp, #32
-
- movw r3, #20091 @ cospi8sqrt2minus1
- movw r4, #35468 @ sinpi8sqrt2
- mov r5, sp
-1:
- ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block1[0]
- ldr lr, [r1, #16] @ i9 | i8 = block2[1] | block2[0]
- ldr r12, [r1, #24] @ i13 | i12 = block3[1] | block3[0]
-
- smulwt r9, r3, r6 @ ip[5] * cospi8sqrt2minus1
- smulwb r7, r3, r6 @ ip[4] * cospi8sqrt2minus1
- smulwt r10, r4, r6 @ ip[5] * sinpi8sqrt2
- smulwb r8, r4, r6 @ ip[4] * sinpi8sqrt2
- pkhbt r7, r7, r9, lsl #16 @ 5c | 4c
- smulwt r11, r3, r12 @ ip[13] * cospi8sqrt2minus1
- pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first half
- uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first half
- smulwb r9, r3, r12 @ ip[12] * cospi8sqrt2minus1
- smulwt r7, r4, r12 @ ip[13] * sinpi8sqrt2
- smulwb r10, r4, r12 @ ip[12] * sinpi8sqrt2
-
- pkhbt r9, r9, r11, lsl #16 @ 13c | 12c
- ldr r11, [r1] @ i1 | i0
- pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second half
- uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 2nd half
- uadd16 r6, r6, r10 @ d = t3
- uadd16 r10, r11, lr @ a = t0
- usub16 r7, r8, r7 @ c = t2
- usub16 r8, r11, lr @ b = t1
- uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0]
- usub16 r10, r10, r6 @ a-d = tmp{0,1}[3]
- uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1]
- usub16 r7, r8, r7 @ b-c = tmp{0,1}[2]
- mov r8, #0
- cmp sp, r5
- str r6, [r5, #8] @ o5 | o4
- str r7, [r5, #16] @ o9 | o8
- str r10, [r5, #24] @ o13 | o12
- str r9, [r5], #4 @ o1 | o0
- str r8, [r1, #8]
- str r8, [r1, #16]
- str r8, [r1, #24]
- str r8, [r1], #4
- beq 1b
-
- mov r5, #2
-2:
- pop {r1, r6, r12, lr}
- smulwt r9, r3, r12 @ ip[5] * cospi8sqrt2minus1
- smulwt r7, r3, r1 @ ip[1] * cospi8sqrt2minus1
- smulwt r10, r4, r12 @ ip[5] * sinpi8sqrt2
- smulwt r8, r4, r1 @ ip[1] * sinpi8sqrt2
- pkhbt r11, r1, r12, lsl #16 @ i4 | i0 = t0/t1 first half
- pkhtb r1, r12, r1, asr #16 @ i5 | i1
- pkhbt r7, r7, r9, lsl #16 @ 5c | 1c
- pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = t2 first half
- pkhbt r9, r6, lr, lsl #16 @ i6 | i2 = t0/t1 second half
- pkhtb r12, lr, r6, asr #16 @ i7 | i3
- uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = t3 first half
- uadd16 r10, r11, r9 @ a = t0
- usub16 r9, r11, r9 @ b = t1
- smulwt r7, r3, r12 @ ip[7] * cospi8sqrt2minus1
- smulwb lr, r3, r12 @ ip[3] * cospi8sqrt2minus1
- smulwt r11, r4, r12 @ ip[7] * sinpi8sqrt2
- smulwb r6, r4, r12 @ ip[3] * sinpi8sqrt2
- subs r5, r5, #1
- pkhbt r7, lr, r7, lsl #16 @ 7c | 3c
- pkhbt r11, r6, r11, lsl #16 @ 7s | 3s = t3 second half
- mov r6, #0x4
- orr r6, r6, #0x40000
- uadd16 r12, r7, r12 @ 7c+7 | 3c+3 = t2 second half
- uadd16 r10, r10, r6 @ t0 + 4
- uadd16 r9, r9, r6 @ t1 + 4
- usub16 lr, r8, r12 @ c (o5 | o1) = t2
- uadd16 r12, r11, r1 @ d (o7 | o3) = t3
- usub16 r1, r9, lr @ b-c = dst{0,1}[2]
- uadd16 r7, r10, r12 @ a+d = dst{0,1}[0]
- usub16 r12, r10, r12 @ a-d = dst{0,1}[3]
- uadd16 r10, r9, lr @ b+c = dst{0,1}[1]
-
- asr lr, r1, #3 @ o[1][2]
- asr r9, r12, #3 @ o[1][3]
- pkhtb r8, lr, r7, asr #19 @ o[1][0,2]
- pkhtb r11, r9, r10, asr #19 @ o[1][1,3]
- ldr lr, [r0]
- sxth r12, r12
- ldr r9, [r0, r2]
- sxth r1, r1
-#if HAVE_ARMV6T2_EXTERNAL
- sbfx r7, r7, #3, #13
- sbfx r10, r10, #3, #13
-#else
- sxth r7, r7
- sxth r10, r10
- asr r7, #3 @ o[0][0]
- asr r10, #3 @ o[0][1]
-#endif
- pkhbt r7, r7, r1, lsl #13 @ o[0][0,2]
- pkhbt r10, r10, r12, lsl #13 @ o[0][1,3]
-
- uxtab16 r7, r7, lr
- uxtab16 r10, r10, lr, ror #8
- uxtab16 r8, r8, r9
- uxtab16 r11, r11, r9, ror #8
- usat16 r7, #8, r7
- usat16 r10, #8, r10
- usat16 r8, #8, r8
- usat16 r11, #8, r11
- orr r7, r7, r10, lsl #8
- orr r8, r8, r11, lsl #8
- str r8, [r0, r2]
- str_post r7, r0, r2, lsl #1
-
- bne 2b
-
- pop {r4-r12, pc}
-endfunc
-
-@ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], int stride)
-function ff_vp8_idct_dc_add_armv6, export=1
- push {r4-r6, lr}
- add r6, r0, r2, lsl #1
- ldrsh r3, [r1]
- mov r4, #0
- add r3, r3, #4
- strh r4, [r1], #32
- asr r3, #3
- ldr r5, [r0]
- ldr r4, [r0, r2]
- pkhbt r3, r3, r3, lsl #16
- uxtab16 lr, r3, r5 @ a1+2 | a1+0
- uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1
- uxtab16 r12, r3, r4
- uxtab16 r4, r3, r4, ror #8
- usat16 lr, #8, lr
- usat16 r5, #8, r5
- usat16 r12, #8, r12
- usat16 r4, #8, r4
- orr lr, lr, r5, lsl #8
- ldr r5, [r6]
- orr r12, r12, r4, lsl #8
- ldr r4, [r6, r2]
- str lr, [r0]
- uxtab16 lr, r3, r5
- str r12, [r0, r2]
- uxtab16 r5, r3, r5, ror #8
- uxtab16 r12, r3, r4
- uxtab16 r4, r3, r4, ror #8
- usat16 lr, #8, lr
- usat16 r5, #8, r5
- usat16 r12, #8, r12
- usat16 r4, #8, r4
- orr lr, lr, r5, lsl #8
- orr r12, r12, r4, lsl #8
- str lr, [r6]
- str r12, [r6, r2]
- pop {r4-r6, pc}
-endfunc
-
-@ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], int stride)
-function ff_vp8_idct_dc_add4uv_armv6, export=1
- push {r4, lr}
-
- bl ff_vp8_idct_dc_add_armv6
- add r0, r0, #4
- bl ff_vp8_idct_dc_add_armv6
- add r0, r0, r2, lsl #2
- sub r0, r0, #4
- bl ff_vp8_idct_dc_add_armv6
- add r0, r0, #4
- bl ff_vp8_idct_dc_add_armv6
-
- pop {r4, pc}
-endfunc
-
-@ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], int stride)
-function ff_vp8_idct_dc_add4y_armv6, export=1
- push {r4, lr}
-
- bl ff_vp8_idct_dc_add_armv6
- add r0, r0, #4
- bl ff_vp8_idct_dc_add_armv6
- add r0, r0, #4
- bl ff_vp8_idct_dc_add_armv6
- add r0, r0, #4
- bl ff_vp8_idct_dc_add_armv6
-
- pop {r4, pc}
-endfunc
-
-@ loopfilter
-
-.macro transpose o3, o2, o1, o0, i0, i1, i2, i3
- uxtb16 \o1, \i1 @ xx 12 xx 10
- uxtb16 \o0, \i0 @ xx 02 xx 00
- uxtb16 \o3, \i3 @ xx 32 xx 30
- uxtb16 \o2, \i2 @ xx 22 xx 20
- orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00
- orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20
-
- uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11
- uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31
- uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01
- uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21
- orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01
- orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21
-
- pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02
- pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00
- pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03
- pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01
-.endm
-
-.macro simple_filter
- uqsub8 r7, r3, r6 @ p1 - q1
- uqsub8 r8, r6, r3 @ q1 - p1
- uqsub8 r10, r4, r5 @ p0 - q0
- uqsub8 r9, r5, r4 @ q0 - p0
- orr r7, r7, r8 @ abs(p1 - q1)
- orr r9, r9, r10 @ abs(p0 - q0)
- uhadd8 r7, r7, lr @ abs(p1 - q2) >> 1
- uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2
- uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1-q1)/2
- mvn r8, #0
- usub8 r10, r12, r7 @ compare to flimit
- sel r10, r8, lr @ filter mask: F or 0
- cmp r10, #0
- beq 2f
-
- eor r3, r3, r2 @ ps1
- eor r6, r6, r2 @ qs1
- eor r4, r4, r2 @ ps0
- eor r5, r5, r2 @ qs0
-
- qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
- qsub8 r6, r5, r4 @ q0 - p0
- qadd8 r3, r3, r6 @ += q0 - p0
- lsr r7, r2, #5 @ 0x04040404
- qadd8 r3, r3, r6 @ += q0 - p0
- sub r9, r7, r2, lsr #7 @ 0x03030303
- qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0)
- and r3, r3, r10 @ vp8_filter &= mask
-
- qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3
- qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4
-
- shadd8 r9, r9, lr
- shadd8 r3, r3, lr
- shadd8 r9, r9, lr
- shadd8 r3, r3, lr
- shadd8 r9, r9, lr @ Filter2 >>= 3
- shadd8 r3, r3, lr @ Filter1 >>= 3
-
- qadd8 r4, r4, r9 @ u = p0 + Filter2
- qsub8 r5, r5, r3 @ u = q0 - Filter1
- eor r4, r4, r2 @ *op0 = u ^ 0x80
- eor r5, r5, r2 @ *oq0 = u ^ 0x80
-.endm
-
-@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
-function ff_vp8_v_loop_filter16_simple_armv6, export=1
- push {r4-r11, lr}
-
- orr r2, r2, r2, lsl #16
- mov r11, #4
- mov lr, #0
- orr r12, r2, r2, lsl #8
- mov32 r2, 0x80808080
-1:
- ldr_nreg r3, r0, r1, lsl #1 @ p1
- ldr_nreg r4, r0, r1 @ p0
- ldr r5, [r0] @ q0
- ldr r6, [r0, r1] @ q1
- simple_filter
-T sub r7, r0, r1
- str r5, [r0] @ oq0
-A str r4, [r0, -r1] @ op0
-T str r4, [r7]
-2:
- subs r11, r11, #1
- add r0, r0, #4
- bne 1b
-
- pop {r4-r11, pc}
-endfunc
-
-.macro filter_mask_p
- uqsub8 r6, r9, r10 @ p3 - p2
- uqsub8 r7, r10, r9 @ p2 - p3
- uqsub8 r8, r10, r11 @ p2 - p1
- uqsub8 r10, r11, r10 @ p1 - p2
- orr r6, r6, r7 @ abs(p3-p2)
- orr r8, r8, r10 @ abs(p2-p1)
- uqsub8 lr, r6, r2 @ compare to limit
- uqsub8 r8, r8, r2 @ compare to limit
- uqsub8 r6, r11, r12 @ p1 - p0
- orr lr, lr, r8
- uqsub8 r7, r12, r11 @ p0 - p1
- orr r6, r6, r7 @ abs(p1-p0)
- uqsub8 r7, r6, r2 @ compare to limit
- uqsub8 r8, r6, r3 @ compare to thresh
- orr lr, lr, r7
-.endm
-
-.macro filter_mask_pq
- uqsub8 r6, r11, r10 @ p1 - q1
- uqsub8 r7, r10, r11 @ q1 - p1
- uqsub8 r11, r12, r9 @ p0 - q0
- uqsub8 r12, r9, r12 @ q0 - p0
- orr r6, r6, r7 @ abs(p1-q1)
- orr r12, r11, r12 @ abs(p0-q0)
- mov32 r7, 0x7f7f7f7f
- uqadd8 r12, r12, r12 @ abs(p0-q0) * 2
- and r6, r7, r6, lsr #1 @ abs(p1-q1) / 2
- uqadd8 r12, r12, r6 @ abs(p0-q0) * 2 + abs(p1-q1)/2
-.endm
-
-.macro filter_mask_v
- filter_mask_p
-
- ldr r10, [r0, r1] @ q1
- ldr_post r9, r0, r1, lsl #1 @ q0
-
- filter_mask_pq
-
- ldr r11, [r0] @ q2
-
- uqsub8 r7, r9, r10 @ q0 - q1
- uqsub8 r6, r10, r9 @ q1 - q0
- uqsub8 r12, r12, r4 @ compare to flimit
- uqsub8 r9, r11, r10 @ q2 - q1
- uqsub8 r10, r10, r11 @ q1 - q2
- orr lr, lr, r12
- ldr r12, [r0, r1] @ q3
- orr r6, r7, r6 @ abs(q1-q0)
- orr r10, r9, r10 @ abs(q2-q1)
- uqsub8 r9, r12, r11 @ q3 - q2
- uqsub8 r11, r11, r12 @ q2 - q3
- uqsub8 r7, r6, r2 @ compare to limit
- uqsub8 r10, r10, r2 @ compare to limit
- uqsub8 r6, r6, r3 @ compare to thresh
- orr r9, r9, r11 @ abs(q3-q2)
- orr lr, lr, r7
- orr lr, lr, r10
- uqsub8 r9, r9, r2 @ compare to limit
- orr lr, lr, r9
-
- mov r12, #0
- usub8 lr, r12, lr
- mvn r11, #0
- sel lr, r11, r12 @ filter mask
- sub r0, r0, r1, lsl #1
-.endm
-
-.macro filter_mask_h
- transpose r12, r11, r10, r9, r6, r7, r8, lr
-
- filter_mask_p
-
- stm sp, {r8, r11, r12, lr}
- sub r0, r0, r1, lsl #2
- add r0, r0, #4
-
- ldr r7, [r0, r1]
- ldr_post r6, r0, r1, lsl #1
- ldr lr, [r0, r1]
- ldr r8, [r0]
-
- transpose r12, r11, r10, r9, r6, r7, r8, lr
-
- uqsub8 r8, r12, r11 @ q3 - q2
- uqsub8 lr, r11, r12 @ q2 - q3
- uqsub8 r7, r9, r10 @ q0 - q1
- uqsub8 r6, r10, r9 @ q1 - q0
- uqsub8 r12, r11, r10 @ q2 - q1
- uqsub8 r11, r10, r11 @ q1 - q2
- orr r8, r8, lr @ abs(q3-q2)
- orr r6, r7, r6 @ abs(q1-q0)
- orr r11, r12, r11 @ abs(q2-q1)
- ldr lr, [sp, #12] @ load back (f)limit accumulator
- uqsub8 r8, r8, r2 @ compare to limit
- uqsub8 r7, r6, r2 @ compare to limit
- uqsub8 r11, r11, r2 @ compare to limit
- orr lr, lr, r8
- uqsub8 r8, r6, r3 @ compare to thresh
- orr lr, lr, r7
- ldr r12, [sp, #8] @ p1
- orr lr, lr, r11
-
- ldr r11, [sp, #4] @ p0
-
- filter_mask_pq
-
- mov r10, #0
- uqsub8 r12, r12, r4 @ compare to flimit
- mvn r11, #0
- orr lr, lr, r12
- usub8 lr, r10, lr
- sel lr, r11, r10 @ filter mask
-.endm
-
-.macro filter inner
- mov32 r12, 0x80808080
- eor r11, r7, r12 @ ps1
- eor r8, r8, r12 @ ps0
- eor r9, r9, r12 @ qs0
- eor r10, r10, r12 @ qs1
-
- stm sp, {r8-r11}
-
- qsub8 r7, r11, r10 @ vp8_signed_char_clamp(ps1-qs1)
- qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
- .if \inner
- and r7, r7, r6 @ vp8_filter &= hev
- .endif
- qadd8 r7, r7, r8
- lsr r10, r12, #5 @ 0x04040404
- qadd8 r7, r7, r8
- sub r9, r10, r12, lsr #7 @ 0x03030303
- qadd8 r7, r7, r8
-
- and r7, r7, lr @ vp8_filter &= mask
- .if !\inner
- mov r12, r7 @ Filter2
- and r7, r7, r6 @ Filter2 &= hev
- .endif
- qadd8 lr, r7, r9 @ Filter2 = vp8_signed_char_clamp(vp8_filter+3)
- qadd8 r7, r7, r10 @ Filter1 = vp8_signed_char_clamp(vp8_filter+4)
-
- mov r9, #0
- shadd8 lr, lr, r9 @ Filter2 >>= 3
- shadd8 r7, r7, r9 @ Filter1 >>= 3
- shadd8 lr, lr, r9
- shadd8 r7, r7, r9
- shadd8 lr, lr, r9 @ Filter2
- shadd8 r7, r7, r9 @ Filter1
-.endm
-
-.macro filter_v inner
- orr r10, r6, r8 @ calculate vp8_hevmask
- ldr_nreg r7, r0, r1, lsl #1 @ p1
- usub8 r10, r12, r10
- ldr_nreg r8, r0, r1 @ p0
- sel r6, r12, r11 @ obtain vp8_hevmask
- ldr r9, [r0] @ q0
- ldr r10, [r0, r1] @ q1
- filter \inner
-.endm
-
-.macro filter_h inner
- orr r9, r6, r8
- usub8 r9, r12, r9
- sel r6, r12, r11 @ hev mask
-
- stm sp, {r6, lr}
-
- ldr_nreg r12, r0, r1, lsl #1
- ldr_nreg r11, r0, r1
- ldr r6, [r0]
- ldr lr, [r0, r1]
-
- transpose r10, r9, r8, r7, r12, r11, r6, lr
-
- ldm sp, {r6, lr}
- filter \inner
-.endm
-
-.macro filter_inner
- ldm sp, {r8, r9}
- lsr r10, r10, #2 @ 0x01010101
- qadd8 r8, r8, lr @ u = vp8_signed_char_clamp(ps0 + Filter2)
- mov lr, #0
- qsub8 r9, r9, r7 @ u = vp8_signed_char_clamp(qs0 - Filter1)
- sadd8 r7, r7, r10 @ vp8_filter += 1
- ldr r10, [sp, #8] @ qs1
- shadd8 r7, r7, lr @ vp8_filter >>= 1
- eor r8, r8, r12 @ *op0 = u ^ 0x80
- bic r7, r7, r6 @ vp8_filter &= ~hev
- qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
- eor r9, r9, r12 @ *oq0 = u ^ 0x80
- qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
- eor r11, r11, r12 @ *op1 = u ^ 0x80
- eor r10, r10, r12 @ *oq1 = u ^ 0x80
-.endm
-
-.macro filter_x c0
- mov lr, \c0
- mov r7, #63
-
- sxtb16 r6, r12
- sxtb16 r10, r12, ror #8
- smlabb r8, r6, lr, r7
- smlatb r6, r6, lr, r7
- smlabb r7, r10, lr, r7
- smultb r10, r10, lr
- ssat r8, #8, r8, asr #7
- ssat r6, #8, r6, asr #7
- add r10, r10, #63
- ssat r7, #8, r7, asr #7
- ssat r10, #8, r10, asr #7
-
- pkhbt r6, r8, r6, lsl #16
- pkhbt r10, r7, r10, lsl #16
- uxtb16 r6, r6
- uxtb16 r10, r10
-
- mov32 lr, 0x80808080
-
- orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
- qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
- qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
- eor r8, r8, lr @ *oq0 = s ^ 0x80
- eor r10, r10, lr @ *op0 = s ^ 0x80
-.endm
-
-.macro filter_1
- ldm sp, {r8, r9}
- qadd8 r11, r8, lr
- qsub8 r9, r9, r7
- bic r12, r12, r6 @ vp8_filter &= ~hev
- filter_x #27
-.endm
-
-.macro filter_2
- ldr r9, [sp, #8] @ qs1
- ldr r11, [sp, #12] @ ps1
- filter_x #18
-.endm
-
-.macro filter_3
- eor r9, r9, lr
- eor r11, r11, lr
- filter_x #9
-.endm
-
-function vp8_v_loop_filter_inner_armv6
- mov r5, #4
- sub sp, sp, #16
-
- orr r2, r2, r2, lsl #16
- orr r3, r3, r3, lsl #16
- orr r6, r6, r6, lsl #16
- orr r4, r2, r2, lsl #8 @ flimE
- orr r2, r3, r3, lsl #8 @ flimI
- orr r3, r6, r6, lsl #8 @ thresh
-1:
- sub r0, r0, r1, lsl #2
- ldr r10, [r0, r1] @ p2
- ldr_post r9, r0, r1, lsl #1 @ p3
- ldr r12, [r0, r1] @ p0
- ldr_post r11, r0, r1, lsl #1 @ p1
-
- filter_mask_v
- cmp lr, #0
- beq 2f
- filter_v inner=1
- filter_inner
-
-A str r11, [r0, -r1, lsl #1] @ op1
-A str r8, [r0, -r1] @ op0
-T sub r0, r0, r1, lsl #1
-T str r8, [r0, r1]
-T str_post r11, r0, r1, lsl #1
- str r9, [r0] @ oq0
- str r10, [r0, r1] @ oq1
-2:
- add r0, r0, #4
- cmp r5, #3
- it eq
- ldreq r0, [sp, #16]
- subs r5, r5, #1
- bne 1b
-
- add sp, sp, #16
- pop {r0, r4-r11, pc}
-endfunc
-
-function ff_vp8_v_loop_filter16_inner_armv6, export=1
- push {r4-r11, lr}
- add r12, r0, #8
- push {r12}
- ldr r6, [sp, #40]
- orr r2, r2, r2, lsl #16
- b vp8_v_loop_filter_inner_armv6
-endfunc
-
-function ff_vp8_v_loop_filter8uv_inner_armv6, export=1
- push {r1, r4-r11, lr}
- mov r1, r2
- orr r2, r3, r3, lsl #16
- ldr r3, [sp, #40]
- ldr r6, [sp, #44]
- b vp8_v_loop_filter_inner_armv6
-endfunc
-
-function vp8_v_loop_filter_armv6
- mov r5, #4
- sub sp, sp, #16
-
- orr r3, r3, r3, lsl #16
- orr r6, r6, r6, lsl #16
- orr r4, r2, r2, lsl #8 @ flimE
- orr r2, r3, r3, lsl #8 @ flimI
- orr r3, r6, r6, lsl #8 @ thresh
-1:
- sub r0, r0, r1, lsl #2
- ldr r10, [r0, r1] @ p2
- ldr_post r9, r0, r1, lsl #1 @ p3
- ldr r12, [r0, r1] @ p0
- ldr_post r11, r0, r1, lsl #1 @ p1
-
- filter_mask_v
- cmp lr, #0
- beq 2f
-
- filter_v inner=0
- filter_1
-
- str r8, [r0] @ *oq0
-A str r10, [r0, -r1] @ *op0
-T sub r0, r0, r1, lsl #1
-T str r10, [r0, r1]
-
- filter_2
-
-A str r10, [r0, -r1, lsl #1] @ *op1
-T str_post r10, r0, r1, lsl #1
- str r8, [r0, r1] @ *oq1
-
- ldr r9, [r0, r1, lsl #1] @ q2
- add r0, r0, r1
-A ldr r11, [r0, -r1, lsl #2] @ p2
-T ldr_dpre r11, r0, r1, lsl #2
-
- filter_3
-
-A str r10, [r0, -r1, lsl #2] @ *op2
-T str_post r10, r0, r1, lsl #2
- str r8, [r0, r1] @ *oq2
- sub r0, r0, r1
-2:
- add r0, r0, #4
- cmp r5, #3
- it eq
- ldreq r0, [sp, #16]
- subs r5, r5, #1
- bne 1b
-
- add sp, sp, #16
- pop {r0, r4-r11, pc}
-endfunc
-
-function ff_vp8_v_loop_filter16_armv6, export=1
- push {r4-r11, lr}
- add r12, r0, #8
- push {r12}
- ldr r6, [sp, #40]
- orr r2, r2, r2, lsl #16
- b vp8_v_loop_filter_armv6
-endfunc
-
-function ff_vp8_v_loop_filter8uv_armv6, export=1
- push {r1, r4-r11, lr}
- mov r1, r2
- orr r2, r3, r3, lsl #16
- ldr r3, [sp, #40]
- ldr r6, [sp, #44]
- b vp8_v_loop_filter_armv6
-endfunc
-
-@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim)
-function ff_vp8_h_loop_filter16_simple_armv6, export=1
- push {r4-r11, lr}
- orr r12, r2, r2, lsl #16
- mov32 r2, 0x80808080
- orr r12, r12, r12, lsl #8
-
- mov lr, #0
- mov r11, #4
-1:
- sub r0, r0, #2
- ldr r8, [r0, r1]
- ldr_post r7, r0, r1, lsl #1
- ldr r10, [r0, r1]
- ldr_post r9, r0, r1, lsl #1
- add r0, r0, #2
- transpose r6, r5, r4, r3, r7, r8, r9, r10
- simple_filter
- sub r0, r0, r1, lsl #2
- sub r0, r0, #1
-
- uxtb16 r6, r4
- uxtb16 r8, r5
- uxtb16 r7, r4, ror #8
- uxtb16 r9, r5, ror #8
- orr r6, r6, r8, lsl #8
- orr r7, r7, r9, lsl #8
- lsr r4, r6, #16
- lsr r5, r7, #16
-
- strh_post r6, r0, r1
- strh_post r7, r0, r1
- strh_post r4, r0, r1
- strh_post r5, r0, r1
- add r0, r0, #1
-2:
- subs r11, r11, #1
- bne 1b
-
- pop {r4-r11, pc}
-endfunc
-
-function vp8_h_loop_filter_inner_armv6
- mov r5, #4
- sub sp, sp, #16
-
- orr r3, r3, r3, lsl #16
- orr r9, r9, r9, lsl #16
- orr r4, r2, r2, lsl #8 @ flimE
- orr r2, r3, r3, lsl #8 @ flimI
- orr r3, r9, r9, lsl #8 @ thresh
- sub r0, r0, #4
-1:
- ldr r7, [r0, r1]
- ldr_post r6, r0, r1, lsl #1
- ldr lr, [r0, r1]
- ldr_post r8, r0, r1, lsl #1
-
- filter_mask_h
-
- cmp lr, #0
- sub r0, r0, #2
- beq 2f
-
- ldr r6, [sp]
-
- filter_h inner=1
- filter_inner
-
- transpose lr, r12, r7, r6, r11, r8, r9, r10
-
-A str r6, [r0, -r1, lsl #1]
-A str r7, [r0, -r1]
-T sub r0, r0, r1, lsl #1
-T str r7, [r0, r1]
-T str_post r6, r0, r1, lsl #1
- str r12, [r0]
- str lr, [r0, r1]
-2:
- sub r0, r0, #2
- add r0, r0, r1, lsl #1
- cmp r5, #3
- it eq
- ldreq r0, [sp, #16]
- subs r5, r5, #1
- bne 1b
-
- add sp, sp, #16
- pop {r0, r4-r11, pc}
-endfunc
-
-function ff_vp8_h_loop_filter16_inner_armv6, export=1
- push {r4-r11, lr}
- add r12, r0, r1, lsl #3
- sub r12, r12, #4
- push {r12}
- ldr r9, [sp, #40]
- orr r2, r2, r2, lsl #16
- b vp8_h_loop_filter_inner_armv6
-endfunc
-
-function ff_vp8_h_loop_filter8uv_inner_armv6, export=1
- sub r1, r1, #4
- push {r1, r4-r11, lr}
- mov r1, r2
- orr r2, r3, r3, lsl #16
- ldr r3, [sp, #40]
- ldr r9, [sp, #44]
- b vp8_h_loop_filter_inner_armv6
-endfunc
-
-function vp8_h_loop_filter_armv6
- mov r5, #4
- sub sp, sp, #16
-
- orr r3, r3, r3, lsl #16
- orr r9, r9, r9, lsl #16
- orr r4, r2, r2, lsl #8 @ flimE
- orr r2, r3, r3, lsl #8 @ flimI
- orr r3, r9, r9, lsl #8 @ thresh
-1:
- sub r0, r0, #4
- ldr r7, [r0, r1]
- ldr_post r6, r0, r1, lsl #1
- ldr lr, [r0, r1]
- ldr_post r8, r0, r1, lsl #1
-
- filter_mask_h
- cmp lr, #0
- it eq
- addeq r0, r0, r1, lsl #1
- beq 2f
-
- ldr r6, [sp]
- sub r0, r0, #2
-
- filter_h inner=0
- filter_1
-
- sub r0, r0, r1, lsl #1
- uxtb16 r6, r10
- uxtb16 r7, r8
- uxtb16 r10, r10, ror #8
- uxtb16 r8, r8, ror #8
- orr r6, r6, r7, lsl #8
- orr r10, r10, r8, lsl #8
- lsr r7, r6, #16
- lsr r8, r10, #16
-
- add r0, r0, #1
- strh_post r6, r0, r1
- strh_post r10, r0, r1
- strh_post r7, r0, r1
- strh_post r8, r0, r1
-
- filter_2
-
- sub r0, r0, r1, lsl #2
- add r0, r0, #3
-
- ldrb r11, [r0, #-5] @ p2 for 1/7th difference
- strb r10, [r0, #-4] @ op1
- strb r8, [r0, #-1] @ oq1
- ldrb_post r9, r0, r1 @ q2 for 1/7th difference
-
- lsr r10, r10, #8
- lsr r8, r8, #8
-
- ldrb r6, [r0, #-5]
- strb r10, [r0, #-4]
- strb r8, [r0, #-1]
- ldrb_post r7, r0, r1
-
- lsr r10, r10, #8
- lsr r8, r8, #8
- orr r11, r11, r6, lsl #8
- orr r9, r9, r7, lsl #8
-
- ldrb r6, [r0, #-5]
- strb r10, [r0, #-4]
- strb r8, [r0, #-1]
- ldrb_post r7, r0, r1
-
- lsr r10, r10, #8
- lsr r8, r8, #8
- orr r11, r11, r6, lsl #16
- orr r9, r9, r7, lsl #16
-
- ldrb r6, [r0, #-5]
- strb r10, [r0, #-4]
- strb r8, [r0, #-1]
- ldrb_post r7, r0, r1
- orr r11, r11, r6, lsl #24
- orr r9, r9, r7, lsl #24
-
- filter_3
-
- sub r0, r0, r1, lsl #2
- strb r10, [r0, #-5]
- strb_post r8, r0, r1
- lsr r10, r10, #8
- lsr r8, r8, #8
- strb r10, [r0, #-5]
- strb_post r8, r0, r1
- lsr r10, r10, #8
- lsr r8, r8, #8
- strb r10, [r0, #-5]
- strb_post r8, r0, r1
- lsr r10, r10, #8
- lsr r8, r8, #8
- strb r10, [r0, #-5]
- strb_post r8, r0, r1
-
- sub r0, r0, #2
-2:
- cmp r5, #3
- it eq
- ldreq r0, [sp, #16]
- subs r5, r5, #1
- bne 1b
-
- add sp, sp, #16
- pop {r0, r4-r11, pc}
-endfunc
-
-function ff_vp8_h_loop_filter16_armv6, export=1
- push {r4-r11, lr}
- add r12, r0, r1, lsl #3
- push {r12}
- ldr r9, [sp, #40]
- orr r2, r2, r2, lsl #16
- b vp8_h_loop_filter_armv6
-endfunc
-
-function ff_vp8_h_loop_filter8uv_armv6, export=1
- push {r1, r4-r11, lr}
- mov r1, r2
- orr r2, r3, r3, lsl #16
- ldr r3, [sp, #40]
- ldr r9, [sp, #44]
- b vp8_h_loop_filter_armv6
-endfunc
-
-.ltorg
-
-@ MC
-
-@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src,
-@ int srcstride, int h, int mx, int my)
-function ff_put_vp8_pixels16_armv6, export=1
- push {r4-r11}
- ldr r12, [sp, #32] @ h
-1:
- subs r12, r12, #2
- ldr r5, [r2, #4]
- ldr r6, [r2, #8]
- ldr r7, [r2, #12]
- ldr_post r4, r2, r3
- ldr r9, [r2, #4]
- ldr r10, [r2, #8]
- ldr r11, [r2, #12]
- ldr_post r8, r2, r3
- strd r6, r7, [r0, #8]
- strd_post r4, r5, r0, r1
- strd r10, r11, [r0, #8]
- strd_post r8, r9, r0, r1
- bgt 1b
- pop {r4-r11}
- bx lr
-endfunc
-
-@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src,
-@ int srcstride, int h, int mx, int my)
-function ff_put_vp8_pixels8_armv6, export=1
- push {r4-r11}
- ldr r12, [sp, #32] @ h
-1:
- subs r12, r12, #4
- ldr r5, [r2, #4]
- ldr_post r4, r2, r3
- ldr r7, [r2, #4]
- ldr_post r6, r2, r3
- ldr r9, [r2, #4]
- ldr_post r8, r2, r3
- ldr r11, [r2, #4]
- ldr_post r10, r2, r3
- strd_post r4, r5, r0, r1
- strd_post r6, r7, r0, r1
- strd_post r8, r9, r0, r1
- strd_post r10, r11, r0, r1
- bgt 1b
- pop {r4-r11}
- bx lr
-endfunc
-
-@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src,
-@ int srcstride, int h, int mx, int my)
-function ff_put_vp8_pixels4_armv6, export=1
- ldr r12, [sp, #0] @ h
- push {r4-r6,lr}
-1:
- subs r12, r12, #4
- ldr_post r4, r2, r3
- ldr_post r5, r2, r3
- ldr_post r6, r2, r3
- ldr_post lr, r2, r3
- str_post r4, r0, r1
- str_post r5, r0, r1
- str_post r6, r0, r1
- str_post lr, r0, r1
- bgt 1b
- pop {r4-r6,pc}
-endfunc
-
-@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
-@ arithmatic can be used to apply filters
-const sixtap_filters_13245600, align=4
- .short 2, 108, -11, 36, -8, 1, 0, 0
- .short 3, 77, -16, 77, -16, 3, 0, 0
- .short 1, 36, -8, 108, -11, 2, 0, 0
-endconst
-
-const fourtap_filters_1324, align=4
- .short -6, 12, 123, -1
- .short -9, 50, 93, -6
- .short -6, 93, 50, -9
- .short -1, 123, 12, -6
-endconst
-
-.macro vp8_mc_1 name, size, hv
-function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1
- sub r1, r1, #\size
- mov r12, sp
- push {r1, r4-r11, lr}
- ldm r12, {r5-r7}
- mov r4, #\size
- stm r12, {r4, r5}
- orr r12, r6, r7
- b vp8_put_\name\()_\hv\()_armv6 + 4
-endfunc
-.endm
-
-vp8_mc_1 epel, 16, h6
-vp8_mc_1 epel, 16, v6
-vp8_mc_1 epel, 8, h6
-vp8_mc_1 epel, 8, v6
-vp8_mc_1 epel, 8, h4
-vp8_mc_1 epel, 8, v4
-vp8_mc_1 epel, 4, h6
-vp8_mc_1 epel, 4, v6
-vp8_mc_1 epel, 4, h4
-vp8_mc_1 epel, 4, v4
-
-vp8_mc_1 bilin, 16, h
-vp8_mc_1 bilin, 16, v
-vp8_mc_1 bilin, 8, h
-vp8_mc_1 bilin, 8, v
-vp8_mc_1 bilin, 4, h
-vp8_mc_1 bilin, 4, v
-
-/* True relational expressions have the value -1 in the GNU assembler,
- +1 in Apple's. */
-#ifdef __APPLE__
-# define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1)
-#else
-# define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1)
-#endif
-
-.macro vp8_mc_hv name, size, h, v, ytaps
-function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1
- push {r0, r1, r4, lr}
- add r0, sp, #16
- sub sp, sp, #TMPSIZE+16
- ldm r0, {r0, r12}
- mov r4, #\size
- add lr, r0, #\ytaps-1
- .if \ytaps > 2
- sub r2, r2, r3, lsl #\ytaps >> 1 & 1
- .endif
- stm sp, {r4, lr}
- add r0, sp, #16
- mov r1, #0
- bl vp8_put_\name\()_\h\()_armv6
- add r0, sp, #TMPSIZE+16
- ldr lr, [sp, #TMPSIZE+16+16]
- ldm r0, {r0, r1}
- mov r3, #\size
- ldr r12, [sp, #TMPSIZE+16+16+8]
- str lr, [sp, #4]
- add r2, sp, #16 + \size * (\ytaps / 2 - 1)
- sub r1, r1, #\size
- bl vp8_put_\name\()_\v\()_armv6
- add sp, sp, #TMPSIZE+16+8
- pop {r4, pc}
-endfunc
-.endm
-
-vp8_mc_hv epel, 16, h6, v6, 6
-vp8_mc_hv epel, 8, h6, v6, 6
-vp8_mc_hv epel, 8, h4, v6, 6
-vp8_mc_hv epel, 8, h6, v4, 4
-vp8_mc_hv epel, 8, h4, v4, 4
-vp8_mc_hv epel, 4, h6, v6, 6
-vp8_mc_hv epel, 4, h4, v6, 6
-vp8_mc_hv epel, 4, h6, v4, 4
-vp8_mc_hv epel, 4, h4, v4, 4
-
-vp8_mc_hv bilin, 16, h, v, 2
-vp8_mc_hv bilin, 8, h, v, 2
-vp8_mc_hv bilin, 4, h, v, 2
-
-.macro sat4 r0, r1, r2, r3
- asr \r0, \r0, #7
- asr \r1, \r1, #7
- pkhbt \r0, \r0, \r2, lsl #9
- pkhbt \r1, \r1, \r3, lsl #9
- usat16 \r0, #8, \r0
- usat16 \r1, #8, \r1
- orr \r0, \r0, \r1, lsl #8
-.endm
-
-@ Calling convention for the inner MC functions:
-@ r0 dst
-@ r1 dst_stride - block_width
-@ r2 src
-@ r3 src_stride
-@ r4 block_width
-@ r12 filter_index
-@ [sp] block_width
-@ [sp+4] height
-@ [sp+8] scratch
-
-function vp8_put_epel_h6_armv6
- push {r1, r4-r11, lr}
- sub r2, r2, #2
- movrel lr, sixtap_filters_13245600 - 16
- add lr, lr, r12, lsl #3
- sub r3, r3, r4
- str r3, [sp, #48]
- ldm lr, {r1, r3, lr}
-1:
- ldr r7, [r2, #5] @ src[5-8]
- ldr r6, [r2, #2] @ src[2-5]
- ldr r5, [r2], #4 @ src[0-3]
-
- pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6]
- uxtb16 r9, r6, ror #8 @ src[5] | src[3]
- uxtb16 r6, r6 @ src[4] | src[2]
- uxtb16 r8, r5, ror #8 @ src[3] | src[1]
- uxtb16 r11, r7, ror #8 @ src[8] | src[7]
- uxtb16 r7, r7 @ src[7] | src[6]
- uxtb16 r5, r5 @ src[2] | src[0]
-
- mov r10, #0x40
- smlad r5, r5, r1, r10 @ filter[0][0]
- smlad r11, r11, lr, r10 @ filter[3][2]
- smlad r12, r7, lr, r10 @ filter[2][2]
- smlad r10, r8, r1, r10 @ filter[1][0]
- smlad r5, r8, r3, r5 @ filter[0][1]
- smlad r11, r9, r1, r11 @ filter[3][0]
- smlad r12, r9, r3, r12 @ filter[2][1]
- pkhtb r9, r9, r6, asr #16 @ src[5] | src[4]
- smlad r10, r6, r3, r10 @ filter[1][1]
- pkhbt r7, r9, r7, lsl #16 @ src[6] | src[4]
- smlad r5, r9, lr, r5 @ filter[0][2]
- pkhtb r8, r7, r9, asr #16 @ src[6] | src[5]
- smlad r11, r7, r3, r11 @ filter[3][1]
- smlad r9, r8, lr, r10 @ filter[1][2]
- smlad r7, r6, r1, r12 @ filter[2][0]
-
- subs r4, r4, #4
-
- sat4 r5, r9, r7, r11
- str r5, [r0], #4
-
- bne 1b
-
- add r4, sp, #40
- ldm r4, {r4, r5, r12}
- ldr r6, [sp]
- subs r5, r5, #1
- add r2, r2, r12
- str r5, [sp, #44]
- add r0, r0, r6
-
- bne 1b
-
- pop {r1, r4-r11, pc}
-endfunc
-
-function vp8_put_epel_v6_armv6
- push {r1, r4-r11, lr}
- movrel lr, sixtap_filters_13245600 - 16
- add lr, lr, r12, lsl #3
- str r3, [sp, #48]
-1:
- add r1, r3, r3, lsl #1 @ stride * 3
- ldr_nreg r5, r2, r3 @ src[0,1,2,3 + stride * 1]
- ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3]
- ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4]
- ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5]
-
- uxtb16 r9, r5, ror #8 @ src[3 + s*1] | src[1 + s*1]
- uxtb16 r10, r6, ror #8 @ src[3 + s*3] | src[1 + s*3]
- uxtb16 r11, r7, ror #8 @ src[3 + s*4] | src[1 + s*4]
- uxtb16 r12, r8, ror #8 @ src[3 + s*5] | src[1 + s*5]
- uxtb16 r5, r5 @ src[2 + s*1] | src[0 + s*1]
- uxtb16 r6, r6 @ src[2 + s*3] | src[0 + s*3]
- uxtb16 r7, r7 @ src[2 + s*4] | src[0 + s*4]
- uxtb16 r8, r8 @ src[2 + s*5] | src[0 + s*5]
- pkhbt r1, r9, r10, lsl #16 @ src[1 + s*3] | src[1 + s*1]
- pkhtb r9, r10, r9, asr #16 @ src[3 + s*3] | src[3 + s*1]
- pkhbt r10, r11, r12, lsl #16 @ src[1 + s*5] | src[1 + s*4]
- pkhtb r11, r12, r11, asr #16 @ src[3 + s*5] | src[3 + s*4]
- pkhbt r12, r5, r6, lsl #16 @ src[0 + s*3] | src[0 + s*1]
- pkhtb r5, r6, r5, asr #16 @ src[2 + s*3] | src[2 + s*1]
- pkhbt r6, r7, r8, lsl #16 @ src[0 + s*5] | src[0 + s*4]
- pkhtb r7, r8, r7, asr #16 @ src[2 + s*5] | src[2 + s*4]
-
- ldr r8, [lr, #4]
- mov r3, #0x40
- smlad r12, r12, r8, r3 @ filter[0][1]
- smlad r1, r1, r8, r3 @ filter[1][1]
- smlad r5, r5, r8, r3 @ filter[2][1]
- smlad r9, r9, r8, r3 @ filter[3][1]
- ldr r8, [lr, #8]
- ldr r3, [sp, #48]
- smlad r12, r6, r8, r12 @ filter[0][2]
- smlad r1, r10, r8, r1 @ filter[1][2]
- ldr_nreg r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0]
- ldr r10, [r2], #4 @ src[0,1,2,3 + stride * 2]
- smlad r5, r7, r8, r5 @ filter[2][2]
- smlad r9, r11, r8, r9 @ filter[3][2]
-
- uxtb16 r7, r6, ror #8 @ src[3 + s*0] | src[1 + s*0]
- uxtb16 r11, r10, ror #8 @ src[3 + s*2] | src[1 + s*2]
- uxtb16 r6, r6 @ src[2 + s*0] | src[0 + s*0]
- uxtb16 r10, r10 @ src[2 + s*2] | src[0 + s*2]
-
- pkhbt r8, r7, r11, lsl #16 @ src[1 + s*2] | src[1 + s*0]
- pkhtb r7, r11, r7, asr #16 @ src[3 + s*2] | src[3 + s*0]
- pkhbt r11, r6, r10, lsl #16 @ src[0 + s*2] | src[0 + s*0]
- pkhtb r6, r10, r6, asr #16 @ src[2 + s*2] | src[2 + s*0]
-
- ldr r10, [lr]
- subs r4, r4, #4
- smlad r12, r11, r10, r12 @ filter[0][0]
- smlad r1, r8, r10, r1 @ filter[1][0]
- smlad r5, r6, r10, r5 @ filter[2][0]
- smlad r9, r7, r10, r9 @ filter[3][0]
-
- sat4 r12, r1, r5, r9
- str r12, [r0], #4
-
- bne 1b
-
- ldrd r4, r5, [sp, #40]
- ldr r6, [sp]
- subs r5, r5, #1
- sub r2, r2, r4
- str r5, [sp, #44]
- add r0, r0, r6
- add r2, r2, r3
-
- bne 1b
-
- pop {r1, r4-r11, pc}
-endfunc
-
-function vp8_put_epel_h4_armv6
- push {r1, r4-r11, lr}
- subs r2, r2, #1
- movrel lr, fourtap_filters_1324 - 4
- add lr, lr, r12, lsl #2
- sub r3, r3, r4
- ldm lr, {r5, r6}
- ldr lr, [sp, #44]
-1:
- ldr r9, [r2, #3]
- ldr r8, [r2, #2]
- ldr r7, [r2], #4
-
- uxtb16 r9, r9, ror #8 @ src[6] | src[4]
- uxtb16 r10, r8, ror #8 @ src[5] | src[3]
- uxtb16 r8, r8 @ src[4] | src[2]
- uxtb16 r11, r7, ror #8 @ src[3] | src[1]
- uxtb16 r7, r7 @ src[2] | src[0]
-
- mov r12, #0x40
- smlad r9, r9, r6, r12 @ filter[3][1]
- smlad r7, r7, r5, r12 @ filter[0][0]
- smlad r9, r10, r5, r9 @ filter[3][0]
- smlad r10, r10, r6, r12 @ filter[2][1]
- smlad r12, r11, r5, r12 @ filter[1][0]
- smlad r7, r11, r6, r7 @ filter[0][1]
- smlad r10, r8, r5, r10 @ filter[2][0]
- smlad r12, r8, r6, r12 @ filter[1][1]
-
- subs r4, r4, #4
-
- sat4 r7, r12, r10, r9
- str r7, [r0], #4
-
- bne 1b
-
- subs lr, lr, #1
- ldr r4, [sp, #40]
- add r2, r2, r3
- add r0, r0, r1
-
- bne 1b
-
- pop {r1, r4-r11, pc}
-endfunc
-
-function vp8_put_epel_v4_armv6
- push {r1, r4-r11, lr}
- movrel lr, fourtap_filters_1324 - 4
- add lr, lr, r12, lsl #2
- ldm lr, {r5, r6}
- str r3, [sp, #48]
-1:
- ldr lr, [r2, r3, lsl #1]
- ldr r12, [r2, r3]
- ldr_nreg r7, r2, r3
- ldr r11, [r2], #4
-
- uxtb16 r8, lr, ror #8 @ src[3 + s*3] | src[1 + s*3]
- uxtb16 r9, r12, ror #8 @ src[3 + s*2] | src[1 + s*2]
- uxtb16 r3, r7, ror #8 @ src[3 + s*0] | src[1 + s*0]
- uxtb16 r1, r11, ror #8 @ src[3 + s*1] | src[1 + s*1]
- uxtb16 lr, lr @ src[2 + s*3] | src[0 + s*3]
- uxtb16 r12, r12 @ src[2 + s*2] | src[0 + s*2]
- uxtb16 r7, r7 @ src[2 + s*0] | src[0 + s*0]
- uxtb16 r11, r11 @ src[2 + s*1] | src[0 + s*1]
- pkhbt r10, r1, r8, lsl #16 @ src[1 + s*3] | src[1 + s*1]
- pkhtb r1, r8, r1, asr #16 @ src[3 + s*3] | src[3 + s*1]
- pkhbt r8, r3, r9, lsl #16 @ src[1 + s*2] | src[1 + s*0]
- pkhtb r3, r9, r3, asr #16 @ src[3 + s*2] | src[3 + s*0]
- pkhbt r9, r11, lr, lsl #16 @ src[0 + s*3] | src[0 + s*1]
- pkhtb r11, lr, r11, asr #16 @ src[2 + s*3] | src[2 + s*1]
- pkhbt lr, r7, r12, lsl #16 @ src[0 + s*2] | src[0 + s*0]
- pkhtb r7, r12, r7, asr #16 @ src[2 + s*2] | src[2 + s*0]
-
- mov r12, #0x40
- smlad r9, r9, r6, r12 @ filter[0][1]
- smlad r10, r10, r6, r12 @ filter[1][1]
- smlad r11, r11, r6, r12 @ filter[2][1]
- smlad r1, r1, r6, r12 @ filter[3][1]
- smlad r9, lr, r5, r9 @ filter[0][0]
- smlad r10, r8, r5, r10 @ filter[1][0]
- smlad r11, r7, r5, r11 @ filter[2][0]
- smlad r1, r3, r5, r1 @ filter[3][0]
-
- subs r4, r4, #4
- ldr r3, [sp, #48]
-
- sat4 r9, r10, r11, r1
- str r9, [r0], #4
-
- bne 1b
-
- ldr r4, [sp, #40]
- ldr r12, [sp, #44]
- add r2, r2, r3
- ldr r9, [sp, #0]
- subs r12, r12, #1
- sub r2, r2, r4
- str r12, [sp, #44]
- add r0, r0, r9
-
- bne 1b
-
- pop {r1, r4-r11, pc}
-endfunc
-
-function vp8_put_bilin_h_armv6
- push {r1, r4-r11, lr}
- rsb r5, r12, r12, lsl #16
- ldr r12, [sp, #44]
- sub r3, r3, r4
- add r5, r5, #8
-1:
- ldrb r6, [r2], #1
- ldrb r7, [r2], #1
- ldrb r8, [r2], #1
- ldrb r9, [r2], #1
- ldrb lr, [r2]
-
- pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0]
- pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1]
- pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2]
- pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3]
-
- mov r10, #4
- smlad r6, r6, r5, r10
- smlad r7, r7, r5, r10
- smlad r8, r8, r5, r10
- smlad r9, r9, r5, r10
-
- subs r4, r4, #4
-
- asr r6, #3
- asr r7, #3
- pkhbt r6, r6, r8, lsl #13
- pkhbt r7, r7, r9, lsl #13
- orr r6, r6, r7, lsl #8
- str r6, [r0], #4
-
- bne 1b
-
- ldr r4, [sp, #40]
- subs r12, r12, #1
- add r2, r2, r3
- add r0, r0, r1
-
- bne 1b
-
- pop {r1, r4-r11, pc}
-endfunc
-
-function vp8_put_bilin_v_armv6
- push {r1, r4-r11, lr}
- rsb r5, r12, r12, lsl #16
- ldr r12, [sp, #44]
- add r5, r5, #8
-1:
- ldrb r10, [r2, r3]
- ldrb r6, [r2], #1
- ldrb r11, [r2, r3]
- ldrb r7, [r2], #1
- ldrb lr, [r2, r3]
- ldrb r8, [r2], #1
- ldrb r9, [r2, r3]
- pkhbt r6, r6, r10, lsl #16
- ldrb r10, [r2], #1
- pkhbt r7, r7, r11, lsl #16
- pkhbt r8, r8, lr, lsl #16
- pkhbt r9, r10, r9, lsl #16
-
- mov r10, #4
- smlad r6, r6, r5, r10
- smlad r7, r7, r5, r10
- smlad r8, r8, r5, r10
- smlad r9, r9, r5, r10
-
- subs r4, r4, #4
-
- asr r6, #3
- asr r7, #3
- pkhbt r6, r6, r8, lsl #13
- pkhbt r7, r7, r9, lsl #13
- orr r6, r6, r7, lsl #8
- str r6, [r0], #4
-
- bne 1b
-
- ldr r4, [sp, #40]
- subs r12, r12, #1
- add r2, r2, r3
- add r0, r0, r1
- sub r2, r2, r4
-
- bne 1b
- pop {r1, r4-r11, pc}
-endfunc
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c
deleted file mode 100644
index d360ae3..0000000
--- a/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/vp8dsp.h"
-#include "vp8dsp.h"
-
-av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
-{
- int cpu_flags = av_get_cpu_flags();
-
- if (have_armv6(cpu_flags))
- ff_vp8dsp_init_armv6(dsp);
- if (have_neon(cpu_flags))
- ff_vp8dsp_init_neon(dsp);
-}
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c b/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c
deleted file mode 100644
index 563268e..0000000
--- a/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/vp8dsp.h"
-#include "vp8dsp.h"
-
-void ff_vp8_luma_dc_wht_armv6(int16_t block[4][4][16], int16_t dc[16]);
-void ff_vp8_luma_dc_wht_dc_armv6(int16_t block[4][4][16], int16_t dc[16]);
-
-void ff_vp8_idct_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
-void ff_vp8_idct_dc_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
-void ff_vp8_idct_dc_add4y_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
-void ff_vp8_idct_dc_add4uv_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
-
-VP8_LF(armv6);
-
-VP8_EPEL(16, armv6);
-VP8_EPEL(8, armv6);
-VP8_EPEL(4, armv6);
-
-VP8_BILIN(16, armv6);
-VP8_BILIN(8, armv6);
-VP8_BILIN(4, armv6);
-
-av_cold void ff_vp8dsp_init_armv6(VP8DSPContext *dsp)
-{
- dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_armv6;
- dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6;
-
- dsp->vp8_idct_add = ff_vp8_idct_add_armv6;
- dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_armv6;
- dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_armv6;
- dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_armv6;
-
- dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_armv6;
- dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_armv6;
- dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_armv6;
- dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_armv6;
-
- dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_armv6;
- dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_armv6;
- dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_armv6;
- dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_armv6;
-
- dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_armv6;
- dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_armv6;
-
- dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6;
- dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_armv6;
- dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_armv6;
- dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_armv6;
-
- dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6;
- dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_armv6;
- dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_armv6;
- dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_armv6;
- dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_armv6;
- dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_armv6;
- dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_armv6;
- dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_armv6;
- dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_armv6;
-
- dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6;
- dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_armv6;
- dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_armv6;
- dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_armv6;
- dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_armv6;
- dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_armv6;
- dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_armv6;
- dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_armv6;
- dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_armv6;
-
- dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_armv6;
-
- dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_armv6;
-
- dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_armv6;
- dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_armv6;
-}
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c b/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c
deleted file mode 100644
index ae045a6..0000000
--- a/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/attributes.h"
-#include "libavcodec/vp8dsp.h"
-#include "vp8dsp.h"
-
-void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
-
-void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
-void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
-void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
-void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
-
-VP8_LF(neon);
-
-VP8_EPEL(16, neon);
-VP8_EPEL(8, neon);
-VP8_EPEL(4, neon);
-
-VP8_BILIN(16, neon);
-VP8_BILIN(8, neon);
-VP8_BILIN(4, neon);
-
-av_cold void ff_vp8dsp_init_neon(VP8DSPContext *dsp)
-{
- dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
-
- dsp->vp8_idct_add = ff_vp8_idct_add_neon;
- dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
- dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
- dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
-
- dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
- dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
- dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
- dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
-
- dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
- dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
- dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
- dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
-
- dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
- dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
-
- dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
- dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
- dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
- dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
-
- dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
- dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
- dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
- dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
- dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
- dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
- dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
- dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
- dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
-
- dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
- dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
- dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
- dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
- dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
- dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
- dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
- dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
-
- dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
-
- dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
-
- dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
- dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
-}
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_neon.S b/ffmpeg/libavcodec/arm/vp8dsp_neon.S
deleted file mode 100644
index 436b340..0000000
--- a/ffmpeg/libavcodec/arm/vp8dsp_neon.S
+++ /dev/null
@@ -1,1876 +0,0 @@
-/*
- * VP8 NEON optimisations
- *
- * Copyright (c) 2010 Rob Clark <rob@ti.com>
- * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-#include "neon.S"
-
-function ff_vp8_luma_dc_wht_neon, export=1
- vld1.16 {q0-q1}, [r1,:128]
- vmov.i16 q15, #0
-
- vadd.i16 d4, d0, d3
- vadd.i16 d6, d1, d2
- vst1.16 {q15}, [r1,:128]!
- vsub.i16 d7, d1, d2
- vsub.i16 d5, d0, d3
- vst1.16 {q15}, [r1,:128]
- vadd.i16 q0, q2, q3
- vsub.i16 q1, q2, q3
-
- vmov.i16 q8, #3
-
- vtrn.32 d0, d2
- vtrn.32 d1, d3
- vtrn.16 d0, d1
- vtrn.16 d2, d3
-
- vadd.i16 d0, d0, d16
-
- vadd.i16 d4, d0, d3
- vadd.i16 d6, d1, d2
- vsub.i16 d7, d1, d2
- vsub.i16 d5, d0, d3
- vadd.i16 q0, q2, q3
- vsub.i16 q1, q2, q3
-
- vshr.s16 q0, q0, #3
- vshr.s16 q1, q1, #3
-
- mov r3, #32
- vst1.16 {d0[0]}, [r0,:16], r3
- vst1.16 {d1[0]}, [r0,:16], r3
- vst1.16 {d2[0]}, [r0,:16], r3
- vst1.16 {d3[0]}, [r0,:16], r3
- vst1.16 {d0[1]}, [r0,:16], r3
- vst1.16 {d1[1]}, [r0,:16], r3
- vst1.16 {d2[1]}, [r0,:16], r3
- vst1.16 {d3[1]}, [r0,:16], r3
- vst1.16 {d0[2]}, [r0,:16], r3
- vst1.16 {d1[2]}, [r0,:16], r3
- vst1.16 {d2[2]}, [r0,:16], r3
- vst1.16 {d3[2]}, [r0,:16], r3
- vst1.16 {d0[3]}, [r0,:16], r3
- vst1.16 {d1[3]}, [r0,:16], r3
- vst1.16 {d2[3]}, [r0,:16], r3
- vst1.16 {d3[3]}, [r0,:16], r3
-
- bx lr
-endfunc
-
-function ff_vp8_idct_add_neon, export=1
- vld1.16 {q0-q1}, [r1,:128]
- movw r3, #20091
- movt r3, #35468/2
- vdup.32 d4, r3
-
- vmull.s16 q12, d1, d4[0]
- vmull.s16 q13, d3, d4[0]
- vqdmulh.s16 d20, d1, d4[1]
- vqdmulh.s16 d23, d3, d4[1]
- vshrn.s32 d21, q12, #16
- vshrn.s32 d22, q13, #16
- vadd.s16 d21, d21, d1
- vadd.s16 d22, d22, d3
-
- vadd.s16 d16, d0, d2
- vsub.s16 d17, d0, d2
- vadd.s16 d18, d21, d23
- vsub.s16 d19, d20, d22
- vadd.s16 q0, q8, q9
- vsub.s16 q1, q8, q9
-
- vtrn.32 d0, d3
- vtrn.32 d1, d2
- vtrn.16 d0, d1
- vtrn.16 d3, d2
-
- vmov.i16 q15, #0
- vmull.s16 q12, d1, d4[0]
- vst1.16 {q15}, [r1,:128]!
- vmull.s16 q13, d2, d4[0]
- vst1.16 {q15}, [r1,:128]
- vqdmulh.s16 d21, d1, d4[1]
- vqdmulh.s16 d23, d2, d4[1]
- vshrn.s32 d20, q12, #16
- vshrn.s32 d22, q13, #16
- vadd.i16 d20, d20, d1
- vadd.i16 d22, d22, d2
-
- vadd.i16 d16, d0, d3
- vsub.i16 d17, d0, d3
- vadd.i16 d18, d20, d23
- vld1.32 {d20[]}, [r0,:32], r2
- vsub.i16 d19, d21, d22
- vld1.32 {d22[]}, [r0,:32], r2
- vadd.s16 q0, q8, q9
- vld1.32 {d23[]}, [r0,:32], r2
- vsub.s16 q1, q8, q9
- vld1.32 {d21[]}, [r0,:32], r2
- vrshr.s16 q0, q0, #3
- vtrn.32 q10, q11
- vrshr.s16 q1, q1, #3
-
- sub r0, r0, r2, lsl #2
-
- vtrn.32 d0, d3
- vtrn.32 d1, d2
- vtrn.16 d0, d1
- vtrn.16 d3, d2
-
- vaddw.u8 q0, q0, d20
- vaddw.u8 q1, q1, d21
- vqmovun.s16 d0, q0
- vqmovun.s16 d1, q1
-
- vst1.32 {d0[0]}, [r0,:32], r2
- vst1.32 {d0[1]}, [r0,:32], r2
- vst1.32 {d1[1]}, [r0,:32], r2
- vst1.32 {d1[0]}, [r0,:32], r2
-
- bx lr
-endfunc
-
-function ff_vp8_idct_dc_add_neon, export=1
- mov r3, #0
- ldrsh r12, [r1]
- strh r3, [r1]
- vdup.16 q1, r12
- vrshr.s16 q1, q1, #3
- vld1.32 {d0[]}, [r0,:32], r2
- vld1.32 {d1[]}, [r0,:32], r2
- vld1.32 {d0[1]}, [r0,:32], r2
- vld1.32 {d1[1]}, [r0,:32], r2
- vaddw.u8 q2, q1, d0
- vaddw.u8 q3, q1, d1
- sub r0, r0, r2, lsl #2
- vqmovun.s16 d0, q2
- vqmovun.s16 d1, q3
- vst1.32 {d0[0]}, [r0,:32], r2
- vst1.32 {d1[0]}, [r0,:32], r2
- vst1.32 {d0[1]}, [r0,:32], r2
- vst1.32 {d1[1]}, [r0,:32], r2
- bx lr
-endfunc
-
-function ff_vp8_idct_dc_add4uv_neon, export=1
- vmov.i16 d0, #0
- mov r3, #32
- vld1.16 {d16[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vld1.16 {d17[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vld1.16 {d18[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vld1.16 {d19[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- mov r3, r0
- vrshr.s16 q8, q8, #3 @ dc >>= 3
- vld1.8 {d0}, [r0,:64], r2
- vrshr.s16 q9, q9, #3
- vld1.8 {d1}, [r0,:64], r2
- vaddw.u8 q10, q8, d0
- vld1.8 {d2}, [r0,:64], r2
- vaddw.u8 q0, q8, d1
- vld1.8 {d3}, [r0,:64], r2
- vaddw.u8 q11, q8, d2
- vld1.8 {d4}, [r0,:64], r2
- vaddw.u8 q1, q8, d3
- vld1.8 {d5}, [r0,:64], r2
- vaddw.u8 q12, q9, d4
- vld1.8 {d6}, [r0,:64], r2
- vaddw.u8 q2, q9, d5
- vld1.8 {d7}, [r0,:64], r2
- vaddw.u8 q13, q9, d6
- vqmovun.s16 d20, q10
- vaddw.u8 q3, q9, d7
- vqmovun.s16 d21, q0
- vqmovun.s16 d22, q11
- vst1.8 {d20}, [r3,:64], r2
- vqmovun.s16 d23, q1
- vst1.8 {d21}, [r3,:64], r2
- vqmovun.s16 d24, q12
- vst1.8 {d22}, [r3,:64], r2
- vqmovun.s16 d25, q2
- vst1.8 {d23}, [r3,:64], r2
- vqmovun.s16 d26, q13
- vst1.8 {d24}, [r3,:64], r2
- vqmovun.s16 d27, q3
- vst1.8 {d25}, [r3,:64], r2
- vst1.8 {d26}, [r3,:64], r2
- vst1.8 {d27}, [r3,:64], r2
-
- bx lr
-endfunc
-
-function ff_vp8_idct_dc_add4y_neon, export=1
- vmov.i16 d0, #0
- mov r3, #32
- vld1.16 {d16[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vld1.16 {d17[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vld1.16 {d18[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vld1.16 {d19[]}, [r1,:16]
- vst1.16 {d0[0]}, [r1,:16], r3
- vrshr.s16 q8, q8, #3 @ dc >>= 3
- vld1.8 {q0}, [r0,:128], r2
- vrshr.s16 q9, q9, #3
- vld1.8 {q1}, [r0,:128], r2
- vaddw.u8 q10, q8, d0
- vld1.8 {q2}, [r0,:128], r2
- vaddw.u8 q0, q9, d1
- vld1.8 {q3}, [r0,:128], r2
- vaddw.u8 q11, q8, d2
- vaddw.u8 q1, q9, d3
- vaddw.u8 q12, q8, d4
- vaddw.u8 q2, q9, d5
- vaddw.u8 q13, q8, d6
- vaddw.u8 q3, q9, d7
- sub r0, r0, r2, lsl #2
- vqmovun.s16 d20, q10
- vqmovun.s16 d21, q0
- vqmovun.s16 d22, q11
- vqmovun.s16 d23, q1
- vqmovun.s16 d24, q12
- vst1.8 {q10}, [r0,:128], r2
- vqmovun.s16 d25, q2
- vst1.8 {q11}, [r0,:128], r2
- vqmovun.s16 d26, q13
- vst1.8 {q12}, [r0,:128], r2
- vqmovun.s16 d27, q3
- vst1.8 {q13}, [r0,:128], r2
-
- bx lr
-endfunc
-
-@ Register layout:
-@ P3..Q3 -> q0..q7
-@ flim_E -> q14
-@ flim_I -> q15
-@ hev_thresh -> r12
-@
-.macro vp8_loop_filter, inner=0, simple=0
- .if \simple
- vabd.u8 q9, q3, q4 @ abs(P0-Q0)
- vabd.u8 q15, q2, q5 @ abs(P1-Q1)
- vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
- vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
- vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
- vmov.i8 q13, #0x80
- vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
- .else
- @ calculate hev and normal_limit:
- vabd.u8 q12, q2, q3 @ abs(P1-P0)
- vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
- vabd.u8 q10, q0, q1 @ abs(P3-P2)
- vabd.u8 q11, q1, q2 @ abs(P2-P1)
- vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
- vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
- vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
- vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
- vand q8, q8, q9
- vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
- vand q8, q8, q11
- vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
- vand q8, q8, q10
- vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
- vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
- vabd.u8 q9, q3, q4 @ abs(P0-Q0)
- vabd.u8 q15, q2, q5 @ abs(P1-Q1)
- vand q8, q8, q10
- vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
- vand q8, q8, q11
- vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
- vdup.8 q15, r12 @ hev_thresh
- vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
- vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
- vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
- vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
- vand q8, q8, q11
- vmov.i8 q13, #0x80
- vorr q9, q12, q14
- .endif
-
- @ at this point:
- @ q8: normal_limit
- @ q9: hev
-
- @ convert to signed value:
- veor q3, q3, q13 @ PS0 = P0 ^ 0x80
- veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
-
- vmov.i16 q12, #3
- vsubl.s8 q10, d8, d6 @ QS0 - PS0
- vsubl.s8 q11, d9, d7 @ (widened to 16bit)
- veor q2, q2, q13 @ PS1 = P1 ^ 0x80
- veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
- vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
- vmul.i16 q11, q11, q12
-
- vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
- vmov.i8 q14, #4
- vmov.i8 q15, #3
- .if \inner
- vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
- .endif
- vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
- vaddw.s8 q11, q11, d25
- vqmovn.s16 d20, q10 @ narrow result back into q10
- vqmovn.s16 d21, q11
- .if !\inner && !\simple
- veor q1, q1, q13 @ PS2 = P2 ^ 0x80
- veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
- .endif
- vand q10, q10, q8 @ w &= normal_limit
-
- @ registers used at this point..
- @ q0 -> P3 (don't corrupt)
- @ q1-q6 -> PS2-QS2
- @ q7 -> Q3 (don't corrupt)
- @ q9 -> hev
- @ q10 -> w
- @ q13 -> #0x80
- @ q14 -> #4
- @ q15 -> #3
- @ q8, q11, q12 -> unused
-
- @ filter_common: is4tap==1
- @ c1 = clamp(w + 4) >> 3;
- @ c2 = clamp(w + 3) >> 3;
- @ Q0 = s2u(QS0 - c1);
- @ P0 = s2u(PS0 + c2);
-
- .if \simple
- vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
- vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
- vshr.s8 q11, q11, #3 @ c1 >>= 3
- vshr.s8 q12, q12, #3 @ c2 >>= 3
- vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
- vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
- veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
- veor q3, q3, q13 @ P0 = PS0 ^ 0x80
- veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
- veor q2, q2, q13 @ P1 = PS1 ^ 0x80
- .elseif \inner
- @ the !is4tap case of filter_common, only used for inner blocks
- @ c3 = ((c1&~hev) + 1) >> 1;
- @ Q1 = s2u(QS1 - c3);
- @ P1 = s2u(PS1 + c3);
- vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
- vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
- vshr.s8 q11, q11, #3 @ c1 >>= 3
- vshr.s8 q12, q12, #3 @ c2 >>= 3
- vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
- vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
- vbic q11, q11, q9 @ c1 & ~hev
- veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
- vrshr.s8 q11, q11, #1 @ c3 >>= 1
- veor q3, q3, q13 @ P0 = PS0 ^ 0x80
- vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
- vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
- veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
- veor q2, q2, q13 @ P1 = PS1 ^ 0x80
- .else
- vand q12, q10, q9 @ w & hev
- vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
- vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
- vshr.s8 q11, q11, #3 @ c1 >>= 3
- vshr.s8 q12, q12, #3 @ c2 >>= 3
- vbic q10, q10, q9 @ w &= ~hev
- vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
- vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
-
- @ filter_mbedge:
- @ a = clamp((27*w + 63) >> 7);
- @ Q0 = s2u(QS0 - a);
- @ P0 = s2u(PS0 + a);
- @ a = clamp((18*w + 63) >> 7);
- @ Q1 = s2u(QS1 - a);
- @ P1 = s2u(PS1 + a);
- @ a = clamp((9*w + 63) >> 7);
- @ Q2 = s2u(QS2 - a);
- @ P2 = s2u(PS2 + a);
- vmov.i16 q9, #63
- vshll.s8 q14, d20, #3
- vshll.s8 q15, d21, #3
- vaddw.s8 q14, q14, d20
- vaddw.s8 q15, q15, d21
- vadd.s16 q8, q9, q14
- vadd.s16 q9, q9, q15 @ 9*w + 63
- vadd.s16 q11, q8, q14
- vadd.s16 q12, q9, q15 @ 18*w + 63
- vadd.s16 q14, q11, q14
- vadd.s16 q15, q12, q15 @ 27*w + 63
- vqshrn.s16 d16, q8, #7
- vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
- vqshrn.s16 d22, q11, #7
- vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
- vqshrn.s16 d28, q14, #7
- vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
- vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
- vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
- vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
- vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
- vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
- vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
- veor q3, q3, q13 @ P0 = PS0 ^ 0x80
- veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
- veor q2, q2, q13 @ P1 = PS1 ^ 0x80
- veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
- veor q1, q1, q13 @ P2 = PS2 ^ 0x80
- veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
- .endif
-.endm
-
-.macro vp8_v_loop_filter16 name, inner=0, simple=0
-function ff_vp8_v_loop_filter16\name\()_neon, export=1
- vpush {q4-q7}
- sub r0, r0, r1, lsl #1+!\simple
-
- @ Load pixels:
- .if !\simple
- ldr r12, [sp, #64] @ hev_thresh
- vld1.8 {q0}, [r0,:128], r1 @ P3
- vld1.8 {q1}, [r0,:128], r1 @ P2
- .endif
- vld1.8 {q2}, [r0,:128], r1 @ P1
- vld1.8 {q3}, [r0,:128], r1 @ P0
- vld1.8 {q4}, [r0,:128], r1 @ Q0
- vld1.8 {q5}, [r0,:128], r1 @ Q1
- .if !\simple
- vld1.8 {q6}, [r0,:128], r1 @ Q2
- vld1.8 {q7}, [r0,:128] @ Q3
- vdup.8 q15, r3 @ flim_I
- .endif
- vdup.8 q14, r2 @ flim_E
-
- vp8_loop_filter inner=\inner, simple=\simple
-
- @ back up to P2: dst -= stride * 6
- sub r0, r0, r1, lsl #2
- .if !\simple
- sub r0, r0, r1, lsl #1
-
- @ Store pixels:
- vst1.8 {q1}, [r0,:128], r1 @ P2
- .endif
- vst1.8 {q2}, [r0,:128], r1 @ P1
- vst1.8 {q3}, [r0,:128], r1 @ P0
- vst1.8 {q4}, [r0,:128], r1 @ Q0
- vst1.8 {q5}, [r0,:128], r1 @ Q1
- .if !\simple
- vst1.8 {q6}, [r0,:128] @ Q2
- .endif
-
- vpop {q4-q7}
- bx lr
-endfunc
-.endm
-
-vp8_v_loop_filter16
-vp8_v_loop_filter16 _inner, inner=1
-vp8_v_loop_filter16 _simple, simple=1
-
-.macro vp8_v_loop_filter8uv name, inner=0
-function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
- vpush {q4-q7}
- sub r0, r0, r2, lsl #2
- sub r1, r1, r2, lsl #2
- ldr r12, [sp, #64] @ flim_I
-
- @ Load pixels:
- vld1.8 {d0}, [r0,:64], r2 @ P3
- vld1.8 {d1}, [r1,:64], r2 @ P3
- vld1.8 {d2}, [r0,:64], r2 @ P2
- vld1.8 {d3}, [r1,:64], r2 @ P2
- vld1.8 {d4}, [r0,:64], r2 @ P1
- vld1.8 {d5}, [r1,:64], r2 @ P1
- vld1.8 {d6}, [r0,:64], r2 @ P0
- vld1.8 {d7}, [r1,:64], r2 @ P0
- vld1.8 {d8}, [r0,:64], r2 @ Q0
- vld1.8 {d9}, [r1,:64], r2 @ Q0
- vld1.8 {d10}, [r0,:64], r2 @ Q1
- vld1.8 {d11}, [r1,:64], r2 @ Q1
- vld1.8 {d12}, [r0,:64], r2 @ Q2
- vld1.8 {d13}, [r1,:64], r2 @ Q2
- vld1.8 {d14}, [r0,:64] @ Q3
- vld1.8 {d15}, [r1,:64] @ Q3
-
- vdup.8 q14, r3 @ flim_E
- vdup.8 q15, r12 @ flim_I
- ldr r12, [sp, #68] @ hev_thresh
-
- vp8_loop_filter inner=\inner
-
- @ back up to P2: u,v -= stride * 6
- sub r0, r0, r2, lsl #2
- sub r1, r1, r2, lsl #2
- sub r0, r0, r2, lsl #1
- sub r1, r1, r2, lsl #1
-
- @ Store pixels:
- vst1.8 {d2}, [r0,:64], r2 @ P2
- vst1.8 {d3}, [r1,:64], r2 @ P2
- vst1.8 {d4}, [r0,:64], r2 @ P1
- vst1.8 {d5}, [r1,:64], r2 @ P1
- vst1.8 {d6}, [r0,:64], r2 @ P0
- vst1.8 {d7}, [r1,:64], r2 @ P0
- vst1.8 {d8}, [r0,:64], r2 @ Q0
- vst1.8 {d9}, [r1,:64], r2 @ Q0
- vst1.8 {d10}, [r0,:64], r2 @ Q1
- vst1.8 {d11}, [r1,:64], r2 @ Q1
- vst1.8 {d12}, [r0,:64] @ Q2
- vst1.8 {d13}, [r1,:64] @ Q2
-
- vpop {q4-q7}
- bx lr
-endfunc
-.endm
-
-vp8_v_loop_filter8uv
-vp8_v_loop_filter8uv _inner, inner=1
-
-.macro vp8_h_loop_filter16 name, inner=0, simple=0
-function ff_vp8_h_loop_filter16\name\()_neon, export=1
- vpush {q4-q7}
- sub r0, r0, #4
- .if !\simple
- ldr r12, [sp, #64] @ hev_thresh
- .endif
-
- @ Load pixels:
- vld1.8 {d0}, [r0], r1 @ load first 8-line src data
- vld1.8 {d2}, [r0], r1
- vld1.8 {d4}, [r0], r1
- vld1.8 {d6}, [r0], r1
- vld1.8 {d8}, [r0], r1
- vld1.8 {d10}, [r0], r1
- vld1.8 {d12}, [r0], r1
- vld1.8 {d14}, [r0], r1
- vld1.8 {d1}, [r0], r1 @ load second 8-line src data
- vld1.8 {d3}, [r0], r1
- vld1.8 {d5}, [r0], r1
- vld1.8 {d7}, [r0], r1
- vld1.8 {d9}, [r0], r1
- vld1.8 {d11}, [r0], r1
- vld1.8 {d13}, [r0], r1
- vld1.8 {d15}, [r0], r1
-
- transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
-
- vdup.8 q14, r2 @ flim_E
- .if !\simple
- vdup.8 q15, r3 @ flim_I
- .endif
-
- vp8_loop_filter inner=\inner, simple=\simple
-
- sub r0, r0, r1, lsl #4 @ backup 16 rows
-
- transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
-
- @ Store pixels:
- vst1.8 {d0}, [r0], r1
- vst1.8 {d2}, [r0], r1
- vst1.8 {d4}, [r0], r1
- vst1.8 {d6}, [r0], r1
- vst1.8 {d8}, [r0], r1
- vst1.8 {d10}, [r0], r1
- vst1.8 {d12}, [r0], r1
- vst1.8 {d14}, [r0], r1
- vst1.8 {d1}, [r0], r1
- vst1.8 {d3}, [r0], r1
- vst1.8 {d5}, [r0], r1
- vst1.8 {d7}, [r0], r1
- vst1.8 {d9}, [r0], r1
- vst1.8 {d11}, [r0], r1
- vst1.8 {d13}, [r0], r1
- vst1.8 {d15}, [r0]
-
- vpop {q4-q7}
- bx lr
-endfunc
-.endm
-
-vp8_h_loop_filter16
-vp8_h_loop_filter16 _inner, inner=1
-vp8_h_loop_filter16 _simple, simple=1
-
-.macro vp8_h_loop_filter8uv name, inner=0
-function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
- vpush {q4-q7}
- sub r0, r0, #4
- sub r1, r1, #4
- ldr r12, [sp, #64] @ flim_I
-
- @ Load pixels:
- vld1.8 {d0}, [r0], r2 @ load u
- vld1.8 {d1}, [r1], r2 @ load v
- vld1.8 {d2}, [r0], r2
- vld1.8 {d3}, [r1], r2
- vld1.8 {d4}, [r0], r2
- vld1.8 {d5}, [r1], r2
- vld1.8 {d6}, [r0], r2
- vld1.8 {d7}, [r1], r2
- vld1.8 {d8}, [r0], r2
- vld1.8 {d9}, [r1], r2
- vld1.8 {d10}, [r0], r2
- vld1.8 {d11}, [r1], r2
- vld1.8 {d12}, [r0], r2
- vld1.8 {d13}, [r1], r2
- vld1.8 {d14}, [r0], r2
- vld1.8 {d15}, [r1], r2
-
- transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
-
- vdup.8 q14, r3 @ flim_E
- vdup.8 q15, r12 @ flim_I
- ldr r12, [sp, #68] @ hev_thresh
-
- vp8_loop_filter inner=\inner
-
- sub r0, r0, r2, lsl #3 @ backup u 8 rows
- sub r1, r1, r2, lsl #3 @ backup v 8 rows
-
- transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
-
- @ Store pixels:
- vst1.8 {d0}, [r0], r2
- vst1.8 {d1}, [r1], r2
- vst1.8 {d2}, [r0], r2
- vst1.8 {d3}, [r1], r2
- vst1.8 {d4}, [r0], r2
- vst1.8 {d5}, [r1], r2
- vst1.8 {d6}, [r0], r2
- vst1.8 {d7}, [r1], r2
- vst1.8 {d8}, [r0], r2
- vst1.8 {d9}, [r1], r2
- vst1.8 {d10}, [r0], r2
- vst1.8 {d11}, [r1], r2
- vst1.8 {d12}, [r0], r2
- vst1.8 {d13}, [r1], r2
- vst1.8 {d14}, [r0]
- vst1.8 {d15}, [r1]
-
- vpop {q4-q7}
- bx lr
-endfunc
-.endm
-
-vp8_h_loop_filter8uv
-vp8_h_loop_filter8uv _inner, inner=1
-
-function ff_put_vp8_pixels16_neon, export=1
- ldr r12, [sp, #0] @ h
-1:
- subs r12, r12, #4
- vld1.8 {q0}, [r2], r3
- vld1.8 {q1}, [r2], r3
- vld1.8 {q2}, [r2], r3
- vld1.8 {q3}, [r2], r3
- vst1.8 {q0}, [r0,:128], r1
- vst1.8 {q1}, [r0,:128], r1
- vst1.8 {q2}, [r0,:128], r1
- vst1.8 {q3}, [r0,:128], r1
- bgt 1b
- bx lr
-endfunc
-
-function ff_put_vp8_pixels8_neon, export=1
- ldr r12, [sp, #0] @ h
-1:
- subs r12, r12, #4
- vld1.8 {d0}, [r2], r3
- vld1.8 {d1}, [r2], r3
- vld1.8 {d2}, [r2], r3
- vld1.8 {d3}, [r2], r3
- vst1.8 {d0}, [r0,:64], r1
- vst1.8 {d1}, [r0,:64], r1
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- bgt 1b
- bx lr
-endfunc
-
-/* 4/6-tap 8th-pel MC */
-
-.macro vp8_epel8_h6 d, a, b
- vext.8 d27, \a, \b, #1
- vmovl.u8 q8, \a
- vext.8 d28, \a, \b, #2
- vmovl.u8 q9, d27
- vext.8 d29, \a, \b, #3
- vmovl.u8 q10, d28
- vext.8 d30, \a, \b, #4
- vmovl.u8 q11, d29
- vext.8 d31, \a, \b, #5
- vmovl.u8 q12, d30
- vmul.u16 q10, q10, d0[2]
- vmovl.u8 q13, d31
- vmul.u16 q11, q11, d0[3]
- vmls.u16 q10, q9, d0[1]
- vmls.u16 q11, q12, d1[0]
- vmla.u16 q10, q8, d0[0]
- vmla.u16 q11, q13, d1[1]
- vqadd.s16 q11, q10, q11
- vqrshrun.s16 \d, q11, #7
-.endm
-
-.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
- vext.8 q14, \q0, \q1, #3
- vext.8 q15, \q0, \q1, #4
- vmovl.u8 q11, d28
- vmovl.u8 q14, d29
- vext.8 q3, \q0, \q1, #2
- vmovl.u8 q12, d30
- vmovl.u8 q15, d31
- vext.8 q8, \q0, \q1, #1
- vmovl.u8 q10, d6
- vmovl.u8 q3, d7
- vext.8 q2, \q0, \q1, #5
- vmovl.u8 q13, d4
- vmovl.u8 q2, d5
- vmovl.u8 q9, d16
- vmovl.u8 q8, d17
- vmul.u16 q11, q11, d0[3]
- vmul.u16 q10, q10, d0[2]
- vmul.u16 q3, q3, d0[2]
- vmul.u16 q14, q14, d0[3]
- vmls.u16 q11, q12, d1[0]
- vmovl.u8 q12, \s0
- vmovl.u8 q1, \s1
- vmls.u16 q10, q9, d0[1]
- vmls.u16 q3, q8, d0[1]
- vmls.u16 q14, q15, d1[0]
- vmla.u16 q10, q12, d0[0]
- vmla.u16 q11, q13, d1[1]
- vmla.u16 q3, q1, d0[0]
- vmla.u16 q14, q2, d1[1]
- vqadd.s16 q11, q10, q11
- vqadd.s16 q14, q3, q14
- vqrshrun.s16 \d0, q11, #7
- vqrshrun.s16 \d1, q14, #7
-.endm
-
-.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
- vmovl.u8 q10, \s2
- vmovl.u8 q11, \s3
- vmovl.u8 q9, \s1
- vmovl.u8 q12, \s4
- vmovl.u8 q8, \s0
- vmovl.u8 q13, \s5
- vmul.u16 q10, q10, d0[2]
- vmul.u16 q11, q11, d0[3]
- vmls.u16 q10, q9, d0[1]
- vmls.u16 q11, q12, d1[0]
- vmla.u16 q10, q8, d0[0]
- vmla.u16 q11, q13, d1[1]
- vqadd.s16 q11, q10, q11
- vqrshrun.s16 \d0, q11, #7
-.endm
-
-.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
- vmovl.u8 q10, \s0
- vmovl.u8 q11, \s3
- vmovl.u8 q14, \s6
- vmovl.u8 q9, \s1
- vmovl.u8 q12, \s4
- vmovl.u8 q8, \s2
- vmovl.u8 q13, \s5
- vmul.u16 q10, q10, d0[0]
- vmul.u16 q15, q11, d0[3]
- vmul.u16 q11, q11, d0[2]
- vmul.u16 q14, q14, d1[1]
- vmls.u16 q10, q9, d0[1]
- vmls.u16 q15, q12, d1[0]
- vmls.u16 q11, q8, d0[1]
- vmls.u16 q14, q13, d1[0]
- vmla.u16 q10, q8, d0[2]
- vmla.u16 q15, q13, d1[1]
- vmla.u16 q11, q9, d0[0]
- vmla.u16 q14, q12, d0[3]
- vqadd.s16 q15, q10, q15
- vqadd.s16 q14, q11, q14
- vqrshrun.s16 \d0, q15, #7
- vqrshrun.s16 \d1, q14, #7
-.endm
-
-.macro vp8_epel8_h4 d, a, b
- vext.8 d28, \a, \b, #1
- vmovl.u8 q9, \a
- vext.8 d29, \a, \b, #2
- vmovl.u8 q10, d28
- vext.8 d30, \a, \b, #3
- vmovl.u8 q11, d29
- vmovl.u8 q12, d30
- vmul.u16 q10, q10, d0[2]
- vmul.u16 q11, q11, d0[3]
- vmls.u16 q10, q9, d0[1]
- vmls.u16 q11, q12, d1[0]
- vqadd.s16 q11, q10, q11
- vqrshrun.s16 \d, q11, #7
-.endm
-
-.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
- vmovl.u8 q9, \s0
- vmovl.u8 q10, \s1
- vmovl.u8 q11, \s2
- vmovl.u8 q12, \s3
- vmovl.u8 q13, \s4
- vmul.u16 q8, q10, d0[2]
- vmul.u16 q14, q11, d0[3]
- vmul.u16 q11, q11, d0[2]
- vmul.u16 q15, q12, d0[3]
- vmls.u16 q8, q9, d0[1]
- vmls.u16 q14, q12, d1[0]
- vmls.u16 q11, q10, d0[1]
- vmls.u16 q15, q13, d1[0]
- vqadd.s16 q8, q8, q14
- vqadd.s16 q11, q11, q15
- vqrshrun.s16 \d0, q8, #7
- vqrshrun.s16 \d1, q11, #7
-.endm
-
-function ff_put_vp8_epel16_v6_neon, export=1
- sub r2, r2, r3, lsl #1
- push {r4,lr}
- vpush {d8-d15}
-
- ldr r4, [sp, #80] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #72] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2-d3}, [r2], r3
- vld1.8 {d4-d5}, [r2], r3
- vld1.8 {d6-d7}, [r2], r3
- vld1.8 {d8-d9}, [r2], r3
- vld1.8 {d10-d11},[r2], r3
- vld1.8 {d12-d13},[r2], r3
- vld1.8 {d14-d15},[r2]
- sub r2, r2, r3, lsl #2
-
- vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
- vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
-
- vst1.8 {d2-d3}, [r0,:128], r1
- vst1.8 {d4-d5}, [r0,:128], r1
- subs r12, r12, #2
- bne 1b
-
- vpop {d8-d15}
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel16_h6_neon, export=1
- sub r2, r2, #2
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2-d4}, [r2], r3
-
- vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
-
- vst1.8 {d2-d3}, [r0,:128], r1
- subs r12, r12, #1
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel16_h6v6_neon, export=1
- sub r2, r2, r3, lsl #1
- sub r2, r2, #2
- push {r4,lr}
- vpush {d8-d9}
-
- @ first pass (horizontal):
- ldr r4, [sp, #28] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #24] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #336+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #5
- bic lr, lr, #15
-1:
- vld1.8 {d2,d3,d4}, [r2], r3
-
- vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
-
- vst1.8 {d2-d3}, [lr,:128]!
- subs r12, r12, #1
- bne 1b
-
- @ second pass (vertical):
- ldr r4, [sp, #336+16+32] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #336+16+24] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d5}, [lr,:128]!
- vld1.8 {d6-d9}, [lr,:128]!
- vld1.8 {d28-d31},[lr,:128]
- sub lr, lr, #48
-
- vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
- vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
-
- vst1.8 {d2-d3}, [r0,:128], r1
- subs r12, r12, #1
- bne 2b
-
- add sp, sp, #336+16
- vpop {d8-d9}
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_v6_neon, export=1
- sub r2, r2, r3, lsl #1
- push {r4,lr}
-
- ldr r4, [sp, #16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2}, [r2], r3
- vld1.8 {d3}, [r2], r3
- vld1.8 {d4}, [r2], r3
- vld1.8 {d5}, [r2], r3
- vld1.8 {d6}, [r2], r3
- vld1.8 {d7}, [r2], r3
- vld1.8 {d28}, [r2]
-
- sub r2, r2, r3, lsl #2
-
- vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
-
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- subs r12, r12, #2
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_h6_neon, export=1
- sub r2, r2, #2
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2,d3}, [r2], r3
-
- vp8_epel8_h6 d2, d2, d3
-
- vst1.8 {d2}, [r0,:64], r1
- subs r12, r12, #1
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_h6v6_neon, export=1
- sub r2, r2, r3, lsl #1
- sub r2, r2, #2
- push {r4,lr}
-
- @ first pass (horizontal):
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #168+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #5
- bic lr, lr, #15
-1:
- vld1.8 {d2,d3}, [r2], r3
-
- vp8_epel8_h6 d2, d2, d3
-
- vst1.8 {d2}, [lr,:64]!
- subs r12, r12, #1
- bne 1b
-
- @ second pass (vertical):
- ldr r4, [sp, #168+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #168+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d5}, [lr,:128]!
- vld1.8 {d6-d7}, [lr,:128]!
- vld1.8 {d30}, [lr,:64]
- sub lr, lr, #32
-
- vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
-
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- subs r12, r12, #2
- bne 2b
-
- add sp, sp, #168+16
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_v4_neon, export=1
- sub r2, r2, r3
- push {r4,lr}
-
- ldr r4, [sp, #16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2}, [r2], r3
- vld1.8 {d3}, [r2], r3
- vld1.8 {d4}, [r2], r3
- vld1.8 {d5}, [r2], r3
- vld1.8 {d6}, [r2]
- sub r2, r2, r3, lsl #1
-
- vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
-
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- subs r12, r12, #2
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_h4_neon, export=1
- sub r2, r2, #1
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2,d3}, [r2], r3
-
- vp8_epel8_h4 d2, d2, d3
-
- vst1.8 {d2}, [r0,:64], r1
- subs r12, r12, #1
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_h4v4_neon, export=1
- sub r2, r2, r3
- sub r2, r2, #1
- push {r4,lr}
-
- @ first pass (horizontal):
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #168+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #3
- bic lr, lr, #15
-1:
- vld1.8 {d2,d3}, [r2], r3
-
- vp8_epel8_h4 d2, d2, d3
-
- vst1.8 {d2}, [lr,:64]!
- subs r12, r12, #1
- bne 1b
-
- @ second pass (vertical):
- ldr r4, [sp, #168+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #168+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d5}, [lr,:128]!
- vld1.8 {d6}, [lr,:64]
- sub lr, lr, #16
-
- vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
-
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- subs r12, r12, #2
- bne 2b
-
- add sp, sp, #168+16
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_h6v4_neon, export=1
- sub r2, r2, r3
- sub r2, r2, #2
- push {r4,lr}
-
- @ first pass (horizontal):
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #168+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #3
- bic lr, lr, #15
-1:
- vld1.8 {d2,d3}, [r2], r3
-
- vp8_epel8_h6 d2, d2, d3
-
- vst1.8 {d2}, [lr,:64]!
- subs r12, r12, #1
- bne 1b
-
- @ second pass (vertical):
- ldr r4, [sp, #168+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #168+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d5}, [lr,:128]!
- vld1.8 {d6}, [lr,:64]
- sub lr, lr, #16
-
- vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
-
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- subs r12, r12, #2
- bne 2b
-
- add sp, sp, #168+16
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel8_h4v6_neon, export=1
- sub r2, r2, r3, lsl #1
- sub r2, r2, #1
- push {r4,lr}
-
- @ first pass (horizontal):
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #168+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #5
- bic lr, lr, #15
-1:
- vld1.8 {d2,d3}, [r2], r3
-
- vp8_epel8_h4 d2, d2, d3
-
- vst1.8 {d2}, [lr,:64]!
- subs r12, r12, #1
- bne 1b
-
- @ second pass (vertical):
- ldr r4, [sp, #168+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #168+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d5}, [lr,:128]!
- vld1.8 {d6-d7}, [lr,:128]!
- vld1.8 {d30}, [lr,:64]
- sub lr, lr, #32
-
- vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
-
- vst1.8 {d2}, [r0,:64], r1
- vst1.8 {d3}, [r0,:64], r1
- subs r12, r12, #2
- bne 2b
-
- add sp, sp, #168+16
- pop {r4,pc}
-endfunc
-
-.ltorg
-
-function ff_put_vp8_epel4_v6_neon, export=1
- sub r2, r2, r3, lsl #1
- push {r4,lr}
-
- ldr r4, [sp, #16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.32 {d2[]}, [r2], r3
- vld1.32 {d3[]}, [r2], r3
- vld1.32 {d4[]}, [r2], r3
- vld1.32 {d5[]}, [r2], r3
- vld1.32 {d6[]}, [r2], r3
- vld1.32 {d7[]}, [r2], r3
- vld1.32 {d28[]}, [r2]
- sub r2, r2, r3, lsl #2
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d3[1]}, [r2], r3
- vld1.32 {d4[1]}, [r2], r3
- vld1.32 {d5[1]}, [r2], r3
- vld1.32 {d6[1]}, [r2], r3
- vld1.32 {d7[1]}, [r2], r3
- vld1.32 {d28[1]}, [r2]
- sub r2, r2, r3, lsl #2
-
- vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
-
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- subs r12, r12, #4
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_h6_neon, export=1
- sub r2, r2, #2
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {q1}, [r2], r3
- vp8_epel8_h6 d2, d2, d3
- vst1.32 {d2[0]}, [r0,:32], r1
- subs r12, r12, #1
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_h6v6_neon, export=1
- sub r2, r2, r3, lsl #1
- sub r2, r2, #2
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #52+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #5
- bic lr, lr, #15
-1:
- vld1.8 {q1}, [r2], r3
- vp8_epel8_h6 d2, d2, d3
- vst1.32 {d2[0]}, [lr,:32]!
- subs r12, r12, #1
- bne 1b
-
- ldr r4, [sp, #52+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #52+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d3}, [lr,:128]!
- vld1.8 {d6}, [lr,:64]!
- vld1.32 {d28[]}, [lr,:32]
- sub lr, lr, #16
- vld1.8 {d4-d5}, [lr]!
- vld1.8 {d7}, [lr,:64]!
- vld1.32 {d28[1]}, [lr,:32]
- sub lr, lr, #16
- vtrn.32 q1, q2
- vtrn.32 d6, d7
- vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- subs r12, r12, #4
- bne 2b
-
- add sp, sp, #52+16
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_h4v6_neon, export=1
- sub r2, r2, r3, lsl #1
- sub r2, r2, #1
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #52+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #5
- bic lr, lr, #15
-1:
- vld1.8 {d2}, [r2], r3
- vp8_epel8_h4 d2, d2, d2
- vst1.32 {d2[0]}, [lr,:32]!
- subs r12, r12, #1
- bne 1b
-
- ldr r4, [sp, #52+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #52+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d3}, [lr,:128]!
- vld1.8 {d6}, [lr,:64]!
- vld1.32 {d28[]}, [lr,:32]
- sub lr, lr, #16
- vld1.8 {d4-d5}, [lr]!
- vld1.8 {d7}, [lr,:64]!
- vld1.32 {d28[1]}, [lr,:32]
- sub lr, lr, #16
- vtrn.32 q1, q2
- vtrn.32 d6, d7
- vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- subs r12, r12, #4
- bne 2b
-
- add sp, sp, #52+16
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_h6v4_neon, export=1
- sub r2, r2, r3
- sub r2, r2, #2
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #44+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #3
- bic lr, lr, #15
-1:
- vld1.8 {q1}, [r2], r3
- vp8_epel8_h6 d2, d2, d3
- vst1.32 {d2[0]}, [lr,:32]!
- subs r12, r12, #1
- bne 1b
-
- ldr r4, [sp, #44+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #44+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d3}, [lr,:128]!
- vld1.32 {d6[]}, [lr,:32]
- sub lr, lr, #8
- vld1.8 {d4-d5}, [lr]!
- vld1.32 {d6[1]}, [lr,:32]
- sub lr, lr, #8
- vtrn.32 q1, q2
- vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- subs r12, r12, #4
- bne 2b
-
- add sp, sp, #44+16
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_h4_neon, export=1
- sub r2, r2, #1
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.8 {d2}, [r2], r3
- vp8_epel8_h4 d2, d2, d2
- vst1.32 {d2[0]}, [r0,:32], r1
- subs r12, r12, #1
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_v4_neon, export=1
- sub r2, r2, r3
- push {r4,lr}
-
- ldr r4, [sp, #16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- vld1.16 {q0}, [r4,:128]
-1:
- vld1.32 {d2[]}, [r2], r3
- vld1.32 {d3[]}, [r2], r3
- vld1.32 {d4[]}, [r2], r3
- vld1.32 {d5[]}, [r2], r3
- vld1.32 {d6[]}, [r2]
- sub r2, r2, r3, lsl #1
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d3[1]}, [r2], r3
- vld1.32 {d4[1]}, [r2], r3
- vld1.32 {d5[1]}, [r2], r3
- vld1.32 {d6[1]}, [r2]
- sub r2, r2, r3, lsl #1
-
- vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
-
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- subs r12, r12, #4
- bne 1b
-
- pop {r4,pc}
-endfunc
-
-function ff_put_vp8_epel4_h4v4_neon, export=1
- sub r2, r2, r3
- sub r2, r2, #1
- push {r4,lr}
-
- ldr r4, [sp, #12] @ mx
- movrel lr, subpel_filters-16
- ldr r12, [sp, #8] @ h
- add r4, lr, r4, lsl #4
- sub sp, sp, #44+16
- vld1.16 {q0}, [r4,:128]
- add lr, sp, #15
- add r12, r12, #3
- bic lr, lr, #15
-1:
- vld1.8 {d2}, [r2], r3
- vp8_epel8_h4 d2, d2, d3
- vst1.32 {d2[0]}, [lr,:32]!
- subs r12, r12, #1
- bne 1b
-
- ldr r4, [sp, #44+16+16] @ my
- movrel lr, subpel_filters-16
- ldr r12, [sp, #44+16+8] @ h
- add r4, lr, r4, lsl #4
- add lr, sp, #15
- vld1.16 {q0}, [r4,:128]
- bic lr, lr, #15
-2:
- vld1.8 {d2-d3}, [lr,:128]!
- vld1.32 {d6[]}, [lr,:32]
- sub lr, lr, #8
- vld1.8 {d4-d5}, [lr]!
- vld1.32 {d6[1]}, [lr,:32]
- sub lr, lr, #8
- vtrn.32 q1, q2
- vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
- vst1.32 {d2[0]}, [r0,:32], r1
- vst1.32 {d3[0]}, [r0,:32], r1
- vst1.32 {d2[1]}, [r0,:32], r1
- vst1.32 {d3[1]}, [r0,:32], r1
- subs r12, r12, #4
- bne 2b
-
- add sp, sp, #44+16
- pop {r4,pc}
-endfunc
-
-@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
-@ arithmatic can be used to apply filters
-const subpel_filters, align=4
- .short 0, 6, 123, 12, 1, 0, 0, 0
- .short 2, 11, 108, 36, 8, 1, 0, 0
- .short 0, 9, 93, 50, 6, 0, 0, 0
- .short 3, 16, 77, 77, 16, 3, 0, 0
- .short 0, 6, 50, 93, 9, 0, 0, 0
- .short 1, 8, 36, 108, 11, 2, 0, 0
- .short 0, 1, 12, 123, 6, 0, 0, 0
-endconst
-
-/* Bilinear MC */
-
-function ff_put_vp8_bilin16_h_neon, export=1
- push {lr}
- ldr lr, [sp, #8] @ mx
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr r12, [sp, #4] @ h
-1:
- subs r12, r12, #2
- vld1.8 {d2-d4}, [r2], r3
- vext.8 q2, q1, q2, #1
- vmull.u8 q8, d2, d1
- vmlal.u8 q8, d4, d0
- vld1.8 {d18-d20},[r2], r3
- vmull.u8 q3, d3, d1
- vmlal.u8 q3, d5, d0
- vext.8 q10, q9, q10, #1
- vmull.u8 q11, d18, d1
- vmlal.u8 q11, d20, d0
- vmull.u8 q12, d19, d1
- vmlal.u8 q12, d21, d0
- vrshrn.u16 d4, q8, #3
- vrshrn.u16 d5, q3, #3
- vrshrn.u16 d6, q11, #3
- vrshrn.u16 d7, q12, #3
- vst1.8 {q2}, [r0,:128], r1
- vst1.8 {q3}, [r0,:128], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin16_v_neon, export=1
- push {lr}
- ldr lr, [sp, #12] @ my
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr r12, [sp, #4] @ h
- vld1.8 {q1}, [r2], r3
-1:
- subs r12, r12, #2
- vld1.8 {q2}, [r2], r3
- vmull.u8 q3, d2, d1
- vmlal.u8 q3, d4, d0
- vmull.u8 q8, d3, d1
- vmlal.u8 q8, d5, d0
- vld1.8 {q1}, [r2], r3
- vmull.u8 q9, d4, d1
- vmlal.u8 q9, d2, d0
- vmull.u8 q10, d5, d1
- vmlal.u8 q10, d3, d0
- vrshrn.u16 d4, q3, #3
- vrshrn.u16 d5, q8, #3
- vrshrn.u16 d6, q9, #3
- vrshrn.u16 d7, q10, #3
- vst1.8 {q2}, [r0,:128], r1
- vst1.8 {q3}, [r0,:128], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin16_hv_neon, export=1
- push {lr}
- ldr lr, [sp, #8] @ mx
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr lr, [sp, #12] @ my
- rsb r12, lr, #8
- vdup.8 d2, lr
- vdup.8 d3, r12
- ldr r12, [sp, #4] @ h
-
- vld1.8 {d4-d6}, [r2], r3
- vext.8 q3, q2, q3, #1
- vmull.u8 q8, d4, d1
- vmlal.u8 q8, d6, d0
- vmull.u8 q9, d5, d1
- vmlal.u8 q9, d7, d0
- vrshrn.u16 d4, q8, #3
- vrshrn.u16 d5, q9, #3
-1:
- subs r12, r12, #2
- vld1.8 {d18-d20},[r2], r3
- vext.8 q10, q9, q10, #1
- vmull.u8 q11, d18, d1
- vmlal.u8 q11, d20, d0
- vld1.8 {d26-d28},[r2], r3
- vmull.u8 q12, d19, d1
- vmlal.u8 q12, d21, d0
- vext.8 q14, q13, q14, #1
- vmull.u8 q8, d26, d1
- vmlal.u8 q8, d28, d0
- vmull.u8 q9, d27, d1
- vmlal.u8 q9, d29, d0
- vrshrn.u16 d6, q11, #3
- vrshrn.u16 d7, q12, #3
- vmull.u8 q12, d4, d3
- vmlal.u8 q12, d6, d2
- vmull.u8 q15, d5, d3
- vmlal.u8 q15, d7, d2
- vrshrn.u16 d4, q8, #3
- vrshrn.u16 d5, q9, #3
- vmull.u8 q10, d6, d3
- vmlal.u8 q10, d4, d2
- vmull.u8 q11, d7, d3
- vmlal.u8 q11, d5, d2
- vrshrn.u16 d24, q12, #3
- vrshrn.u16 d25, q15, #3
- vst1.8 {q12}, [r0,:128], r1
- vrshrn.u16 d20, q10, #3
- vrshrn.u16 d21, q11, #3
- vst1.8 {q10}, [r0,:128], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin8_h_neon, export=1
- push {lr}
- ldr lr, [sp, #8] @ mx
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr r12, [sp, #4] @ h
-1:
- subs r12, r12, #2
- vld1.8 {q1}, [r2], r3
- vext.8 d3, d2, d3, #1
- vmull.u8 q2, d2, d1
- vmlal.u8 q2, d3, d0
- vld1.8 {q3}, [r2], r3
- vext.8 d7, d6, d7, #1
- vmull.u8 q8, d6, d1
- vmlal.u8 q8, d7, d0
- vrshrn.u16 d4, q2, #3
- vrshrn.u16 d16, q8, #3
- vst1.8 {d4}, [r0,:64], r1
- vst1.8 {d16}, [r0,:64], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin8_v_neon, export=1
- push {lr}
- ldr lr, [sp, #12] @ my
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr r12, [sp, #4] @ h
- vld1.8 {d2}, [r2], r3
-1:
- subs r12, r12, #2
- vld1.8 {d3}, [r2], r3
- vmull.u8 q2, d2, d1
- vmlal.u8 q2, d3, d0
- vld1.8 {d2}, [r2], r3
- vmull.u8 q3, d3, d1
- vmlal.u8 q3, d2, d0
- vrshrn.u16 d4, q2, #3
- vrshrn.u16 d6, q3, #3
- vst1.8 {d4}, [r0,:64], r1
- vst1.8 {d6}, [r0,:64], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin8_hv_neon, export=1
- push {lr}
- ldr lr, [sp, #8] @ mx
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr lr, [sp, #12] @ my
- rsb r12, lr, #8
- vdup.8 d2, lr
- vdup.8 d3, r12
- ldr r12, [sp, #4] @ h
-
- vld1.8 {q2}, [r2], r3
- vext.8 d5, d4, d5, #1
- vmull.u8 q9, d4, d1
- vmlal.u8 q9, d5, d0
- vrshrn.u16 d22, q9, #3
-1:
- subs r12, r12, #2
- vld1.8 {q3}, [r2], r3
- vext.8 d7, d6, d7, #1
- vmull.u8 q8, d6, d1
- vmlal.u8 q8, d7, d0
- vld1.8 {q2}, [r2], r3
- vext.8 d5, d4, d5, #1
- vmull.u8 q9, d4, d1
- vmlal.u8 q9, d5, d0
- vrshrn.u16 d16, q8, #3
- vmull.u8 q10, d22, d3
- vmlal.u8 q10, d16, d2
- vrshrn.u16 d22, q9, #3
- vmull.u8 q12, d16, d3
- vmlal.u8 q12, d22, d2
- vrshrn.u16 d20, q10, #3
- vst1.8 {d20}, [r0,:64], r1
- vrshrn.u16 d23, q12, #3
- vst1.8 {d23}, [r0,:64], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin4_h_neon, export=1
- push {lr}
- ldr lr, [sp, #8] @ mx
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr r12, [sp, #4] @ h
-1:
- subs r12, r12, #2
- vld1.8 {d2}, [r2], r3
- vext.8 d3, d2, d3, #1
- vld1.8 {d6}, [r2], r3
- vext.8 d7, d6, d7, #1
- vtrn.32 q1, q3
- vmull.u8 q2, d2, d1
- vmlal.u8 q2, d3, d0
- vrshrn.u16 d4, q2, #3
- vst1.32 {d4[0]}, [r0,:32], r1
- vst1.32 {d4[1]}, [r0,:32], r1
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin4_v_neon, export=1
- push {lr}
- ldr lr, [sp, #12] @ my
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr r12, [sp, #4] @ h
- vld1.32 {d2[]}, [r2], r3
-1:
- vld1.32 {d3[]}, [r2]
- vld1.32 {d2[1]}, [r2], r3
- vld1.32 {d3[1]}, [r2], r3
- vmull.u8 q2, d2, d1
- vmlal.u8 q2, d3, d0
- vtrn.32 d3, d2
- vrshrn.u16 d4, q2, #3
- vst1.32 {d4[0]}, [r0,:32], r1
- vst1.32 {d4[1]}, [r0,:32], r1
- subs r12, r12, #2
- bgt 1b
-
- pop {pc}
-endfunc
-
-function ff_put_vp8_bilin4_hv_neon, export=1
- push {lr}
- ldr lr, [sp, #8] @ mx
- rsb r12, lr, #8
- vdup.8 d0, lr
- vdup.8 d1, r12
- ldr lr, [sp, #12] @ my
- rsb r12, lr, #8
- vdup.8 d2, lr
- vdup.8 d3, r12
- ldr r12, [sp, #4] @ h
-
- vld1.8 {d4}, [r2], r3
- vext.8 d5, d4, d4, #1
- vmull.u8 q9, d4, d1
- vmlal.u8 q9, d5, d0
- vrshrn.u16 d22, q9, #3
-1:
- subs r12, r12, #2
- vld1.8 {d6}, [r2], r3
- vext.8 d7, d6, d6, #1
- vld1.8 {d4}, [r2], r3
- vext.8 d5, d4, d4, #1
- vtrn.32 q3, q2
- vmull.u8 q8, d6, d1
- vmlal.u8 q8, d7, d0
- vrshrn.u16 d16, q8, #3
- vmull.u8 q10, d16, d2
- vtrn.32 d22, d16
- vmlal.u8 q10, d22, d3
- vrev64.32 d22, d16
- vrshrn.u16 d20, q10, #3
- vst1.32 {d20[0]}, [r0,:32], r1
- vst1.32 {d20[1]}, [r0,:32], r1
- bgt 1b
-
- pop {pc}
-endfunc