summaryrefslogtreecommitdiff
path: root/ffmpeg/libavcodec/arm
diff options
context:
space:
mode:
authorTim Redfern <tim@eclectronics.org>2013-09-05 17:57:22 +0100
committerTim Redfern <tim@eclectronics.org>2013-09-05 17:57:22 +0100
commit8992cb1d0d07edc33d274f6d7924ecdf6f83d994 (patch)
tree3a2c86846b7eec8137c1507e623fc7018f13d453 /ffmpeg/libavcodec/arm
parent741fb4b9e135cfb161a749db88713229038577bb (diff)
making act segmenter
Diffstat (limited to 'ffmpeg/libavcodec/arm')
-rw-r--r--ffmpeg/libavcodec/arm/Makefile116
-rw-r--r--ffmpeg/libavcodec/arm/aac.h143
-rw-r--r--ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c57
-rw-r--r--ffmpeg/libavcodec/arm/aacpsdsp_neon.S272
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_arm.S36
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_armv6.S84
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_init_arm.c70
-rw-r--r--ffmpeg/libavcodec/arm/ac3dsp_neon.S154
-rw-r--r--ffmpeg/libavcodec/arm/asm-offsets.h39
-rw-r--r--ffmpeg/libavcodec/arm/dca.h105
-rw-r--r--ffmpeg/libavcodec/arm/dcadsp_init_arm.c36
-rw-r--r--ffmpeg/libavcodec/arm/dcadsp_neon.S61
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_arm.S125
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_arm.h32
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_armv6.S381
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_arm.c86
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_armv5te.c37
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_armv6.c85
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_init_neon.c81
-rw-r--r--ffmpeg/libavcodec/arm/dsputil_neon.S209
-rw-r--r--ffmpeg/libavcodec/arm/fft_fixed_init_arm.c48
-rw-r--r--ffmpeg/libavcodec/arm/fft_fixed_neon.S261
-rw-r--r--ffmpeg/libavcodec/arm/fft_init_arm.c77
-rw-r--r--ffmpeg/libavcodec/arm/fft_neon.S375
-rw-r--r--ffmpeg/libavcodec/arm/flacdsp_arm.S146
-rw-r--r--ffmpeg/libavcodec/arm/flacdsp_init_arm.c32
-rw-r--r--ffmpeg/libavcodec/arm/fmtconvert_init_arm.c52
-rw-r--r--ffmpeg/libavcodec/arm/fmtconvert_neon.S392
-rw-r--r--ffmpeg/libavcodec/arm/fmtconvert_vfp.S78
-rw-r--r--ffmpeg/libavcodec/arm/h264chroma_init_arm.c51
-rw-r--r--ffmpeg/libavcodec/arm/h264cmc_neon.S400
-rw-r--r--ffmpeg/libavcodec/arm/h264dsp_init_arm.c111
-rw-r--r--ffmpeg/libavcodec/arm/h264dsp_neon.S541
-rw-r--r--ffmpeg/libavcodec/arm/h264idct_neon.S413
-rw-r--r--ffmpeg/libavcodec/arm/h264pred_init_arm.c92
-rw-r--r--ffmpeg/libavcodec/arm/h264pred_neon.S359
-rw-r--r--ffmpeg/libavcodec/arm/h264qpel_init_arm.c171
-rw-r--r--ffmpeg/libavcodec/arm/h264qpel_neon.S955
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_arm.S611
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_arm.h29
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_armv6.S259
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_init_arm.c68
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c66
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_init_neon.c86
-rw-r--r--ffmpeg/libavcodec/arm/hpeldsp_neon.S410
-rw-r--r--ffmpeg/libavcodec/arm/int_neon.S92
-rw-r--r--ffmpeg/libavcodec/arm/jrevdct_arm.S383
-rw-r--r--ffmpeg/libavcodec/arm/mathops.h108
-rw-r--r--ffmpeg/libavcodec/arm/mdct_fixed_neon.S193
-rw-r--r--ffmpeg/libavcodec/arm/mdct_neon.S301
-rw-r--r--ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S143
-rw-r--r--ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c38
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_arm.c52
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_arm.h26
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_armv5te.c102
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S114
-rw-r--r--ffmpeg/libavcodec/arm/mpegvideo_neon.S107
-rw-r--r--ffmpeg/libavcodec/arm/neon.S59
-rw-r--r--ffmpeg/libavcodec/arm/rdft_neon.S150
-rw-r--r--ffmpeg/libavcodec/arm/rv34dsp_init_arm.c46
-rw-r--r--ffmpeg/libavcodec/arm/rv34dsp_neon.S156
-rw-r--r--ffmpeg/libavcodec/arm/rv40dsp_init_arm.c148
-rw-r--r--ffmpeg/libavcodec/arm/rv40dsp_neon.S920
-rw-r--r--ffmpeg/libavcodec/arm/sbrdsp_init_arm.c73
-rw-r--r--ffmpeg/libavcodec/arm/sbrdsp_neon.S411
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_arm.S479
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_armv5te.S620
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_armv6.S425
-rw-r--r--ffmpeg/libavcodec/arm/simple_idct_neon.S375
-rw-r--r--ffmpeg/libavcodec/arm/synth_filter_neon.S115
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_arm.h29
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_armv5te.S31
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_init_arm.c30
-rw-r--r--ffmpeg/libavcodec/arm/videodsp_init_armv5te.c33
-rw-r--r--ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c37
-rw-r--r--ffmpeg/libavcodec/arm/vorbisdsp_neon.S83
-rw-r--r--ffmpeg/libavcodec/arm/vp3dsp_init_arm.c45
-rw-r--r--ffmpeg/libavcodec/arm/vp3dsp_neon.S395
-rw-r--r--ffmpeg/libavcodec/arm/vp56_arith.h121
-rw-r--r--ffmpeg/libavcodec/arm/vp56dsp_init_arm.c39
-rw-r--r--ffmpeg/libavcodec/arm/vp56dsp_neon.S121
-rw-r--r--ffmpeg/libavcodec/arm/vp8.h35
-rw-r--r--ffmpeg/libavcodec/arm/vp8_armv6.S248
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp.h78
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_armv6.S1634
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_init_arm.c34
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c120
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_init_neon.c116
-rw-r--r--ffmpeg/libavcodec/arm/vp8dsp_neon.S1867
89 files changed, 19024 insertions, 0 deletions
diff --git a/ffmpeg/libavcodec/arm/Makefile b/ffmpeg/libavcodec/arm/Makefile
new file mode 100644
index 0000000..011404c
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/Makefile
@@ -0,0 +1,116 @@
+ARCH_HEADERS = mathops.h
+
+OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_init_arm.o \
+ arm/ac3dsp_arm.o
+
+OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_init_arm.o \
+ arm/aacpsdsp_init_arm.o
+
+OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_init_arm.o \
+
+ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
+
+OBJS-$(CONFIG_FLAC_DECODER) += arm/flacdsp_init_arm.o \
+ arm/flacdsp_arm.o \
+
+OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
+ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
+
+OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
+OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_init_arm.o
+OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o
+OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_init_arm.o
+OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_init_arm.o
+OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_arm.o
+ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
+ arm/vp8dsp_init_armv6.o \
+ arm/vp8dsp_armv6.o
+
+OBJS-$(CONFIG_H264CHROMA) += arm/h264chroma_init_arm.o
+OBJS-$(CONFIG_H264DSP) += arm/h264dsp_init_arm.o
+OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
+OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
+
+OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_arm.o \
+ arm/hpeldsp_init_arm.o
+
+OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_init_arm.o
+OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_init_arm.o \
+ arm/rv40dsp_init_arm.o \
+
+OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o \
+
+OBJS += arm/dsputil_init_arm.o \
+ arm/dsputil_arm.o \
+ arm/fft_init_arm.o \
+ arm/fft_fixed_init_arm.o \
+ arm/fmtconvert_init_arm.o \
+ arm/jrevdct_arm.o \
+ arm/simple_idct_arm.o \
+
+ARMV5TE-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_armv5te.o \
+ arm/mpegvideo_armv5te_s.o \
+
+ARMV5TE-OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_armv5te.o \
+ arm/videodsp_armv5te.o \
+
+ARMV5TE-OBJS += arm/dsputil_init_armv5te.o \
+ arm/simple_idct_armv5te.o \
+
+ARMV6-OBJS += arm/dsputil_init_armv6.o \
+ arm/dsputil_armv6.o \
+ arm/simple_idct_armv6.o \
+
+ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_armv6.o \
+ arm/hpeldsp_init_armv6.o
+
+VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
+
+NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
+ arm/fft_fixed_neon.o \
+
+NEON-OBJS-$(CONFIG_MDCT) += arm/mdct_neon.o \
+ arm/mdct_fixed_neon.o \
+
+NEON-OBJS-$(CONFIG_RDFT) += arm/rdft_neon.o \
+
+NEON-OBJS-$(CONFIG_H264CHROMA) += arm/h264cmc_neon.o
+NEON-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_neon.o \
+ arm/h264idct_neon.o \
+
+NEON-OBJS-$(CONFIG_H264PRED) += arm/h264pred_neon.o \
+
+NEON-OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_neon.o \
+
+NEON-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_neon.o \
+ arm/hpeldsp_init_neon.o
+
+NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
+
+NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/sbrdsp_neon.o \
+ arm/aacpsdsp_neon.o
+
+NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
+ arm/synth_filter_neon.o \
+
+NEON-OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_neon.o
+NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
+NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
+ arm/rv40dsp_neon.o \
+
+NEON-OBJS-$(CONFIG_VORBIS_DECODER) += arm/vorbisdsp_neon.o
+
+NEON-OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_neon.o
+
+NEON-OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_neon.o \
+
+NEON-OBJS-$(CONFIG_VP6_DECODER) += arm/vp56dsp_neon.o \
+
+NEON-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8dsp_init_neon.o \
+ arm/vp8dsp_neon.o
+
+NEON-OBJS += arm/dsputil_init_neon.o \
+ arm/dsputil_neon.o \
+ arm/fmtconvert_neon.o \
+ arm/int_neon.o \
+ arm/simple_idct_neon.o \
diff --git a/ffmpeg/libavcodec/arm/aac.h b/ffmpeg/libavcodec/arm/aac.h
new file mode 100644
index 0000000..cafa881
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/aac.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_AAC_H
+#define AVCODEC_ARM_AAC_H
+
+#include "config.h"
+
+#if HAVE_NEON_INLINE
+
+#define VMUL2 VMUL2
+static inline float *VMUL2(float *dst, const float *v, unsigned idx,
+ const float *scale)
+{
+ unsigned v0, v1;
+ __asm__ ("ubfx %0, %6, #0, #4 \n\t"
+ "ubfx %1, %6, #4, #4 \n\t"
+ "ldr %0, [%5, %0, lsl #2] \n\t"
+ "ldr %1, [%5, %1, lsl #2] \n\t"
+ "vld1.32 {d1[]}, [%7,:32] \n\t"
+ "vmov d0, %0, %1 \n\t"
+ "vmul.f32 d0, d0, d1 \n\t"
+ "vst1.32 {d0}, [%2,:64]! \n\t"
+ : "=&r"(v0), "=&r"(v1), "+r"(dst), "=m"(dst[0]), "=m"(dst[1])
+ : "r"(v), "r"(idx), "r"(scale)
+ : "d0", "d1");
+ return dst;
+}
+
+#define VMUL4 VMUL4
+static inline float *VMUL4(float *dst, const float *v, unsigned idx,
+ const float *scale)
+{
+ unsigned v0, v1, v2, v3;
+ __asm__ ("ubfx %0, %10, #0, #2 \n\t"
+ "ubfx %1, %10, #2, #2 \n\t"
+ "ldr %0, [%9, %0, lsl #2] \n\t"
+ "ubfx %2, %10, #4, #2 \n\t"
+ "ldr %1, [%9, %1, lsl #2] \n\t"
+ "ubfx %3, %10, #6, #2 \n\t"
+ "ldr %2, [%9, %2, lsl #2] \n\t"
+ "vmov d0, %0, %1 \n\t"
+ "ldr %3, [%9, %3, lsl #2] \n\t"
+ "vld1.32 {d2[],d3[]},[%11,:32] \n\t"
+ "vmov d1, %2, %3 \n\t"
+ "vmul.f32 q0, q0, q1 \n\t"
+ "vst1.32 {q0}, [%4,:128]! \n\t"
+ : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
+ "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
+ : "r"(v), "r"(idx), "r"(scale)
+ : "d0", "d1", "d2", "d3");
+ return dst;
+}
+
+#define VMUL2S VMUL2S
+static inline float *VMUL2S(float *dst, const float *v, unsigned idx,
+ unsigned sign, const float *scale)
+{
+ unsigned v0, v1, v2, v3;
+ __asm__ ("ubfx %0, %8, #0, #4 \n\t"
+ "ubfx %1, %8, #4, #4 \n\t"
+ "ldr %0, [%7, %0, lsl #2] \n\t"
+ "lsl %2, %10, #30 \n\t"
+ "ldr %1, [%7, %1, lsl #2] \n\t"
+ "lsl %3, %10, #31 \n\t"
+ "vmov d0, %0, %1 \n\t"
+ "bic %2, %2, #1<<30 \n\t"
+ "vld1.32 {d1[]}, [%9,:32] \n\t"
+ "vmov d2, %2, %3 \n\t"
+ "veor d0, d0, d2 \n\t"
+ "vmul.f32 d0, d0, d1 \n\t"
+ "vst1.32 {d0}, [%4,:64]! \n\t"
+ : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
+ "=m"(dst[0]), "=m"(dst[1])
+ : "r"(v), "r"(idx), "r"(scale), "r"(sign)
+ : "d0", "d1", "d2");
+ return dst;
+}
+
+#define VMUL4S VMUL4S
+static inline float *VMUL4S(float *dst, const float *v, unsigned idx,
+ unsigned sign, const float *scale)
+{
+ unsigned v0, v1, v2, v3, nz;
+ __asm__ ("vld1.32 {d2[],d3[]},[%13,:32] \n\t"
+ "ubfx %0, %12, #0, #2 \n\t"
+ "ubfx %1, %12, #2, #2 \n\t"
+ "ldr %0, [%11,%0, lsl #2] \n\t"
+ "ubfx %2, %12, #4, #2 \n\t"
+ "ldr %1, [%11,%1, lsl #2] \n\t"
+ "ubfx %3, %12, #6, #2 \n\t"
+ "ldr %2, [%11,%2, lsl #2] \n\t"
+ "vmov d0, %0, %1 \n\t"
+ "ldr %3, [%11,%3, lsl #2] \n\t"
+ "lsr %6, %12, #12 \n\t"
+ "rbit %6, %6 \n\t"
+ "vmov d1, %2, %3 \n\t"
+ "lsls %6, %6, #1 \n\t"
+ "and %0, %5, #1<<31 \n\t"
+ "it cs \n\t"
+ "lslcs %5, %5, #1 \n\t"
+ "lsls %6, %6, #1 \n\t"
+ "and %1, %5, #1<<31 \n\t"
+ "it cs \n\t"
+ "lslcs %5, %5, #1 \n\t"
+ "lsls %6, %6, #1 \n\t"
+ "and %2, %5, #1<<31 \n\t"
+ "it cs \n\t"
+ "lslcs %5, %5, #1 \n\t"
+ "vmov d4, %0, %1 \n\t"
+ "and %3, %5, #1<<31 \n\t"
+ "vmov d5, %2, %3 \n\t"
+ "veor q0, q0, q2 \n\t"
+ "vmul.f32 q0, q0, q1 \n\t"
+ "vst1.32 {q0}, [%4,:128]! \n\t"
+ : "=&r"(v0), "=&r"(v1), "=&r"(v2), "=&r"(v3), "+r"(dst),
+ "+r"(sign), "=r"(nz),
+ "=m"(dst[0]), "=m"(dst[1]), "=m"(dst[2]), "=m"(dst[3])
+ : "r"(v), "r"(idx), "r"(scale)
+ : "cc", "d0", "d1", "d2", "d3", "d4", "d5");
+ return dst;
+}
+
+#endif /* HAVE_NEON_INLINE */
+
+#endif /* AVCODEC_ARM_AAC_H */
diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c b/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c
new file mode 100644
index 0000000..6326376
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/aacpsdsp_init_arm.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/aacpsdsp.h"
+
+void ff_ps_add_squares_neon(float *dst, const float (*src)[2], int n);
+void ff_ps_mul_pair_single_neon(float (*dst)[2], float (*src0)[2],
+ float *src1, int n);
+void ff_ps_hybrid_analysis_neon(float (*out)[2], float (*in)[2],
+ const float (*filter)[8][2],
+ int stride, int n);
+void ff_ps_hybrid_analysis_ileave_neon(float (*out)[32][2], float L[2][38][64],
+ int i, int len);
+void ff_ps_hybrid_synthesis_deint_neon(float out[2][38][64], float (*in)[32][2],
+ int i, int len);
+void ff_ps_decorrelate_neon(float (*out)[2], float (*delay)[2],
+ float (*ap_delay)[PS_QMF_TIME_SLOTS+PS_MAX_AP_DELAY][2],
+ const float phi_fract[2], float (*Q_fract)[2],
+ const float *transient_gain, float g_decay_slope,
+ int len);
+void ff_ps_stereo_interpolate_neon(float (*l)[2], float (*r)[2],
+ float h[2][4], float h_step[2][4],
+ int len);
+
+av_cold void ff_psdsp_init_arm(PSDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->add_squares = ff_ps_add_squares_neon;
+ s->mul_pair_single = ff_ps_mul_pair_single_neon;
+ s->hybrid_synthesis_deint = ff_ps_hybrid_synthesis_deint_neon;
+ s->hybrid_analysis = ff_ps_hybrid_analysis_neon;
+ s->stereo_interpolate[0] = ff_ps_stereo_interpolate_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/aacpsdsp_neon.S b/ffmpeg/libavcodec/arm/aacpsdsp_neon.S
new file mode 100644
index 0000000..fb00900
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/aacpsdsp_neon.S
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_ps_add_squares_neon, export=1
+ mov r3, r0
+ sub r2, r2, #4
+ vld1.32 {q0}, [r1,:128]!
+ vmul.f32 q0, q0, q0
+ vld1.32 {q2}, [r1,:128]!
+ vmul.f32 q2, q2, q2
+ vld1.32 {q1}, [r0,:128]!
+1:
+ vpadd.f32 d6, d0, d1
+ vld1.32 {q0}, [r1,:128]!
+ vpadd.f32 d7, d4, d5
+ vmul.f32 q0, q0, q0
+ vld1.32 {q2}, [r1,:128]!
+ vadd.f32 q3, q1, q3
+ vld1.32 {q1}, [r0,:128]!
+ vmul.f32 q2, q2, q2
+ vst1.32 {q3}, [r3,:128]!
+ subs r2, r2, #4
+ bgt 1b
+ vpadd.f32 d6, d0, d1
+ vpadd.f32 d7, d4, d5
+ vadd.f32 q1, q1, q3
+ vst1.32 {q1}, [r3,:128]!
+ bx lr
+endfunc
+
+function ff_ps_mul_pair_single_neon, export=1
+ sub r3, r3, #4
+ tst r1, #8
+ bne 2f
+ vld1.32 {q0}, [r1,:128]!
+1:
+ vld1.32 {q3}, [r2,:128]!
+ vmul.f32 d4, d0, d6[0]
+ vmul.f32 d5, d1, d6[1]
+ vld1.32 {q1}, [r1,:128]!
+ vmul.f32 d6, d2, d7[0]
+ vmul.f32 d7, d3, d7[1]
+ vld1.32 {q0}, [r1,:128]!
+ vst1.32 {q2,q3}, [r0,:128]!
+ subs r3, r3, #4
+ bgt 1b
+ vld1.32 {q3}, [r2,:128]!
+ vmul.f32 d4, d0, d6[0]
+ vmul.f32 d5, d1, d6[1]
+ vld1.32 {q1}, [r1,:128]!
+ vmul.f32 d6, d2, d7[0]
+ vmul.f32 d7, d3, d7[1]
+ vst1.32 {q2,q3}, [r0,:128]!
+ bx lr
+2:
+ vld1.32 {d0}, [r1,:64]!
+ vld1.32 {d1,d2}, [r1,:128]!
+1:
+ vld1.32 {q3}, [r2,:128]!
+ vmul.f32 d4, d0, d6[0]
+ vmul.f32 d5, d1, d6[1]
+ vld1.32 {d0,d1}, [r1,:128]!
+ vmul.f32 d6, d2, d7[0]
+ vmul.f32 d7, d0, d7[1]
+ vmov d0, d1
+ vld1.32 {d1,d2}, [r1,:128]!
+ vst1.32 {q2,q3}, [r0,:128]!
+ subs r3, r3, #4
+ bgt 1b
+ vld1.32 {q3}, [r2,:128]!
+ vmul.f32 d4, d0, d6[0]
+ vmul.f32 d5, d1, d6[1]
+ vld1.32 {d0}, [r1,:64]!
+ vmul.f32 d6, d2, d7[0]
+ vmul.f32 d7, d0, d7[1]
+ vst1.32 {q2,q3}, [r0,:128]!
+ bx lr
+endfunc
+
+function ff_ps_hybrid_synthesis_deint_neon, export=1
+ push {r4-r8,lr}
+ add r0, r0, r2, lsl #2
+ add r1, r1, r2, lsl #5+1+2
+ rsb r2, r2, #64
+ mov r5, #64*4
+ mov lr, r0
+ add r4, r0, #38*64*4
+ mov r12, r3
+2:
+ vld1.32 {d0,d1}, [r1,:128]!
+ vst1.32 {d0[0]}, [lr,:32], r5
+ vst1.32 {d0[1]}, [r4,:32], r5
+ vst1.32 {d1[0]}, [lr,:32], r5
+ vst1.32 {d1[1]}, [r4,:32], r5
+ subs r12, r12, #2
+ bgt 2b
+ add r0, r0, #4
+ sub r2, r2, #1
+ tst r2, #2
+ bne 6f
+1:
+ mov lr, r0
+ add r4, r0, #38*64*4
+ add r6, r1, # 32*2*4
+ add r7, r1, #2*32*2*4
+ add r8, r1, #3*32*2*4
+ mov r12, r3
+2:
+ vld1.32 {d0,d1}, [r1,:128]!
+ vld1.32 {d2,d3}, [r6,:128]!
+ vld1.32 {d4,d5}, [r7,:128]!
+ vld1.32 {d6,d7}, [r8,:128]!
+ vst4.32 {d0[0],d2[0],d4[0],d6[0]}, [lr,:128], r5
+ vst4.32 {d0[1],d2[1],d4[1],d6[1]}, [r4,:128], r5
+ vst4.32 {d1[0],d3[0],d5[0],d7[0]}, [lr,:128], r5
+ vst4.32 {d1[1],d3[1],d5[1],d7[1]}, [r4,:128], r5
+ subs r12, r12, #2
+ bgt 2b
+ add r0, r0, #16
+ add r1, r1, #3*32*2*4
+ subs r2, r2, #4
+ bgt 1b
+ pop {r4-r8,pc}
+6:
+ mov lr, r0
+ add r4, r0, #38*64*4
+ add r6, r1, #32*2*4
+ mov r12, r3
+2:
+ vld1.32 {d0,d1}, [r1,:128]!
+ vld1.32 {d2,d3}, [r6,:128]!
+ vst2.32 {d0[0],d2[0]}, [lr,:64], r5
+ vst2.32 {d0[1],d2[1]}, [r4,:64], r5
+ vst2.32 {d1[0],d3[0]}, [lr,:64], r5
+ vst2.32 {d1[1],d3[1]}, [r4,:64], r5
+ subs r12, r12, #2
+ bgt 2b
+ add r0, r0, #8
+ add r1, r1, #32*2*4
+ sub r2, r2, #2
+ b 1b
+endfunc
+
+function ff_ps_hybrid_analysis_neon, export=1
+ vldm r1, {d19-d31}
+ ldr r12, [sp]
+ lsl r3, r3, #3
+ vadd.f32 d16, d19, d31
+ vadd.f32 d17, d20, d30
+ vsub.f32 d18, d19, d31
+ vsub.f32 d19, d20, d30
+ vsub.f32 d0, d21, d29
+ vsub.f32 d1, d22, d28
+ vadd.f32 d2, d21, d29
+ vadd.f32 d3, d22, d28
+ vadd.f32 d20, d23, d27
+ vadd.f32 d21, d24, d26
+ vsub.f32 d22, d23, d27
+ vsub.f32 d23, d24, d26
+ vmov.i32 d6, #1<<31
+ vmov.i32 d7, #0
+ vmov.f32 q14, #0.0
+ vmov.f32 q15, #0.0
+ vtrn.32 d6, d7
+ vrev64.32 q9, q9
+ vrev64.32 q0, q0
+ vrev64.32 q11, q11
+ veor q9, q9, q3
+ veor q0, q0, q3
+ veor q11, q11, q3
+ vld1.32 {q13}, [r2,:128]!
+ vtrn.32 q8, q9
+ vtrn.32 q1, q0
+ vtrn.32 q10, q11
+ sub r12, r12, #1
+ vmla.f32 q14, q8, q13
+ vld1.32 {q2}, [r2,:128]!
+ vmla.f32 q15, q9, q13
+1:
+ vmla.f32 q14, q1, q2
+ vld1.32 {q13}, [r2,:128]!
+ vmla.f32 q15, q0, q2
+ vmla.f32 q14, q10, q13
+ vld1.32 {q2}, [r2,:128]!
+ vmla.f32 q15, q11, q13
+ vld1.32 {q13}, [r2,:128]!
+ vadd.f32 d6, d28, d29
+ vadd.f32 d7, d30, d31
+ vmov.f32 q14, #0.0
+ vmov.f32 q15, #0.0
+ vmla.f32 q14, q8, q13
+ vpadd.f32 d6, d6, d7
+ vmla.f32 q15, q9, q13
+ vmla.f32 d6, d25, d4[0]
+ vld1.32 {q2}, [r2,:128]!
+ vst1.32 {d6}, [r0,:64], r3
+ subs r12, r12, #1
+ bgt 1b
+ vmla.f32 q14, q1, q2
+ vld1.32 {q13}, [r2,:128]!
+ vmla.f32 q15, q0, q2
+ vmla.f32 q14, q10, q13
+ vld1.32 {q2}, [r2,:128]!
+ vmla.f32 q15, q11, q13
+ vadd.f32 d6, d28, d29
+ vadd.f32 d7, d30, d31
+ vpadd.f32 d6, d6, d7
+ vmla.f32 d6, d25, d4[0]
+ vst1.32 {d6}, [r0,:64], r3
+ bx lr
+endfunc
+
+function ff_ps_stereo_interpolate_neon, export=1
+ vld1.32 {q0}, [r2]
+ vld1.32 {q14}, [r3]
+ vadd.f32 q15, q14, q14
+ mov r2, r0
+ mov r3, r1
+ ldr r12, [sp]
+ vadd.f32 q1, q0, q14
+ vadd.f32 q0, q0, q15
+ vld1.32 {q2}, [r0,:64]!
+ vld1.32 {q3}, [r1,:64]!
+ subs r12, r12, #1
+ beq 2f
+1:
+ vmul.f32 d16, d4, d2[0]
+ vmul.f32 d17, d5, d0[0]
+ vmul.f32 d18, d4, d2[1]
+ vmul.f32 d19, d5, d0[1]
+ vmla.f32 d16, d6, d3[0]
+ vmla.f32 d17, d7, d1[0]
+ vmla.f32 d18, d6, d3[1]
+ vmla.f32 d19, d7, d1[1]
+ vadd.f32 q1, q1, q15
+ vadd.f32 q0, q0, q15
+ vld1.32 {q2}, [r0,:64]!
+ vld1.32 {q3}, [r1,:64]!
+ vst1.32 {q8}, [r2,:64]!
+ vst1.32 {q9}, [r3,:64]!
+ subs r12, r12, #2
+ bgt 1b
+ it lt
+ bxlt lr
+2:
+ vmul.f32 d16, d4, d2[0]
+ vmul.f32 d18, d4, d2[1]
+ vmla.f32 d16, d6, d3[0]
+ vmla.f32 d18, d6, d3[1]
+ vst1.32 {d16}, [r2,:64]!
+ vst1.32 {d18}, [r3,:64]!
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_arm.S b/ffmpeg/libavcodec/arm/ac3dsp_arm.S
new file mode 100644
index 0000000..ed8eb37
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/ac3dsp_arm.S
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_ac3_update_bap_counts_arm, export=1
+ push {lr}
+ ldrb lr, [r1], #1
+1:
+ lsl r3, lr, #1
+ ldrh r12, [r0, r3]
+ subs r2, r2, #1
+ it gt
+ ldrbgt lr, [r1], #1
+ add r12, r12, #1
+ strh r12, [r0, r3]
+ bgt 1b
+ pop {pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_armv6.S b/ffmpeg/libavcodec/arm/ac3dsp_armv6.S
new file mode 100644
index 0000000..2028d0b
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/ac3dsp_armv6.S
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_ac3_bit_alloc_calc_bap_armv6, export=1
+ ldr r12, [sp]
+ cmp r12, #-960
+ beq 4f
+ push {r4-r11,lr}
+ add r5, sp, #40
+ movrelx r4, X(ff_ac3_bin_to_band_tab), r11
+ movrelx lr, X(ff_ac3_band_start_tab)
+ ldm r5, {r5-r7}
+ ldrb r4, [r4, r2]
+ add r1, r1, r2, lsl #1 @ psd + start
+ add r0, r0, r4, lsl #1 @ mask + band
+ add r4, r4, lr
+ add r7, r7, r2 @ bap + start
+1:
+ ldrsh r9, [r0], #2 @ mask[band]
+ mov r8, #0xff0
+ sub r9, r9, r12 @ - snr_offset
+ ldrb r10, [r4, #1]! @ band_start_tab[++band]
+ subs r9, r9, r5 @ - floor
+ it lt
+ movlt r9, #0
+ cmp r10, r3 @ - end
+ and r9, r9, r8, lsl #1 @ & 0x1fe0
+ ite gt
+ subgt r8, r3, r2
+ suble r8, r10, r2
+ mov r2, r10
+ add r9, r9, r5 @ + floor => m
+ tst r8, #1
+ add r11, r7, r8
+ bne 3f
+ b 5f
+2:
+ ldrsh r8, [r1], #2
+ ldrsh lr, [r1], #2
+ sub r8, r8, r9
+ sub lr, lr, r9
+ usat r8, #6, r8, asr #5 @ address
+ usat lr, #6, lr, asr #5
+ ldrb r8, [r6, r8] @ bap_tab[address]
+ ldrb lr, [r6, lr]
+ strb r8, [r7], #1 @ bap[bin]
+ strb lr, [r7], #1
+5: cmp r7, r11
+ blo 2b
+ cmp r3, r10
+ bgt 1b
+ pop {r4-r11,pc}
+3:
+ ldrsh r8, [r1], #2 @ psd[bin]
+ sub r8, r8, r9 @ - m
+ usat r8, #6, r8, asr #5 @ address
+ ldrb r8, [r6, r8] @ bap_tab[address]
+ strb r8, [r7], #1 @ bap[bin]
+ b 5b
+4:
+ ldr r0, [sp, #12]
+ mov r1, #0
+ mov r2, #256
+ b X(memset)
+endfunc
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c b/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c
new file mode 100644
index 0000000..ffe0747
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/ac3dsp_init_arm.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/ac3dsp.h"
+#include "config.h"
+
+void ff_ac3_exponent_min_neon(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
+int ff_ac3_max_msb_abs_int16_neon(const int16_t *src, int len);
+void ff_ac3_lshift_int16_neon(int16_t *src, unsigned len, unsigned shift);
+void ff_ac3_rshift_int32_neon(int32_t *src, unsigned len, unsigned shift);
+void ff_float_to_fixed24_neon(int32_t *dst, const float *src, unsigned int len);
+void ff_ac3_extract_exponents_neon(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_ac3_sum_square_butterfly_int32_neon(int64_t sum[4],
+ const int32_t *coef0,
+ const int32_t *coef1,
+ int len);
+void ff_ac3_sum_square_butterfly_float_neon(float sum[4],
+ const float *coef0,
+ const float *coef1,
+ int len);
+
+void ff_ac3_bit_alloc_calc_bap_armv6(int16_t *mask, int16_t *psd,
+ int start, int end,
+ int snr_offset, int floor,
+ const uint8_t *bap_tab, uint8_t *bap);
+
+void ff_ac3_update_bap_counts_arm(uint16_t mant_cnt[16], uint8_t *bap, int len);
+
+av_cold void ff_ac3dsp_init_arm(AC3DSPContext *c, int bit_exact)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ c->update_bap_counts = ff_ac3_update_bap_counts_arm;
+
+ if (have_armv6(cpu_flags)) {
+ c->bit_alloc_calc_bap = ff_ac3_bit_alloc_calc_bap_armv6;
+ }
+
+ if (have_neon(cpu_flags)) {
+ c->ac3_exponent_min = ff_ac3_exponent_min_neon;
+ c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_neon;
+ c->ac3_lshift_int16 = ff_ac3_lshift_int16_neon;
+ c->ac3_rshift_int32 = ff_ac3_rshift_int32_neon;
+ c->float_to_fixed24 = ff_float_to_fixed24_neon;
+ c->extract_exponents = ff_ac3_extract_exponents_neon;
+ c->sum_square_butterfly_int32 = ff_ac3_sum_square_butterfly_int32_neon;
+ c->sum_square_butterfly_float = ff_ac3_sum_square_butterfly_float_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/ac3dsp_neon.S b/ffmpeg/libavcodec/arm/ac3dsp_neon.S
new file mode 100644
index 0000000..42f35e3
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/ac3dsp_neon.S
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_ac3_max_msb_abs_int16_neon, export=1
+ vmov.i16 q0, #0
+ vmov.i16 q2, #0
+1: vld1.16 {q1}, [r0,:128]!
+ vabs.s16 q1, q1
+ vld1.16 {q3}, [r0,:128]!
+ vabs.s16 q3, q3
+ vorr q0, q0, q1
+ vorr q2, q2, q3
+ subs r1, r1, #16
+ bgt 1b
+ vorr q0, q0, q2
+ vorr d0, d0, d1
+ vpmax.u16 d0, d0, d0
+ vpmax.u16 d0, d0, d0
+ vmov.u16 r0, d0[0]
+ bx lr
+endfunc
+
+function ff_ac3_exponent_min_neon, export=1
+ cmp r1, #0
+ it eq
+ bxeq lr
+ push {lr}
+ mov r12, #256
+1:
+ vld1.8 {q0}, [r0,:128]
+ mov lr, r1
+ add r3, r0, #256
+2: vld1.8 {q1}, [r3,:128], r12
+ subs lr, lr, #1
+ vmin.u8 q0, q0, q1
+ bgt 2b
+ subs r2, r2, #16
+ vst1.8 {q0}, [r0,:128]!
+ bgt 1b
+ pop {pc}
+endfunc
+
+function ff_ac3_lshift_int16_neon, export=1
+ vdup.16 q0, r2
+1: vld1.16 {q1}, [r0,:128]
+ vshl.s16 q1, q1, q0
+ vst1.16 {q1}, [r0,:128]!
+ subs r1, r1, #8
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_ac3_rshift_int32_neon, export=1
+ rsb r2, r2, #0
+ vdup.32 q0, r2
+1: vld1.32 {q1}, [r0,:128]
+ vshl.s32 q1, q1, q0
+ vst1.32 {q1}, [r0,:128]!
+ subs r1, r1, #4
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_float_to_fixed24_neon, export=1
+1: vld1.32 {q0-q1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #24
+ vld1.32 {q2-q3}, [r1,:128]!
+ vcvt.s32.f32 q1, q1, #24
+ vcvt.s32.f32 q2, q2, #24
+ vst1.32 {q0-q1}, [r0,:128]!
+ vcvt.s32.f32 q3, q3, #24
+ vst1.32 {q2-q3}, [r0,:128]!
+ subs r2, r2, #16
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_ac3_extract_exponents_neon, export=1
+ vmov.i32 q15, #8
+1:
+ vld1.32 {q0}, [r1,:128]!
+ vabs.s32 q1, q0
+ vclz.i32 q3, q1
+ vsub.i32 q3, q3, q15
+ vmovn.i32 d6, q3
+ vmovn.i16 d6, q3
+ vst1.32 {d6[0]}, [r0,:32]!
+ subs r2, r2, #4
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_int32_neon, export=1
+ vmov.i64 q0, #0
+ vmov.i64 q1, #0
+ vmov.i64 q2, #0
+ vmov.i64 q3, #0
+1:
+ vld1.32 {d16}, [r1]!
+ vld1.32 {d17}, [r2]!
+ vadd.s32 d18, d16, d17
+ vsub.s32 d19, d16, d17
+ vmlal.s32 q0, d16, d16
+ vmlal.s32 q1, d17, d17
+ vmlal.s32 q2, d18, d18
+ vmlal.s32 q3, d19, d19
+ subs r3, r3, #2
+ bgt 1b
+ vadd.s64 d0, d0, d1
+ vadd.s64 d1, d2, d3
+ vadd.s64 d2, d4, d5
+ vadd.s64 d3, d6, d7
+ vst1.64 {q0-q1}, [r0]
+ bx lr
+endfunc
+
+function ff_ac3_sum_square_butterfly_float_neon, export=1
+ vmov.f32 q0, #0.0
+ vmov.f32 q1, #0.0
+1:
+ vld1.32 {d16}, [r1]!
+ vld1.32 {d17}, [r2]!
+ vadd.f32 d18, d16, d17
+ vsub.f32 d19, d16, d17
+ vmla.f32 d0, d16, d16
+ vmla.f32 d1, d17, d17
+ vmla.f32 d2, d18, d18
+ vmla.f32 d3, d19, d19
+ subs r3, r3, #2
+ bgt 1b
+ vpadd.f32 d0, d0, d1
+ vpadd.f32 d1, d2, d3
+ vst1.32 {q0}, [r0]
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/asm-offsets.h b/ffmpeg/libavcodec/arm/asm-offsets.h
new file mode 100644
index 0000000..5cfc5cb
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/asm-offsets.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_ASM_OFFSETS_H
+#define AVCODEC_ARM_ASM_OFFSETS_H
+
+#ifndef __ASSEMBLER__
+#include <stddef.h>
+#define CHK_OFFS(s, m, o) struct check_##o { \
+ int x_##o[offsetof(s, m) == o? 1: -1]; \
+ }
+#endif
+
+/* MpegEncContext */
+#define Y_DC_SCALE 0xa8
+#define C_DC_SCALE 0xac
+#define AC_PRED 0xb0
+#define BLOCK_LAST_INDEX 0xb4
+#define H263_AIC 0xe4
+#define INTER_SCANTAB_RASTER_END 0x12c
+
+#endif /* AVCODEC_ARM_ASM_OFFSETS_H */
diff --git a/ffmpeg/libavcodec/arm/dca.h b/ffmpeg/libavcodec/arm/dca.h
new file mode 100644
index 0000000..2cfd18a
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dca.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_DCA_H
+#define AVCODEC_ARM_DCA_H
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavcodec/mathops.h"
+
+#if HAVE_ARMV6_INLINE && AV_GCC_VERSION_AT_LEAST(4,4) && !CONFIG_THUMB
+
+#define decode_blockcodes decode_blockcodes
+static inline int decode_blockcodes(int code1, int code2, int levels,
+ int *values)
+{
+ int v0, v1, v2, v3, v4, v5;
+
+ __asm__ ("smmul %8, %14, %18 \n"
+ "smmul %11, %15, %18 \n"
+ "smlabb %14, %8, %17, %14 \n"
+ "smlabb %15, %11, %17, %15 \n"
+ "smmul %9, %8, %18 \n"
+ "smmul %12, %11, %18 \n"
+ "sub %14, %14, %16, lsr #1 \n"
+ "sub %15, %15, %16, lsr #1 \n"
+ "smlabb %8, %9, %17, %8 \n"
+ "smlabb %11, %12, %17, %11 \n"
+ "smmul %10, %9, %18 \n"
+ "smmul %13, %12, %18 \n"
+ "str %14, %0 \n"
+ "str %15, %4 \n"
+ "sub %8, %8, %16, lsr #1 \n"
+ "sub %11, %11, %16, lsr #1 \n"
+ "smlabb %9, %10, %17, %9 \n"
+ "smlabb %12, %13, %17, %12 \n"
+ "smmul %14, %10, %18 \n"
+ "smmul %15, %13, %18 \n"
+ "str %8, %1 \n"
+ "str %11, %5 \n"
+ "sub %9, %9, %16, lsr #1 \n"
+ "sub %12, %12, %16, lsr #1 \n"
+ "smlabb %10, %14, %17, %10 \n"
+ "smlabb %13, %15, %17, %13 \n"
+ "str %9, %2 \n"
+ "str %12, %6 \n"
+ "sub %10, %10, %16, lsr #1 \n"
+ "sub %13, %13, %16, lsr #1 \n"
+ "str %10, %3 \n"
+ "str %13, %7 \n"
+ : "=m"(values[0]), "=m"(values[1]),
+ "=m"(values[2]), "=m"(values[3]),
+ "=m"(values[4]), "=m"(values[5]),
+ "=m"(values[6]), "=m"(values[7]),
+ "=&r"(v0), "=&r"(v1), "=&r"(v2),
+ "=&r"(v3), "=&r"(v4), "=&r"(v5),
+ "+&r"(code1), "+&r"(code2)
+ : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+
+ return code1 | code2;
+}
+
+#endif
+
+#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
+
+#define int8x8_fmul_int32 int8x8_fmul_int32
+static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
+{
+ __asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
+ "vld1.8 {d0}, [%1,:64] \n"
+ "vmovl.s8 q0, d0 \n"
+ "vmovl.s16 q1, d1 \n"
+ "vmovl.s16 q0, d0 \n"
+ "vcvt.f32.s32 q0, q0 \n"
+ "vcvt.f32.s32 q1, q1 \n"
+ "vmul.f32 q0, q0, %y2 \n"
+ "vmul.f32 q1, q1, %y2 \n"
+ "vst1.32 {q0-q1}, [%m0,:128] \n"
+ : "=Um"(*(float (*)[8])dst)
+ : "r"(src), "x"(scale)
+ : "d0", "d1", "d2", "d3");
+}
+
+#endif
+
+#endif /* AVCODEC_ARM_DCA_H */
diff --git a/ffmpeg/libavcodec/arm/dcadsp_init_arm.c b/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
new file mode 100644
index 0000000..56568e0
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dcadsp_init_arm.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/dcadsp.h"
+
+void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
+ int decifactor, float scale);
+
+av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ s->lfe_fir = ff_dca_lfe_fir_neon;
+}
diff --git a/ffmpeg/libavcodec/arm/dcadsp_neon.S b/ffmpeg/libavcodec/arm/dcadsp_neon.S
new file mode 100644
index 0000000..6a6c77a
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dcadsp_neon.S
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_dca_lfe_fir_neon, export=1
+ push {r4-r6,lr}
+
+ add r4, r0, r3, lsl #2 @ out2
+ add r5, r2, #256*4-16 @ cf1
+ sub r1, r1, #12
+ cmp r3, #32
+ ite eq
+ moveq r6, #256/32
+ movne r6, #256/64
+NOVFP vldr s0, [sp, #16] @ scale
+ mov lr, #-16
+1:
+ vmov.f32 q2, #0.0 @ v0
+ vmov.f32 q3, #0.0 @ v1
+ mov r12, r6
+2:
+ vld1.32 {q8}, [r2,:128]! @ cf0
+ vld1.32 {q9}, [r5,:128], lr @ cf1
+ vld1.32 {q1}, [r1], lr @ in
+ subs r12, r12, #4
+ vrev64.32 q10, q8
+ vmla.f32 q3, q1, q9
+ vmla.f32 d4, d2, d21
+ vmla.f32 d5, d3, d20
+ bne 2b
+
+ add r1, r1, r6, lsl #2
+ subs r3, r3, #1
+ vadd.f32 d4, d4, d5
+ vadd.f32 d6, d6, d7
+ vpadd.f32 d4, d4, d6
+ vmul.f32 d5, d4, d0[0]
+ vst1.32 {d5[0]}, [r0,:32]!
+ vst1.32 {d5[1]}, [r4,:32]!
+ bne 1b
+
+ pop {r4-r6,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.S b/ffmpeg/libavcodec/arm/dsputil_arm.S
new file mode 100644
index 0000000..586a833
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_arm.S
@@ -0,0 +1,125 @@
+@
+@ ARMv4 optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+#if !HAVE_ARMV5TE_EXTERNAL
+#define pld @
+#endif
+
+ .align 5
+@ void ff_add_pixels_clamped_arm(int16_t *block, uint8_t *dest, int stride)
+function ff_add_pixels_clamped_arm, export=1
+ push {r4-r10}
+ mov r10, #8
+1:
+ ldr r4, [r1] /* load dest */
+ /* block[0] and block[1]*/
+ ldrsh r5, [r0]
+ ldrsh r7, [r0, #2]
+ and r6, r4, #0xFF
+ and r8, r4, #0xFF00
+ add r6, r6, r5
+ add r8, r7, r8, lsr #8
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ it ne
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ it ne
+ movne r8, r7, lsr #24
+ mov r9, r6
+ ldrsh r5, [r0, #4] /* moved form [A] */
+ orr r9, r9, r8, lsl #8
+ /* block[2] and block[3] */
+ /* [A] */
+ ldrsh r7, [r0, #6]
+ and r6, r4, #0xFF0000
+ and r8, r4, #0xFF000000
+ add r6, r5, r6, lsr #16
+ add r8, r7, r8, lsr #24
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ it ne
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ it ne
+ movne r8, r7, lsr #24
+ orr r9, r9, r6, lsl #16
+ ldr r4, [r1, #4] /* moved form [B] */
+ orr r9, r9, r8, lsl #24
+ /* store dest */
+ ldrsh r5, [r0, #8] /* moved form [C] */
+ str r9, [r1]
+
+ /* load dest */
+ /* [B] */
+ /* block[4] and block[5] */
+ /* [C] */
+ ldrsh r7, [r0, #10]
+ and r6, r4, #0xFF
+ and r8, r4, #0xFF00
+ add r6, r6, r5
+ add r8, r7, r8, lsr #8
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ it ne
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ it ne
+ movne r8, r7, lsr #24
+ mov r9, r6
+ ldrsh r5, [r0, #12] /* moved from [D] */
+ orr r9, r9, r8, lsl #8
+ /* block[6] and block[7] */
+ /* [D] */
+ ldrsh r7, [r0, #14]
+ and r6, r4, #0xFF0000
+ and r8, r4, #0xFF000000
+ add r6, r5, r6, lsr #16
+ add r8, r7, r8, lsr #24
+ mvn r5, r5
+ mvn r7, r7
+ tst r6, #0x100
+ it ne
+ movne r6, r5, lsr #24
+ tst r8, #0x100
+ it ne
+ movne r8, r7, lsr #24
+ orr r9, r9, r6, lsl #16
+ add r0, r0, #16 /* moved from [E] */
+ orr r9, r9, r8, lsl #24
+ subs r10, r10, #1 /* moved from [F] */
+ /* store dest */
+ str r9, [r1, #4]
+
+ /* [E] */
+ /* [F] */
+ add r1, r1, r2
+ bne 1b
+
+ pop {r4-r10}
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/dsputil_arm.h b/ffmpeg/libavcodec/arm/dsputil_arm.h
new file mode 100644
index 0000000..b7b5bdc
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_arm.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_DSPUTIL_H
+#define AVCODEC_ARM_DSPUTIL_H
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/dsputil.h"
+
+void ff_dsputil_init_armv5te(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_armv6(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_vfp(DSPContext* c, AVCodecContext *avctx);
+void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx);
+
+#endif /* AVCODEC_ARM_DSPUTIL_H */
diff --git a/ffmpeg/libavcodec/arm/dsputil_armv6.S b/ffmpeg/libavcodec/arm/dsputil_armv6.S
new file mode 100644
index 0000000..6ec238b
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_armv6.S
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_add_pixels_clamped_armv6, export=1
+ push {r4-r8,lr}
+ mov r3, #8
+1:
+ ldm r0!, {r4,r5,r12,lr}
+ ldrd r6, r7, [r1]
+ pkhbt r8, r4, r5, lsl #16
+ pkhtb r5, r5, r4, asr #16
+ pkhbt r4, r12, lr, lsl #16
+ pkhtb lr, lr, r12, asr #16
+ pld [r1, r2]
+ uxtab16 r8, r8, r6
+ uxtab16 r5, r5, r6, ror #8
+ uxtab16 r4, r4, r7
+ uxtab16 lr, lr, r7, ror #8
+ usat16 r8, #8, r8
+ usat16 r5, #8, r5
+ usat16 r4, #8, r4
+ usat16 lr, #8, lr
+ orr r6, r8, r5, lsl #8
+ orr r7, r4, lr, lsl #8
+ subs r3, r3, #1
+ strd_post r6, r7, r1, r2
+ bgt 1b
+ pop {r4-r8,pc}
+endfunc
+
+function ff_get_pixels_armv6, export=1
+ pld [r1, r2]
+ push {r4-r8, lr}
+ mov lr, #8
+1:
+ ldrd_post r4, r5, r1, r2
+ subs lr, lr, #1
+ uxtb16 r6, r4
+ uxtb16 r4, r4, ror #8
+ uxtb16 r12, r5
+ uxtb16 r8, r5, ror #8
+ pld [r1, r2]
+ pkhbt r5, r6, r4, lsl #16
+ pkhtb r6, r4, r6, asr #16
+ pkhbt r7, r12, r8, lsl #16
+ pkhtb r12, r8, r12, asr #16
+ stm r0!, {r5,r6,r7,r12}
+ bgt 1b
+
+ pop {r4-r8, pc}
+endfunc
+
+function ff_diff_pixels_armv6, export=1
+ pld [r1, r3]
+ pld [r2, r3]
+ push {r4-r9, lr}
+ mov lr, #8
+1:
+ ldrd_post r4, r5, r1, r3
+ ldrd_post r6, r7, r2, r3
+ uxtb16 r8, r4
+ uxtb16 r4, r4, ror #8
+ uxtb16 r9, r6
+ uxtb16 r6, r6, ror #8
+ pld [r1, r3]
+ ssub16 r9, r8, r9
+ ssub16 r6, r4, r6
+ uxtb16 r8, r5
+ uxtb16 r5, r5, ror #8
+ pld [r2, r3]
+ pkhbt r4, r9, r6, lsl #16
+ pkhtb r6, r6, r9, asr #16
+ uxtb16 r9, r7
+ uxtb16 r7, r7, ror #8
+ ssub16 r9, r8, r9
+ ssub16 r5, r5, r7
+ subs lr, lr, #1
+ pkhbt r8, r9, r5, lsl #16
+ pkhtb r9, r5, r9, asr #16
+ stm r0!, {r4,r6,r8,r9}
+ bgt 1b
+
+ pop {r4-r9, pc}
+endfunc
+
+function ff_pix_abs16_armv6, export=1
+ ldr r0, [sp]
+ push {r4-r9, lr}
+ mov r12, #0
+ mov lr, #0
+ ldm r1, {r4-r7}
+ ldr r8, [r2]
+1:
+ ldr r9, [r2, #4]
+ pld [r1, r3]
+ usada8 r12, r4, r8, r12
+ ldr r8, [r2, #8]
+ pld [r2, r3]
+ usada8 lr, r5, r9, lr
+ ldr r9, [r2, #12]
+ usada8 r12, r6, r8, r12
+ subs r0, r0, #1
+ usada8 lr, r7, r9, lr
+ beq 2f
+ add r1, r1, r3
+ ldm r1, {r4-r7}
+ add r2, r2, r3
+ ldr r8, [r2]
+ b 1b
+2:
+ add r0, r12, lr
+ pop {r4-r9, pc}
+endfunc
+
+function ff_pix_abs16_x2_armv6, export=1
+ ldr r12, [sp]
+ push {r4-r11, lr}
+ mov r0, #0
+ mov lr, #1
+ orr lr, lr, lr, lsl #8
+ orr lr, lr, lr, lsl #16
+1:
+ ldr r8, [r2]
+ ldr r9, [r2, #4]
+ lsr r10, r8, #8
+ ldr r4, [r1]
+ lsr r6, r9, #8
+ orr r10, r10, r9, lsl #24
+ ldr r5, [r2, #8]
+ eor r11, r8, r10
+ uhadd8 r7, r8, r10
+ orr r6, r6, r5, lsl #24
+ and r11, r11, lr
+ uadd8 r7, r7, r11
+ ldr r8, [r1, #4]
+ usada8 r0, r4, r7, r0
+ eor r7, r9, r6
+ lsr r10, r5, #8
+ and r7, r7, lr
+ uhadd8 r4, r9, r6
+ ldr r6, [r2, #12]
+ uadd8 r4, r4, r7
+ pld [r1, r3]
+ orr r10, r10, r6, lsl #24
+ usada8 r0, r8, r4, r0
+ ldr r4, [r1, #8]
+ eor r11, r5, r10
+ ldrb r7, [r2, #16]
+ and r11, r11, lr
+ uhadd8 r8, r5, r10
+ ldr r5, [r1, #12]
+ uadd8 r8, r8, r11
+ pld [r2, r3]
+ lsr r10, r6, #8
+ usada8 r0, r4, r8, r0
+ orr r10, r10, r7, lsl #24
+ subs r12, r12, #1
+ eor r11, r6, r10
+ add r1, r1, r3
+ uhadd8 r9, r6, r10
+ and r11, r11, lr
+ uadd8 r9, r9, r11
+ add r2, r2, r3
+ usada8 r0, r5, r9, r0
+ bgt 1b
+
+ pop {r4-r11, pc}
+endfunc
+
+.macro usad_y2 p0, p1, p2, p3, n0, n1, n2, n3
+ ldr \n0, [r2]
+ eor \n1, \p0, \n0
+ uhadd8 \p0, \p0, \n0
+ and \n1, \n1, lr
+ ldr \n2, [r1]
+ uadd8 \p0, \p0, \n1
+ ldr \n1, [r2, #4]
+ usada8 r0, \p0, \n2, r0
+ pld [r1, r3]
+ eor \n3, \p1, \n1
+ uhadd8 \p1, \p1, \n1
+ and \n3, \n3, lr
+ ldr \p0, [r1, #4]
+ uadd8 \p1, \p1, \n3
+ ldr \n2, [r2, #8]
+ usada8 r0, \p1, \p0, r0
+ pld [r2, r3]
+ eor \p0, \p2, \n2
+ uhadd8 \p2, \p2, \n2
+ and \p0, \p0, lr
+ ldr \p1, [r1, #8]
+ uadd8 \p2, \p2, \p0
+ ldr \n3, [r2, #12]
+ usada8 r0, \p2, \p1, r0
+ eor \p1, \p3, \n3
+ uhadd8 \p3, \p3, \n3
+ and \p1, \p1, lr
+ ldr \p0, [r1, #12]
+ uadd8 \p3, \p3, \p1
+ add r1, r1, r3
+ usada8 r0, \p3, \p0, r0
+ add r2, r2, r3
+.endm
+
+function ff_pix_abs16_y2_armv6, export=1
+ pld [r1]
+ pld [r2]
+ ldr r12, [sp]
+ push {r4-r11, lr}
+ mov r0, #0
+ mov lr, #1
+ orr lr, lr, lr, lsl #8
+ orr lr, lr, lr, lsl #16
+ ldr r4, [r2]
+ ldr r5, [r2, #4]
+ ldr r6, [r2, #8]
+ ldr r7, [r2, #12]
+ add r2, r2, r3
+1:
+ usad_y2 r4, r5, r6, r7, r8, r9, r10, r11
+ subs r12, r12, #2
+ usad_y2 r8, r9, r10, r11, r4, r5, r6, r7
+ bgt 1b
+
+ pop {r4-r11, pc}
+endfunc
+
+function ff_pix_abs8_armv6, export=1
+ pld [r2, r3]
+ ldr r12, [sp]
+ push {r4-r9, lr}
+ mov r0, #0
+ mov lr, #0
+ ldrd_post r4, r5, r1, r3
+1:
+ subs r12, r12, #2
+ ldr r7, [r2, #4]
+ ldr_post r6, r2, r3
+ ldrd_post r8, r9, r1, r3
+ usada8 r0, r4, r6, r0
+ pld [r2, r3]
+ usada8 lr, r5, r7, lr
+ ldr r7, [r2, #4]
+ ldr_post r6, r2, r3
+ beq 2f
+ ldrd_post r4, r5, r1, r3
+ usada8 r0, r8, r6, r0
+ pld [r2, r3]
+ usada8 lr, r9, r7, lr
+ b 1b
+2:
+ usada8 r0, r8, r6, r0
+ usada8 lr, r9, r7, lr
+ add r0, r0, lr
+ pop {r4-r9, pc}
+endfunc
+
+function ff_sse16_armv6, export=1
+ ldr r12, [sp]
+ push {r4-r9, lr}
+ mov r0, #0
+1:
+ ldrd r4, r5, [r1]
+ ldr r8, [r2]
+ uxtb16 lr, r4
+ uxtb16 r4, r4, ror #8
+ uxtb16 r9, r8
+ uxtb16 r8, r8, ror #8
+ ldr r7, [r2, #4]
+ usub16 lr, lr, r9
+ usub16 r4, r4, r8
+ smlad r0, lr, lr, r0
+ uxtb16 r6, r5
+ uxtb16 lr, r5, ror #8
+ uxtb16 r8, r7
+ uxtb16 r9, r7, ror #8
+ smlad r0, r4, r4, r0
+ ldrd r4, r5, [r1, #8]
+ usub16 r6, r6, r8
+ usub16 r8, lr, r9
+ ldr r7, [r2, #8]
+ smlad r0, r6, r6, r0
+ uxtb16 lr, r4
+ uxtb16 r4, r4, ror #8
+ uxtb16 r9, r7
+ uxtb16 r7, r7, ror #8
+ smlad r0, r8, r8, r0
+ ldr r8, [r2, #12]
+ usub16 lr, lr, r9
+ usub16 r4, r4, r7
+ smlad r0, lr, lr, r0
+ uxtb16 r6, r5
+ uxtb16 r5, r5, ror #8
+ uxtb16 r9, r8
+ uxtb16 r8, r8, ror #8
+ smlad r0, r4, r4, r0
+ usub16 r6, r6, r9
+ usub16 r5, r5, r8
+ smlad r0, r6, r6, r0
+ add r1, r1, r3
+ add r2, r2, r3
+ subs r12, r12, #1
+ smlad r0, r5, r5, r0
+ bgt 1b
+
+ pop {r4-r9, pc}
+endfunc
+
+function ff_pix_norm1_armv6, export=1
+ push {r4-r6, lr}
+ mov r12, #16
+ mov lr, #0
+1:
+ ldm r0, {r2-r5}
+ uxtb16 r6, r2
+ uxtb16 r2, r2, ror #8
+ smlad lr, r6, r6, lr
+ uxtb16 r6, r3
+ smlad lr, r2, r2, lr
+ uxtb16 r3, r3, ror #8
+ smlad lr, r6, r6, lr
+ uxtb16 r6, r4
+ smlad lr, r3, r3, lr
+ uxtb16 r4, r4, ror #8
+ smlad lr, r6, r6, lr
+ uxtb16 r6, r5
+ smlad lr, r4, r4, lr
+ uxtb16 r5, r5, ror #8
+ smlad lr, r6, r6, lr
+ subs r12, r12, #1
+ add r0, r0, r1
+ smlad lr, r5, r5, lr
+ bgt 1b
+
+ mov r0, lr
+ pop {r4-r6, pc}
+endfunc
+
+function ff_pix_sum_armv6, export=1
+ push {r4-r7, lr}
+ mov r12, #16
+ mov r2, #0
+ mov r3, #0
+ mov lr, #0
+ ldr r4, [r0]
+1:
+ subs r12, r12, #1
+ ldr r5, [r0, #4]
+ usada8 r2, r4, lr, r2
+ ldr r6, [r0, #8]
+ usada8 r3, r5, lr, r3
+ ldr r7, [r0, #12]
+ usada8 r2, r6, lr, r2
+ beq 2f
+ ldr_pre r4, r0, r1
+ usada8 r3, r7, lr, r3
+ bgt 1b
+2:
+ usada8 r3, r7, lr, r3
+ add r0, r2, r3
+ pop {r4-r7, pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_arm.c b/ffmpeg/libavcodec/arm/dsputil_init_arm.c
new file mode 100644
index 0000000..68991fa
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_init_arm.c
@@ -0,0 +1,86 @@
+/*
+ * ARM optimized DSP utils
+ * Copyright (c) 2001 Lionel Ulmer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "dsputil_arm.h"
+
+void ff_j_rev_dct_arm(int16_t *data);
+void ff_simple_idct_arm(int16_t *data);
+
+/* XXX: local hack */
+static void (*ff_put_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+static void (*ff_add_pixels_clamped)(const int16_t *block, uint8_t *pixels, int line_size);
+
+void ff_add_pixels_clamped_arm(const int16_t *block, uint8_t *dest,
+ int line_size);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+ converted */
+static void j_rev_dct_arm_put(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_j_rev_dct_arm (block);
+ ff_put_pixels_clamped(block, dest, line_size);
+}
+static void j_rev_dct_arm_add(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_j_rev_dct_arm (block);
+ ff_add_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_arm_put(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_simple_idct_arm (block);
+ ff_put_pixels_clamped(block, dest, line_size);
+}
+static void simple_idct_arm_add(uint8_t *dest, int line_size, int16_t *block)
+{
+ ff_simple_idct_arm (block);
+ ff_add_pixels_clamped(block, dest, line_size);
+}
+
+av_cold void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ ff_put_pixels_clamped = c->put_pixels_clamped;
+ ff_add_pixels_clamped = c->add_pixels_clamped;
+
+ if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
+ if(avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_ARM){
+ c->idct_put = j_rev_dct_arm_put;
+ c->idct_add = j_rev_dct_arm_add;
+ c->idct = ff_j_rev_dct_arm;
+ c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+ } else if (avctx->idct_algo == FF_IDCT_SIMPLEARM){
+ c->idct_put = simple_idct_arm_put;
+ c->idct_add = simple_idct_arm_add;
+ c->idct = ff_simple_idct_arm;
+ c->idct_permutation_type = FF_NO_IDCT_PERM;
+ }
+ }
+
+ c->add_pixels_clamped = ff_add_pixels_clamped_arm;
+
+ if (have_armv5te(cpu_flags)) ff_dsputil_init_armv5te(c, avctx);
+ if (have_armv6(cpu_flags)) ff_dsputil_init_armv6(c, avctx);
+ if (have_neon(cpu_flags)) ff_dsputil_init_neon(c, avctx);
+}
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c b/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c
new file mode 100644
index 0000000..841fbfa
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_init_armv5te.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_arm.h"
+
+void ff_simple_idct_armv5te(int16_t *data);
+void ff_simple_idct_put_armv5te(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_armv5te(uint8_t *dest, int line_size, int16_t *data);
+
+av_cold void ff_dsputil_init_armv5te(DSPContext *c, AVCodecContext *avctx)
+{
+ if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEARMV5TE)) {
+ c->idct_put = ff_simple_idct_put_armv5te;
+ c->idct_add = ff_simple_idct_add_armv5te;
+ c->idct = ff_simple_idct_armv5te;
+ c->idct_permutation_type = FF_NO_IDCT_PERM;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_armv6.c b/ffmpeg/libavcodec/arm/dsputil_init_armv6.c
new file mode 100644
index 0000000..8f38302
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_init_armv6.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "dsputil_arm.h"
+
+void ff_simple_idct_armv6(int16_t *data);
+void ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data);
+
+void ff_add_pixels_clamped_armv6(const int16_t *block,
+ uint8_t *restrict pixels,
+ int line_size);
+
+void ff_get_pixels_armv6(int16_t *block, const uint8_t *pixels, int stride);
+void ff_diff_pixels_armv6(int16_t *block, const uint8_t *s1,
+ const uint8_t *s2, int stride);
+
+int ff_pix_abs16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+ int line_size, int h);
+int ff_pix_abs16_x2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+ int line_size, int h);
+int ff_pix_abs16_y2_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+ int line_size, int h);
+
+int ff_pix_abs8_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+ int line_size, int h);
+
+int ff_sse16_armv6(void *s, uint8_t *blk1, uint8_t *blk2,
+ int line_size, int h);
+
+int ff_pix_norm1_armv6(uint8_t *pix, int line_size);
+int ff_pix_sum_armv6(uint8_t *pix, int line_size);
+
+av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx)
+{
+ const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+
+ if (!avctx->lowres && avctx->bits_per_raw_sample <= 8 &&
+ (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLEARMV6)) {
+ c->idct_put = ff_simple_idct_put_armv6;
+ c->idct_add = ff_simple_idct_add_armv6;
+ c->idct = ff_simple_idct_armv6;
+ c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
+ }
+
+ if (!high_bit_depth)
+ c->get_pixels = ff_get_pixels_armv6;
+ c->add_pixels_clamped = ff_add_pixels_clamped_armv6;
+ c->diff_pixels = ff_diff_pixels_armv6;
+
+ c->pix_abs[0][0] = ff_pix_abs16_armv6;
+ c->pix_abs[0][1] = ff_pix_abs16_x2_armv6;
+ c->pix_abs[0][2] = ff_pix_abs16_y2_armv6;
+
+ c->pix_abs[1][0] = ff_pix_abs8_armv6;
+
+ c->sad[0] = ff_pix_abs16_armv6;
+ c->sad[1] = ff_pix_abs8_armv6;
+
+ c->sse[0] = ff_sse16_armv6;
+
+ c->pix_norm1 = ff_pix_norm1_armv6;
+ c->pix_sum = ff_pix_sum_armv6;
+}
diff --git a/ffmpeg/libavcodec/arm/dsputil_init_neon.c b/ffmpeg/libavcodec/arm/dsputil_init_neon.c
new file mode 100644
index 0000000..6d19af7
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_init_neon.c
@@ -0,0 +1,81 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "dsputil_arm.h"
+
+void ff_simple_idct_neon(int16_t *data);
+void ff_simple_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_simple_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
+
+void ff_clear_block_neon(int16_t *block);
+void ff_clear_blocks_neon(int16_t *blocks);
+
+void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, int);
+
+void ff_vector_clipf_neon(float *dst, const float *src, float min, float max,
+ int len);
+void ff_vector_clip_int32_neon(int32_t *dst, const int32_t *src, int32_t min,
+ int32_t max, unsigned int len);
+
+int32_t ff_scalarproduct_int16_neon(const int16_t *v1, const int16_t *v2, int len);
+int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
+ const int16_t *v3, int len, int mul);
+
+void ff_apply_window_int16_neon(int16_t *dst, const int16_t *src,
+ const int16_t *window, unsigned n);
+
+av_cold void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
+{
+ const int high_bit_depth = avctx->bits_per_raw_sample > 8;
+
+ if (!avctx->lowres && avctx->bits_per_raw_sample <= 8) {
+ if (avctx->idct_algo == FF_IDCT_AUTO ||
+ avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+ c->idct_put = ff_simple_idct_put_neon;
+ c->idct_add = ff_simple_idct_add_neon;
+ c->idct = ff_simple_idct_neon;
+ c->idct_permutation_type = FF_PARTTRANS_IDCT_PERM;
+ }
+ }
+
+ if (!high_bit_depth) {
+ c->clear_block = ff_clear_block_neon;
+ c->clear_blocks = ff_clear_blocks_neon;
+ }
+
+ c->add_pixels_clamped = ff_add_pixels_clamped_neon;
+ c->put_pixels_clamped = ff_put_pixels_clamped_neon;
+ c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+
+ c->vector_clipf = ff_vector_clipf_neon;
+ c->vector_clip_int32 = ff_vector_clip_int32_neon;
+
+ c->scalarproduct_int16 = ff_scalarproduct_int16_neon;
+ c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
+
+ c->apply_window_int16 = ff_apply_window_int16_neon;
+}
diff --git a/ffmpeg/libavcodec/arm/dsputil_neon.S b/ffmpeg/libavcodec/arm/dsputil_neon.S
new file mode 100644
index 0000000..307e122
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/dsputil_neon.S
@@ -0,0 +1,209 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_clear_block_neon, export=1
+ vmov.i16 q0, #0
+ .rept 8
+ vst1.16 {q0}, [r0,:128]!
+ .endr
+ bx lr
+endfunc
+
+function ff_clear_blocks_neon, export=1
+ vmov.i16 q0, #0
+ .rept 8*6
+ vst1.16 {q0}, [r0,:128]!
+ .endr
+ bx lr
+endfunc
+
+function ff_put_pixels_clamped_neon, export=1
+ vld1.16 {d16-d19}, [r0,:128]!
+ vqmovun.s16 d0, q8
+ vld1.16 {d20-d23}, [r0,:128]!
+ vqmovun.s16 d1, q9
+ vld1.16 {d24-d27}, [r0,:128]!
+ vqmovun.s16 d2, q10
+ vld1.16 {d28-d31}, [r0,:128]!
+ vqmovun.s16 d3, q11
+ vst1.8 {d0}, [r1,:64], r2
+ vqmovun.s16 d4, q12
+ vst1.8 {d1}, [r1,:64], r2
+ vqmovun.s16 d5, q13
+ vst1.8 {d2}, [r1,:64], r2
+ vqmovun.s16 d6, q14
+ vst1.8 {d3}, [r1,:64], r2
+ vqmovun.s16 d7, q15
+ vst1.8 {d4}, [r1,:64], r2
+ vst1.8 {d5}, [r1,:64], r2
+ vst1.8 {d6}, [r1,:64], r2
+ vst1.8 {d7}, [r1,:64], r2
+ bx lr
+endfunc
+
+function ff_put_signed_pixels_clamped_neon, export=1
+ vmov.u8 d31, #128
+ vld1.16 {d16-d17}, [r0,:128]!
+ vqmovn.s16 d0, q8
+ vld1.16 {d18-d19}, [r0,:128]!
+ vqmovn.s16 d1, q9
+ vld1.16 {d16-d17}, [r0,:128]!
+ vqmovn.s16 d2, q8
+ vld1.16 {d18-d19}, [r0,:128]!
+ vadd.u8 d0, d0, d31
+ vld1.16 {d20-d21}, [r0,:128]!
+ vadd.u8 d1, d1, d31
+ vld1.16 {d22-d23}, [r0,:128]!
+ vadd.u8 d2, d2, d31
+ vst1.8 {d0}, [r1,:64], r2
+ vqmovn.s16 d3, q9
+ vst1.8 {d1}, [r1,:64], r2
+ vqmovn.s16 d4, q10
+ vst1.8 {d2}, [r1,:64], r2
+ vqmovn.s16 d5, q11
+ vld1.16 {d24-d25}, [r0,:128]!
+ vadd.u8 d3, d3, d31
+ vld1.16 {d26-d27}, [r0,:128]!
+ vadd.u8 d4, d4, d31
+ vadd.u8 d5, d5, d31
+ vst1.8 {d3}, [r1,:64], r2
+ vqmovn.s16 d6, q12
+ vst1.8 {d4}, [r1,:64], r2
+ vqmovn.s16 d7, q13
+ vst1.8 {d5}, [r1,:64], r2
+ vadd.u8 d6, d6, d31
+ vadd.u8 d7, d7, d31
+ vst1.8 {d6}, [r1,:64], r2
+ vst1.8 {d7}, [r1,:64], r2
+ bx lr
+endfunc
+
+function ff_add_pixels_clamped_neon, export=1
+ mov r3, r1
+ vld1.8 {d16}, [r1,:64], r2
+ vld1.16 {d0-d1}, [r0,:128]!
+ vaddw.u8 q0, q0, d16
+ vld1.8 {d17}, [r1,:64], r2
+ vld1.16 {d2-d3}, [r0,:128]!
+ vqmovun.s16 d0, q0
+ vld1.8 {d18}, [r1,:64], r2
+ vaddw.u8 q1, q1, d17
+ vld1.16 {d4-d5}, [r0,:128]!
+ vaddw.u8 q2, q2, d18
+ vst1.8 {d0}, [r3,:64], r2
+ vqmovun.s16 d2, q1
+ vld1.8 {d19}, [r1,:64], r2
+ vld1.16 {d6-d7}, [r0,:128]!
+ vaddw.u8 q3, q3, d19
+ vqmovun.s16 d4, q2
+ vst1.8 {d2}, [r3,:64], r2
+ vld1.8 {d16}, [r1,:64], r2
+ vqmovun.s16 d6, q3
+ vld1.16 {d0-d1}, [r0,:128]!
+ vaddw.u8 q0, q0, d16
+ vst1.8 {d4}, [r3,:64], r2
+ vld1.8 {d17}, [r1,:64], r2
+ vld1.16 {d2-d3}, [r0,:128]!
+ vaddw.u8 q1, q1, d17
+ vst1.8 {d6}, [r3,:64], r2
+ vqmovun.s16 d0, q0
+ vld1.8 {d18}, [r1,:64], r2
+ vld1.16 {d4-d5}, [r0,:128]!
+ vaddw.u8 q2, q2, d18
+ vst1.8 {d0}, [r3,:64], r2
+ vqmovun.s16 d2, q1
+ vld1.8 {d19}, [r1,:64], r2
+ vqmovun.s16 d4, q2
+ vld1.16 {d6-d7}, [r0,:128]!
+ vaddw.u8 q3, q3, d19
+ vst1.8 {d2}, [r3,:64], r2
+ vqmovun.s16 d6, q3
+ vst1.8 {d4}, [r3,:64], r2
+ vst1.8 {d6}, [r3,:64], r2
+ bx lr
+endfunc
+
+function ff_vector_clipf_neon, export=1
+VFP vdup.32 q1, d0[1]
+VFP vdup.32 q0, d0[0]
+NOVFP vdup.32 q0, r2
+NOVFP vdup.32 q1, r3
+NOVFP ldr r2, [sp]
+ vld1.f32 {q2},[r1,:128]!
+ vmin.f32 q10, q2, q1
+ vld1.f32 {q3},[r1,:128]!
+ vmin.f32 q11, q3, q1
+1: vmax.f32 q8, q10, q0
+ vmax.f32 q9, q11, q0
+ subs r2, r2, #8
+ beq 2f
+ vld1.f32 {q2},[r1,:128]!
+ vmin.f32 q10, q2, q1
+ vld1.f32 {q3},[r1,:128]!
+ vmin.f32 q11, q3, q1
+ vst1.f32 {q8},[r0,:128]!
+ vst1.f32 {q9},[r0,:128]!
+ b 1b
+2: vst1.f32 {q8},[r0,:128]!
+ vst1.f32 {q9},[r0,:128]!
+ bx lr
+endfunc
+
+function ff_apply_window_int16_neon, export=1
+ push {r4,lr}
+ add r4, r1, r3, lsl #1
+ add lr, r0, r3, lsl #1
+ sub r4, r4, #16
+ sub lr, lr, #16
+ mov r12, #-16
+1:
+ vld1.16 {q0}, [r1,:128]!
+ vld1.16 {q2}, [r2,:128]!
+ vld1.16 {q1}, [r4,:128], r12
+ vrev64.16 q3, q2
+ vqrdmulh.s16 q0, q0, q2
+ vqrdmulh.s16 d2, d2, d7
+ vqrdmulh.s16 d3, d3, d6
+ vst1.16 {q0}, [r0,:128]!
+ vst1.16 {q1}, [lr,:128], r12
+ subs r3, r3, #16
+ bgt 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_vector_clip_int32_neon, export=1
+ vdup.32 q0, r2
+ vdup.32 q1, r3
+ ldr r2, [sp]
+1:
+ vld1.32 {q2-q3}, [r1,:128]!
+ vmin.s32 q2, q2, q1
+ vmin.s32 q3, q3, q1
+ vmax.s32 q2, q2, q0
+ vmax.s32 q3, q3, q0
+ vst1.32 {q2-q3}, [r0,:128]!
+ subs r2, r2, #8
+ bgt 1b
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c b/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c
new file mode 100644
index 0000000..ef098f4
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fft_fixed_init_arm.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/cpu.h"
+
+#define CONFIG_FFT_FLOAT 0
+#include "libavcodec/fft.h"
+
+void ff_fft_fixed_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_mdct_fixed_calc_neon(FFTContext *s, FFTSample *o, const FFTSample *i);
+void ff_mdct_fixed_calcw_neon(FFTContext *s, FFTDouble *o, const FFTSample *i);
+
+av_cold void ff_fft_fixed_init_arm(FFTContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
+#if CONFIG_FFT
+ s->fft_calc = ff_fft_fixed_calc_neon;
+#endif
+
+#if CONFIG_MDCT
+ if (!s->inverse && s->nbits >= 3) {
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+ s->mdct_calc = ff_mdct_fixed_calc_neon;
+ s->mdct_calcw = ff_mdct_fixed_calcw_neon;
+ }
+#endif
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/fft_fixed_neon.S b/ffmpeg/libavcodec/arm/fft_fixed_neon.S
new file mode 100644
index 0000000..fa33eac
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fft_fixed_neon.S
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro bflies d0, d1, r0, r1
+ vrev64.32 \r0, \d1 @ t5, t6, t1, t2
+ vhsub.s16 \r1, \d1, \r0 @ t1-t5, t2-t6, t5-t1, t6-t2
+ vhadd.s16 \r0, \d1, \r0 @ t1+t5, t2+t6, t5+t1, t6+t2
+ vext.16 \r1, \r1, \r1, #1 @ t2-t6, t5-t1, t6-t2, t1-t5
+ vtrn.32 \r0, \r1 @ t1+t5, t2+t6, t2-t6, t5-t1
+ @ t5, t6, t4, t3
+ vhsub.s16 \d1, \d0, \r0
+ vhadd.s16 \d0, \d0, \r0
+.endm
+
+.macro transform01 q0, q1, d3, c0, c1, r0, w0, w1
+ vrev32.16 \r0, \d3
+ vmull.s16 \w0, \d3, \c0
+ vmlal.s16 \w0, \r0, \c1
+ vshrn.s32 \d3, \w0, #15
+ bflies \q0, \q1, \w0, \w1
+.endm
+
+.macro transform2 d0, d1, d2, d3, q0, q1, c0, c1, c2, c3, \
+ r0, r1, w0, w1
+ vrev32.16 \r0, \d1
+ vrev32.16 \r1, \d3
+ vmull.s16 \w0, \d1, \c0
+ vmlal.s16 \w0, \r0, \c1
+ vmull.s16 \w1, \d3, \c2
+ vmlal.s16 \w1, \r1, \c3
+ vshrn.s32 \d1, \w0, #15
+ vshrn.s32 \d3, \w1, #15
+ bflies \q0, \q1, \w0, \w1
+.endm
+
+.macro fft4 d0, d1, r0, r1
+ vhsub.s16 \r0, \d0, \d1 @ t3, t4, t8, t7
+ vhsub.s16 \r1, \d1, \d0
+ vhadd.s16 \d0, \d0, \d1 @ t1, t2, t6, t5
+ vmov.i64 \d1, #0xffff00000000
+ vbit \r0, \r1, \d1
+ vrev64.16 \r1, \r0 @ t7, t8, t4, t3
+ vtrn.32 \r0, \r1 @ t3, t4, t7, t8
+ vtrn.32 \d0, \r0 @ t1, t2, t3, t4, t6, t5, t8, t7
+ vhsub.s16 \d1, \d0, \r0 @ r2, i2, r3, i1
+ vhadd.s16 \d0, \d0, \r0 @ r0, i0, r1, i3
+.endm
+
+.macro fft8 d0, d1, d2, d3, q0, q1, c0, c1, r0, r1, w0, w1
+ fft4 \d0, \d1, \r0, \r1
+ vtrn.32 \d0, \d1 @ z0, z2, z1, z3
+ vhadd.s16 \r0, \d2, \d3 @ t1, t2, t3, t4
+ vhsub.s16 \d3, \d2, \d3 @ z5, z7
+ vmov \d2, \r0
+ transform01 \q0, \q1, \d3, \c0, \c1, \r0, \w0, \w1
+.endm
+
+function fft4_neon
+ vld1.16 {d0-d1}, [r0]
+ fft4 d0, d1, d2, d3
+ vst1.16 {d0-d1}, [r0]
+ bx lr
+endfunc
+
+function fft8_neon
+ vld1.16 {d0-d3}, [r0,:128]
+ movrel r1, coefs
+ vld1.16 {d30}, [r1,:64]
+ vdup.16 d31, d30[0]
+ fft8 d0, d1, d2, d3, q0, q1, d31, d30, d20, d21, q8, q9
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+ vst1.16 {d0-d3}, [r0,:128]
+ bx lr
+endfunc
+
+function fft16_neon
+ vld1.16 {d0-d3}, [r0,:128]!
+ vld1.16 {d4-d7}, [r0,:128]
+ movrel r1, coefs
+ sub r0, r0, #32
+ vld1.16 {d28-d31},[r1,:128]
+ vdup.16 d31, d28[0]
+ fft8 d0, d1, d2, d3, q0, q1, d31, d28, d20, d21, q8, q9
+ vswp d5, d6
+ fft4 q2, q3, q8, q9
+ vswp d5, d6
+ vtrn.32 q0, q1 @ z0, z4, z2, z6, z1, z5, z3, z7
+ vtrn.32 q2, q3 @ z8, z12,z10,z14,z9, z13,z11,z15
+ vswp d1, d2
+ vdup.16 d31, d28[0]
+ transform01 q0, q2, d5, d31, d28, d20, q8, q9
+ vdup.16 d26, d29[0]
+ vdup.16 d27, d30[0]
+ transform2 d2, d6, d3, d7, q1, q3, d26, d30, d27, d29, \
+ d20, d21, q8, q9
+ vtrn.32 q0, q1
+ vtrn.32 q2, q3
+ vst1.16 {d0-d3}, [r0,:128]!
+ vst1.16 {d4-d7}, [r0,:128]
+ bx lr
+endfunc
+
+function fft_pass_neon
+ push {r4,lr}
+ movrel lr, coefs+24
+ vld1.16 {d30}, [lr,:64]
+ lsl r12, r2, #3
+ vmov d31, d30
+ add r3, r1, r2, lsl #2
+ mov lr, #-8
+ sub r3, r3, #2
+ mov r4, r0
+ vld1.16 {d27[]}, [r3,:16]
+ sub r3, r3, #6
+ vld1.16 {q0}, [r4,:128], r12
+ vld1.16 {q1}, [r4,:128], r12
+ vld1.16 {q2}, [r4,:128], r12
+ vld1.16 {q3}, [r4,:128], r12
+ vld1.16 {d28}, [r1,:64]!
+ vld1.16 {d29}, [r3,:64], lr
+ vswp d1, d2
+ vswp d5, d6
+ vtrn.32 d0, d1
+ vtrn.32 d4, d5
+ vdup.16 d25, d28[1]
+ vmul.s16 d27, d27, d31
+ transform01 q0, q2, d5, d25, d27, d20, q8, q9
+ b 2f
+1:
+ mov r4, r0
+ vdup.16 d26, d29[0]
+ vld1.16 {q0}, [r4,:128], r12
+ vld1.16 {q1}, [r4,:128], r12
+ vld1.16 {q2}, [r4,:128], r12
+ vld1.16 {q3}, [r4,:128], r12
+ vld1.16 {d28}, [r1,:64]!
+ vld1.16 {d29}, [r3,:64], lr
+ vswp d1, d2
+ vswp d5, d6
+ vtrn.32 d0, d1
+ vtrn.32 d4, d5
+ vdup.16 d24, d28[0]
+ vdup.16 d25, d28[1]
+ vdup.16 d27, d29[3]
+ vmul.s16 q13, q13, q15
+ transform2 d0, d4, d1, d5, q0, q2, d24, d26, d25, d27, \
+ d16, d17, q9, q10
+2:
+ vtrn.32 d2, d3
+ vtrn.32 d6, d7
+ vdup.16 d24, d28[2]
+ vdup.16 d26, d29[2]
+ vdup.16 d25, d28[3]
+ vdup.16 d27, d29[1]
+ vmul.s16 q13, q13, q15
+ transform2 d2, d6, d3, d7, q1, q3, d24, d26, d25, d27, \
+ d16, d17, q9, q10
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+ vswp d1, d2
+ vswp d5, d6
+ mov r4, r0
+ vst1.16 {q0}, [r4,:128], r12
+ vst1.16 {q1}, [r4,:128], r12
+ vst1.16 {q2}, [r4,:128], r12
+ vst1.16 {q3}, [r4,:128], r12
+ add r0, r0, #16
+ subs r2, r2, #2
+ bgt 1b
+ pop {r4,pc}
+endfunc
+
+#define F_SQRT1_2 23170
+#define F_COS_16_1 30274
+#define F_COS_16_3 12540
+
+const coefs, align=4
+ .short F_SQRT1_2, -F_SQRT1_2, -F_SQRT1_2, F_SQRT1_2
+ .short F_COS_16_1,-F_COS_16_1,-F_COS_16_1, F_COS_16_1
+ .short F_COS_16_3,-F_COS_16_3,-F_COS_16_3, F_COS_16_3
+ .short 1, -1, -1, 1
+endconst
+
+.macro def_fft n, n2, n4
+function fft\n\()_neon
+ push {r4, lr}
+ mov r4, r0
+ bl fft\n2\()_neon
+ add r0, r4, #\n4*2*4
+ bl fft\n4\()_neon
+ add r0, r4, #\n4*3*4
+ bl fft\n4\()_neon
+ mov r0, r4
+ pop {r4, lr}
+ movrelx r1, X(ff_cos_\n\()_fixed)
+ mov r2, #\n4/2
+ b fft_pass_neon
+endfunc
+.endm
+
+ def_fft 32, 16, 8
+ def_fft 64, 32, 16
+ def_fft 128, 64, 32
+ def_fft 256, 128, 64
+ def_fft 512, 256, 128
+ def_fft 1024, 512, 256
+ def_fft 2048, 1024, 512
+ def_fft 4096, 2048, 1024
+ def_fft 8192, 4096, 2048
+ def_fft 16384, 8192, 4096
+ def_fft 32768, 16384, 8192
+ def_fft 65536, 32768, 16384
+
+function ff_fft_fixed_calc_neon, export=1
+ ldr r2, [r0]
+ sub r2, r2, #2
+ movrel r3, fft_fixed_tab_neon
+ ldr r3, [r3, r2, lsl #2]
+ mov r0, r1
+ bx r3
+endfunc
+
+const fft_fixed_tab_neon
+ .word fft4_neon
+ .word fft8_neon
+ .word fft16_neon
+ .word fft32_neon
+ .word fft64_neon
+ .word fft128_neon
+ .word fft256_neon
+ .word fft512_neon
+ .word fft1024_neon
+ .word fft2048_neon
+ .word fft4096_neon
+ .word fft8192_neon
+ .word fft16384_neon
+ .word fft32768_neon
+ .word fft65536_neon
+endconst
diff --git a/ffmpeg/libavcodec/arm/fft_init_arm.c b/ffmpeg/libavcodec/arm/fft_init_arm.c
new file mode 100644
index 0000000..8c98abc
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fft_init_arm.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/fft.h"
+#include "libavcodec/rdft.h"
+#include "libavcodec/synth_filter.h"
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+
+void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+ float *synth_buf_ptr, int *synth_buf_offset,
+ float synth_buf2[32], const float window[512],
+ float out[32], const float in[32],
+ float scale);
+
+av_cold void ff_fft_init_arm(FFTContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+#if CONFIG_FFT
+ s->fft_permute = ff_fft_permute_neon;
+ s->fft_calc = ff_fft_calc_neon;
+#endif
+#if CONFIG_MDCT
+ s->imdct_calc = ff_imdct_calc_neon;
+ s->imdct_half = ff_imdct_half_neon;
+ s->mdct_calc = ff_mdct_calc_neon;
+ s->mdct_permutation = FF_MDCT_PERM_INTERLEAVE;
+#endif
+ }
+}
+
+#if CONFIG_RDFT
+av_cold void ff_rdft_init_arm(RDFTContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ s->rdft_calc = ff_rdft_calc_neon;
+}
+#endif
+
+#if CONFIG_DCA_DECODER
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_neon;
+}
+#endif
diff --git a/ffmpeg/libavcodec/arm/fft_neon.S b/ffmpeg/libavcodec/arm/fft_neon.S
new file mode 100644
index 0000000..8b9ae2a
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fft_neon.S
@@ -0,0 +1,375 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ *
+ * This algorithm (though not any of the implementation details) is
+ * based on libdjbfft by D. J. Bernstein.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+
+function fft4_neon
+ vld1.32 {d0-d3}, [r0,:128]
+
+ vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2
+ vsub.f32 d6, d0, d1 @ r0-r1,i0-i1
+ vsub.f32 d7, d16, d17 @ r3-r2,i2-i3
+ vadd.f32 d4, d0, d1 @ r0+r1,i0+i1
+ vadd.f32 d5, d2, d3 @ i2+i3,r2+r3
+ vadd.f32 d1, d6, d7
+ vsub.f32 d3, d6, d7
+ vadd.f32 d0, d4, d5
+ vsub.f32 d2, d4, d5
+
+ vst1.32 {d0-d3}, [r0,:128]
+
+ bx lr
+endfunc
+
+function fft8_neon
+ mov r1, r0
+ vld1.32 {d0-d3}, [r1,:128]!
+ vld1.32 {d16-d19}, [r1,:128]
+
+ movw r2, #0x04f3 @ sqrt(1/2)
+ movt r2, #0x3f35
+ eor r3, r2, #1<<31
+ vdup.32 d31, r2
+
+ vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2
+ vadd.f32 d4, d16, d17 @ r4+r5,i4+i5
+ vmov d28, r3, r2
+ vadd.f32 d5, d18, d19 @ r6+r7,i6+i7
+ vsub.f32 d17, d16, d17 @ r4-r5,i4-i5
+ vsub.f32 d19, d18, d19 @ r6-r7,i6-i7
+ vrev64.32 d29, d28
+ vadd.f32 d20, d0, d1 @ r0+r1,i0+i1
+ vadd.f32 d21, d2, d3 @ r2+r3,i2+i3
+ vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w
+ vext.32 q3, q2, q2, #1
+ vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w
+ vsub.f32 d23, d22, d23 @ i2-i3,r3-r2
+ vsub.f32 d22, d0, d1 @ r0-r1,i0-i1
+ vmul.f32 d24, d17, d31 @ a2r*w,a2i*w
+ vmul.f32 d25, d19, d31 @ a3r*w,a3i*w
+ vadd.f32 d0, d20, d21
+ vsub.f32 d2, d20, d21
+ vadd.f32 d1, d22, d23
+ vrev64.32 q13, q13
+ vsub.f32 d3, d22, d23
+ vsub.f32 d6, d6, d7
+ vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2
+ vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6
+ vadd.f32 d7, d4, d5
+ vsub.f32 d18, d2, d6
+ vext.32 q13, q12, q12, #1
+ vadd.f32 d2, d2, d6
+ vsub.f32 d16, d0, d7
+ vadd.f32 d5, d25, d24
+ vsub.f32 d4, d26, d27
+ vadd.f32 d0, d0, d7
+ vsub.f32 d17, d1, d5
+ vsub.f32 d19, d3, d4
+ vadd.f32 d3, d3, d4
+ vadd.f32 d1, d1, d5
+
+ vst1.32 {d16-d19}, [r1,:128]
+ vst1.32 {d0-d3}, [r0,:128]
+
+ bx lr
+endfunc
+
+function fft16_neon
+ movrel r1, mppm
+ vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
+ pld [r0, #32]
+ vld1.32 {d2-d3}, [r1,:128]
+ vext.32 q13, q9, q9, #1
+ vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
+ vadd.f32 d4, d16, d17
+ vsub.f32 d5, d16, d17
+ vadd.f32 d18, d18, d19
+ vsub.f32 d19, d26, d27
+
+ vadd.f32 d20, d22, d23
+ vsub.f32 d22, d22, d23
+ vsub.f32 d23, d24, d25
+ vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1}
+ vadd.f32 d21, d24, d25
+ vmul.f32 d24, d22, d2
+ vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3}
+ vmul.f32 d25, d23, d3
+ vuzp.32 d16, d17 @ {r0,r1,i0,i1}
+ vmul.f32 q1, q11, d2[1]
+ vuzp.32 d18, d19 @ {r2,r3,i2,i3}
+ vrev64.32 q12, q12
+ vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6}
+ vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
+ vzip.32 q10, q11
+ vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
+ vadd.f32 d0, d22, d20
+ vadd.f32 d1, d21, d23
+ vsub.f32 d2, d21, d23
+ vsub.f32 d3, d22, d20
+ sub r0, r0, #96
+ vext.32 q13, q13, q13, #1
+ vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5}
+ vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
+ vext.32 q15, q15, q15, #1
+ vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7}
+ vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
+ vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3}
+ vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
+ vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6}
+ vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a}
+ movrelx r2, X(ff_cos_16)
+ vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8}
+ vrev64.32 d1, d1
+ vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a}
+ vrev64.32 d3, d3
+ movrel r3, pmmp
+ vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
+ vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
+ vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9}
+ vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13}
+ vld1.32 {d4-d5}, [r2,:64]
+ vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11}
+ vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15}
+ vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
+ vld1.32 {d6-d7}, [r3,:128]
+ vrev64.32 q1, q14
+ vmul.f32 q14, q14, d4[1]
+ vmul.f32 q1, q1, q3
+ vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a}
+ vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
+ vzip.32 q12, q14
+ vadd.f32 d0, d28, d24
+ vadd.f32 d1, d25, d29
+ vsub.f32 d2, d25, d29
+ vsub.f32 d3, d28, d24
+ vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9}
+ vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1}
+ vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13}
+ mov r1, #32
+ vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5}
+ vrev64.32 q0, q13
+ vmul.f32 q13, q13, d5[0]
+ vrev64.32 q1, q15
+ vmul.f32 q15, q15, d5[1]
+ vst2.32 {d16-d17},[r0,:128], r1
+ vmul.f32 q0, q0, q3
+ vst2.32 {d20-d21},[r0,:128], r1
+ vmul.f32 q1, q1, q3
+ vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6}
+ vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a}
+ vst2.32 {d24-d25},[r0,:128], r1
+ vst2.32 {d28-d29},[r0,:128]
+ vzip.32 q13, q15
+ sub r0, r0, #80
+ vadd.f32 d0, d30, d26
+ vadd.f32 d1, d27, d31
+ vsub.f32 d2, d27, d31
+ vsub.f32 d3, d30, d26
+ vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11}
+ vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3}
+ vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15}
+ vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7}
+ vst2.32 {d18-d19},[r0,:128], r1
+ vst2.32 {d22-d23},[r0,:128], r1
+ vst2.32 {d26-d27},[r0,:128], r1
+ vst2.32 {d30-d31},[r0,:128]
+ bx lr
+endfunc
+
+function fft_pass_neon
+ push {r4-r6,lr}
+ mov r6, r2 @ n
+ lsl r5, r2, #3 @ 2 * n * sizeof FFTSample
+ lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex
+ lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex
+ add r3, r2, r4
+ add r4, r4, r0 @ &z[o1]
+ add r2, r2, r0 @ &z[o2]
+ add r3, r3, r0 @ &z[o3]
+ vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
+ movrel r12, pmmp
+ vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
+ add r5, r5, r1 @ wim
+ vld1.32 {d6-d7}, [r12,:128] @ pmmp
+ vswp d21, d22
+ vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]}
+ sub r5, r5, #4 @ wim--
+ vrev64.32 q1, q11
+ vmul.f32 q11, q11, d4[1]
+ vmul.f32 q1, q1, q3
+ vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1]
+ vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
+ vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
+ sub r6, r6, #1 @ n--
+ vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
+ vzip.32 q10, q11
+ vadd.f32 d0, d22, d20
+ vadd.f32 d1, d21, d23
+ vsub.f32 d2, d21, d23
+ vsub.f32 d3, d22, d20
+ vsub.f32 q10, q8, q0
+ vadd.f32 q8, q8, q0
+ vsub.f32 q11, q9, q1
+ vadd.f32 q9, q9, q1
+ vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]}
+ vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]}
+ vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]}
+ vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]}
+ sub r5, r5, #8 @ wim -= 2
+1:
+ vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]}
+ vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]}
+ vswp d21, d22
+ vld1.32 {d4}, [r1]! @ {wre[0],wre[1]}
+ vrev64.32 q0, q10
+ vmul.f32 q10, q10, d4[0]
+ vrev64.32 q1, q11
+ vmul.f32 q11, q11, d4[1]
+ vld1.32 {d5}, [r5] @ {wim[-1],wim[0]}
+ vmul.f32 q0, q0, q3
+ sub r5, r5, #8 @ wim -= 2
+ vmul.f32 q1, q1, q3
+ vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6}
+ vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a}
+ vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]}
+ subs r6, r6, #1 @ n--
+ vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]}
+ vzip.32 q10, q11
+ vadd.f32 d0, d22, d20
+ vadd.f32 d1, d21, d23
+ vsub.f32 d2, d21, d23
+ vsub.f32 d3, d22, d20
+ vsub.f32 q10, q8, q0
+ vadd.f32 q8, q8, q0
+ vsub.f32 q11, q9, q1
+ vadd.f32 q9, q9, q1
+ vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]}
+ vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]}
+ vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]}
+ vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]}
+ bne 1b
+
+ pop {r4-r6,pc}
+endfunc
+
+.macro def_fft n, n2, n4
+ .align 6
+function fft\n\()_neon
+ push {r4, lr}
+ mov r4, r0
+ bl fft\n2\()_neon
+ add r0, r4, #\n4*2*8
+ bl fft\n4\()_neon
+ add r0, r4, #\n4*3*8
+ bl fft\n4\()_neon
+ mov r0, r4
+ pop {r4, lr}
+ movrelx r1, X(ff_cos_\n)
+ mov r2, #\n4/2
+ b fft_pass_neon
+endfunc
+.endm
+
+ def_fft 32, 16, 8
+ def_fft 64, 32, 16
+ def_fft 128, 64, 32
+ def_fft 256, 128, 64
+ def_fft 512, 256, 128
+ def_fft 1024, 512, 256
+ def_fft 2048, 1024, 512
+ def_fft 4096, 2048, 1024
+ def_fft 8192, 4096, 2048
+ def_fft 16384, 8192, 4096
+ def_fft 32768, 16384, 8192
+ def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+ ldr r2, [r0]
+ sub r2, r2, #2
+ movrel r3, fft_tab_neon
+ ldr r3, [r3, r2, lsl #2]
+ mov r0, r1
+ bx r3
+endfunc
+
+function ff_fft_permute_neon, export=1
+ push {r4,lr}
+ mov r12, #1
+ ldr r2, [r0] @ nbits
+ ldr r3, [r0, #12] @ tmp_buf
+ ldr r0, [r0, #8] @ revtab
+ lsl r12, r12, r2
+ mov r2, r12
+1:
+ vld1.32 {d0-d1}, [r1,:128]!
+ ldr r4, [r0], #4
+ uxth lr, r4
+ uxth r4, r4, ror #16
+ add lr, r3, lr, lsl #3
+ add r4, r3, r4, lsl #3
+ vst1.32 {d0}, [lr,:64]
+ vst1.32 {d1}, [r4,:64]
+ subs r12, r12, #2
+ bgt 1b
+
+ sub r1, r1, r2, lsl #3
+1:
+ vld1.32 {d0-d3}, [r3,:128]!
+ vst1.32 {d0-d3}, [r1,:128]!
+ subs r2, r2, #4
+ bgt 1b
+
+ pop {r4,pc}
+endfunc
+
+const fft_tab_neon
+ .word fft4_neon
+ .word fft8_neon
+ .word fft16_neon
+ .word fft32_neon
+ .word fft64_neon
+ .word fft128_neon
+ .word fft256_neon
+ .word fft512_neon
+ .word fft1024_neon
+ .word fft2048_neon
+ .word fft4096_neon
+ .word fft8192_neon
+ .word fft16384_neon
+ .word fft32768_neon
+ .word fft65536_neon
+endconst
+
+const pmmp, align=4
+ .float +1.0, -1.0, -1.0, +1.0
+endconst
+
+const mppm, align=4
+ .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+endconst
diff --git a/ffmpeg/libavcodec/arm/flacdsp_arm.S b/ffmpeg/libavcodec/arm/flacdsp_arm.S
new file mode 100644
index 0000000..f8861c5
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/flacdsp_arm.S
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function flac_lpc_16_1_arm
+ ldr r12, [sp]
+ push {r4, lr}
+ ldr r1, [r1]
+ subs r12, r12, #2
+ ldr lr, [r0], #4
+ beq 2f
+ it lt
+ poplt {r4, pc}
+1:
+ mul r4, lr, r1
+ ldm r0, {r2, lr}
+ add_sh r2, r2, r4, asr r3
+ mul r4, r2, r1
+ subs r12, r12, #2
+ add_sh lr, lr, r4, asr r3
+ stm r0!, {r2, lr}
+ bgt 1b
+ it lt
+ poplt {r4, pc}
+2:
+ mul r4, lr, r1
+ ldr r2, [r0]
+ add_sh r2, r2, r4, asr r3
+ str r2, [r0]
+ pop {r4, pc}
+endfunc
+
+function flac_lpc_16_2_arm
+ ldr r12, [sp]
+ subs r12, r12, r2
+ it le
+ bxle lr
+
+ push {r4-r9, lr}
+ ldm r0!, {r6, r7}
+ ldm r1, {r8, r9}
+ subs r12, r12, #1
+ beq 2f
+1:
+ mul r4, r6, r8
+ mul r5, r7, r8
+ mla r4, r7, r9, r4
+ ldm r0, {r6, r7}
+ add_sh r6, r6, r4, asr r3
+ mla r5, r6, r9, r5
+ add_sh r7, r7, r5, asr r3
+ stm r0!, {r6, r7}
+ subs r12, r12, #2
+ bgt 1b
+ it lt
+ poplt {r4-r9, pc}
+2:
+ mul r4, r6, r8
+ mla r4, r7, r9, r4
+ ldr r5, [r0]
+ add_sh r5, r5, r4, asr r3
+ str r5, [r0]
+ pop {r4-r9, pc}
+endfunc
+
+function ff_flac_lpc_16_arm, export=1
+ cmp r2, #2
+ blt flac_lpc_16_1_arm
+ beq flac_lpc_16_2_arm
+
+ ldr r12, [sp]
+ subs r12, r12, r2
+ it le
+ bxle lr
+
+ push {r4-r9, lr}
+
+ subs r12, r12, #1
+ beq 3f
+1:
+ sub lr, r2, #2
+ mov r4, #0
+ mov r5, #0
+
+ ldr r7, [r0], #4
+ ldr r9, [r1], #4
+2:
+ mla r4, r7, r9, r4
+ ldm r0!, {r6, r7}
+ mla r5, r6, r9, r5
+ ldm r1!, {r8, r9}
+ mla r4, r6, r8, r4
+ subs lr, lr, #2
+ mla r5, r7, r8, r5
+ bgt 2b
+ blt 6f
+
+ mla r4, r7, r9, r4
+ ldr r7, [r0], #4
+ mla r5, r7, r9, r5
+ ldr r9, [r1], #4
+6:
+ mla r4, r7, r9, r4
+ ldm r0, {r6, r7}
+ add_sh r6, r6, r4, asr r3
+ mla r5, r6, r9, r5
+ add_sh r7, r7, r5, asr r3
+ stm r0!, {r6, r7}
+ sub r0, r0, r2, lsl #2
+ sub r1, r1, r2, lsl #2
+
+ subs r12, r12, #2
+ bgt 1b
+ it lt
+ poplt {r4-r9, pc}
+3:
+ mov r4, #0
+4:
+ ldr r5, [r1], #4
+ ldr r6, [r0], #4
+ mla r4, r5, r6, r4
+ subs r2, r2, #1
+ bgt 4b
+ ldr r5, [r0]
+ add_sh r5, r5, r4, asr r3
+ str r5, [r0]
+ pop {r4-r9, pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/flacdsp_init_arm.c b/ffmpeg/libavcodec/arm/flacdsp_init_arm.c
new file mode 100644
index 0000000..9b93942
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/flacdsp_init_arm.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/flacdsp.h"
+#include "config.h"
+
+void ff_flac_lpc_16_arm(int32_t *samples, const int coeffs[32], int order,
+ int qlevel, int len);
+
+av_cold void ff_flacdsp_init_arm(FLACDSPContext *c, enum AVSampleFormat fmt,
+ int bps)
+{
+ if (bps <= 16)
+ c->lpc = ff_flac_lpc_16_arm;
+}
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c b/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
new file mode 100644
index 0000000..1d99c97
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fmtconvert_init_arm.c
@@ -0,0 +1,52 @@
+/*
+ * ARM optimized Format Conversion Utils
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/fmtconvert.h"
+
+void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
+ float mul, int len);
+
+void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
+void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
+
+void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
+
+av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
+ c->float_to_int16 = ff_float_to_int16_vfp;
+ }
+
+ if (have_neon(cpu_flags)) {
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_neon;
+
+ if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
+ c->float_to_int16 = ff_float_to_int16_neon;
+ c->float_to_int16_interleave = ff_float_to_int16_interleave_neon;
+ }
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_neon.S b/ffmpeg/libavcodec/arm/fmtconvert_neon.S
new file mode 100644
index 0000000..55d070e
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fmtconvert_neon.S
@@ -0,0 +1,392 @@
+/*
+ * ARM NEON optimised Format Conversion Utils
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+function ff_float_to_int16_neon, export=1
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vshrn.s32 d4, q8, #16
+ vld1.64 {d0-d1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vshrn.s32 d5, q9, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vld1.64 {d16-d17},[r1,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r1,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vld1.64 {d0-d1}, [r1,:128]!
+ vshrn.s32 d4, q8, #16
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r1,:128]!
+ vshrn.s32 d5, q9, #16
+ vcvt.s32.f32 q1, q1, #16
+ vshrn.s32 d6, q0, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ vshrn.s32 d7, q1, #16
+ vst1.64 {d6-d7}, [r0,:128]!
+ bx lr
+3: vshrn.s32 d4, q8, #16
+ vshrn.s32 d5, q9, #16
+ vst1.64 {d4-d5}, [r0,:128]!
+ bx lr
+endfunc
+
+function ff_float_to_int16_interleave_neon, export=1
+ cmp r3, #2
+ itt lt
+ ldrlt r1, [r1]
+ blt ff_float_to_int16_neon
+ bne 4f
+
+ ldr r3, [r1]
+ ldr r1, [r1, #4]
+
+ subs r2, r2, #8
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q8, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q9, q1, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 3f
+ bics ip, r2, #15
+ beq 2f
+1: subs ip, ip, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q10, q8, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vld1.64 {d16-d17},[r3,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d25},[r0,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r3,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r1,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r1,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d26-d27},[r0,:128]!
+ bne 1b
+ ands r2, r2, #15
+ beq 3f
+2: vsri.32 q10, q8, #16
+ vld1.64 {d0-d1}, [r3,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r3,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vld1.64 {d24-d25},[r1,:128]!
+ vcvt.s32.f32 q12, q12, #16
+ vsri.32 q11, q9, #16
+ vld1.64 {d26-d27},[r1,:128]!
+ vcvt.s32.f32 q13, q13, #16
+ vst1.64 {d20-d21},[r0,:128]!
+ vsri.32 q12, q0, #16
+ vst1.64 {d22-d23},[r0,:128]!
+ vsri.32 q13, q1, #16
+ vst1.64 {d24-d27},[r0,:128]!
+ bx lr
+3: vsri.32 q10, q8, #16
+ vsri.32 q11, q9, #16
+ vst1.64 {d20-d23},[r0,:128]!
+ bx lr
+
+4: push {r4-r8,lr}
+ cmp r3, #4
+ lsl ip, r3, #1
+ blt 4f
+
+ @ 4 channels
+5: ldmia r1!, {r4-r7}
+ mov lr, r2
+ mov r8, r0
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #8
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 q9, q8, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 q11, q10, #16
+ vld1.64 {d4-d5}, [r6,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vzip.32 d18, d22
+ vld1.64 {d6-d7}, [r7,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vzip.32 d19, d23
+ vst1.64 {d18}, [r8], ip
+ vsri.32 q1, q0, #16
+ vst1.64 {d22}, [r8], ip
+ vsri.32 q3, q2, #16
+ vst1.64 {d19}, [r8], ip
+ vzip.32 d2, d6
+ vst1.64 {d23}, [r8], ip
+ vzip.32 d3, d7
+ beq 7f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.64 {d2}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.64 {d6}, [r8], ip
+ vld1.64 {d20-d21},[r6,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.64 {d3}, [r8], ip
+ vld1.64 {d22-d23},[r7,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.64 {d7}, [r8], ip
+ b 6b
+7: vst1.64 {d2}, [r8], ip
+ vst1.64 {d6}, [r8], ip
+ vst1.64 {d3}, [r8], ip
+ vst1.64 {d7}, [r8], ip
+ subs r3, r3, #4
+ it eq
+ popeq {r4-r8,pc}
+ cmp r3, #4
+ add r0, r0, #8
+ bge 5b
+
+ @ 2 channels
+4: cmp r3, #2
+ blt 4f
+ ldmia r1!, {r4-r5}
+ mov lr, r2
+ mov r8, r0
+ tst lr, #8
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ beq 6f
+ subs lr, lr, #8
+ beq 7f
+ vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+6: subs lr, lr, #16
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vsri.32 d18, d16, #16
+ vld1.64 {d2-d3}, [r5,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ vsri.32 d19, d17, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r5,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vsri.32 d2, d0, #16
+ vst1.32 {d19[1]}, [r8], ip
+ vsri.32 d3, d1, #16
+ vst1.32 {d22[0]}, [r8], ip
+ vsri.32 d6, d4, #16
+ vst1.32 {d22[1]}, [r8], ip
+ vsri.32 d7, d5, #16
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+ beq 6f
+ vld1.64 {d16-d17},[r4,:128]!
+ vcvt.s32.f32 q8, q8, #16
+ vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vld1.64 {d18-d19},[r5,:128]!
+ vcvt.s32.f32 q9, q9, #16
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vld1.64 {d20-d21},[r4,:128]!
+ vcvt.s32.f32 q10, q10, #16
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vld1.64 {d22-d23},[r5,:128]!
+ vcvt.s32.f32 q11, q11, #16
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ bgt 6b
+6: vst1.32 {d2[0]}, [r8], ip
+ vst1.32 {d2[1]}, [r8], ip
+ vst1.32 {d3[0]}, [r8], ip
+ vst1.32 {d3[1]}, [r8], ip
+ vst1.32 {d6[0]}, [r8], ip
+ vst1.32 {d6[1]}, [r8], ip
+ vst1.32 {d7[0]}, [r8], ip
+ vst1.32 {d7[1]}, [r8], ip
+ b 8f
+7: vsri.32 d18, d16, #16
+ vsri.32 d19, d17, #16
+ vst1.32 {d18[0]}, [r8], ip
+ vsri.32 d22, d20, #16
+ vst1.32 {d18[1]}, [r8], ip
+ vsri.32 d23, d21, #16
+ vst1.32 {d19[0]}, [r8], ip
+ vst1.32 {d19[1]}, [r8], ip
+ vst1.32 {d22[0]}, [r8], ip
+ vst1.32 {d22[1]}, [r8], ip
+ vst1.32 {d23[0]}, [r8], ip
+ vst1.32 {d23[1]}, [r8], ip
+8: subs r3, r3, #2
+ add r0, r0, #4
+ it eq
+ popeq {r4-r8,pc}
+
+ @ 1 channel
+4: ldr r4, [r1],#4
+ tst r2, #8
+ mov lr, r2
+ mov r5, r0
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ bne 8f
+6: subs lr, lr, #16
+ vld1.64 {d4-d5}, [r4,:128]!
+ vcvt.s32.f32 q2, q2, #16
+ vld1.64 {d6-d7}, [r4,:128]!
+ vcvt.s32.f32 q3, q3, #16
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ beq 7f
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+7: vst1.16 {d4[1]}, [r5,:16], ip
+ vst1.16 {d4[3]}, [r5,:16], ip
+ vst1.16 {d5[1]}, [r5,:16], ip
+ vst1.16 {d5[3]}, [r5,:16], ip
+ vst1.16 {d6[1]}, [r5,:16], ip
+ vst1.16 {d6[3]}, [r5,:16], ip
+ vst1.16 {d7[1]}, [r5,:16], ip
+ vst1.16 {d7[3]}, [r5,:16], ip
+ bgt 6b
+ pop {r4-r8,pc}
+8: subs lr, lr, #8
+ vst1.16 {d0[1]}, [r5,:16], ip
+ vst1.16 {d0[3]}, [r5,:16], ip
+ vst1.16 {d1[1]}, [r5,:16], ip
+ vst1.16 {d1[3]}, [r5,:16], ip
+ vst1.16 {d2[1]}, [r5,:16], ip
+ vst1.16 {d2[3]}, [r5,:16], ip
+ vst1.16 {d3[1]}, [r5,:16], ip
+ vst1.16 {d3[3]}, [r5,:16], ip
+ it eq
+ popeq {r4-r8,pc}
+ vld1.64 {d0-d1}, [r4,:128]!
+ vcvt.s32.f32 q0, q0, #16
+ vld1.64 {d2-d3}, [r4,:128]!
+ vcvt.s32.f32 q1, q1, #16
+ b 6b
+endfunc
+
+function ff_int32_to_float_fmul_scalar_neon, export=1
+VFP vdup.32 q0, d0[0]
+VFP len .req r2
+NOVFP vdup.32 q0, r2
+NOVFP len .req r3
+
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+1: subs len, len, #8
+ pld [r1, #16]
+ vmul.f32 q9, q3, q0
+ vmul.f32 q10, q8, q0
+ beq 2f
+ vld1.32 {q1},[r1,:128]!
+ vcvt.f32.s32 q3, q1
+ vld1.32 {q2},[r1,:128]!
+ vcvt.f32.s32 q8, q2
+ vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ b 1b
+2: vst1.32 {q9}, [r0,:128]!
+ vst1.32 {q10},[r0,:128]!
+ bx lr
+ .unreq len
+endfunc
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
new file mode 100644
index 0000000..7b012bc
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+/**
+ * ARM VFP optimized float to int16 conversion.
+ * Assume that len is a positive number and is multiple of 8, destination
+ * buffer is at least 4 bytes aligned (8 bytes alignment is better for
+ * performance), little-endian byte sex.
+ */
+@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len)
+function ff_float_to_int16_vfp, export=1
+ push {r4-r8,lr}
+ vpush {d8-d11}
+ vldmia r1!, {s16-s23}
+ vcvt.s32.f32 s0, s16
+ vcvt.s32.f32 s1, s17
+ vcvt.s32.f32 s2, s18
+ vcvt.s32.f32 s3, s19
+ vcvt.s32.f32 s4, s20
+ vcvt.s32.f32 s5, s21
+ vcvt.s32.f32 s6, s22
+ vcvt.s32.f32 s7, s23
+1:
+ subs r2, r2, #8
+ vmov r3, r4, s0, s1
+ vmov r5, r6, s2, s3
+ vmov r7, r8, s4, s5
+ vmov ip, lr, s6, s7
+ it gt
+ vldmiagt r1!, {s16-s23}
+ ssat r4, #16, r4
+ ssat r3, #16, r3
+ ssat r6, #16, r6
+ ssat r5, #16, r5
+ pkhbt r3, r3, r4, lsl #16
+ pkhbt r4, r5, r6, lsl #16
+ itttt gt
+ vcvtgt.s32.f32 s0, s16
+ vcvtgt.s32.f32 s1, s17
+ vcvtgt.s32.f32 s2, s18
+ vcvtgt.s32.f32 s3, s19
+ itttt gt
+ vcvtgt.s32.f32 s4, s20
+ vcvtgt.s32.f32 s5, s21
+ vcvtgt.s32.f32 s6, s22
+ vcvtgt.s32.f32 s7, s23
+ ssat r8, #16, r8
+ ssat r7, #16, r7
+ ssat lr, #16, lr
+ ssat ip, #16, ip
+ pkhbt r5, r7, r8, lsl #16
+ pkhbt r6, ip, lr, lsl #16
+ stmia r0!, {r3-r6}
+ bgt 1b
+
+ vpop {d8-d11}
+ pop {r4-r8,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/h264chroma_init_arm.c b/ffmpeg/libavcodec/arm/h264chroma_init_arm.c
new file mode 100644
index 0000000..13f7e0d
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264chroma_init_arm.c
@@ -0,0 +1,51 @@
+/*
+ * ARM NEON optimised H.264 chroma functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/h264chroma.h"
+
+void ff_put_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_h264_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_h264_chroma_mc2_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+av_cold void ff_h264chroma_init_arm(H264ChromaContext *c, int bit_depth)
+{
+ const int high_bit_depth = bit_depth > 8;
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags) && !high_bit_depth) {
+ c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_neon;
+ c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_neon;
+ c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_neon;
+
+ c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_neon;
+ c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_neon;
+ c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/h264cmc_neon.S b/ffmpeg/libavcodec/arm/h264cmc_neon.S
new file mode 100644
index 0000000..3427e36
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264cmc_neon.S
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+.macro h264_chroma_mc8 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc8_neon, export=1
+ push {r4-r7, lr}
+ ldrd r4, r5, [sp, #20]
+ .ifc \type,avg
+ mov lr, r0
+ .endif
+ pld [r1]
+ pld [r1, r2]
+
+ .ifc \codec,rv40
+ movrel r6, rv40bias
+ lsr r7, r5, #1
+ add r6, r6, r7, lsl #3
+ lsr r7, r4, #1
+ add r6, r6, r7, lsl #1
+ vld1.16 {d22[],d23[]}, [r6,:16]
+ .endif
+
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
+ rsb r6, r7, r5, lsl #3
+ rsb r12, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ vdup.8 d0, r4
+ vdup.8 d1, r12
+ vld1.8 {d4, d5}, [r1], r2
+ vdup.8 d2, r6
+ vdup.8 d3, r7
+ vext.8 d5, d4, d5, #1
+
+1: vld1.8 {d6, d7}, [r1], r2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vext.8 d7, d6, d7, #1
+ vld1.8 {d4, d5}, [r1], r2
+ vmlal.u8 q8, d6, d2
+ pld [r1]
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vmlal.u8 q9, d7, d1
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+ pld [r1, r2]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vadd.u16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
+ .ifc \type,avg
+ vld1.8 {d20}, [lr,:64], r2
+ vld1.8 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+ .endif
+ vst1.8 {d16}, [r0,:64], r2
+ vst1.8 {d17}, [r0,:64], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add r12, r12, r6
+ vdup.8 d0, r4
+ vdup.8 d1, r12
+
+ beq 4f
+
+ vld1.8 {d4}, [r1], r2
+
+3: vld1.8 {d6}, [r1], r2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+ vld1.8 {d4}, [r1], r2
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d1
+ pld [r1]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vadd.u16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
+ pld [r1, r2]
+ .ifc \type,avg
+ vld1.8 {d20}, [lr,:64], r2
+ vld1.8 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+ .endif
+ subs r3, r3, #2
+ vst1.8 {d16}, [r0,:64], r2
+ vst1.8 {d17}, [r0,:64], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.8 {d4, d5}, [r1], r2
+ vld1.8 {d6, d7}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ pld [r1]
+ subs r3, r3, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d7, d1
+ pld [r1, r2]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vadd.u16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+ .endif
+ .ifc \type,avg
+ vld1.8 {d20}, [lr,:64], r2
+ vld1.8 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+ .endif
+ vst1.8 {d16}, [r0,:64], r2
+ vst1.8 {d17}, [r0,:64], r2
+ bgt 4b
+
+ pop {r4-r7, pc}
+endfunc
+.endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+.macro h264_chroma_mc4 type, codec=h264
+function ff_\type\()_\codec\()_chroma_mc4_neon, export=1
+ push {r4-r7, lr}
+ ldrd r4, r5, [sp, #20]
+ .ifc \type,avg
+ mov lr, r0
+ .endif
+ pld [r1]
+ pld [r1, r2]
+
+ .ifc \codec,rv40
+ movrel r6, rv40bias
+ lsr r7, r5, #1
+ add r6, r6, r7, lsl #3
+ lsr r7, r4, #1
+ add r6, r6, r7, lsl #1
+ vld1.16 {d22[],d23[]}, [r6,:16]
+ .endif
+
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
+ rsb r6, r7, r5, lsl #3
+ rsb r12, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ vdup.8 d0, r4
+ vdup.8 d1, r12
+ vld1.8 {d4}, [r1], r2
+ vdup.8 d2, r6
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+1: vld1.8 {d6}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+ vld1.8 {d4}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ pld [r1]
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ .endif
+ subs r3, r3, #2
+ pld [r1, r2]
+ .ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+ .endif
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add r12, r12, r6
+ vdup.8 d0, r4
+ vdup.8 d1, r12
+ vtrn.32 d0, d1
+
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+ vld1.32 {d4[0]}, [r1], r2
+
+3: vld1.32 {d4[1]}, [r1], r2
+ vmull.u8 q8, d4, d0
+ vld1.32 {d4[0]}, [r1], r2
+ vmull.u8 q9, d4, d1
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r1]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ .endif
+ .ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+ .endif
+ subs r3, r3, #2
+ pld [r1, r2]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.8 {d4}, [r1], r2
+ vld1.8 {d6}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+ vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r1]
+ .ifc \codec,h264
+ vrshrn.u16 d16, q8, #6
+ .else
+ vadd.u16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ .endif
+ .ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+ .endif
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 4b
+
+ pop {r4-r7, pc}
+endfunc
+.endm
+
+.macro h264_chroma_mc2 type
+function ff_\type\()_h264_chroma_mc2_neon, export=1
+ push {r4-r6, lr}
+ ldr r4, [sp, #16]
+ ldr lr, [sp, #20]
+ pld [r1]
+ pld [r1, r2]
+ orrs r5, r4, lr
+ beq 2f
+
+ mul r5, r4, lr
+ rsb r6, r5, lr, lsl #3
+ rsb r12, r5, r4, lsl #3
+ sub r4, r5, r4, lsl #3
+ sub r4, r4, lr, lsl #3
+ add r4, r4, #64
+ vdup.8 d0, r4
+ vdup.8 d2, r12
+ vdup.8 d1, r6
+ vdup.8 d3, r5
+ vtrn.16 q0, q1
+1:
+ vld1.32 {d4[0]}, [r1], r2
+ vld1.32 {d4[1]}, [r1], r2
+ vrev64.32 d5, d4
+ vld1.32 {d5[1]}, [r1]
+ vext.8 q3, q2, q2, #1
+ vtrn.16 q2, q3
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ .ifc \type,avg
+ vld1.16 {d18[0]}, [r0,:16], r2
+ vld1.16 {d18[1]}, [r0,:16]
+ sub r0, r0, r2
+ .endif
+ vtrn.32 d16, d17
+ vadd.i16 d16, d16, d17
+ vrshrn.u16 d16, q8, #6
+ .ifc \type,avg
+ vrhadd.u8 d16, d16, d18
+ .endif
+ vst1.16 {d16[0]}, [r0,:16], r2
+ vst1.16 {d16[1]}, [r0,:16], r2
+ subs r3, r3, #2
+ bgt 1b
+ pop {r4-r6, pc}
+2:
+ .ifc \type,put
+ ldrh_post r5, r1, r2
+ strh_post r5, r0, r2
+ ldrh_post r6, r1, r2
+ strh_post r6, r0, r2
+ .else
+ vld1.16 {d16[0]}, [r1], r2
+ vld1.16 {d16[1]}, [r1], r2
+ vld1.16 {d18[0]}, [r0,:16], r2
+ vld1.16 {d18[1]}, [r0,:16]
+ sub r0, r0, r2
+ vrhadd.u8 d16, d16, d18
+ vst1.16 {d16[0]}, [r0,:16], r2
+ vst1.16 {d16[1]}, [r0,:16], r2
+ .endif
+ subs r3, r3, #2
+ bgt 2b
+ pop {r4-r6, pc}
+endfunc
+.endm
+
+#if CONFIG_H264_DECODER
+ h264_chroma_mc8 put
+ h264_chroma_mc8 avg
+ h264_chroma_mc4 put
+ h264_chroma_mc4 avg
+ h264_chroma_mc2 put
+ h264_chroma_mc2 avg
+#endif
+
+#if CONFIG_RV40_DECODER
+const rv40bias
+ .short 0, 16, 32, 16
+ .short 32, 28, 32, 28
+ .short 0, 32, 16, 32
+ .short 32, 28, 32, 28
+endconst
+
+ h264_chroma_mc8 put, rv40
+ h264_chroma_mc8 avg, rv40
+ h264_chroma_mc4 put, rv40
+ h264_chroma_mc4 avg, rv40
+#endif
diff --git a/ffmpeg/libavcodec/arm/h264dsp_init_arm.c b/ffmpeg/libavcodec/arm/h264dsp_init_arm.c
new file mode 100644
index 0000000..785b604
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264dsp_init_arm.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/h264dsp.h"
+
+void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
+ int beta, int8_t *tc0);
+
+void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
+ int log2_den, int weight, int offset);
+
+void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
+ int height, int log2_den, int weightd,
+ int weights, int offset);
+
+void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+
+void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
+void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
+ int16_t *block, int stride,
+ const uint8_t nnzc[6*8]);
+
+static av_cold void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
+ const int chroma_format_idc)
+{
+#if HAVE_NEON
+ if (bit_depth == 8) {
+ c->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_neon;
+ c->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_neon;
+ if(chroma_format_idc == 1){
+ c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
+ c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+ }
+
+ c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
+ c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
+ c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
+
+ c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
+ c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
+ c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
+
+ c->h264_idct_add = ff_h264_idct_add_neon;
+ c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
+ c->h264_idct_add16 = ff_h264_idct_add16_neon;
+ c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
+ if (chroma_format_idc == 1)
+ c->h264_idct_add8 = ff_h264_idct_add8_neon;
+ c->h264_idct8_add = ff_h264_idct8_add_neon;
+ c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
+ c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
+ }
+#endif // HAVE_NEON
+}
+
+av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
+ const int chroma_format_idc)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_h264dsp_init_neon(c, bit_depth, chroma_format_idc);
+}
diff --git a/ffmpeg/libavcodec/arm/h264dsp_neon.S b/ffmpeg/libavcodec/arm/h264dsp_neon.S
new file mode 100644
index 0000000..274a547
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264dsp_neon.S
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+ /* H.264 loop filter */
+
+.macro h264_loop_filter_start
+ ldr r12, [sp]
+ tst r2, r2
+ ldr r12, [r12]
+ it ne
+ tstne r3, r3
+ vmov.32 d24[0], r12
+ and r12, r12, r12, lsl #16
+ it eq
+ bxeq lr
+ ands r12, r12, r12, lsl #8
+ it lt
+ bxlt lr
+.endm
+
+.macro h264_loop_filter_luma
+ vdup.8 q11, r2 @ alpha
+ vmovl.u8 q12, d24
+ vabd.u8 q6, q8, q0 @ abs(p0 - q0)
+ vmovl.u16 q12, d24
+ vabd.u8 q14, q9, q8 @ abs(p1 - p0)
+ vsli.16 q12, q12, #8
+ vabd.u8 q15, q1, q0 @ abs(q1 - q0)
+ vsli.32 q12, q12, #16
+ vclt.u8 q6, q6, q11 @ < alpha
+ vdup.8 q11, r3 @ beta
+ vclt.s8 q7, q12, #0
+ vclt.u8 q14, q14, q11 @ < beta
+ vclt.u8 q15, q15, q11 @ < beta
+ vbic q6, q6, q7
+ vabd.u8 q4, q10, q8 @ abs(p2 - p0)
+ vand q6, q6, q14
+ vabd.u8 q5, q2, q0 @ abs(q2 - q0)
+ vclt.u8 q4, q4, q11 @ < beta
+ vand q6, q6, q15
+ vclt.u8 q5, q5, q11 @ < beta
+ vand q4, q4, q6
+ vand q5, q5, q6
+ vand q12, q12, q6
+ vrhadd.u8 q14, q8, q0
+ vsub.i8 q6, q12, q4
+ vqadd.u8 q7, q9, q12
+ vhadd.u8 q10, q10, q14
+ vsub.i8 q6, q6, q5
+ vhadd.u8 q14, q2, q14
+ vmin.u8 q7, q7, q10
+ vqsub.u8 q11, q9, q12
+ vqadd.u8 q2, q1, q12
+ vmax.u8 q7, q7, q11
+ vqsub.u8 q11, q1, q12
+ vmin.u8 q14, q2, q14
+ vmovl.u8 q2, d0
+ vmax.u8 q14, q14, q11
+ vmovl.u8 q10, d1
+ vsubw.u8 q2, q2, d16
+ vsubw.u8 q10, q10, d17
+ vshl.i16 q2, q2, #2
+ vshl.i16 q10, q10, #2
+ vaddw.u8 q2, q2, d18
+ vaddw.u8 q10, q10, d19
+ vsubw.u8 q2, q2, d2
+ vsubw.u8 q10, q10, d3
+ vrshrn.i16 d4, q2, #3
+ vrshrn.i16 d5, q10, #3
+ vbsl q4, q7, q9
+ vbsl q5, q14, q1
+ vneg.s8 q7, q6
+ vmovl.u8 q14, d16
+ vmin.s8 q2, q2, q6
+ vmovl.u8 q6, d17
+ vmax.s8 q2, q2, q7
+ vmovl.u8 q11, d0
+ vmovl.u8 q12, d1
+ vaddw.s8 q14, q14, d4
+ vaddw.s8 q6, q6, d5
+ vsubw.s8 q11, q11, d4
+ vsubw.s8 q12, q12, d5
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d17, q6
+ vqmovun.s16 d0, q11
+ vqmovun.s16 d1, q12
+.endm
+
+function ff_h264_v_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+
+ vld1.8 {d0, d1}, [r0,:128], r1
+ vld1.8 {d2, d3}, [r0,:128], r1
+ vld1.8 {d4, d5}, [r0,:128], r1
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, r1, lsl #1
+ vld1.8 {d20,d21}, [r0,:128], r1
+ vld1.8 {d18,d19}, [r0,:128], r1
+ vld1.8 {d16,d17}, [r0,:128], r1
+
+ vpush {d8-d15}
+
+ h264_loop_filter_luma
+
+ sub r0, r0, r1, lsl #1
+ vst1.8 {d8, d9}, [r0,:128], r1
+ vst1.8 {d16,d17}, [r0,:128], r1
+ vst1.8 {d0, d1}, [r0,:128], r1
+ vst1.8 {d10,d11}, [r0,:128]
+
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+function ff_h264_h_loop_filter_luma_neon, export=1
+ h264_loop_filter_start
+
+ sub r0, r0, #4
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d20}, [r0], r1
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d26}, [r0], r1
+ vld1.8 {d7}, [r0], r1
+ vld1.8 {d21}, [r0], r1
+ vld1.8 {d19}, [r0], r1
+ vld1.8 {d17}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d5}, [r0], r1
+ vld1.8 {d27}, [r0], r1
+
+ transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
+
+ vpush {d8-d15}
+
+ h264_loop_filter_luma
+
+ transpose_4x4 q4, q8, q0, q5
+
+ sub r0, r0, r1, lsl #4
+ add r0, r0, #2
+ vst1.32 {d8[0]}, [r0], r1
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d10[0]}, [r0], r1
+ vst1.32 {d8[1]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d10[1]}, [r0], r1
+ vst1.32 {d9[0]}, [r0], r1
+ vst1.32 {d17[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ vst1.32 {d11[0]}, [r0], r1
+ vst1.32 {d9[1]}, [r0], r1
+ vst1.32 {d17[1]}, [r0], r1
+ vst1.32 {d1[1]}, [r0], r1
+ vst1.32 {d11[1]}, [r0], r1
+
+ vpop {d8-d15}
+ bx lr
+endfunc
+
+.macro h264_loop_filter_chroma
+ vdup.8 d22, r2 @ alpha
+ vmovl.u8 q12, d24
+ vabd.u8 d26, d16, d0 @ abs(p0 - q0)
+ vmovl.u8 q2, d0
+ vabd.u8 d28, d18, d16 @ abs(p1 - p0)
+ vsubw.u8 q2, q2, d16
+ vsli.16 d24, d24, #8
+ vshl.i16 q2, q2, #2
+ vabd.u8 d30, d2, d0 @ abs(q1 - q0)
+ vaddw.u8 q2, q2, d18
+ vclt.u8 d26, d26, d22 @ < alpha
+ vsubw.u8 q2, q2, d2
+ vdup.8 d22, r3 @ beta
+ vrshrn.i16 d4, q2, #3
+ vclt.u8 d28, d28, d22 @ < beta
+ vclt.u8 d30, d30, d22 @ < beta
+ vmin.s8 d4, d4, d24
+ vneg.s8 d25, d24
+ vand d26, d26, d28
+ vmax.s8 d4, d4, d25
+ vand d26, d26, d30
+ vmovl.u8 q11, d0
+ vand d4, d4, d26
+ vmovl.u8 q14, d16
+ vaddw.s8 q14, q14, d4
+ vsubw.s8 q11, q11, d4
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d0, q11
+.endm
+
+function ff_h264_v_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub r0, r0, r1, lsl #1
+ vld1.8 {d18}, [r0,:64], r1
+ vld1.8 {d16}, [r0,:64], r1
+ vld1.8 {d0}, [r0,:64], r1
+ vld1.8 {d2}, [r0,:64]
+
+ h264_loop_filter_chroma
+
+ sub r0, r0, r1, lsl #1
+ vst1.8 {d16}, [r0,:64], r1
+ vst1.8 {d0}, [r0,:64], r1
+
+ bx lr
+endfunc
+
+function ff_h264_h_loop_filter_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub r0, r0, #2
+ vld1.32 {d18[0]}, [r0], r1
+ vld1.32 {d16[0]}, [r0], r1
+ vld1.32 {d0[0]}, [r0], r1
+ vld1.32 {d2[0]}, [r0], r1
+ vld1.32 {d18[1]}, [r0], r1
+ vld1.32 {d16[1]}, [r0], r1
+ vld1.32 {d0[1]}, [r0], r1
+ vld1.32 {d2[1]}, [r0], r1
+
+ vtrn.16 d18, d0
+ vtrn.16 d16, d2
+ vtrn.8 d18, d16
+ vtrn.8 d0, d2
+
+ h264_loop_filter_chroma
+
+ vtrn.16 d18, d0
+ vtrn.16 d16, d2
+ vtrn.8 d18, d16
+ vtrn.8 d0, d2
+
+ sub r0, r0, r1, lsl #3
+ vst1.32 {d18[0]}, [r0], r1
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d2[0]}, [r0], r1
+ vst1.32 {d18[1]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d2[1]}, [r0], r1
+
+ bx lr
+endfunc
+
+@ Biweighted prediction
+
+.macro biweight_16 macs, macd
+ vdup.8 d0, r4
+ vdup.8 d1, r5
+ vmov q2, q8
+ vmov q3, q8
+1: subs r3, r3, #2
+ vld1.8 {d20-d21},[r0,:128], r2
+ \macd q2, d0, d20
+ pld [r0]
+ \macd q3, d0, d21
+ vld1.8 {d22-d23},[r1,:128], r2
+ \macs q2, d1, d22
+ pld [r1]
+ \macs q3, d1, d23
+ vmov q12, q8
+ vld1.8 {d28-d29},[r0,:128], r2
+ vmov q13, q8
+ \macd q12, d0, d28
+ pld [r0]
+ \macd q13, d0, d29
+ vld1.8 {d30-d31},[r1,:128], r2
+ \macs q12, d1, d30
+ pld [r1]
+ \macs q13, d1, d31
+ vshl.s16 q2, q2, q9
+ vshl.s16 q3, q3, q9
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d5, q3
+ vshl.s16 q12, q12, q9
+ vshl.s16 q13, q13, q9
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vmov q3, q8
+ vst1.8 {d4- d5}, [r6,:128], r2
+ vmov q2, q8
+ vst1.8 {d24-d25},[r6,:128], r2
+ bne 1b
+ pop {r4-r6, pc}
+.endm
+
+.macro biweight_8 macs, macd
+ vdup.8 d0, r4
+ vdup.8 d1, r5
+ vmov q1, q8
+ vmov q10, q8
+1: subs r3, r3, #2
+ vld1.8 {d4},[r0,:64], r2
+ \macd q1, d0, d4
+ pld [r0]
+ vld1.8 {d5},[r1,:64], r2
+ \macs q1, d1, d5
+ pld [r1]
+ vld1.8 {d6},[r0,:64], r2
+ \macd q10, d0, d6
+ pld [r0]
+ vld1.8 {d7},[r1,:64], r2
+ \macs q10, d1, d7
+ pld [r1]
+ vshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vmov q10, q8
+ vst1.8 {d2},[r6,:64], r2
+ vmov q1, q8
+ vst1.8 {d4},[r6,:64], r2
+ bne 1b
+ pop {r4-r6, pc}
+.endm
+
+.macro biweight_4 macs, macd
+ vdup.8 d0, r4
+ vdup.8 d1, r5
+ vmov q1, q8
+ vmov q10, q8
+1: subs r3, r3, #4
+ vld1.32 {d4[0]},[r0,:32], r2
+ vld1.32 {d4[1]},[r0,:32], r2
+ \macd q1, d0, d4
+ pld [r0]
+ vld1.32 {d5[0]},[r1,:32], r2
+ vld1.32 {d5[1]},[r1,:32], r2
+ \macs q1, d1, d5
+ pld [r1]
+ blt 2f
+ vld1.32 {d6[0]},[r0,:32], r2
+ vld1.32 {d6[1]},[r0,:32], r2
+ \macd q10, d0, d6
+ pld [r0]
+ vld1.32 {d7[0]},[r1,:32], r2
+ vld1.32 {d7[1]},[r1,:32], r2
+ \macs q10, d1, d7
+ pld [r1]
+ vshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vmov q10, q8
+ vst1.32 {d2[0]},[r6,:32], r2
+ vst1.32 {d2[1]},[r6,:32], r2
+ vmov q1, q8
+ vst1.32 {d4[0]},[r6,:32], r2
+ vst1.32 {d4[1]},[r6,:32], r2
+ bne 1b
+ pop {r4-r6, pc}
+2: vshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vst1.32 {d2[0]},[r6,:32], r2
+ vst1.32 {d2[1]},[r6,:32], r2
+ pop {r4-r6, pc}
+.endm
+
+.macro biweight_func w
+function ff_biweight_h264_pixels_\w\()_neon, export=1
+ push {r4-r6, lr}
+ ldr r12, [sp, #16]
+ add r4, sp, #20
+ ldm r4, {r4-r6}
+ lsr lr, r4, #31
+ add r6, r6, #1
+ eors lr, lr, r5, lsr #30
+ orr r6, r6, #1
+ vdup.16 q9, r12
+ lsl r6, r6, r12
+ vmvn q9, q9
+ vdup.16 q8, r6
+ mov r6, r0
+ beq 10f
+ subs lr, lr, #1
+ beq 20f
+ subs lr, lr, #1
+ beq 30f
+ b 40f
+10: biweight_\w vmlal.u8, vmlal.u8
+20: rsb r4, r4, #0
+ biweight_\w vmlal.u8, vmlsl.u8
+30: rsb r4, r4, #0
+ rsb r5, r5, #0
+ biweight_\w vmlsl.u8, vmlsl.u8
+40: rsb r5, r5, #0
+ biweight_\w vmlsl.u8, vmlal.u8
+endfunc
+.endm
+
+ biweight_func 16
+ biweight_func 8
+ biweight_func 4
+
+@ Weighted prediction
+
+.macro weight_16 add
+ vdup.8 d0, r12
+1: subs r2, r2, #2
+ vld1.8 {d20-d21},[r0,:128], r1
+ vmull.u8 q2, d0, d20
+ pld [r0]
+ vmull.u8 q3, d0, d21
+ vld1.8 {d28-d29},[r0,:128], r1
+ vmull.u8 q12, d0, d28
+ pld [r0]
+ vmull.u8 q13, d0, d29
+ \add q2, q8, q2
+ vrshl.s16 q2, q2, q9
+ \add q3, q8, q3
+ vrshl.s16 q3, q3, q9
+ vqmovun.s16 d4, q2
+ vqmovun.s16 d5, q3
+ \add q12, q8, q12
+ vrshl.s16 q12, q12, q9
+ \add q13, q8, q13
+ vrshl.s16 q13, q13, q9
+ vqmovun.s16 d24, q12
+ vqmovun.s16 d25, q13
+ vst1.8 {d4- d5}, [r4,:128], r1
+ vst1.8 {d24-d25},[r4,:128], r1
+ bne 1b
+ pop {r4, pc}
+.endm
+
+.macro weight_8 add
+ vdup.8 d0, r12
+1: subs r2, r2, #2
+ vld1.8 {d4},[r0,:64], r1
+ vmull.u8 q1, d0, d4
+ pld [r0]
+ vld1.8 {d6},[r0,:64], r1
+ vmull.u8 q10, d0, d6
+ \add q1, q8, q1
+ pld [r0]
+ vrshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ \add q10, q8, q10
+ vrshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vst1.8 {d2},[r4,:64], r1
+ vst1.8 {d4},[r4,:64], r1
+ bne 1b
+ pop {r4, pc}
+.endm
+
+.macro weight_4 add
+ vdup.8 d0, r12
+ vmov q1, q8
+ vmov q10, q8
+1: subs r2, r2, #4
+ vld1.32 {d4[0]},[r0,:32], r1
+ vld1.32 {d4[1]},[r0,:32], r1
+ vmull.u8 q1, d0, d4
+ pld [r0]
+ blt 2f
+ vld1.32 {d6[0]},[r0,:32], r1
+ vld1.32 {d6[1]},[r0,:32], r1
+ vmull.u8 q10, d0, d6
+ pld [r0]
+ \add q1, q8, q1
+ vrshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ \add q10, q8, q10
+ vrshl.s16 q10, q10, q9
+ vqmovun.s16 d4, q10
+ vmov q10, q8
+ vst1.32 {d2[0]},[r4,:32], r1
+ vst1.32 {d2[1]},[r4,:32], r1
+ vmov q1, q8
+ vst1.32 {d4[0]},[r4,:32], r1
+ vst1.32 {d4[1]},[r4,:32], r1
+ bne 1b
+ pop {r4, pc}
+2: \add q1, q8, q1
+ vrshl.s16 q1, q1, q9
+ vqmovun.s16 d2, q1
+ vst1.32 {d2[0]},[r4,:32], r1
+ vst1.32 {d2[1]},[r4,:32], r1
+ pop {r4, pc}
+.endm
+
+.macro weight_func w
+function ff_weight_h264_pixels_\w\()_neon, export=1
+ push {r4, lr}
+ ldr r12, [sp, #8]
+ ldr r4, [sp, #12]
+ cmp r3, #1
+ lsl r4, r4, r3
+ vdup.16 q8, r4
+ mov r4, r0
+ ble 20f
+ rsb lr, r3, #1
+ vdup.16 q9, lr
+ cmp r12, #0
+ blt 10f
+ weight_\w vhadd.s16
+10: rsb r12, r12, #0
+ weight_\w vhsub.s16
+20: rsb lr, r3, #0
+ vdup.16 q9, lr
+ cmp r12, #0
+ blt 10f
+ weight_\w vadd.s16
+10: rsb r12, r12, #0
+ weight_\w vsub.s16
+endfunc
+.endm
+
+ weight_func 16
+ weight_func 8
+ weight_func 4
diff --git a/ffmpeg/libavcodec/arm/h264idct_neon.S b/ffmpeg/libavcodec/arm/h264idct_neon.S
new file mode 100644
index 0000000..fa5b90c
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264idct_neon.S
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_h264_idct_add_neon, export=1
+ vld1.64 {d0-d3}, [r1,:128]
+ vmov.i16 q15, #0
+
+ vswp d1, d2
+ vst1.16 {q15}, [r1,:128]!
+ vadd.i16 d4, d0, d1
+ vst1.16 {q15}, [r1,:128]!
+ vshr.s16 q8, q1, #1
+ vsub.i16 d5, d0, d1
+ vadd.i16 d6, d2, d17
+ vsub.i16 d7, d16, d3
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vtrn.16 d0, d1
+ vtrn.16 d3, d2
+ vtrn.32 d0, d3
+ vtrn.32 d1, d2
+
+ vadd.i16 d4, d0, d3
+ vld1.32 {d18[0]}, [r0,:32], r2
+ vswp d1, d3
+ vshr.s16 q8, q1, #1
+ vld1.32 {d19[1]}, [r0,:32], r2
+ vsub.i16 d5, d0, d1
+ vld1.32 {d18[1]}, [r0,:32], r2
+ vadd.i16 d6, d16, d3
+ vld1.32 {d19[0]}, [r0,:32], r2
+ vsub.i16 d7, d2, d17
+ sub r0, r0, r2, lsl #2
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vrshr.s16 q0, q0, #6
+ vrshr.s16 q1, q1, #6
+
+ vaddw.u8 q0, q0, d18
+ vaddw.u8 q1, q1, d19
+
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+
+ sub r1, r1, #32
+ bx lr
+endfunc
+
+function ff_h264_idct_dc_add_neon, export=1
+ mov r3, #0
+ vld1.16 {d2[],d3[]}, [r1,:16]
+ strh r3, [r1]
+ vrshr.s16 q1, q1, #6
+ vld1.32 {d0[0]}, [r0,:32], r2
+ vld1.32 {d0[1]}, [r0,:32], r2
+ vaddw.u8 q2, q1, d0
+ vld1.32 {d1[0]}, [r0,:32], r2
+ vld1.32 {d1[1]}, [r0,:32], r2
+ vaddw.u8 q1, q1, d1
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q1
+ sub r0, r0, r2, lsl #2
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ bx lr
+endfunc
+
+function ff_h264_idct_add16_neon, export=1
+ push {r4-r8,lr}
+ mov r4, r0
+ mov r5, r1
+ mov r1, r2
+ mov r2, r3
+ ldr r6, [sp, #24]
+ movrel r7, scan8
+ mov ip, #16
+1: ldrb r8, [r7], #1
+ ldr r0, [r5], #4
+ ldrb r8, [r6, r8]
+ subs r8, r8, #1
+ blt 2f
+ ldrsh lr, [r1]
+ add r0, r0, r4
+ it ne
+ movne lr, #0
+ cmp lr, #0
+ ite ne
+ adrne lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ blx lr
+2: subs ip, ip, #1
+ add r1, r1, #32
+ bne 1b
+ pop {r4-r8,pc}
+endfunc
+
+function ff_h264_idct_add16intra_neon, export=1
+ push {r4-r8,lr}
+ mov r4, r0
+ mov r5, r1
+ mov r1, r2
+ mov r2, r3
+ ldr r6, [sp, #24]
+ movrel r7, scan8
+ mov ip, #16
+1: ldrb r8, [r7], #1
+ ldr r0, [r5], #4
+ ldrb r8, [r6, r8]
+ add r0, r0, r4
+ cmp r8, #0
+ ldrsh r8, [r1]
+ iteet ne
+ adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
+ cmpeq r8, #0
+ blxne lr
+ subs ip, ip, #1
+ add r1, r1, #32
+ bne 1b
+ pop {r4-r8,pc}
+endfunc
+
+function ff_h264_idct_add8_neon, export=1
+ push {r4-r10,lr}
+ ldm r0, {r4,r9}
+ add r5, r1, #16*4
+ add r1, r2, #16*32
+ mov r2, r3
+ mov r10, r1
+ ldr r6, [sp, #32]
+ movrel r7, scan8+16
+ mov r12, #0
+1: ldrb r8, [r7, r12]
+ ldr r0, [r5, r12, lsl #2]
+ ldrb r8, [r6, r8]
+ add r0, r0, r4
+ add r1, r10, r12, lsl #5
+ cmp r8, #0
+ ldrsh r8, [r1]
+ iteet ne
+ adrne lr, ff_h264_idct_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct_dc_add_neon + CONFIG_THUMB
+ cmpeq r8, #0
+ blxne lr
+ add r12, r12, #1
+ cmp r12, #4
+ itt eq
+ moveq r12, #16
+ moveq r4, r9
+ cmp r12, #20
+ blt 1b
+ pop {r4-r10,pc}
+endfunc
+
+.macro idct8x8_cols pass
+ .if \pass == 0
+ qa .req q2
+ qb .req q14
+ vshr.s16 q2, q10, #1
+ vadd.i16 q0, q8, q12
+ vld1.16 {q14-q15},[r1,:128]
+ vst1.16 {q7}, [r1,:128]!
+ vst1.16 {q7}, [r1,:128]!
+ vsub.i16 q1, q8, q12
+ vshr.s16 q3, q14, #1
+ vsub.i16 q2, q2, q14
+ vadd.i16 q3, q3, q10
+ .else
+ qa .req q14
+ qb .req q2
+ vtrn.32 q8, q10
+ vtrn.16 q12, q13
+ vtrn.32 q9, q11
+ vtrn.32 q12, q2
+ vtrn.32 q13, q15
+ vswp d21, d4
+ vshr.s16 q14, q10, #1
+ vswp d17, d24
+ vshr.s16 q3, q2, #1
+ vswp d19, d26
+ vadd.i16 q0, q8, q12
+ vswp d23, d30
+ vsub.i16 q1, q8, q12
+ vsub.i16 q14, q14, q2
+ vadd.i16 q3, q3, q10
+ .endif
+ vadd.i16 q10, q1, qa
+ vsub.i16 q12, q1, qa
+ vadd.i16 q8, q0, q3
+ vsub.i16 qb, q0, q3
+ vsub.i16 q0, q13, q11
+ vadd.i16 q1, q15, q9
+ vsub.i16 qa, q15, q9
+ vadd.i16 q3, q13, q11
+ vsub.i16 q0, q0, q15
+ vsub.i16 q1, q1, q11
+ vadd.i16 qa, qa, q13
+ vadd.i16 q3, q3, q9
+ vshr.s16 q9, q9, #1
+ vshr.s16 q11, q11, #1
+ vshr.s16 q13, q13, #1
+ vshr.s16 q15, q15, #1
+ vsub.i16 q0, q0, q15
+ vsub.i16 q1, q1, q11
+ vadd.i16 qa, qa, q13
+ vadd.i16 q3, q3, q9
+ vshr.s16 q9, q0, #2
+ vshr.s16 q11, q1, #2
+ vshr.s16 q13, qa, #2
+ vshr.s16 q15, q3, #2
+ vsub.i16 q3, q3, q9
+ vsub.i16 qa, q11, qa
+ vadd.i16 q1, q1, q13
+ vadd.i16 q0, q0, q15
+ .if \pass == 0
+ vsub.i16 q15, q8, q3
+ vadd.i16 q8, q8, q3
+ vadd.i16 q9, q10, q2
+ vsub.i16 q2, q10, q2
+ vtrn.16 q8, q9
+ vadd.i16 q10, q12, q1
+ vtrn.16 q2, q15
+ vadd.i16 q11, q14, q0
+ vsub.i16 q13, q12, q1
+ vtrn.16 q10, q11
+ vsub.i16 q12, q14, q0
+ .else
+ vsub.i16 q15, q8, q3
+ vadd.i16 q8, q8, q3
+ vadd.i16 q9, q10, q14
+ vsub.i16 q14, q10, q14
+ vadd.i16 q10, q12, q1
+ vsub.i16 q13, q12, q1
+ vadd.i16 q11, q2, q0
+ vsub.i16 q12, q2, q0
+ .endif
+ .unreq qa
+ .unreq qb
+.endm
+
+function ff_h264_idct8_add_neon, export=1
+ vmov.i16 q7, #0
+ vld1.16 {q8-q9}, [r1,:128]
+ vst1.16 {q7}, [r1,:128]!
+ vst1.16 {q7}, [r1,:128]!
+ vld1.16 {q10-q11},[r1,:128]
+ vst1.16 {q7}, [r1,:128]!
+ vst1.16 {q7}, [r1,:128]!
+ vld1.16 {q12-q13},[r1,:128]
+ vst1.16 {q7}, [r1,:128]!
+ vst1.16 {q7}, [r1,:128]!
+
+ idct8x8_cols 0
+ idct8x8_cols 1
+
+ mov r3, r0
+ vrshr.s16 q8, q8, #6
+ vld1.8 {d0}, [r0,:64], r2
+ vrshr.s16 q9, q9, #6
+ vld1.8 {d1}, [r0,:64], r2
+ vrshr.s16 q10, q10, #6
+ vld1.8 {d2}, [r0,:64], r2
+ vrshr.s16 q11, q11, #6
+ vld1.8 {d3}, [r0,:64], r2
+ vrshr.s16 q12, q12, #6
+ vld1.8 {d4}, [r0,:64], r2
+ vrshr.s16 q13, q13, #6
+ vld1.8 {d5}, [r0,:64], r2
+ vrshr.s16 q14, q14, #6
+ vld1.8 {d6}, [r0,:64], r2
+ vrshr.s16 q15, q15, #6
+ vld1.8 {d7}, [r0,:64], r2
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vqmovun.s16 d0, q8
+ vaddw.u8 q11, q11, d3
+ vqmovun.s16 d1, q9
+ vaddw.u8 q12, q12, d4
+ vqmovun.s16 d2, q10
+ vst1.8 {d0}, [r3,:64], r2
+ vaddw.u8 q13, q13, d5
+ vqmovun.s16 d3, q11
+ vst1.8 {d1}, [r3,:64], r2
+ vaddw.u8 q14, q14, d6
+ vqmovun.s16 d4, q12
+ vst1.8 {d2}, [r3,:64], r2
+ vaddw.u8 q15, q15, d7
+ vqmovun.s16 d5, q13
+ vst1.8 {d3}, [r3,:64], r2
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+ vst1.8 {d4}, [r3,:64], r2
+ vst1.8 {d5}, [r3,:64], r2
+ vst1.8 {d6}, [r3,:64], r2
+ vst1.8 {d7}, [r3,:64], r2
+
+ sub r1, r1, #128
+ bx lr
+endfunc
+
+function ff_h264_idct8_dc_add_neon, export=1
+ mov r3, #0
+ vld1.16 {d30[],d31[]},[r1,:16]
+ strh r3, [r1]
+ vld1.32 {d0}, [r0,:64], r2
+ vrshr.s16 q15, q15, #6
+ vld1.32 {d1}, [r0,:64], r2
+ vld1.32 {d2}, [r0,:64], r2
+ vaddw.u8 q8, q15, d0
+ vld1.32 {d3}, [r0,:64], r2
+ vaddw.u8 q9, q15, d1
+ vld1.32 {d4}, [r0,:64], r2
+ vaddw.u8 q10, q15, d2
+ vld1.32 {d5}, [r0,:64], r2
+ vaddw.u8 q11, q15, d3
+ vld1.32 {d6}, [r0,:64], r2
+ vaddw.u8 q12, q15, d4
+ vld1.32 {d7}, [r0,:64], r2
+ vaddw.u8 q13, q15, d5
+ vaddw.u8 q14, q15, d6
+ vaddw.u8 q15, q15, d7
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ sub r0, r0, r2, lsl #3
+ vst1.32 {d0}, [r0,:64], r2
+ vqmovun.s16 d4, q12
+ vst1.32 {d1}, [r0,:64], r2
+ vqmovun.s16 d5, q13
+ vst1.32 {d2}, [r0,:64], r2
+ vqmovun.s16 d6, q14
+ vst1.32 {d3}, [r0,:64], r2
+ vqmovun.s16 d7, q15
+ vst1.32 {d4}, [r0,:64], r2
+ vst1.32 {d5}, [r0,:64], r2
+ vst1.32 {d6}, [r0,:64], r2
+ vst1.32 {d7}, [r0,:64], r2
+ bx lr
+endfunc
+
+function ff_h264_idct8_add4_neon, export=1
+ push {r4-r8,lr}
+ mov r4, r0
+ mov r5, r1
+ mov r1, r2
+ mov r2, r3
+ ldr r6, [sp, #24]
+ movrel r7, scan8
+ mov r12, #16
+1: ldrb r8, [r7], #4
+ ldr r0, [r5], #16
+ ldrb r8, [r6, r8]
+ subs r8, r8, #1
+ blt 2f
+ ldrsh lr, [r1]
+ add r0, r0, r4
+ it ne
+ movne lr, #0
+ cmp lr, #0
+ ite ne
+ adrne lr, ff_h264_idct8_dc_add_neon + CONFIG_THUMB
+ adreq lr, ff_h264_idct8_add_neon + CONFIG_THUMB
+ blx lr
+2: subs r12, r12, #4
+ add r1, r1, #128
+ bne 1b
+ pop {r4-r8,pc}
+endfunc
+
+const scan8
+ .byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
+ .byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
+ .byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
+ .byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
+ .byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
+ .byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
+ .byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
+ .byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
+ .byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
+ .byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
+ .byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
+ .byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
+endconst
diff --git a/ffmpeg/libavcodec/arm/h264pred_init_arm.c b/ffmpeg/libavcodec/arm/h264pred_init_arm.c
new file mode 100644
index 0000000..5ec39ce
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264pred_init_arm.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/h264pred.h"
+
+void ff_pred16x16_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred16x16_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+void ff_pred8x8_vert_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_hor_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_plane_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_128_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_left_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_top_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l0t_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0lt_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_l00_dc_neon(uint8_t *src, ptrdiff_t stride);
+void ff_pred8x8_0l0_dc_neon(uint8_t *src, ptrdiff_t stride);
+
+static av_cold void ff_h264_pred_init_neon(H264PredContext *h, int codec_id,
+ const int bit_depth,
+ const int chroma_format_idc)
+{
+#if HAVE_NEON
+ const int high_depth = bit_depth > 8;
+
+ if (high_depth)
+ return;
+ if(chroma_format_idc == 1){
+ h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vert_neon;
+ h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_hor_neon;
+ if (codec_id != AV_CODEC_ID_VP8)
+ h->pred8x8[PLANE_PRED8x8] = ff_pred8x8_plane_neon;
+ h->pred8x8[DC_128_PRED8x8 ] = ff_pred8x8_128_dc_neon;
+ if (codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8) {
+ h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_neon;
+ h->pred8x8[LEFT_DC_PRED8x8] = ff_pred8x8_left_dc_neon;
+ h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_L0T_PRED8x8] = ff_pred8x8_l0t_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_0LT_PRED8x8] = ff_pred8x8_0lt_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_L00_PRED8x8] = ff_pred8x8_l00_dc_neon;
+ h->pred8x8[ALZHEIMER_DC_0L0_PRED8x8] = ff_pred8x8_0l0_dc_neon;
+ }
+ }
+
+ h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_neon;
+ h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vert_neon;
+ h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_hor_neon;
+ h->pred16x16[LEFT_DC_PRED8x8] = ff_pred16x16_left_dc_neon;
+ h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_neon;
+ h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_neon;
+ if (codec_id != AV_CODEC_ID_SVQ3 && codec_id != AV_CODEC_ID_RV40 && codec_id != AV_CODEC_ID_VP8)
+ h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_neon;
+#endif // HAVE_NEON
+}
+
+av_cold void ff_h264_pred_init_arm(H264PredContext *h, int codec_id,
+ int bit_depth, const int chroma_format_idc)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_h264_pred_init_neon(h, codec_id, bit_depth, chroma_format_idc);
+}
diff --git a/ffmpeg/libavcodec/arm/h264pred_neon.S b/ffmpeg/libavcodec/arm/h264pred_neon.S
new file mode 100644
index 0000000..4dc47ba
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264pred_neon.S
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+ .macro ldcol.8 rd, rs, rt, n=8, hi=0
+.if \n == 8 || \hi == 0
+ vld1.8 {\rd[0]}, [\rs], \rt
+ vld1.8 {\rd[1]}, [\rs], \rt
+ vld1.8 {\rd[2]}, [\rs], \rt
+ vld1.8 {\rd[3]}, [\rs], \rt
+.endif
+.if \n == 8 || \hi == 1
+ vld1.8 {\rd[4]}, [\rs], \rt
+ vld1.8 {\rd[5]}, [\rs], \rt
+ vld1.8 {\rd[6]}, [\rs], \rt
+ vld1.8 {\rd[7]}, [\rs], \rt
+.endif
+ .endm
+
+ .macro add16x8 dq, dl, dh, rl, rh
+ vaddl.u8 \dq, \rl, \rh
+ vadd.u16 \dl, \dl, \dh
+ vpadd.u16 \dl, \dl, \dl
+ vpadd.u16 \dl, \dl, \dl
+ .endm
+
+function ff_pred16x16_128_dc_neon, export=1
+ vmov.i8 q0, #128
+ b .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_top_dc_neon, export=1
+ sub r2, r0, r1
+ vld1.8 {q0}, [r2,:128]
+ add16x8 q0, d0, d1, d0, d1
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+ b .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_left_dc_neon, export=1
+ sub r2, r0, #1
+ ldcol.8 d0, r2, r1
+ ldcol.8 d1, r2, r1
+ add16x8 q0, d0, d1, d0, d1
+ vrshrn.u16 d0, q0, #4
+ vdup.8 q0, d0[0]
+ b .L_pred16x16_dc_end
+endfunc
+
+function ff_pred16x16_dc_neon, export=1
+ sub r2, r0, r1
+ vld1.8 {q0}, [r2,:128]
+ sub r2, r0, #1
+ ldcol.8 d2, r2, r1
+ ldcol.8 d3, r2, r1
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vadd.u16 q0, q0, q1
+ vadd.u16 d0, d0, d1
+ vpadd.u16 d0, d0, d0
+ vpadd.u16 d0, d0, d0
+ vrshrn.u16 d0, q0, #5
+ vdup.8 q0, d0[0]
+.L_pred16x16_dc_end:
+ mov r3, #8
+6: vst1.8 {q0}, [r0,:128], r1
+ vst1.8 {q0}, [r0,:128], r1
+ subs r3, r3, #1
+ bne 6b
+ bx lr
+endfunc
+
+function ff_pred16x16_hor_neon, export=1
+ sub r2, r0, #1
+ mov r3, #16
+1: vld1.8 {d0[],d1[]},[r2], r1
+ vst1.8 {q0}, [r0,:128], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_pred16x16_vert_neon, export=1
+ sub r0, r0, r1
+ vld1.8 {q0}, [r0,:128], r1
+ mov r3, #8
+1: vst1.8 {q0}, [r0,:128], r1
+ vst1.8 {q0}, [r0,:128], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_pred16x16_plane_neon, export=1
+ sub r3, r0, r1
+ add r2, r3, #8
+ sub r3, r3, #1
+ vld1.8 {d0}, [r3]
+ vld1.8 {d2}, [r2,:64], r1
+ ldcol.8 d1, r3, r1
+ add r3, r3, r1
+ ldcol.8 d3, r3, r1
+ vrev64.8 q0, q0
+ vaddl.u8 q8, d2, d3
+ vsubl.u8 q2, d2, d0
+ vsubl.u8 q3, d3, d1
+ movrel r3, p16weight
+ vld1.8 {q0}, [r3,:128]
+ vmul.s16 q2, q2, q0
+ vmul.s16 q3, q3, q0
+ vadd.i16 d4, d4, d5
+ vadd.i16 d5, d6, d7
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d4, d4, d4
+ vshll.s16 q3, d4, #2
+ vaddw.s16 q2, q3, d4
+ vrshrn.s32 d4, q2, #6
+ mov r3, #0
+ vtrn.16 d4, d5
+ vadd.i16 d2, d4, d5
+ vshl.i16 d3, d2, #3
+ vrev64.16 d16, d17
+ vsub.i16 d3, d3, d2
+ vadd.i16 d16, d16, d0
+ vshl.i16 d2, d16, #4
+ vsub.i16 d2, d2, d3
+ vshl.i16 d3, d4, #4
+ vext.16 q0, q0, q0, #7
+ vsub.i16 d6, d5, d3
+ vmov.16 d0[0], r3
+ vmul.i16 q0, q0, d4[0]
+ vdup.16 q1, d2[0]
+ vdup.16 q2, d4[0]
+ vdup.16 q3, d6[0]
+ vshl.i16 q2, q2, #3
+ vadd.i16 q1, q1, q0
+ vadd.i16 q3, q3, q2
+ mov r3, #16
+1:
+ vqshrun.s16 d0, q1, #5
+ vadd.i16 q1, q1, q2
+ vqshrun.s16 d1, q1, #5
+ vadd.i16 q1, q1, q3
+ vst1.8 {q0}, [r0,:128], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+const p16weight, align=4
+ .short 1,2,3,4,5,6,7,8
+endconst
+
+function ff_pred8x8_hor_neon, export=1
+ sub r2, r0, #1
+ mov r3, #8
+1: vld1.8 {d0[]}, [r2], r1
+ vst1.8 {d0}, [r0,:64], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_pred8x8_vert_neon, export=1
+ sub r0, r0, r1
+ vld1.8 {d0}, [r0,:64], r1
+ mov r3, #4
+1: vst1.8 {d0}, [r0,:64], r1
+ vst1.8 {d0}, [r0,:64], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_pred8x8_plane_neon, export=1
+ sub r3, r0, r1
+ add r2, r3, #4
+ sub r3, r3, #1
+ vld1.32 {d0[0]}, [r3]
+ vld1.32 {d2[0]}, [r2,:32], r1
+ ldcol.8 d0, r3, r1, 4, hi=1
+ add r3, r3, r1
+ ldcol.8 d3, r3, r1, 4
+ vaddl.u8 q8, d2, d3
+ vrev32.8 d0, d0
+ vtrn.32 d2, d3
+ vsubl.u8 q2, d2, d0
+ movrel r3, p16weight
+ vld1.16 {q0}, [r3,:128]
+ vmul.s16 d4, d4, d0
+ vmul.s16 d5, d5, d0
+ vpadd.i16 d4, d4, d5
+ vpaddl.s16 d4, d4
+ vshl.i32 d5, d4, #4
+ vadd.s32 d4, d4, d5
+ vrshrn.s32 d4, q2, #5
+ mov r3, #0
+ vtrn.16 d4, d5
+ vadd.i16 d2, d4, d5
+ vshl.i16 d3, d2, #2
+ vrev64.16 d16, d16
+ vsub.i16 d3, d3, d2
+ vadd.i16 d16, d16, d0
+ vshl.i16 d2, d16, #4
+ vsub.i16 d2, d2, d3
+ vshl.i16 d3, d4, #3
+ vext.16 q0, q0, q0, #7
+ vsub.i16 d6, d5, d3
+ vmov.16 d0[0], r3
+ vmul.i16 q0, q0, d4[0]
+ vdup.16 q1, d2[0]
+ vdup.16 q2, d4[0]
+ vdup.16 q3, d6[0]
+ vshl.i16 q2, q2, #3
+ vadd.i16 q1, q1, q0
+ vadd.i16 q3, q3, q2
+ mov r3, #8
+1:
+ vqshrun.s16 d0, q1, #5
+ vadd.i16 q1, q1, q3
+ vst1.8 {d0}, [r0,:64], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+function ff_pred8x8_128_dc_neon, export=1
+ vmov.i8 q0, #128
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_top_dc_neon, export=1
+ sub r2, r0, r1
+ vld1.8 {d0}, [r2,:64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d1, d0[1]
+ vdup.8 d0, d0[0]
+ vtrn.32 d0, d1
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_left_dc_neon, export=1
+ sub r2, r0, #1
+ ldcol.8 d0, r2, r1
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d1, d0[1]
+ vdup.8 d0, d0[0]
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_dc_neon, export=1
+ sub r2, r0, r1
+ vld1.8 {d0}, [r2,:64]
+ sub r2, r0, #1
+ ldcol.8 d1, r2, r1
+ vtrn.32 d0, d1
+ vpaddl.u8 q0, q0
+ vpadd.u16 d0, d0, d1
+ vpadd.u16 d1, d0, d0
+ vrshrn.u16 d2, q0, #3
+ vrshrn.u16 d3, q0, #2
+ vdup.8 d0, d2[4]
+ vdup.8 d1, d3[3]
+ vdup.8 d4, d3[2]
+ vdup.8 d5, d2[5]
+ vtrn.32 q0, q2
+.L_pred8x8_dc_end:
+ mov r3, #4
+ add r2, r0, r1, lsl #2
+6: vst1.8 {d0}, [r0,:64], r1
+ vst1.8 {d1}, [r2,:64], r1
+ subs r3, r3, #1
+ bne 6b
+ bx lr
+endfunc
+
+function ff_pred8x8_l0t_dc_neon, export=1
+ sub r2, r0, r1
+ vld1.8 {d0}, [r2,:64]
+ sub r2, r0, #1
+ ldcol.8 d1, r2, r1, 4
+ vtrn.32 d0, d1
+ vpaddl.u8 q0, q0
+ vpadd.u16 d0, d0, d1
+ vpadd.u16 d1, d0, d0
+ vrshrn.u16 d2, q0, #3
+ vrshrn.u16 d3, q0, #2
+ vdup.8 d0, d2[4]
+ vdup.8 d1, d3[0]
+ vdup.8 q2, d3[2]
+ vtrn.32 q0, q2
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_l00_dc_neon, export=1
+ sub r2, r0, #1
+ ldcol.8 d0, r2, r1, 4
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0, d0
+ vrshrn.u16 d0, q0, #2
+ vmov.i8 d1, #128
+ vdup.8 d0, d0[0]
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0lt_dc_neon, export=1
+ sub r2, r0, r1
+ vld1.8 {d0}, [r2,:64]
+ add r2, r0, r1, lsl #2
+ sub r2, r2, #1
+ ldcol.8 d1, r2, r1, 4, hi=1
+ vtrn.32 d0, d1
+ vpaddl.u8 q0, q0
+ vpadd.u16 d0, d0, d1
+ vpadd.u16 d1, d0, d0
+ vrshrn.u16 d3, q0, #2
+ vrshrn.u16 d2, q0, #3
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[3]
+ vdup.8 d4, d3[2]
+ vdup.8 d5, d2[5]
+ vtrn.32 q0, q2
+ b .L_pred8x8_dc_end
+endfunc
+
+function ff_pred8x8_0l0_dc_neon, export=1
+ add r2, r0, r1, lsl #2
+ sub r2, r2, #1
+ ldcol.8 d1, r2, r1, 4
+ vpaddl.u8 d2, d1
+ vpadd.u16 d2, d2, d2
+ vrshrn.u16 d1, q1, #2
+ vmov.i8 d0, #128
+ vdup.8 d1, d1[0]
+ b .L_pred8x8_dc_end
+endfunc
diff --git a/ffmpeg/libavcodec/arm/h264qpel_init_arm.c b/ffmpeg/libavcodec/arm/h264qpel_init_arm.c
new file mode 100644
index 0000000..eaa1324
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264qpel_init_arm.c
@@ -0,0 +1,171 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/h264qpel.h"
+
+void ff_put_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
+
+void ff_put_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_put_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
+
+void ff_avg_h264_qpel16_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel16_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
+
+void ff_avg_h264_qpel8_mc00_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc10_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc20_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc30_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc01_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc11_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc21_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc31_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc02_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc12_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc22_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc32_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc03_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc13_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc23_neon(uint8_t *, uint8_t *, ptrdiff_t);
+void ff_avg_h264_qpel8_mc33_neon(uint8_t *, uint8_t *, ptrdiff_t);
+
+av_cold void ff_h264qpel_init_arm(H264QpelContext *c, int bit_depth)
+{
+ const int high_bit_depth = bit_depth > 8;
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags) && !high_bit_depth) {
+ c->put_h264_qpel_pixels_tab[0][ 0] = ff_put_h264_qpel16_mc00_neon;
+ c->put_h264_qpel_pixels_tab[0][ 1] = ff_put_h264_qpel16_mc10_neon;
+ c->put_h264_qpel_pixels_tab[0][ 2] = ff_put_h264_qpel16_mc20_neon;
+ c->put_h264_qpel_pixels_tab[0][ 3] = ff_put_h264_qpel16_mc30_neon;
+ c->put_h264_qpel_pixels_tab[0][ 4] = ff_put_h264_qpel16_mc01_neon;
+ c->put_h264_qpel_pixels_tab[0][ 5] = ff_put_h264_qpel16_mc11_neon;
+ c->put_h264_qpel_pixels_tab[0][ 6] = ff_put_h264_qpel16_mc21_neon;
+ c->put_h264_qpel_pixels_tab[0][ 7] = ff_put_h264_qpel16_mc31_neon;
+ c->put_h264_qpel_pixels_tab[0][ 8] = ff_put_h264_qpel16_mc02_neon;
+ c->put_h264_qpel_pixels_tab[0][ 9] = ff_put_h264_qpel16_mc12_neon;
+ c->put_h264_qpel_pixels_tab[0][10] = ff_put_h264_qpel16_mc22_neon;
+ c->put_h264_qpel_pixels_tab[0][11] = ff_put_h264_qpel16_mc32_neon;
+ c->put_h264_qpel_pixels_tab[0][12] = ff_put_h264_qpel16_mc03_neon;
+ c->put_h264_qpel_pixels_tab[0][13] = ff_put_h264_qpel16_mc13_neon;
+ c->put_h264_qpel_pixels_tab[0][14] = ff_put_h264_qpel16_mc23_neon;
+ c->put_h264_qpel_pixels_tab[0][15] = ff_put_h264_qpel16_mc33_neon;
+
+ c->put_h264_qpel_pixels_tab[1][ 0] = ff_put_h264_qpel8_mc00_neon;
+ c->put_h264_qpel_pixels_tab[1][ 1] = ff_put_h264_qpel8_mc10_neon;
+ c->put_h264_qpel_pixels_tab[1][ 2] = ff_put_h264_qpel8_mc20_neon;
+ c->put_h264_qpel_pixels_tab[1][ 3] = ff_put_h264_qpel8_mc30_neon;
+ c->put_h264_qpel_pixels_tab[1][ 4] = ff_put_h264_qpel8_mc01_neon;
+ c->put_h264_qpel_pixels_tab[1][ 5] = ff_put_h264_qpel8_mc11_neon;
+ c->put_h264_qpel_pixels_tab[1][ 6] = ff_put_h264_qpel8_mc21_neon;
+ c->put_h264_qpel_pixels_tab[1][ 7] = ff_put_h264_qpel8_mc31_neon;
+ c->put_h264_qpel_pixels_tab[1][ 8] = ff_put_h264_qpel8_mc02_neon;
+ c->put_h264_qpel_pixels_tab[1][ 9] = ff_put_h264_qpel8_mc12_neon;
+ c->put_h264_qpel_pixels_tab[1][10] = ff_put_h264_qpel8_mc22_neon;
+ c->put_h264_qpel_pixels_tab[1][11] = ff_put_h264_qpel8_mc32_neon;
+ c->put_h264_qpel_pixels_tab[1][12] = ff_put_h264_qpel8_mc03_neon;
+ c->put_h264_qpel_pixels_tab[1][13] = ff_put_h264_qpel8_mc13_neon;
+ c->put_h264_qpel_pixels_tab[1][14] = ff_put_h264_qpel8_mc23_neon;
+ c->put_h264_qpel_pixels_tab[1][15] = ff_put_h264_qpel8_mc33_neon;
+
+ c->avg_h264_qpel_pixels_tab[0][ 0] = ff_avg_h264_qpel16_mc00_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 1] = ff_avg_h264_qpel16_mc10_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 2] = ff_avg_h264_qpel16_mc20_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 3] = ff_avg_h264_qpel16_mc30_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 4] = ff_avg_h264_qpel16_mc01_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 5] = ff_avg_h264_qpel16_mc11_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 6] = ff_avg_h264_qpel16_mc21_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 7] = ff_avg_h264_qpel16_mc31_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 8] = ff_avg_h264_qpel16_mc02_neon;
+ c->avg_h264_qpel_pixels_tab[0][ 9] = ff_avg_h264_qpel16_mc12_neon;
+ c->avg_h264_qpel_pixels_tab[0][10] = ff_avg_h264_qpel16_mc22_neon;
+ c->avg_h264_qpel_pixels_tab[0][11] = ff_avg_h264_qpel16_mc32_neon;
+ c->avg_h264_qpel_pixels_tab[0][12] = ff_avg_h264_qpel16_mc03_neon;
+ c->avg_h264_qpel_pixels_tab[0][13] = ff_avg_h264_qpel16_mc13_neon;
+ c->avg_h264_qpel_pixels_tab[0][14] = ff_avg_h264_qpel16_mc23_neon;
+ c->avg_h264_qpel_pixels_tab[0][15] = ff_avg_h264_qpel16_mc33_neon;
+
+ c->avg_h264_qpel_pixels_tab[1][ 0] = ff_avg_h264_qpel8_mc00_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 1] = ff_avg_h264_qpel8_mc10_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 2] = ff_avg_h264_qpel8_mc20_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 3] = ff_avg_h264_qpel8_mc30_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 4] = ff_avg_h264_qpel8_mc01_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 5] = ff_avg_h264_qpel8_mc11_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 6] = ff_avg_h264_qpel8_mc21_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 7] = ff_avg_h264_qpel8_mc31_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 8] = ff_avg_h264_qpel8_mc02_neon;
+ c->avg_h264_qpel_pixels_tab[1][ 9] = ff_avg_h264_qpel8_mc12_neon;
+ c->avg_h264_qpel_pixels_tab[1][10] = ff_avg_h264_qpel8_mc22_neon;
+ c->avg_h264_qpel_pixels_tab[1][11] = ff_avg_h264_qpel8_mc32_neon;
+ c->avg_h264_qpel_pixels_tab[1][12] = ff_avg_h264_qpel8_mc03_neon;
+ c->avg_h264_qpel_pixels_tab[1][13] = ff_avg_h264_qpel8_mc13_neon;
+ c->avg_h264_qpel_pixels_tab[1][14] = ff_avg_h264_qpel8_mc23_neon;
+ c->avg_h264_qpel_pixels_tab[1][15] = ff_avg_h264_qpel8_mc33_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/h264qpel_neon.S b/ffmpeg/libavcodec/arm/h264qpel_neon.S
new file mode 100644
index 0000000..21336c6
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/h264qpel_neon.S
@@ -0,0 +1,955 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+ /* H.264 qpel MC */
+
+.macro lowpass_const r
+ movw \r, #5
+ movt \r, #20
+ vmov.32 d6[0], \r
+.endm
+
+.macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
+ .if \narrow
+ t0 .req q0
+ t1 .req q8
+ .else
+ t0 .req \d0
+ t1 .req \d1
+ .endif
+ vext.8 d2, \r0, \r1, #2
+ vext.8 d3, \r0, \r1, #3
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, \r0, \r1, #1
+ vext.8 d5, \r0, \r1, #4
+ vaddl.u8 q2, d4, d5
+ vext.8 d30, \r0, \r1, #5
+ vaddl.u8 t0, \r0, d30
+ vext.8 d18, \r2, \r3, #2
+ vmla.i16 t0, q1, d6[1]
+ vext.8 d19, \r2, \r3, #3
+ vaddl.u8 q9, d18, d19
+ vext.8 d20, \r2, \r3, #1
+ vmls.i16 t0, q2, d6[0]
+ vext.8 d21, \r2, \r3, #4
+ vaddl.u8 q10, d20, d21
+ vext.8 d31, \r2, \r3, #5
+ vaddl.u8 t1, \r2, d31
+ vmla.i16 t1, q9, d6[1]
+ vmls.i16 t1, q10, d6[0]
+ .if \narrow
+ vqrshrun.s16 \d0, t0, #5
+ vqrshrun.s16 \d1, t1, #5
+ .endif
+ .unreq t0
+ .unreq t1
+.endm
+
+.macro lowpass_8_1 r0, r1, d0, narrow=1
+ .if \narrow
+ t0 .req q0
+ .else
+ t0 .req \d0
+ .endif
+ vext.8 d2, \r0, \r1, #2
+ vext.8 d3, \r0, \r1, #3
+ vaddl.u8 q1, d2, d3
+ vext.8 d4, \r0, \r1, #1
+ vext.8 d5, \r0, \r1, #4
+ vaddl.u8 q2, d4, d5
+ vext.8 d30, \r0, \r1, #5
+ vaddl.u8 t0, \r0, d30
+ vmla.i16 t0, q1, d6[1]
+ vmls.i16 t0, q2, d6[0]
+ .if \narrow
+ vqrshrun.s16 \d0, t0, #5
+ .endif
+ .unreq t0
+.endm
+
+.macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
+ vext.16 q1, \r0, \r1, #2
+ vext.16 q0, \r0, \r1, #3
+ vaddl.s16 q9, d2, d0
+ vext.16 q2, \r0, \r1, #1
+ vaddl.s16 q1, d3, d1
+ vext.16 q3, \r0, \r1, #4
+ vaddl.s16 q10, d4, d6
+ vext.16 \r1, \r0, \r1, #5
+ vaddl.s16 q2, d5, d7
+ vaddl.s16 q0, \h0, \h1
+ vaddl.s16 q8, \l0, \l1
+
+ vshl.i32 q3, q9, #4
+ vshl.i32 q9, q9, #2
+ vshl.i32 q15, q10, #2
+ vadd.i32 q9, q9, q3
+ vadd.i32 q10, q10, q15
+
+ vshl.i32 q3, q1, #4
+ vshl.i32 q1, q1, #2
+ vshl.i32 q15, q2, #2
+ vadd.i32 q1, q1, q3
+ vadd.i32 q2, q2, q15
+
+ vadd.i32 q9, q9, q8
+ vsub.i32 q9, q9, q10
+
+ vadd.i32 q1, q1, q0
+ vsub.i32 q1, q1, q2
+
+ vrshrn.s32 d18, q9, #10
+ vrshrn.s32 d19, q1, #10
+
+ vqmovun.s16 \d, q9
+.endm
+
+function put_h264_qpel16_h_lowpass_neon_packed
+ mov r4, lr
+ mov r12, #16
+ mov r3, #8
+ bl put_h264_qpel8_h_lowpass_neon
+ sub r1, r1, r2, lsl #4
+ add r1, r1, #8
+ mov r12, #16
+ mov lr, r4
+ b put_h264_qpel8_h_lowpass_neon
+endfunc
+
+.macro h264_qpel_h_lowpass type
+function \type\()_h264_qpel16_h_lowpass_neon
+ push {lr}
+ mov r12, #16
+ bl \type\()_h264_qpel8_h_lowpass_neon
+ sub r0, r0, r3, lsl #4
+ sub r1, r1, r2, lsl #4
+ add r0, r0, #8
+ add r1, r1, #8
+ mov r12, #16
+ pop {lr}
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_neon
+1: vld1.8 {d0, d1}, [r1], r2
+ vld1.8 {d16,d17}, [r1], r2
+ subs r12, r12, #2
+ lowpass_8 d0, d1, d16, d17, d0, d16
+ .ifc \type,avg
+ vld1.8 {d2}, [r0,:64], r3
+ vrhadd.u8 d0, d0, d2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 d16, d16, d3
+ sub r0, r0, r3
+ .endif
+ vst1.8 {d0}, [r0,:64], r3
+ vst1.8 {d16}, [r0,:64], r3
+ bne 1b
+ bx lr
+endfunc
+.endm
+
+ h264_qpel_h_lowpass put
+ h264_qpel_h_lowpass avg
+
+.macro h264_qpel_h_lowpass_l2 type
+function \type\()_h264_qpel16_h_lowpass_l2_neon
+ push {lr}
+ mov r12, #16
+ bl \type\()_h264_qpel8_h_lowpass_l2_neon
+ sub r0, r0, r2, lsl #4
+ sub r1, r1, r2, lsl #4
+ sub r3, r3, r2, lsl #4
+ add r0, r0, #8
+ add r1, r1, #8
+ add r3, r3, #8
+ mov r12, #16
+ pop {lr}
+endfunc
+
+function \type\()_h264_qpel8_h_lowpass_l2_neon
+1: vld1.8 {d0, d1}, [r1], r2
+ vld1.8 {d16,d17}, [r1], r2
+ vld1.8 {d28}, [r3], r2
+ vld1.8 {d29}, [r3], r2
+ subs r12, r12, #2
+ lowpass_8 d0, d1, d16, d17, d0, d1
+ vrhadd.u8 q0, q0, q14
+ .ifc \type,avg
+ vld1.8 {d2}, [r0,:64], r2
+ vrhadd.u8 d0, d0, d2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 d1, d1, d3
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d0}, [r0,:64], r2
+ vst1.8 {d1}, [r0,:64], r2
+ bne 1b
+ bx lr
+endfunc
+.endm
+
+ h264_qpel_h_lowpass_l2 put
+ h264_qpel_h_lowpass_l2 avg
+
+function put_h264_qpel16_v_lowpass_neon_packed
+ mov r4, lr
+ mov r2, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ bl put_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r4
+ b put_h264_qpel8_v_lowpass_neon
+endfunc
+
+.macro h264_qpel_v_lowpass type
+function \type\()_h264_qpel16_v_lowpass_neon
+ mov r4, lr
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_neon
+ vld1.8 {d8}, [r1], r3
+ vld1.8 {d10}, [r1], r3
+ vld1.8 {d12}, [r1], r3
+ vld1.8 {d14}, [r1], r3
+ vld1.8 {d22}, [r1], r3
+ vld1.8 {d24}, [r1], r3
+ vld1.8 {d26}, [r1], r3
+ vld1.8 {d28}, [r1], r3
+ vld1.8 {d9}, [r1], r3
+ vld1.8 {d11}, [r1], r3
+ vld1.8 {d13}, [r1], r3
+ vld1.8 {d15}, [r1], r3
+ vld1.8 {d23}, [r1]
+
+ transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
+ lowpass_8 d8, d9, d10, d11, d8, d10
+ lowpass_8 d12, d13, d14, d15, d12, d14
+ lowpass_8 d22, d23, d24, d25, d22, d24
+ lowpass_8 d26, d27, d28, d29, d26, d28
+ transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
+
+ .ifc \type,avg
+ vld1.8 {d9}, [r0,:64], r2
+ vrhadd.u8 d8, d8, d9
+ vld1.8 {d11}, [r0,:64], r2
+ vrhadd.u8 d10, d10, d11
+ vld1.8 {d13}, [r0,:64], r2
+ vrhadd.u8 d12, d12, d13
+ vld1.8 {d15}, [r0,:64], r2
+ vrhadd.u8 d14, d14, d15
+ vld1.8 {d23}, [r0,:64], r2
+ vrhadd.u8 d22, d22, d23
+ vld1.8 {d25}, [r0,:64], r2
+ vrhadd.u8 d24, d24, d25
+ vld1.8 {d27}, [r0,:64], r2
+ vrhadd.u8 d26, d26, d27
+ vld1.8 {d29}, [r0,:64], r2
+ vrhadd.u8 d28, d28, d29
+ sub r0, r0, r2, lsl #3
+ .endif
+
+ vst1.8 {d8}, [r0,:64], r2
+ vst1.8 {d10}, [r0,:64], r2
+ vst1.8 {d12}, [r0,:64], r2
+ vst1.8 {d14}, [r0,:64], r2
+ vst1.8 {d22}, [r0,:64], r2
+ vst1.8 {d24}, [r0,:64], r2
+ vst1.8 {d26}, [r0,:64], r2
+ vst1.8 {d28}, [r0,:64], r2
+
+ bx lr
+endfunc
+.endm
+
+ h264_qpel_v_lowpass put
+ h264_qpel_v_lowpass avg
+
+.macro h264_qpel_v_lowpass_l2 type
+function \type\()_h264_qpel16_v_lowpass_l2_neon
+ mov r4, lr
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ sub r0, r0, r3, lsl #4
+ sub r12, r12, r2, lsl #4
+ add r0, r0, #8
+ add r12, r12, #8
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r4
+endfunc
+
+function \type\()_h264_qpel8_v_lowpass_l2_neon
+ vld1.8 {d8}, [r1], r3
+ vld1.8 {d10}, [r1], r3
+ vld1.8 {d12}, [r1], r3
+ vld1.8 {d14}, [r1], r3
+ vld1.8 {d22}, [r1], r3
+ vld1.8 {d24}, [r1], r3
+ vld1.8 {d26}, [r1], r3
+ vld1.8 {d28}, [r1], r3
+ vld1.8 {d9}, [r1], r3
+ vld1.8 {d11}, [r1], r3
+ vld1.8 {d13}, [r1], r3
+ vld1.8 {d15}, [r1], r3
+ vld1.8 {d23}, [r1]
+
+ transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
+ lowpass_8 d8, d9, d10, d11, d8, d9
+ lowpass_8 d12, d13, d14, d15, d12, d13
+ lowpass_8 d22, d23, d24, d25, d22, d23
+ lowpass_8 d26, d27, d28, d29, d26, d27
+ transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
+
+ vld1.8 {d0}, [r12], r2
+ vld1.8 {d1}, [r12], r2
+ vld1.8 {d2}, [r12], r2
+ vld1.8 {d3}, [r12], r2
+ vld1.8 {d4}, [r12], r2
+ vrhadd.u8 q0, q0, q4
+ vld1.8 {d5}, [r12], r2
+ vrhadd.u8 q1, q1, q6
+ vld1.8 {d10}, [r12], r2
+ vrhadd.u8 q2, q2, q11
+ vld1.8 {d11}, [r12], r2
+ vrhadd.u8 q5, q5, q13
+
+ .ifc \type,avg
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d0, d0, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d1, d1, d17
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d2, d2, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d3, d3, d17
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d4, d4, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d5, d5, d17
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d10, d10, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d11, d11, d17
+ sub r0, r0, r3, lsl #3
+ .endif
+
+ vst1.8 {d0}, [r0,:64], r3
+ vst1.8 {d1}, [r0,:64], r3
+ vst1.8 {d2}, [r0,:64], r3
+ vst1.8 {d3}, [r0,:64], r3
+ vst1.8 {d4}, [r0,:64], r3
+ vst1.8 {d5}, [r0,:64], r3
+ vst1.8 {d10}, [r0,:64], r3
+ vst1.8 {d11}, [r0,:64], r3
+
+ bx lr
+endfunc
+.endm
+
+ h264_qpel_v_lowpass_l2 put
+ h264_qpel_v_lowpass_l2 avg
+
+function put_h264_qpel8_hv_lowpass_neon_top
+ lowpass_const r12
+ mov r12, #12
+1: vld1.8 {d0, d1}, [r1], r3
+ vld1.8 {d16,d17}, [r1], r3
+ subs r12, r12, #2
+ lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
+ vst1.8 {d22-d25}, [r4,:128]!
+ bne 1b
+
+ vld1.8 {d0, d1}, [r1]
+ lowpass_8_1 d0, d1, q12, narrow=0
+
+ mov r12, #-16
+ add r4, r4, r12
+ vld1.8 {d30,d31}, [r4,:128], r12
+ vld1.8 {d20,d21}, [r4,:128], r12
+ vld1.8 {d18,d19}, [r4,:128], r12
+ vld1.8 {d16,d17}, [r4,:128], r12
+ vld1.8 {d14,d15}, [r4,:128], r12
+ vld1.8 {d12,d13}, [r4,:128], r12
+ vld1.8 {d10,d11}, [r4,:128], r12
+ vld1.8 {d8, d9}, [r4,:128], r12
+ vld1.8 {d6, d7}, [r4,:128], r12
+ vld1.8 {d4, d5}, [r4,:128], r12
+ vld1.8 {d2, d3}, [r4,:128], r12
+ vld1.8 {d0, d1}, [r4,:128]
+
+ swap4 d1, d3, d5, d7, d8, d10, d12, d14
+ transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
+
+ swap4 d17, d19, d21, d31, d24, d26, d28, d22
+ transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
+
+ vst1.8 {d30,d31}, [r4,:128]!
+ vst1.8 {d6, d7}, [r4,:128]!
+ vst1.8 {d20,d21}, [r4,:128]!
+ vst1.8 {d4, d5}, [r4,:128]!
+ vst1.8 {d18,d19}, [r4,:128]!
+ vst1.8 {d2, d3}, [r4,:128]!
+ vst1.8 {d16,d17}, [r4,:128]!
+ vst1.8 {d0, d1}, [r4,:128]
+
+ lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
+ lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
+ lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
+ lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
+
+ vld1.8 {d16,d17}, [r4,:128], r12
+ vld1.8 {d30,d31}, [r4,:128], r12
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
+ vld1.8 {d16,d17}, [r4,:128], r12
+ vld1.8 {d30,d31}, [r4,:128], r12
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
+ vld1.8 {d16,d17}, [r4,:128], r12
+ vld1.8 {d30,d31}, [r4,:128], r12
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
+ vld1.8 {d16,d17}, [r4,:128], r12
+ vld1.8 {d30,d31}, [r4,:128]
+ lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
+
+ transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
+
+ bx lr
+endfunc
+
+.macro h264_qpel8_hv_lowpass type
+function \type\()_h264_qpel8_hv_lowpass_neon
+ mov r10, lr
+ bl put_h264_qpel8_hv_lowpass_neon_top
+ .ifc \type,avg
+ vld1.8 {d0}, [r0,:64], r2
+ vrhadd.u8 d12, d12, d0
+ vld1.8 {d1}, [r0,:64], r2
+ vrhadd.u8 d13, d13, d1
+ vld1.8 {d2}, [r0,:64], r2
+ vrhadd.u8 d14, d14, d2
+ vld1.8 {d3}, [r0,:64], r2
+ vrhadd.u8 d15, d15, d3
+ vld1.8 {d4}, [r0,:64], r2
+ vrhadd.u8 d8, d8, d4
+ vld1.8 {d5}, [r0,:64], r2
+ vrhadd.u8 d9, d9, d5
+ vld1.8 {d6}, [r0,:64], r2
+ vrhadd.u8 d10, d10, d6
+ vld1.8 {d7}, [r0,:64], r2
+ vrhadd.u8 d11, d11, d7
+ sub r0, r0, r2, lsl #3
+ .endif
+
+ vst1.8 {d12}, [r0,:64], r2
+ vst1.8 {d13}, [r0,:64], r2
+ vst1.8 {d14}, [r0,:64], r2
+ vst1.8 {d15}, [r0,:64], r2
+ vst1.8 {d8}, [r0,:64], r2
+ vst1.8 {d9}, [r0,:64], r2
+ vst1.8 {d10}, [r0,:64], r2
+ vst1.8 {d11}, [r0,:64], r2
+
+ mov lr, r10
+ bx lr
+endfunc
+.endm
+
+ h264_qpel8_hv_lowpass put
+ h264_qpel8_hv_lowpass avg
+
+.macro h264_qpel8_hv_lowpass_l2 type
+function \type\()_h264_qpel8_hv_lowpass_l2_neon
+ mov r10, lr
+ bl put_h264_qpel8_hv_lowpass_neon_top
+
+ vld1.8 {d0, d1}, [r2,:128]!
+ vld1.8 {d2, d3}, [r2,:128]!
+ vrhadd.u8 q0, q0, q6
+ vld1.8 {d4, d5}, [r2,:128]!
+ vrhadd.u8 q1, q1, q7
+ vld1.8 {d6, d7}, [r2,:128]!
+ vrhadd.u8 q2, q2, q4
+ vrhadd.u8 q3, q3, q5
+ .ifc \type,avg
+ vld1.8 {d16}, [r0,:64], r3
+ vrhadd.u8 d0, d0, d16
+ vld1.8 {d17}, [r0,:64], r3
+ vrhadd.u8 d1, d1, d17
+ vld1.8 {d18}, [r0,:64], r3
+ vrhadd.u8 d2, d2, d18
+ vld1.8 {d19}, [r0,:64], r3
+ vrhadd.u8 d3, d3, d19
+ vld1.8 {d20}, [r0,:64], r3
+ vrhadd.u8 d4, d4, d20
+ vld1.8 {d21}, [r0,:64], r3
+ vrhadd.u8 d5, d5, d21
+ vld1.8 {d22}, [r0,:64], r3
+ vrhadd.u8 d6, d6, d22
+ vld1.8 {d23}, [r0,:64], r3
+ vrhadd.u8 d7, d7, d23
+ sub r0, r0, r3, lsl #3
+ .endif
+ vst1.8 {d0}, [r0,:64], r3
+ vst1.8 {d1}, [r0,:64], r3
+ vst1.8 {d2}, [r0,:64], r3
+ vst1.8 {d3}, [r0,:64], r3
+ vst1.8 {d4}, [r0,:64], r3
+ vst1.8 {d5}, [r0,:64], r3
+ vst1.8 {d6}, [r0,:64], r3
+ vst1.8 {d7}, [r0,:64], r3
+
+ mov lr, r10
+ bx lr
+endfunc
+.endm
+
+ h264_qpel8_hv_lowpass_l2 put
+ h264_qpel8_hv_lowpass_l2 avg
+
+.macro h264_qpel16_hv type
+function \type\()_h264_qpel16_hv_lowpass_neon
+ mov r9, lr
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r9
+ b \type\()_h264_qpel8_hv_lowpass_neon
+endfunc
+
+function \type\()_h264_qpel16_hv_lowpass_l2_neon
+ mov r9, lr
+ sub r2, r4, #256
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ sub r1, r1, r3, lsl #4
+ sub r1, r1, r3, lsl #2
+ add r1, r1, #8
+ sub r0, r0, r3, lsl #4
+ add r0, r0, #8
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ sub r1, r1, r3, lsl #2
+ mov lr, r9
+ b \type\()_h264_qpel8_hv_lowpass_l2_neon
+endfunc
+.endm
+
+ h264_qpel16_hv put
+ h264_qpel16_hv avg
+
+.macro h264_qpel8 type
+function ff_\type\()_h264_qpel8_mc10_neon, export=1
+ lowpass_const r3
+ mov r3, r1
+ sub r1, r1, #2
+ mov r12, #8
+ b \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc20_neon, export=1
+ lowpass_const r3
+ sub r1, r1, #2
+ mov r3, r2
+ mov r12, #8
+ b \type\()_h264_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc30_neon, export=1
+ lowpass_const r3
+ add r3, r1, #1
+ sub r1, r1, #2
+ mov r12, #8
+ b \type\()_h264_qpel8_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel8_mc01_neon, export=1
+ push {lr}
+ mov r12, r1
+\type\()_h264_qpel8_mc01:
+ lowpass_const r3
+ mov r3, r2
+ sub r1, r1, r2, lsl #1
+ vpush {d8-d15}
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ vpop {d8-d15}
+ pop {pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc11_neon, export=1
+ push {r0, r1, r11, lr}
+\type\()_h264_qpel8_mc11:
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
+ sub sp, sp, #64
+ mov r0, sp
+ sub r1, r1, #2
+ mov r3, #8
+ mov r12, #8
+ vpush {d8-d15}
+ bl put_h264_qpel8_h_lowpass_neon
+ ldrd r0, r1, [r11], #8
+ mov r3, r2
+ add r12, sp, #64
+ sub r1, r1, r2, lsl #1
+ mov r2, #8
+ bl \type\()_h264_qpel8_v_lowpass_l2_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc21_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+\type\()_h264_qpel8_mc21:
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
+ sub sp, sp, #(8*8+16*12)
+ sub r1, r1, #2
+ mov r3, #8
+ mov r0, sp
+ mov r12, #8
+ vpush {d8-d15}
+ bl put_h264_qpel8_h_lowpass_neon
+ mov r4, r0
+ ldrd r0, r1, [r11], #8
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ sub r2, r4, #64
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r10, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc31_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r11, lr}
+ sub r1, r1, #1
+ b \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc02_neon, export=1
+ push {lr}
+ lowpass_const r3
+ sub r1, r1, r2, lsl #1
+ mov r3, r2
+ vpush {d8-d15}
+ bl \type\()_h264_qpel8_v_lowpass_neon
+ vpop {d8-d15}
+ pop {pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc12_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+\type\()_h264_qpel8_mc12:
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
+ sub sp, sp, #(8*8+16*12)
+ sub r1, r1, r2, lsl #1
+ mov r3, r2
+ mov r2, #8
+ mov r0, sp
+ vpush {d8-d15}
+ bl put_h264_qpel8_v_lowpass_neon
+ mov r4, r0
+ ldrd r0, r1, [r11], #8
+ sub r1, r1, r3, lsl #1
+ sub r1, r1, #2
+ sub r2, r4, #64
+ bl \type\()_h264_qpel8_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r10, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc22_neon, export=1
+ push {r4, r10, r11, lr}
+ mov r11, sp
+A bic sp, sp, #15
+T bic r4, r11, #15
+T mov sp, r4
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ sub sp, sp, #(16*12)
+ mov r4, sp
+ vpush {d8-d15}
+ bl \type\()_h264_qpel8_hv_lowpass_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r10, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel8_mc32_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+ add r1, r1, #1
+ b \type\()_h264_qpel8_mc12
+endfunc
+
+function ff_\type\()_h264_qpel8_mc03_neon, export=1
+ push {lr}
+ add r12, r1, r2
+ b \type\()_h264_qpel8_mc01
+endfunc
+
+function ff_\type\()_h264_qpel8_mc13_neon, export=1
+ push {r0, r1, r11, lr}
+ add r1, r1, r2
+ b \type\()_h264_qpel8_mc11
+endfunc
+
+function ff_\type\()_h264_qpel8_mc23_neon, export=1
+ push {r0, r1, r4, r10, r11, lr}
+ add r1, r1, r2
+ b \type\()_h264_qpel8_mc21
+endfunc
+
+function ff_\type\()_h264_qpel8_mc33_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r11, lr}
+ add r1, r1, r2
+ sub r1, r1, #1
+ b \type\()_h264_qpel8_mc11
+endfunc
+.endm
+
+ h264_qpel8 put
+ h264_qpel8 avg
+
+.macro h264_qpel16 type
+function ff_\type\()_h264_qpel16_mc10_neon, export=1
+ lowpass_const r3
+ mov r3, r1
+ sub r1, r1, #2
+ b \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc20_neon, export=1
+ lowpass_const r3
+ sub r1, r1, #2
+ mov r3, r2
+ b \type\()_h264_qpel16_h_lowpass_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc30_neon, export=1
+ lowpass_const r3
+ add r3, r1, #1
+ sub r1, r1, #2
+ b \type\()_h264_qpel16_h_lowpass_l2_neon
+endfunc
+
+function ff_\type\()_h264_qpel16_mc01_neon, export=1
+ push {r4, lr}
+ mov r12, r1
+\type\()_h264_qpel16_mc01:
+ lowpass_const r3
+ mov r3, r2
+ sub r1, r1, r2, lsl #1
+ vpush {d8-d15}
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc11_neon, export=1
+ push {r0, r1, r4, r11, lr}
+\type\()_h264_qpel16_mc11:
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
+ sub sp, sp, #256
+ mov r0, sp
+ sub r1, r1, #2
+ mov r3, #16
+ vpush {d8-d15}
+ bl put_h264_qpel16_h_lowpass_neon
+ ldrd r0, r1, [r11], #8
+ mov r3, r2
+ add r12, sp, #64
+ sub r1, r1, r2, lsl #1
+ mov r2, #16
+ bl \type\()_h264_qpel16_v_lowpass_l2_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc21_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+\type\()_h264_qpel16_mc21:
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
+ sub sp, sp, #(16*16+16*12)
+ sub r1, r1, #2
+ mov r0, sp
+ vpush {d8-d15}
+ bl put_h264_qpel16_h_lowpass_neon_packed
+ mov r4, r0
+ ldrd r0, r1, [r11], #8
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ bl \type\()_h264_qpel16_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4-r5, r9-r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc31_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r4, r11, lr}
+ sub r1, r1, #1
+ b \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc02_neon, export=1
+ push {r4, lr}
+ lowpass_const r3
+ sub r1, r1, r2, lsl #1
+ mov r3, r2
+ vpush {d8-d15}
+ bl \type\()_h264_qpel16_v_lowpass_neon
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc12_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+\type\()_h264_qpel16_mc12:
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r0, r11, #15
+T mov sp, r0
+ sub sp, sp, #(16*16+16*12)
+ sub r1, r1, r2, lsl #1
+ mov r0, sp
+ mov r3, r2
+ vpush {d8-d15}
+ bl put_h264_qpel16_v_lowpass_neon_packed
+ mov r4, r0
+ ldrd r0, r1, [r11], #8
+ sub r1, r1, r3, lsl #1
+ sub r1, r1, #2
+ mov r2, r3
+ bl \type\()_h264_qpel16_hv_lowpass_l2_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4-r5, r9-r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc22_neon, export=1
+ push {r4, r9-r11, lr}
+ lowpass_const r3
+ mov r11, sp
+A bic sp, sp, #15
+T bic r4, r11, #15
+T mov sp, r4
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, r2
+ sub sp, sp, #(16*12)
+ mov r4, sp
+ vpush {d8-d15}
+ bl \type\()_h264_qpel16_hv_lowpass_neon
+ vpop {d8-d15}
+ mov sp, r11
+ pop {r4, r9-r11, pc}
+endfunc
+
+function ff_\type\()_h264_qpel16_mc32_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+ add r1, r1, #1
+ b \type\()_h264_qpel16_mc12
+endfunc
+
+function ff_\type\()_h264_qpel16_mc03_neon, export=1
+ push {r4, lr}
+ add r12, r1, r2
+ b \type\()_h264_qpel16_mc01
+endfunc
+
+function ff_\type\()_h264_qpel16_mc13_neon, export=1
+ push {r0, r1, r4, r11, lr}
+ add r1, r1, r2
+ b \type\()_h264_qpel16_mc11
+endfunc
+
+function ff_\type\()_h264_qpel16_mc23_neon, export=1
+ push {r0, r1, r4-r5, r9-r11, lr}
+ add r1, r1, r2
+ b \type\()_h264_qpel16_mc21
+endfunc
+
+function ff_\type\()_h264_qpel16_mc33_neon, export=1
+ add r1, r1, #1
+ push {r0, r1, r4, r11, lr}
+ add r1, r1, r2
+ sub r1, r1, #1
+ b \type\()_h264_qpel16_mc11
+endfunc
+.endm
+
+ h264_qpel16 put
+ h264_qpel16 avg
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.S b/ffmpeg/libavcodec/arm/hpeldsp_arm.S
new file mode 100644
index 0000000..2f3d311
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_arm.S
@@ -0,0 +1,611 @@
+@
+@ ARMv4 optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg.
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+#if !HAVE_ARMV5TE_EXTERNAL
+#define pld @
+#endif
+
+.macro ALIGN_QWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
+ mov \Rd0, \Rn0, lsr #(\shift * 8)
+ mov \Rd1, \Rn1, lsr #(\shift * 8)
+ mov \Rd2, \Rn2, lsr #(\shift * 8)
+ mov \Rd3, \Rn3, lsr #(\shift * 8)
+ orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
+ orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
+ orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
+ orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
+.endm
+.macro ALIGN_DWORD shift, R0, R1, R2
+ mov \R0, \R0, lsr #(\shift * 8)
+ orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
+ mov \R1, \R1, lsr #(\shift * 8)
+ orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
+.endm
+.macro ALIGN_DWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
+ mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
+ mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
+ orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
+ orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
+.endm
+
+.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+ @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+ @ Rmask = 0xFEFEFEFE
+ @ Rn = destroy
+ eor \Rd0, \Rn0, \Rm0
+ eor \Rd1, \Rn1, \Rm1
+ orr \Rn0, \Rn0, \Rm0
+ orr \Rn1, \Rn1, \Rm1
+ and \Rd0, \Rd0, \Rmask
+ and \Rd1, \Rd1, \Rmask
+ sub \Rd0, \Rn0, \Rd0, lsr #1
+ sub \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
+ @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
+ @ Rmask = 0xFEFEFEFE
+ @ Rn = destroy
+ eor \Rd0, \Rn0, \Rm0
+ eor \Rd1, \Rn1, \Rm1
+ and \Rn0, \Rn0, \Rm0
+ and \Rn1, \Rn1, \Rm1
+ and \Rd0, \Rd0, \Rmask
+ and \Rd1, \Rd1, \Rmask
+ add \Rd0, \Rn0, \Rd0, lsr #1
+ add \Rd1, \Rn1, \Rd1, lsr #1
+.endm
+
+.macro JMP_ALIGN tmp, reg
+ ands \tmp, \reg, #3
+ bic \reg, \reg, #3
+ beq 1f
+ subs \tmp, \tmp, #1
+ beq 2f
+ subs \tmp, \tmp, #1
+ beq 3f
+ b 4f
+.endm
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels16_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11, lr}
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r7}
+ add r1, r1, r2
+ stm r0, {r4-r7}
+ pld [r1]
+ subs r3, r3, #1
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r11, pc}
+ .align 5
+2:
+ ldm r1, {r4-r8}
+ add r1, r1, r2
+ ALIGN_QWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r9-r12}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r11, pc}
+ .align 5
+3:
+ ldm r1, {r4-r8}
+ add r1, r1, r2
+ ALIGN_QWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r9-r12}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r11, pc}
+ .align 5
+4:
+ ldm r1, {r4-r8}
+ add r1, r1, r2
+ ALIGN_QWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r9-r12}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r11,pc}
+endfunc
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels8_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r5,lr}
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+ subs r3, r3, #1
+ pld [r1]
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r5,pc}
+ .align 5
+2:
+ ldm r1, {r4-r5, r12}
+ add r1, r1, r2
+ ALIGN_DWORD 1, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r5,pc}
+ .align 5
+3:
+ ldm r1, {r4-r5, r12}
+ add r1, r1, r2
+ ALIGN_DWORD 2, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r5,pc}
+ .align 5
+4:
+ ldm r1, {r4-r5, r12}
+ add r1, r1, r2
+ ALIGN_DWORD 3, r4, r5, r12
+ pld [r1]
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r5,pc}
+endfunc
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels8_x2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r10,lr}
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r10,pc}
+ .align 5
+2:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r10,pc}
+ .align 5
+3:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r10,pc}
+ .align 5
+4:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
+ pld [r1]
+ RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r10,pc}
+endfunc
+
+ .align 5
+function ff_put_no_rnd_pixels8_x2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r10,lr}
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 1b
+ pop {r4-r10,pc}
+ .align 5
+2:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 1, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 2, r8, r9, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 2b
+ pop {r4-r10,pc}
+ .align 5
+3:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 2, r6, r7, r4, r5, r10
+ ALIGN_DWORD_D 3, r8, r9, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
+ subs r3, r3, #1
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bne 3b
+ pop {r4-r10,pc}
+ .align 5
+4:
+ ldm r1, {r4-r5, r10}
+ add r1, r1, r2
+ ALIGN_DWORD_D 3, r6, r7, r4, r5, r10
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 4b
+ pop {r4-r10,pc}
+endfunc
+
+
+@ ----------------------------------------------------------------
+ .align 5
+function ff_put_pixels8_y2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr}
+ mov r3, r3, lsr #1
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+6: ldm r1, {r6-r7}
+ add r1, r1, r2
+ pld [r1]
+ RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ pld [r1]
+ RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+2:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+3:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+4:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r7, r8, r9
+ RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+ subs r3, r3, #1
+ RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+endfunc
+
+ .align 5
+function ff_put_no_rnd_pixels8_y2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr}
+ mov r3, r3, lsr #1
+ ldr r12, =0xfefefefe
+ JMP_ALIGN r5, r1
+1:
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+6: ldm r1, {r6-r7}
+ add r1, r1, r2
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
+ ldm r1, {r4-r5}
+ add r1, r1, r2
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ pld [r1]
+ NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
+ subs r3, r3, #1
+ stm r0, {r8-r9}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+2:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 1, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+3:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 2, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+ .align 5
+4:
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+6: ldm r1, {r7-r9}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r7, r8, r9
+ NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ ldm r1, {r4-r6}
+ add r1, r1, r2
+ pld [r1]
+ ALIGN_DWORD 3, r4, r5, r6
+ subs r3, r3, #1
+ NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
+ stm r0, {r10-r11}
+ add r0, r0, r2
+ bne 6b
+ pop {r4-r11,pc}
+endfunc
+
+ .ltorg
+
+@ ----------------------------------------------------------------
+.macro RND_XY2_IT align, rnd
+ @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
+ @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
+.if \align == 0
+ ldm r1, {r6-r8}
+.elseif \align == 3
+ ldm r1, {r5-r7}
+.else
+ ldm r1, {r8-r10}
+.endif
+ add r1, r1, r2
+ pld [r1]
+.if \align == 0
+ ALIGN_DWORD_D 1, r4, r5, r6, r7, r8
+.elseif \align == 1
+ ALIGN_DWORD_D 1, r4, r5, r8, r9, r10
+ ALIGN_DWORD_D 2, r6, r7, r8, r9, r10
+.elseif \align == 2
+ ALIGN_DWORD_D 2, r4, r5, r8, r9, r10
+ ALIGN_DWORD_D 3, r6, r7, r8, r9, r10
+.elseif \align == 3
+ ALIGN_DWORD_D 3, r4, r5, r5, r6, r7
+.endif
+ ldr r14, =0x03030303
+ tst r3, #1
+ and r8, r4, r14
+ and r9, r5, r14
+ and r10, r6, r14
+ and r11, r7, r14
+ it eq
+ andeq r14, r14, r14, \rnd #1
+ add r8, r8, r10
+ add r9, r9, r11
+ ldr r12, =0xfcfcfcfc >> 2
+ itt eq
+ addeq r8, r8, r14
+ addeq r9, r9, r14
+ and r4, r12, r4, lsr #2
+ and r5, r12, r5, lsr #2
+ and r6, r12, r6, lsr #2
+ and r7, r12, r7, lsr #2
+ add r10, r4, r6
+ add r11, r5, r7
+ subs r3, r3, #1
+.endm
+
+.macro RND_XY2_EXPAND align, rnd
+ RND_XY2_IT \align, \rnd
+6: push {r8-r11}
+ RND_XY2_IT \align, \rnd
+ pop {r4-r7}
+ add r4, r4, r8
+ add r5, r5, r9
+ ldr r14, =0x0f0f0f0f
+ add r6, r6, r10
+ add r7, r7, r11
+ and r4, r14, r4, lsr #2
+ and r5, r14, r5, lsr #2
+ add r4, r4, r6
+ add r5, r5, r7
+ stm r0, {r4-r5}
+ add r0, r0, r2
+ bge 6b
+ pop {r4-r11,pc}
+.endm
+
+ .align 5
+function ff_put_pixels8_xy2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr} @ R14 is also called LR
+ JMP_ALIGN r5, r1
+1: RND_XY2_EXPAND 0, lsl
+ .align 5
+2: RND_XY2_EXPAND 1, lsl
+ .align 5
+3: RND_XY2_EXPAND 2, lsl
+ .align 5
+4: RND_XY2_EXPAND 3, lsl
+endfunc
+
+ .align 5
+function ff_put_no_rnd_pixels8_xy2_arm, export=1
+ @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+ @ block = word aligned, pixles = unaligned
+ pld [r1]
+ push {r4-r11,lr}
+ JMP_ALIGN r5, r1
+1: RND_XY2_EXPAND 0, lsr
+ .align 5
+2: RND_XY2_EXPAND 1, lsr
+ .align 5
+3: RND_XY2_EXPAND 2, lsr
+ .align 5
+4: RND_XY2_EXPAND 3, lsr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_arm.h b/ffmpeg/libavcodec/arm/hpeldsp_arm.h
new file mode 100644
index 0000000..e79bc6f
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_arm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_HPELDSP_H
+#define AVCODEC_ARM_HPELDSP_H
+
+#include "libavcodec/hpeldsp.h"
+
+void ff_hpeldsp_init_armv6(HpelDSPContext* c, int flags);
+void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags);
+
+#endif /* AVCODEC_ARM_HPELDSP_H */
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_armv6.S b/ffmpeg/libavcodec/arm/hpeldsp_armv6.S
new file mode 100644
index 0000000..cd50150
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_armv6.S
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro call_2x_pixels type, subp
+function ff_\type\()_pixels16\subp\()_armv6, export=1
+ push {r0-r3, lr}
+ bl ff_\type\()_pixels8\subp\()_armv6
+ pop {r0-r3, lr}
+ add r0, r0, #8
+ add r1, r1, #8
+ b ff_\type\()_pixels8\subp\()_armv6
+endfunc
+.endm
+
+call_2x_pixels avg
+call_2x_pixels put, _x2
+call_2x_pixels put, _y2
+call_2x_pixels put, _x2_no_rnd
+call_2x_pixels put, _y2_no_rnd
+
+function ff_put_pixels16_armv6, export=1
+ push {r4-r11}
+1:
+ ldr r5, [r1, #4]
+ ldr r6, [r1, #8]
+ ldr r7, [r1, #12]
+ ldr_post r4, r1, r2
+ strd r6, r7, [r0, #8]
+ ldr r9, [r1, #4]
+ strd_post r4, r5, r0, r2
+ ldr r10, [r1, #8]
+ ldr r11, [r1, #12]
+ ldr_post r8, r1, r2
+ strd r10, r11, [r0, #8]
+ subs r3, r3, #2
+ strd_post r8, r9, r0, r2
+ bne 1b
+
+ pop {r4-r11}
+ bx lr
+endfunc
+
+function ff_put_pixels8_armv6, export=1
+ push {r4-r7}
+1:
+ ldr r5, [r1, #4]
+ ldr_post r4, r1, r2
+ ldr r7, [r1, #4]
+ strd_post r4, r5, r0, r2
+ ldr_post r6, r1, r2
+ subs r3, r3, #2
+ strd_post r6, r7, r0, r2
+ bne 1b
+
+ pop {r4-r7}
+ bx lr
+endfunc
+
+function ff_put_pixels8_x2_armv6, export=1
+ push {r4-r11, lr}
+ mov r12, #1
+ orr r12, r12, r12, lsl #8
+ orr r12, r12, r12, lsl #16
+1:
+ ldr r4, [r1]
+ subs r3, r3, #2
+ ldr r5, [r1, #4]
+ ldr r7, [r1, #5]
+ lsr r6, r4, #8
+ ldr_pre r8, r1, r2
+ orr r6, r6, r5, lsl #24
+ ldr r9, [r1, #4]
+ ldr r11, [r1, #5]
+ lsr r10, r8, #8
+ add r1, r1, r2
+ orr r10, r10, r9, lsl #24
+ eor r14, r4, r6
+ uhadd8 r4, r4, r6
+ eor r6, r5, r7
+ uhadd8 r5, r5, r7
+ and r14, r14, r12
+ and r6, r6, r12
+ uadd8 r4, r4, r14
+ eor r14, r8, r10
+ uadd8 r5, r5, r6
+ eor r6, r9, r11
+ uhadd8 r8, r8, r10
+ and r14, r14, r12
+ uhadd8 r9, r9, r11
+ and r6, r6, r12
+ uadd8 r8, r8, r14
+ strd_post r4, r5, r0, r2
+ uadd8 r9, r9, r6
+ strd_post r8, r9, r0, r2
+ bne 1b
+
+ pop {r4-r11, pc}
+endfunc
+
+function ff_put_pixels8_y2_armv6, export=1
+ push {r4-r11}
+ mov r12, #1
+ orr r12, r12, r12, lsl #8
+ orr r12, r12, r12, lsl #16
+ ldr r4, [r1]
+ ldr r5, [r1, #4]
+ ldr_pre r6, r1, r2
+ ldr r7, [r1, #4]
+1:
+ subs r3, r3, #2
+ uhadd8 r8, r4, r6
+ eor r10, r4, r6
+ uhadd8 r9, r5, r7
+ eor r11, r5, r7
+ and r10, r10, r12
+ ldr_pre r4, r1, r2
+ uadd8 r8, r8, r10
+ and r11, r11, r12
+ uadd8 r9, r9, r11
+ ldr r5, [r1, #4]
+ uhadd8 r10, r4, r6
+ eor r6, r4, r6
+ uhadd8 r11, r5, r7
+ and r6, r6, r12
+ eor r7, r5, r7
+ uadd8 r10, r10, r6
+ and r7, r7, r12
+ ldr_pre r6, r1, r2
+ uadd8 r11, r11, r7
+ strd_post r8, r9, r0, r2
+ ldr r7, [r1, #4]
+ strd_post r10, r11, r0, r2
+ bne 1b
+
+ pop {r4-r11}
+ bx lr
+endfunc
+
+function ff_put_pixels8_x2_no_rnd_armv6, export=1
+ push {r4-r9, lr}
+1:
+ subs r3, r3, #2
+ ldr r4, [r1]
+ ldr r5, [r1, #4]
+ ldr r7, [r1, #5]
+ ldr_pre r8, r1, r2
+ ldr r9, [r1, #4]
+ ldr r14, [r1, #5]
+ add r1, r1, r2
+ lsr r6, r4, #8
+ orr r6, r6, r5, lsl #24
+ lsr r12, r8, #8
+ orr r12, r12, r9, lsl #24
+ uhadd8 r4, r4, r6
+ uhadd8 r5, r5, r7
+ uhadd8 r8, r8, r12
+ uhadd8 r9, r9, r14
+ stm r0, {r4,r5}
+ add r0, r0, r2
+ stm r0, {r8,r9}
+ add r0, r0, r2
+ bne 1b
+
+ pop {r4-r9, pc}
+endfunc
+
+function ff_put_pixels8_y2_no_rnd_armv6, export=1
+ push {r4-r9, lr}
+ ldr r4, [r1]
+ ldr r5, [r1, #4]
+ ldr_pre r6, r1, r2
+ ldr r7, [r1, #4]
+1:
+ subs r3, r3, #2
+ uhadd8 r8, r4, r6
+ ldr_pre r4, r1, r2
+ uhadd8 r9, r5, r7
+ ldr r5, [r1, #4]
+ uhadd8 r12, r4, r6
+ ldr_pre r6, r1, r2
+ uhadd8 r14, r5, r7
+ ldr r7, [r1, #4]
+ stm r0, {r8,r9}
+ add r0, r0, r2
+ stm r0, {r12,r14}
+ add r0, r0, r2
+ bne 1b
+
+ pop {r4-r9, pc}
+endfunc
+
+function ff_avg_pixels8_armv6, export=1
+ pld [r1, r2]
+ push {r4-r10, lr}
+ mov lr, #1
+ orr lr, lr, lr, lsl #8
+ orr lr, lr, lr, lsl #16
+ ldrd r4, r5, [r0]
+ ldr r10, [r1, #4]
+ ldr_post r9, r1, r2
+ subs r3, r3, #2
+1:
+ pld [r1, r2]
+ eor r8, r4, r9
+ uhadd8 r4, r4, r9
+ eor r12, r5, r10
+ ldrd_reg r6, r7, r0, r2
+ uhadd8 r5, r5, r10
+ and r8, r8, lr
+ ldr r10, [r1, #4]
+ and r12, r12, lr
+ uadd8 r4, r4, r8
+ ldr_post r9, r1, r2
+ eor r8, r6, r9
+ uadd8 r5, r5, r12
+ pld [r1, r2, lsl #1]
+ eor r12, r7, r10
+ uhadd8 r6, r6, r9
+ strd_post r4, r5, r0, r2
+ uhadd8 r7, r7, r10
+ beq 2f
+ and r8, r8, lr
+ ldrd_reg r4, r5, r0, r2
+ uadd8 r6, r6, r8
+ ldr r10, [r1, #4]
+ and r12, r12, lr
+ subs r3, r3, #2
+ uadd8 r7, r7, r12
+ ldr_post r9, r1, r2
+ strd_post r6, r7, r0, r2
+ b 1b
+2:
+ and r8, r8, lr
+ and r12, r12, lr
+ uadd8 r6, r6, r8
+ uadd8 r7, r7, r12
+ strd_post r6, r7, r0, r2
+
+ pop {r4-r10, pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c b/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c
new file mode 100644
index 0000000..bae93eb
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_init_arm.c
@@ -0,0 +1,68 @@
+/*
+ * ARM optimized DSP utils
+ * Copyright (c) 2001 Lionel Ulmer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/bit_depth_template.c" // for CALL_2X_PIXELS
+#include "hpeldsp_arm.h"
+
+void ff_put_pixels8_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+void ff_put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+void ff_put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+void ff_put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+
+void ff_put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+
+void ff_put_pixels16_arm(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
+
+CALL_2X_PIXELS(ff_put_pixels16_x2_arm, ff_put_pixels8_x2_arm, 8)
+CALL_2X_PIXELS(ff_put_pixels16_y2_arm, ff_put_pixels8_y2_arm, 8)
+CALL_2X_PIXELS(ff_put_pixels16_xy2_arm, ff_put_pixels8_xy2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_x2_arm, ff_put_no_rnd_pixels8_x2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_y2_arm, ff_put_no_rnd_pixels8_y2_arm, 8)
+CALL_2X_PIXELS(ff_put_no_rnd_pixels16_xy2_arm, ff_put_no_rnd_pixels8_xy2_arm,8)
+
+void ff_hpeldsp_init_arm(HpelDSPContext* c, int flags)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ c->put_pixels_tab[0][0] = ff_put_pixels16_arm;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_arm;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_arm;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_arm;
+ c->put_pixels_tab[1][0] = ff_put_pixels8_arm;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_arm;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_arm;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_arm;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_arm;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_arm;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_arm;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_no_rnd_pixels16_xy2_arm;
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_arm;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_arm;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_arm;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_no_rnd_pixels8_xy2_arm;
+
+ if (have_armv6(cpu_flags)) ff_hpeldsp_init_armv6(c, flags);
+ if (have_neon(cpu_flags)) ff_hpeldsp_init_neon(c, flags);
+}
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c b/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c
new file mode 100644
index 0000000..da4caf8
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_init_armv6.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "hpeldsp_arm.h"
+
+void ff_put_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_put_pixels16_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_avg_pixels16_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_put_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_x2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_y2_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_put_pixels8_x2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_y2_no_rnd_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_avg_pixels8_armv6(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+av_cold void ff_hpeldsp_init_armv6(HpelDSPContext *c, int flags)
+{
+ c->put_pixels_tab[0][0] = ff_put_pixels16_armv6;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_armv6;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_armv6;
+/* c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_armv6; */
+ c->put_pixels_tab[1][0] = ff_put_pixels8_armv6;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_armv6;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_armv6;
+/* c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_armv6; */
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_armv6;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_armv6;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_armv6;
+/* c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_armv6; */
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_armv6;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_armv6;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_armv6;
+/* c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_armv6; */
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_armv6;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_armv6;
+}
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c b/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c
new file mode 100644
index 0000000..d577735
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_init_neon.c
@@ -0,0 +1,86 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "hpeldsp_arm.h"
+
+void ff_put_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_put_pixels8_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_avg_pixels16_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels16_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels16_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels16_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels8_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels8_x2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels8_y2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels8_xy2_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels16_x2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels16_y2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+void ff_avg_pixels16_xy2_no_rnd_neon(uint8_t *, const uint8_t *, ptrdiff_t, int);
+
+void ff_hpeldsp_init_neon(HpelDSPContext *c, int flags)
+{
+ c->put_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_pixels_tab[0][1] = ff_put_pixels16_x2_neon;
+ c->put_pixels_tab[0][2] = ff_put_pixels16_y2_neon;
+ c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_neon;
+ c->put_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_pixels_tab[1][1] = ff_put_pixels8_x2_neon;
+ c->put_pixels_tab[1][2] = ff_put_pixels8_y2_neon;
+ c->put_pixels_tab[1][3] = ff_put_pixels8_xy2_neon;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_neon;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_pixels16_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_pixels16_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[0][3] = ff_put_pixels16_xy2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][0] = ff_put_pixels8_neon;
+ c->put_no_rnd_pixels_tab[1][1] = ff_put_pixels8_x2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][2] = ff_put_pixels8_y2_no_rnd_neon;
+ c->put_no_rnd_pixels_tab[1][3] = ff_put_pixels8_xy2_no_rnd_neon;
+
+ c->avg_pixels_tab[0][0] = ff_avg_pixels16_neon;
+ c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_neon;
+ c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_neon;
+ c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_neon;
+ c->avg_pixels_tab[1][0] = ff_avg_pixels8_neon;
+ c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_neon;
+ c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_neon;
+ c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_neon;
+
+ c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_neon;
+ c->avg_no_rnd_pixels_tab[1] = ff_avg_pixels16_x2_no_rnd_neon;
+ c->avg_no_rnd_pixels_tab[2] = ff_avg_pixels16_y2_no_rnd_neon;
+ c->avg_no_rnd_pixels_tab[3] = ff_avg_pixels16_xy2_no_rnd_neon;
+}
diff --git a/ffmpeg/libavcodec/arm/hpeldsp_neon.S b/ffmpeg/libavcodec/arm/hpeldsp_neon.S
new file mode 100644
index 0000000..cf4a6cf
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/hpeldsp_neon.S
@@ -0,0 +1,410 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro pixels16 rnd=1, avg=0
+ .if \avg
+ mov r12, r0
+ .endif
+1: vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+ vld1.8 {q2}, [r1], r2
+ pld [r1, r2, lsl #2]
+ vld1.8 {q3}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ .if \avg
+ vld1.8 {q8}, [r12,:128], r2
+ vrhadd.u8 q0, q0, q8
+ vld1.8 {q9}, [r12,:128], r2
+ vrhadd.u8 q1, q1, q9
+ vld1.8 {q10}, [r12,:128], r2
+ vrhadd.u8 q2, q2, q10
+ vld1.8 {q11}, [r12,:128], r2
+ vrhadd.u8 q3, q3, q11
+ .endif
+ subs r3, r3, #4
+ vst1.64 {q0}, [r0,:128], r2
+ vst1.64 {q1}, [r0,:128], r2
+ vst1.64 {q2}, [r0,:128], r2
+ vst1.64 {q3}, [r0,:128], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels16_x2 rnd=1, avg=0
+1: vld1.8 {d0-d2}, [r1], r2
+ vld1.8 {d4-d6}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ subs r3, r3, #2
+ vext.8 q1, q0, q1, #1
+ avg q0, q0, q1
+ vext.8 q3, q2, q3, #1
+ avg q2, q2, q3
+ .if \avg
+ vld1.8 {q1}, [r0,:128], r2
+ vld1.8 {q3}, [r0,:128]
+ vrhadd.u8 q0, q0, q1
+ vrhadd.u8 q2, q2, q3
+ sub r0, r0, r2
+ .endif
+ vst1.8 {q0}, [r0,:128], r2
+ vst1.8 {q2}, [r0,:128], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels16_y2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+1: subs r3, r3, #2
+ avg q2, q0, q1
+ vld1.8 {q0}, [r1], r2
+ avg q3, q0, q1
+ vld1.8 {q1}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ .if \avg
+ vld1.8 {q8}, [r0,:128], r2
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q2, q2, q8
+ vrhadd.u8 q3, q3, q9
+ sub r0, r0, r2
+ .endif
+ vst1.8 {q2}, [r0,:128], r2
+ vst1.8 {q3}, [r0,:128], r2
+ bne 1b
+
+ avg q2, q0, q1
+ vld1.8 {q0}, [r1], r2
+ avg q3, q0, q1
+ .if \avg
+ vld1.8 {q8}, [r0,:128], r2
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q2, q2, q8
+ vrhadd.u8 q3, q3, q9
+ sub r0, r0, r2
+ .endif
+ vst1.8 {q2}, [r0,:128], r2
+ vst1.8 {q3}, [r0,:128], r2
+
+ bx lr
+.endm
+
+.macro pixels16_xy2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {d0-d2}, [r1], r2
+ vld1.8 {d4-d6}, [r1], r2
+NRND vmov.i16 q13, #1
+ pld [r1]
+ pld [r1, r2]
+ vext.8 q1, q0, q1, #1
+ vext.8 q3, q2, q3, #1
+ vaddl.u8 q8, d0, d2
+ vaddl.u8 q10, d1, d3
+ vaddl.u8 q9, d4, d6
+ vaddl.u8 q11, d5, d7
+1: subs r3, r3, #2
+ vld1.8 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+ pld [r1]
+NRND vadd.u16 q12, q12, q13
+ vext.8 q15, q0, q1, #1
+ vadd.u16 q1 , q10, q11
+ shrn d28, q12, #2
+NRND vadd.u16 q1, q1, q13
+ shrn d29, q1, #2
+ .if \avg
+ vld1.8 {q8}, [r0,:128]
+ vrhadd.u8 q14, q14, q8
+ .endif
+ vaddl.u8 q8, d0, d30
+ vld1.8 {d2-d4}, [r1], r2
+ vaddl.u8 q10, d1, d31
+ vst1.8 {q14}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+ pld [r1, r2]
+NRND vadd.u16 q12, q12, q13
+ vext.8 q2, q1, q2, #1
+ vadd.u16 q0, q10, q11
+ shrn d30, q12, #2
+NRND vadd.u16 q0, q0, q13
+ shrn d31, q0, #2
+ .if \avg
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q15, q15, q9
+ .endif
+ vaddl.u8 q9, d2, d4
+ vaddl.u8 q11, d3, d5
+ vst1.8 {q15}, [r0,:128], r2
+ bgt 1b
+
+ vld1.8 {d0-d2}, [r1], r2
+ vadd.u16 q12, q8, q9
+NRND vadd.u16 q12, q12, q13
+ vext.8 q15, q0, q1, #1
+ vadd.u16 q1 , q10, q11
+ shrn d28, q12, #2
+NRND vadd.u16 q1, q1, q13
+ shrn d29, q1, #2
+ .if \avg
+ vld1.8 {q8}, [r0,:128]
+ vrhadd.u8 q14, q14, q8
+ .endif
+ vaddl.u8 q8, d0, d30
+ vaddl.u8 q10, d1, d31
+ vst1.8 {q14}, [r0,:128], r2
+ vadd.u16 q12, q8, q9
+NRND vadd.u16 q12, q12, q13
+ vadd.u16 q0, q10, q11
+ shrn d30, q12, #2
+NRND vadd.u16 q0, q0, q13
+ shrn d31, q0, #2
+ .if \avg
+ vld1.8 {q9}, [r0,:128]
+ vrhadd.u8 q15, q15, q9
+ .endif
+ vst1.8 {q15}, [r0,:128], r2
+
+ bx lr
+.endm
+
+.macro pixels8 rnd=1, avg=0
+1: vld1.8 {d0}, [r1], r2
+ vld1.8 {d1}, [r1], r2
+ vld1.8 {d2}, [r1], r2
+ pld [r1, r2, lsl #2]
+ vld1.8 {d3}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ pld [r1, r2, lsl #1]
+ .if \avg
+ vld1.8 {d4}, [r0,:64], r2
+ vrhadd.u8 d0, d0, d4
+ vld1.8 {d5}, [r0,:64], r2
+ vrhadd.u8 d1, d1, d5
+ vld1.8 {d6}, [r0,:64], r2
+ vrhadd.u8 d2, d2, d6
+ vld1.8 {d7}, [r0,:64], r2
+ vrhadd.u8 d3, d3, d7
+ sub r0, r0, r2, lsl #2
+ .endif
+ subs r3, r3, #4
+ vst1.8 {d0}, [r0,:64], r2
+ vst1.8 {d1}, [r0,:64], r2
+ vst1.8 {d2}, [r0,:64], r2
+ vst1.8 {d3}, [r0,:64], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels8_x2 rnd=1, avg=0
+1: vld1.8 {q0}, [r1], r2
+ vext.8 d1, d0, d1, #1
+ vld1.8 {q1}, [r1], r2
+ vext.8 d3, d2, d3, #1
+ pld [r1]
+ pld [r1, r2]
+ subs r3, r3, #2
+ vswp d1, d2
+ avg q0, q0, q1
+ .if \avg
+ vld1.8 {d4}, [r0,:64], r2
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 q0, q0, q2
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d0}, [r0,:64], r2
+ vst1.8 {d1}, [r0,:64], r2
+ bne 1b
+ bx lr
+.endm
+
+.macro pixels8_y2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {d0}, [r1], r2
+ vld1.8 {d1}, [r1], r2
+1: subs r3, r3, #2
+ avg d4, d0, d1
+ vld1.8 {d0}, [r1], r2
+ avg d5, d0, d1
+ vld1.8 {d1}, [r1], r2
+ pld [r1]
+ pld [r1, r2]
+ .if \avg
+ vld1.8 {d2}, [r0,:64], r2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 q2, q2, q1
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d4}, [r0,:64], r2
+ vst1.8 {d5}, [r0,:64], r2
+ bne 1b
+
+ avg d4, d0, d1
+ vld1.8 {d0}, [r1], r2
+ avg d5, d0, d1
+ .if \avg
+ vld1.8 {d2}, [r0,:64], r2
+ vld1.8 {d3}, [r0,:64]
+ vrhadd.u8 q2, q2, q1
+ sub r0, r0, r2
+ .endif
+ vst1.8 {d4}, [r0,:64], r2
+ vst1.8 {d5}, [r0,:64], r2
+
+ bx lr
+.endm
+
+.macro pixels8_xy2 rnd=1, avg=0
+ sub r3, r3, #2
+ vld1.8 {q0}, [r1], r2
+ vld1.8 {q1}, [r1], r2
+NRND vmov.i16 q11, #1
+ pld [r1]
+ pld [r1, r2]
+ vext.8 d4, d0, d1, #1
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q8, d0, d4
+ vaddl.u8 q9, d2, d6
+1: subs r3, r3, #2
+ vld1.8 {q0}, [r1], r2
+ pld [r1]
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+NRND vadd.u16 q10, q10, q11
+ vaddl.u8 q8, d0, d4
+ shrn d5, q10, #2
+ vld1.8 {q1}, [r1], r2
+ vadd.u16 q10, q8, q9
+ pld [r1, r2]
+ .if \avg
+ vld1.8 {d7}, [r0,:64]
+ vrhadd.u8 d5, d5, d7
+ .endif
+NRND vadd.u16 q10, q10, q11
+ vst1.8 {d5}, [r0,:64], r2
+ shrn d7, q10, #2
+ .if \avg
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 d7, d7, d5
+ .endif
+ vext.8 d6, d2, d3, #1
+ vaddl.u8 q9, d2, d6
+ vst1.8 {d7}, [r0,:64], r2
+ bgt 1b
+
+ vld1.8 {q0}, [r1], r2
+ vadd.u16 q10, q8, q9
+ vext.8 d4, d0, d1, #1
+NRND vadd.u16 q10, q10, q11
+ vaddl.u8 q8, d0, d4
+ shrn d5, q10, #2
+ vadd.u16 q10, q8, q9
+ .if \avg
+ vld1.8 {d7}, [r0,:64]
+ vrhadd.u8 d5, d5, d7
+ .endif
+NRND vadd.u16 q10, q10, q11
+ vst1.8 {d5}, [r0,:64], r2
+ shrn d7, q10, #2
+ .if \avg
+ vld1.8 {d5}, [r0,:64]
+ vrhadd.u8 d7, d7, d5
+ .endif
+ vst1.8 {d7}, [r0,:64], r2
+
+ bx lr
+.endm
+
+.macro pixfunc pfx, name, suf, rnd=1, avg=0
+ .if \rnd
+ .macro avg rd, rn, rm
+ vrhadd.u8 \rd, \rn, \rm
+ .endm
+ .macro shrn rd, rn, rm
+ vrshrn.u16 \rd, \rn, \rm
+ .endm
+ .macro NRND insn:vararg
+ .endm
+ .else
+ .macro avg rd, rn, rm
+ vhadd.u8 \rd, \rn, \rm
+ .endm
+ .macro shrn rd, rn, rm
+ vshrn.u16 \rd, \rn, \rm
+ .endm
+ .macro NRND insn:vararg
+ \insn
+ .endm
+ .endif
+function ff_\pfx\name\suf\()_neon, export=1
+ \name \rnd, \avg
+endfunc
+ .purgem avg
+ .purgem shrn
+ .purgem NRND
+.endm
+
+.macro pixfunc2 pfx, name, avg=0
+ pixfunc \pfx, \name, rnd=1, avg=\avg
+ pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg
+.endm
+
+function ff_put_h264_qpel16_mc00_neon, export=1
+ mov r3, #16
+endfunc
+
+ pixfunc put_, pixels16, avg=0
+ pixfunc2 put_, pixels16_x2, avg=0
+ pixfunc2 put_, pixels16_y2, avg=0
+ pixfunc2 put_, pixels16_xy2, avg=0
+
+function ff_avg_h264_qpel16_mc00_neon, export=1
+ mov r3, #16
+endfunc
+
+ pixfunc avg_, pixels16, avg=1
+ pixfunc2 avg_, pixels16_x2, avg=1
+ pixfunc2 avg_, pixels16_y2, avg=1
+ pixfunc2 avg_, pixels16_xy2, avg=1
+
+function ff_put_h264_qpel8_mc00_neon, export=1
+ mov r3, #8
+endfunc
+
+ pixfunc put_, pixels8, avg=0
+ pixfunc2 put_, pixels8_x2, avg=0
+ pixfunc2 put_, pixels8_y2, avg=0
+ pixfunc2 put_, pixels8_xy2, avg=0
+
+function ff_avg_h264_qpel8_mc00_neon, export=1
+ mov r3, #8
+endfunc
+
+ pixfunc avg_, pixels8, avg=1
+ pixfunc avg_, pixels8_x2, avg=1
+ pixfunc avg_, pixels8_y2, avg=1
+ pixfunc avg_, pixels8_xy2, avg=1
diff --git a/ffmpeg/libavcodec/arm/int_neon.S b/ffmpeg/libavcodec/arm/int_neon.S
new file mode 100644
index 0000000..6b28a97
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/int_neon.S
@@ -0,0 +1,92 @@
+/*
+ * ARM NEON optimised integer operations
+ * Copyright (c) 2009 Kostya Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+ .fpu neon
+
+function ff_scalarproduct_int16_neon, export=1
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+1: vld1.16 {d16-d17}, [r0]!
+ vld1.16 {d20-d21}, [r1,:128]!
+ vmlal.s16 q0, d16, d20
+ vld1.16 {d18-d19}, [r0]!
+ vmlal.s16 q1, d17, d21
+ vld1.16 {d22-d23}, [r1,:128]!
+ vmlal.s16 q2, d18, d22
+ vmlal.s16 q3, d19, d23
+ subs r2, r2, #16
+ bne 1b
+
+ vpadd.s32 d16, d0, d1
+ vpadd.s32 d17, d2, d3
+ vpadd.s32 d10, d4, d5
+ vpadd.s32 d11, d6, d7
+ vpadd.s32 d0, d16, d17
+ vpadd.s32 d1, d10, d11
+ vpadd.s32 d2, d0, d1
+ vpaddl.s32 d3, d2
+ vmov.32 r0, d3[0]
+ bx lr
+endfunc
+
+@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
+function ff_scalarproduct_and_madd_int16_neon, export=1
+ vld1.16 {d28[],d29[]}, [sp]
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ mov r12, r0
+
+1: vld1.16 {d16-d17}, [r0,:128]!
+ vld1.16 {d18-d19}, [r1]!
+ vld1.16 {d20-d21}, [r2]!
+ vld1.16 {d22-d23}, [r0,:128]!
+ vld1.16 {d24-d25}, [r1]!
+ vld1.16 {d26-d27}, [r2]!
+ vmul.s16 q10, q10, q14
+ vmul.s16 q13, q13, q14
+ vmlal.s16 q0, d16, d18
+ vmlal.s16 q1, d17, d19
+ vadd.s16 q10, q8, q10
+ vadd.s16 q13, q11, q13
+ vmlal.s16 q2, d22, d24
+ vmlal.s16 q3, d23, d25
+ vst1.16 {q10}, [r12,:128]!
+ subs r3, r3, #16
+ vst1.16 {q13}, [r12,:128]!
+ bne 1b
+
+ vpadd.s32 d16, d0, d1
+ vpadd.s32 d17, d2, d3
+ vpadd.s32 d10, d4, d5
+ vpadd.s32 d11, d6, d7
+ vpadd.s32 d0, d16, d17
+ vpadd.s32 d1, d10, d11
+ vpadd.s32 d2, d0, d1
+ vpaddl.s32 d3, d2
+ vmov.32 r0, d3[0]
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/jrevdct_arm.S b/ffmpeg/libavcodec/arm/jrevdct_arm.S
new file mode 100644
index 0000000..f951e2a
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/jrevdct_arm.S
@@ -0,0 +1,383 @@
+/*
+ C-like prototype :
+ void j_rev_dct_arm(DCTBLOCK data)
+
+ With DCTBLOCK being a pointer to an array of 64 'signed shorts'
+
+ Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org)
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+#include "libavutil/arm/asm.S"
+
+#define FIX_0_298631336 2446
+#define FIX_0_541196100 4433
+#define FIX_0_765366865 6270
+#define FIX_1_175875602 9633
+#define FIX_1_501321110 12299
+#define FIX_2_053119869 16819
+#define FIX_3_072711026 25172
+#define FIX_M_0_390180644 -3196
+#define FIX_M_0_899976223 -7373
+#define FIX_M_1_847759065 -15137
+#define FIX_M_1_961570560 -16069
+#define FIX_M_2_562915447 -20995
+#define FIX_0xFFFF 0xFFFF
+
+#define FIX_0_298631336_ID 0
+#define FIX_0_541196100_ID 4
+#define FIX_0_765366865_ID 8
+#define FIX_1_175875602_ID 12
+#define FIX_1_501321110_ID 16
+#define FIX_2_053119869_ID 20
+#define FIX_3_072711026_ID 24
+#define FIX_M_0_390180644_ID 28
+#define FIX_M_0_899976223_ID 32
+#define FIX_M_1_847759065_ID 36
+#define FIX_M_1_961570560_ID 40
+#define FIX_M_2_562915447_ID 44
+#define FIX_0xFFFF_ID 48
+
+function ff_j_rev_dct_arm, export=1
+ push {r0, r4 - r11, lr}
+
+ mov lr, r0 @ lr = pointer to the current row
+ mov r12, #8 @ r12 = row-counter
+ movrel r11, const_array @ r11 = base pointer to the constants array
+row_loop:
+ ldrsh r0, [lr, # 0] @ r0 = 'd0'
+ ldrsh r2, [lr, # 2] @ r2 = 'd2'
+
+ @ Optimization for row that have all items except the first set to 0
+ @ (this works as the int16_t are always 4-byte aligned)
+ ldr r5, [lr, # 0]
+ ldr r6, [lr, # 4]
+ ldr r3, [lr, # 8]
+ ldr r4, [lr, #12]
+ orr r3, r3, r4
+ orr r3, r3, r6
+ orrs r5, r3, r5
+ beq end_of_row_loop @ nothing to be done as ALL of them are '0'
+ orrs r3, r3, r2
+ beq empty_row
+
+ ldrsh r1, [lr, # 8] @ r1 = 'd1'
+ ldrsh r4, [lr, # 4] @ r4 = 'd4'
+ ldrsh r6, [lr, # 6] @ r6 = 'd6'
+
+ ldr r3, [r11, #FIX_0_541196100_ID]
+ add r7, r2, r6
+ ldr r5, [r11, #FIX_M_1_847759065_ID]
+ mul r7, r3, r7 @ r7 = z1
+ ldr r3, [r11, #FIX_0_765366865_ID]
+ mla r6, r5, r6, r7 @ r6 = tmp2
+ add r5, r0, r4 @ r5 = tmp0
+ mla r2, r3, r2, r7 @ r2 = tmp3
+ sub r3, r0, r4 @ r3 = tmp1
+
+ add r0, r2, r5, lsl #13 @ r0 = tmp10
+ rsb r2, r2, r5, lsl #13 @ r2 = tmp13
+ add r4, r6, r3, lsl #13 @ r4 = tmp11
+ rsb r3, r6, r3, lsl #13 @ r3 = tmp12
+
+ push {r0, r2, r3, r4} @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+ ldrsh r3, [lr, #10] @ r3 = 'd3'
+ ldrsh r5, [lr, #12] @ r5 = 'd5'
+ ldrsh r7, [lr, #14] @ r7 = 'd7'
+
+ add r0, r3, r5 @ r0 = 'z2'
+ add r2, r1, r7 @ r2 = 'z1'
+ add r4, r3, r7 @ r4 = 'z3'
+ add r6, r1, r5 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_1_175875602_ID]
+ add r8, r4, r6 @ r8 = z3 + z4
+ ldr r10, [r11, #FIX_M_0_899976223_ID]
+ mul r8, r9, r8 @ r8 = 'z5'
+ ldr r9, [r11, #FIX_M_2_562915447_ID]
+ mul r2, r10, r2 @ r2 = 'z1'
+ ldr r10, [r11, #FIX_M_1_961570560_ID]
+ mul r0, r9, r0 @ r0 = 'z2'
+ ldr r9, [r11, #FIX_M_0_390180644_ID]
+ mla r4, r10, r4, r8 @ r4 = 'z3'
+ ldr r10, [r11, #FIX_0_298631336_ID]
+ mla r6, r9, r6, r8 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_2_053119869_ID]
+ mla r7, r10, r7, r2 @ r7 = tmp0 + z1
+ ldr r10, [r11, #FIX_3_072711026_ID]
+ mla r5, r9, r5, r0 @ r5 = tmp1 + z2
+ ldr r9, [r11, #FIX_1_501321110_ID]
+ mla r3, r10, r3, r0 @ r3 = tmp2 + z2
+ add r7, r7, r4 @ r7 = tmp0
+ mla r1, r9, r1, r2 @ r1 = tmp3 + z1
+ add r5, r5, r6 @ r5 = tmp1
+ add r3, r3, r4 @ r3 = tmp2
+ add r1, r1, r6 @ r1 = tmp3
+
+ pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11
+ @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
+
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS)
+ add r8, r0, r1
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 0]
+
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS)
+ sub r8, r0, r1
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #14]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS)
+ add r8, r6, r3
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 2]
+
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS)
+ sub r8, r6, r3
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #12]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS)
+ add r8, r4, r5
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 4]
+
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS)
+ sub r8, r4, r5
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, #10]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS)
+ add r8, r2, r7
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 6]
+
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS)
+ sub r8, r2, r7
+ add r8, r8, #(1<<10)
+ mov r8, r8, asr #11
+ strh r8, [lr, # 8]
+
+ @ End of row loop
+ add lr, lr, #16
+ subs r12, r12, #1
+ bne row_loop
+ beq start_column_loop
+
+empty_row:
+ ldr r1, [r11, #FIX_0xFFFF_ID]
+ mov r0, r0, lsl #2
+ and r0, r0, r1
+ add r0, r0, r0, lsl #16
+ str r0, [lr, # 0]
+ str r0, [lr, # 4]
+ str r0, [lr, # 8]
+ str r0, [lr, #12]
+
+end_of_row_loop:
+ @ End of loop
+ add lr, lr, #16
+ subs r12, r12, #1
+ bne row_loop
+
+start_column_loop:
+ @ Start of column loop
+ pop {lr}
+ mov r12, #8
+column_loop:
+ ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0'
+ ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2'
+ ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4'
+ ldrsh r6, [lr, #(12*8)] @ r6 = 'd6'
+
+ ldr r3, [r11, #FIX_0_541196100_ID]
+ add r1, r2, r6
+ ldr r5, [r11, #FIX_M_1_847759065_ID]
+ mul r1, r3, r1 @ r1 = z1
+ ldr r3, [r11, #FIX_0_765366865_ID]
+ mla r6, r5, r6, r1 @ r6 = tmp2
+ add r5, r0, r4 @ r5 = tmp0
+ mla r2, r3, r2, r1 @ r2 = tmp3
+ sub r3, r0, r4 @ r3 = tmp1
+
+ add r0, r2, r5, lsl #13 @ r0 = tmp10
+ rsb r2, r2, r5, lsl #13 @ r2 = tmp13
+ add r4, r6, r3, lsl #13 @ r4 = tmp11
+ rsb r6, r6, r3, lsl #13 @ r6 = tmp12
+
+ ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1'
+ ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3'
+ ldrsh r5, [lr, #(10*8)] @ r5 = 'd5'
+ ldrsh r7, [lr, #(14*8)] @ r7 = 'd7'
+
+ @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats)
+ orr r9, r1, r3
+ orr r10, r5, r7
+ orrs r10, r9, r10
+ beq empty_odd_column
+
+ push {r0, r2, r4, r6} @ save on the stack tmp10, tmp13, tmp12, tmp11
+
+ add r0, r3, r5 @ r0 = 'z2'
+ add r2, r1, r7 @ r2 = 'z1'
+ add r4, r3, r7 @ r4 = 'z3'
+ add r6, r1, r5 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_1_175875602_ID]
+ add r8, r4, r6
+ ldr r10, [r11, #FIX_M_0_899976223_ID]
+ mul r8, r9, r8 @ r8 = 'z5'
+ ldr r9, [r11, #FIX_M_2_562915447_ID]
+ mul r2, r10, r2 @ r2 = 'z1'
+ ldr r10, [r11, #FIX_M_1_961570560_ID]
+ mul r0, r9, r0 @ r0 = 'z2'
+ ldr r9, [r11, #FIX_M_0_390180644_ID]
+ mla r4, r10, r4, r8 @ r4 = 'z3'
+ ldr r10, [r11, #FIX_0_298631336_ID]
+ mla r6, r9, r6, r8 @ r6 = 'z4'
+ ldr r9, [r11, #FIX_2_053119869_ID]
+ mla r7, r10, r7, r2 @ r7 = tmp0 + z1
+ ldr r10, [r11, #FIX_3_072711026_ID]
+ mla r5, r9, r5, r0 @ r5 = tmp1 + z2
+ ldr r9, [r11, #FIX_1_501321110_ID]
+ mla r3, r10, r3, r0 @ r3 = tmp2 + z2
+ add r7, r7, r4 @ r7 = tmp0
+ mla r1, r9, r1, r2 @ r1 = tmp3 + z1
+ add r5, r5, r6 @ r5 = tmp1
+ add r3, r3, r4 @ r3 = tmp2
+ add r1, r1, r6 @ r1 = tmp3
+
+ pop {r0, r2, r4, r6} @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12
+ @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0
+
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+ add r8, r0, r1
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 0*8)]
+
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+ sub r8, r0, r1
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(14*8)]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+ add r8, r4, r3
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 2*8)]
+
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+ sub r8, r4, r3
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(12*8)]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+ add r8, r6, r5
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 4*8)]
+
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+ sub r8, r6, r5
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #(10*8)]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+ add r8, r2, r7
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 6*8)]
+
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+ sub r8, r2, r7
+ add r8, r8, #(1<<17)
+ mov r8, r8, asr #18
+ strh r8, [lr, #( 8*8)]
+
+ @ End of row loop
+ add lr, lr, #2
+ subs r12, r12, #1
+ bne column_loop
+ beq the_end
+
+empty_odd_column:
+ @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3)
+ add r0, r0, #(1<<17)
+ mov r0, r0, asr #18
+ strh r0, [lr, #( 0*8)]
+ strh r0, [lr, #(14*8)]
+
+ @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3)
+ add r4, r4, #(1<<17)
+ mov r4, r4, asr #18
+ strh r4, [lr, #( 2*8)]
+ strh r4, [lr, #(12*8)]
+
+ @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3)
+ add r6, r6, #(1<<17)
+ mov r6, r6, asr #18
+ strh r6, [lr, #( 4*8)]
+ strh r6, [lr, #(10*8)]
+
+ @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3)
+ @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3)
+ add r2, r2, #(1<<17)
+ mov r2, r2, asr #18
+ strh r2, [lr, #( 6*8)]
+ strh r2, [lr, #( 8*8)]
+
+ @ End of row loop
+ add lr, lr, #2
+ subs r12, r12, #1
+ bne column_loop
+
+the_end:
+ @ The end....
+ pop {r4 - r11, pc}
+endfunc
+
+const const_array
+ .word FIX_0_298631336
+ .word FIX_0_541196100
+ .word FIX_0_765366865
+ .word FIX_1_175875602
+ .word FIX_1_501321110
+ .word FIX_2_053119869
+ .word FIX_3_072711026
+ .word FIX_M_0_390180644
+ .word FIX_M_0_899976223
+ .word FIX_M_1_847759065
+ .word FIX_M_1_961570560
+ .word FIX_M_2_562915447
+ .word FIX_0xFFFF
+endconst
diff --git a/ffmpeg/libavcodec/arm/mathops.h b/ffmpeg/libavcodec/arm/mathops.h
new file mode 100644
index 0000000..dc57c55
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mathops.h
@@ -0,0 +1,108 @@
+/*
+ * simple math operations
+ * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MATHOPS_H
+#define AVCODEC_ARM_MATHOPS_H
+
+#include <stdint.h>
+#include "config.h"
+#include "libavutil/common.h"
+
+#if HAVE_INLINE_ASM
+
+#if HAVE_ARMV6_INLINE
+#define MULH MULH
+static inline av_const int MULH(int a, int b)
+{
+ int r;
+ __asm__ ("smmul %0, %1, %2" : "=r"(r) : "r"(a), "r"(b));
+ return r;
+}
+
+#define FASTDIV FASTDIV
+static av_always_inline av_const int FASTDIV(int a, int b)
+{
+ int r;
+ __asm__ ("cmp %2, #2 \n\t"
+ "ldr %0, [%3, %2, lsl #2] \n\t"
+ "ite le \n\t"
+ "lsrle %0, %1, #1 \n\t"
+ "smmulgt %0, %0, %1 \n\t"
+ : "=&r"(r) : "r"(a), "r"(b), "r"(ff_inverse) : "cc");
+ return r;
+}
+
+#else /* HAVE_ARMV6_INLINE */
+
+#define FASTDIV FASTDIV
+static av_always_inline av_const int FASTDIV(int a, int b)
+{
+ int r, t;
+ __asm__ ("umull %1, %0, %2, %3"
+ : "=&r"(r), "=&r"(t) : "r"(a), "r"(ff_inverse[b]));
+ return r;
+}
+#endif
+
+#define MLS64(d, a, b) MAC64(d, -(a), b)
+
+#if HAVE_ARMV5TE_INLINE
+
+/* signed 16x16 -> 32 multiply add accumulate */
+# define MAC16(rt, ra, rb) \
+ __asm__ ("smlabb %0, %1, %2, %0" : "+r"(rt) : "r"(ra), "r"(rb));
+
+/* signed 16x16 -> 32 multiply */
+# define MUL16 MUL16
+static inline av_const int MUL16(int ra, int rb)
+{
+ int rt;
+ __asm__ ("smulbb %0, %1, %2" : "=r"(rt) : "r"(ra), "r"(rb));
+ return rt;
+}
+
+#endif
+
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+ int m;
+ __asm__ (
+ "mov %0, %2 \n\t"
+ "cmp %1, %2 \n\t"
+ "itt gt \n\t"
+ "movgt %0, %1 \n\t"
+ "movgt %1, %2 \n\t"
+ "cmp %1, %3 \n\t"
+ "it le \n\t"
+ "movle %1, %3 \n\t"
+ "cmp %0, %1 \n\t"
+ "it gt \n\t"
+ "movgt %0, %1 \n\t"
+ : "=&r"(m), "+r"(a)
+ : "r"(b), "r"(c)
+ : "cc");
+ return m;
+}
+
+#endif /* HAVE_INLINE_ASM */
+
+#endif /* AVCODEC_ARM_MATHOPS_H */
diff --git a/ffmpeg/libavcodec/arm/mdct_fixed_neon.S b/ffmpeg/libavcodec/arm/mdct_fixed_neon.S
new file mode 100644
index 0000000..c77be59
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mdct_fixed_neon.S
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro prerot dst, rt
+ lsr r3, r6, #2 @ n4
+ add \rt, r4, r6, lsr #1 @ revtab + n4
+ add r9, r3, r3, lsl #1 @ n3
+ add r8, r7, r6 @ tcos + n4
+ add r3, r2, r6, lsr #1 @ in + n4
+ add r9, r2, r9, lsl #1 @ in + n3
+ sub r8, r8, #16
+ sub r10, r3, #16
+ sub r11, r9, #16
+ mov r12, #-16
+1:
+ vld2.16 {d0,d1}, [r9, :128]!
+ vld2.16 {d2,d3}, [r11,:128], r12
+ vld2.16 {d4,d5}, [r3, :128]!
+ vld2.16 {d6,d7}, [r10,:128], r12
+ vld2.16 {d16,d17},[r7, :128]! @ cos, sin
+ vld2.16 {d18,d19},[r8, :128], r12
+ vrev64.16 q1, q1
+ vrev64.16 q3, q3
+ vrev64.16 q9, q9
+ vneg.s16 d0, d0
+ vneg.s16 d2, d2
+ vneg.s16 d16, d16
+ vneg.s16 d18, d18
+ vhsub.s16 d0, d0, d3 @ re
+ vhsub.s16 d4, d7, d4 @ im
+ vhsub.s16 d6, d6, d5
+ vhsub.s16 d2, d2, d1
+ vmull.s16 q10, d0, d16
+ vmlsl.s16 q10, d4, d17
+ vmull.s16 q11, d0, d17
+ vmlal.s16 q11, d4, d16
+ vmull.s16 q12, d6, d18
+ vmlsl.s16 q12, d2, d19
+ vmull.s16 q13, d6, d19
+ vmlal.s16 q13, d2, d18
+ vshrn.s32 d0, q10, #15
+ vshrn.s32 d1, q11, #15
+ vshrn.s32 d2, q12, #15
+ vshrn.s32 d3, q13, #15
+ vzip.16 d0, d1
+ vzip.16 d2, d3
+ ldrh lr, [r4], #2
+ ldrh r2, [\rt, #-2]!
+ add lr, \dst, lr, lsl #2
+ add r2, \dst, r2, lsl #2
+ vst1.32 {d0[0]}, [lr,:32]
+ vst1.32 {d2[0]}, [r2,:32]
+ ldrh lr, [r4], #2
+ ldrh r2, [\rt, #-2]!
+ add lr, \dst, lr, lsl #2
+ add r2, \dst, r2, lsl #2
+ vst1.32 {d0[1]}, [lr,:32]
+ vst1.32 {d2[1]}, [r2,:32]
+ ldrh lr, [r4], #2
+ ldrh r2, [\rt, #-2]!
+ add lr, \dst, lr, lsl #2
+ add r2, \dst, r2, lsl #2
+ vst1.32 {d1[0]}, [lr,:32]
+ vst1.32 {d3[0]}, [r2,:32]
+ ldrh lr, [r4], #2
+ ldrh r2, [\rt, #-2]!
+ add lr, \dst, lr, lsl #2
+ add r2, \dst, r2, lsl #2
+ vst1.32 {d1[1]}, [lr,:32]
+ vst1.32 {d3[1]}, [r2,:32]
+ subs r6, r6, #32
+ bgt 1b
+.endm
+
+function ff_mdct_fixed_calc_neon, export=1
+ push {r1,r4-r11,lr}
+
+ ldr r4, [r0, #8] @ revtab
+ ldr r6, [r0, #16] @ mdct_size; n
+ ldr r7, [r0, #24] @ tcos
+
+ prerot r1, r5
+
+ mov r4, r0
+ bl X(ff_fft_fixed_calc_neon)
+
+ pop {r5}
+ mov r12, #-16
+ ldr r6, [r4, #16] @ mdct_size; n
+ ldr r7, [r4, #24] @ tcos
+ add r5, r5, r6, lsr #1
+ add r7, r7, r6, lsr #1
+ sub r1, r5, #16
+ sub r2, r7, #16
+1:
+ vld2.16 {d4,d5}, [r7,:128]!
+ vld2.16 {d6,d7}, [r2,:128], r12
+ vld2.16 {d0,d1}, [r5,:128]
+ vld2.16 {d2,d3}, [r1,:128]
+ vrev64.16 q3, q3
+ vrev64.16 q1, q1
+ vneg.s16 q3, q3
+ vneg.s16 q2, q2
+ vmull.s16 q11, d2, d6
+ vmlal.s16 q11, d3, d7
+ vmull.s16 q8, d0, d5
+ vmlsl.s16 q8, d1, d4
+ vmull.s16 q9, d0, d4
+ vmlal.s16 q9, d1, d5
+ vmull.s16 q10, d2, d7
+ vmlsl.s16 q10, d3, d6
+ vshrn.s32 d0, q11, #15
+ vshrn.s32 d1, q8, #15
+ vshrn.s32 d2, q9, #15
+ vshrn.s32 d3, q10, #15
+ vrev64.16 q0, q0
+ vst2.16 {d2,d3}, [r5,:128]!
+ vst2.16 {d0,d1}, [r1,:128], r12
+ subs r6, r6, #32
+ bgt 1b
+
+ pop {r4-r11,pc}
+endfunc
+
+function ff_mdct_fixed_calcw_neon, export=1
+ push {r1,r4-r11,lr}
+
+ ldrd r4, r5, [r0, #8] @ revtab, tmp_buf
+ ldr r6, [r0, #16] @ mdct_size; n
+ ldr r7, [r0, #24] @ tcos
+
+ prerot r5, r1
+
+ mov r4, r0
+ mov r1, r5
+ bl X(ff_fft_fixed_calc_neon)
+
+ pop {r7}
+ mov r12, #-16
+ ldr r6, [r4, #16] @ mdct_size; n
+ ldr r9, [r4, #24] @ tcos
+ add r5, r5, r6, lsr #1
+ add r7, r7, r6
+ add r9, r9, r6, lsr #1
+ sub r3, r5, #16
+ sub r1, r7, #16
+ sub r2, r9, #16
+1:
+ vld2.16 {d4,d5}, [r9,:128]!
+ vld2.16 {d6,d7}, [r2,:128], r12
+ vld2.16 {d0,d1}, [r5,:128]!
+ vld2.16 {d2,d3}, [r3,:128], r12
+ vrev64.16 q3, q3
+ vrev64.16 q1, q1
+ vneg.s16 q3, q3
+ vneg.s16 q2, q2
+ vmull.s16 q8, d2, d6
+ vmlal.s16 q8, d3, d7
+ vmull.s16 q9, d0, d5
+ vmlsl.s16 q9, d1, d4
+ vmull.s16 q10, d0, d4
+ vmlal.s16 q10, d1, d5
+ vmull.s16 q11, d2, d7
+ vmlsl.s16 q11, d3, d6
+ vrev64.32 q8, q8
+ vrev64.32 q9, q9
+ vst2.32 {q10,q11},[r7,:128]!
+ vst2.32 {d16,d18},[r1,:128], r12
+ vst2.32 {d17,d19},[r1,:128], r12
+ subs r6, r6, #32
+ bgt 1b
+
+ pop {r4-r11,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/mdct_neon.S b/ffmpeg/libavcodec/arm/mdct_neon.S
new file mode 100644
index 0000000..e481cd1
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mdct_neon.S
@@ -0,0 +1,301 @@
+/*
+ * ARM NEON optimised MDCT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define ff_fft_calc_neon X(ff_fft_calc_neon)
+
+function ff_imdct_half_neon, export=1
+ push {r4-r8,lr}
+
+ mov r12, #1
+ ldr lr, [r0, #20] @ mdct_bits
+ ldr r4, [r0, #24] @ tcos
+ ldr r3, [r0, #8] @ revtab
+ lsl r12, r12, lr @ n = 1 << nbits
+ lsr lr, r12, #2 @ n4 = n >> 2
+ add r7, r2, r12, lsl #1
+ mov r12, #-16
+ sub r7, r7, #16
+
+ vld2.32 {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
+ vld2.32 {d0-d1}, [r2,:128]! @ d0 =m0,x d1 =m1,x
+ vrev64.32 d17, d17
+ vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
+ vmul.f32 d6, d17, d2
+ vmul.f32 d7, d0, d2
+1:
+ subs lr, lr, #2
+ ldr r6, [r3], #4
+ vmul.f32 d4, d0, d3
+ vmul.f32 d5, d17, d3
+ vsub.f32 d4, d6, d4
+ vadd.f32 d5, d5, d7
+ uxth r8, r6, ror #16
+ uxth r6, r6
+ add r8, r1, r8, lsl #3
+ add r6, r1, r6, lsl #3
+ beq 1f
+ vld2.32 {d16-d17},[r7,:128],r12
+ vld2.32 {d0-d1}, [r2,:128]!
+ vrev64.32 d17, d17
+ vld2.32 {d2,d3}, [r4,:128]! @ d2=c0,c1 d3=s0,s2
+ vmul.f32 d6, d17, d2
+ vmul.f32 d7, d0, d2
+ vst2.32 {d4[0],d5[0]}, [r6,:64]
+ vst2.32 {d4[1],d5[1]}, [r8,:64]
+ b 1b
+1:
+ vst2.32 {d4[0],d5[0]}, [r6,:64]
+ vst2.32 {d4[1],d5[1]}, [r8,:64]
+
+ mov r4, r0
+ mov r6, r1
+ bl ff_fft_calc_neon
+
+ mov r12, #1
+ ldr lr, [r4, #20] @ mdct_bits
+ ldr r4, [r4, #24] @ tcos
+ lsl r12, r12, lr @ n = 1 << nbits
+ lsr lr, r12, #3 @ n8 = n >> 3
+
+ add r4, r4, lr, lsl #3
+ add r6, r6, lr, lsl #3
+ sub r1, r4, #16
+ sub r3, r6, #16
+
+ mov r7, #-16
+ mov r8, r6
+ mov r0, r3
+
+ vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
+ vld2.32 {d20-d21},[r6,:128]! @ d20=i2,r2 d21=i3,r3
+ vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
+1:
+ subs lr, lr, #2
+ vmul.f32 d7, d0, d18
+ vld2.32 {d17,d19},[r4,:128]! @ d17=c2,c3 d19=s2,s3
+ vmul.f32 d4, d1, d18
+ vmul.f32 d5, d21, d19
+ vmul.f32 d6, d20, d19
+ vmul.f32 d22, d1, d16
+ vmul.f32 d23, d21, d17
+ vmul.f32 d24, d0, d16
+ vmul.f32 d25, d20, d17
+ vadd.f32 d7, d7, d22
+ vadd.f32 d6, d6, d23
+ vsub.f32 d4, d4, d24
+ vsub.f32 d5, d5, d25
+ beq 1f
+ vld2.32 {d0-d1}, [r3,:128], r7
+ vld2.32 {d20-d21},[r6,:128]!
+ vld2.32 {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
+ vrev64.32 q3, q3
+ vst2.32 {d4,d6}, [r0,:128], r7
+ vst2.32 {d5,d7}, [r8,:128]!
+ b 1b
+1:
+ vrev64.32 q3, q3
+ vst2.32 {d4,d6}, [r0,:128]
+ vst2.32 {d5,d7}, [r8,:128]
+
+ pop {r4-r8,pc}
+endfunc
+
+function ff_imdct_calc_neon, export=1
+ push {r4-r6,lr}
+
+ ldr r3, [r0, #20]
+ mov r4, #1
+ mov r5, r1
+ lsl r4, r4, r3
+ add r1, r1, r4
+
+ bl ff_imdct_half_neon
+
+ add r0, r5, r4, lsl #2
+ add r1, r5, r4, lsl #1
+ sub r0, r0, #8
+ sub r2, r1, #16
+ mov r3, #-16
+ mov r6, #-8
+ vmov.i32 d30, #1<<31
+1:
+ vld1.32 {d0-d1}, [r2,:128], r3
+ pld [r0, #-16]
+ vrev64.32 q0, q0
+ vld1.32 {d2-d3}, [r1,:128]!
+ veor d4, d1, d30
+ pld [r2, #-16]
+ vrev64.32 q1, q1
+ veor d5, d0, d30
+ vst1.32 {d2}, [r0,:64], r6
+ vst1.32 {d3}, [r0,:64], r6
+ vst1.32 {d4-d5}, [r5,:128]!
+ subs r4, r4, #16
+ bgt 1b
+
+ pop {r4-r6,pc}
+endfunc
+
+function ff_mdct_calc_neon, export=1
+ push {r4-r10,lr}
+
+ mov r12, #1
+ ldr lr, [r0, #20] @ mdct_bits
+ ldr r4, [r0, #24] @ tcos
+ ldr r3, [r0, #8] @ revtab
+ lsl lr, r12, lr @ n = 1 << nbits
+ add r7, r2, lr @ in4u
+ sub r9, r7, #16 @ in4d
+ add r2, r7, lr, lsl #1 @ in3u
+ add r8, r9, lr, lsl #1 @ in3d
+ add r5, r4, lr, lsl #1
+ sub r5, r5, #16
+ sub r3, r3, #4
+ mov r12, #-16
+
+ vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
+ vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
+ vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
+ vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
+ vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
+ vsub.f32 d0, d18, d0 @ in4d-in4u I
+ vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
+ vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
+ vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
+ vadd.f32 d1, d1, d19 @ in3u+in3d -R
+ vsub.f32 d16, d16, d2 @ in0u-in2d R
+ vadd.f32 d17, d17, d3 @ in2u+in1d -I
+1:
+ vmul.f32 d7, d0, d21 @ I*s
+A ldr r10, [r3, lr, lsr #1]
+T lsr r10, lr, #1
+T ldr r10, [r3, r10]
+ vmul.f32 d6, d1, d20 @ -R*c
+ ldr r6, [r3, #4]!
+ vmul.f32 d4, d1, d21 @ -R*s
+ vmul.f32 d5, d0, d20 @ I*c
+ vmul.f32 d24, d16, d30 @ R*c
+ vmul.f32 d25, d17, d31 @ -I*s
+ vmul.f32 d22, d16, d31 @ R*s
+ vmul.f32 d23, d17, d30 @ I*c
+ subs lr, lr, #16
+ vsub.f32 d6, d6, d7 @ -R*c-I*s
+ vadd.f32 d7, d4, d5 @ -R*s+I*c
+ vsub.f32 d24, d25, d24 @ I*s-R*c
+ vadd.f32 d25, d22, d23 @ R*s-I*c
+ beq 1f
+ mov r12, #-16
+ vld2.32 {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
+ vld2.32 {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
+ vneg.f32 d7, d7 @ R*s-I*c
+ vld2.32 {d0, d2}, [r7,:128]! @ in4u0,in4u1 in2d1,in2d0
+ vrev64.32 q9, q9 @ in4d0,in4d1 in3d0,in3d1
+ vld2.32 {d1, d3}, [r2,:128]! @ in3u0,in3u1 in1d1,in1d0
+ vsub.f32 d0, d18, d0 @ in4d-in4u I
+ vld2.32 {d20,d21},[r4,:128]! @ c0,c1 s0,s1
+ vrev64.32 q1, q1 @ in2d0,in2d1 in1d0,in1d1
+ vld2.32 {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
+ vadd.f32 d1, d1, d19 @ in3u+in3d -R
+ vsub.f32 d16, d16, d2 @ in0u-in2d R
+ vadd.f32 d17, d17, d3 @ in2u+in1d -I
+ uxth r12, r6, ror #16
+ uxth r6, r6
+ add r12, r1, r12, lsl #3
+ add r6, r1, r6, lsl #3
+ vst2.32 {d6[0],d7[0]}, [r6,:64]
+ vst2.32 {d6[1],d7[1]}, [r12,:64]
+ uxth r6, r10, ror #16
+ uxth r10, r10
+ add r6 , r1, r6, lsl #3
+ add r10, r1, r10, lsl #3
+ vst2.32 {d24[0],d25[0]},[r10,:64]
+ vst2.32 {d24[1],d25[1]},[r6,:64]
+ b 1b
+1:
+ vneg.f32 d7, d7 @ R*s-I*c
+ uxth r12, r6, ror #16
+ uxth r6, r6
+ add r12, r1, r12, lsl #3
+ add r6, r1, r6, lsl #3
+ vst2.32 {d6[0],d7[0]}, [r6,:64]
+ vst2.32 {d6[1],d7[1]}, [r12,:64]
+ uxth r6, r10, ror #16
+ uxth r10, r10
+ add r6 , r1, r6, lsl #3
+ add r10, r1, r10, lsl #3
+ vst2.32 {d24[0],d25[0]},[r10,:64]
+ vst2.32 {d24[1],d25[1]},[r6,:64]
+
+ mov r4, r0
+ mov r6, r1
+ bl ff_fft_calc_neon
+
+ mov r12, #1
+ ldr lr, [r4, #20] @ mdct_bits
+ ldr r4, [r4, #24] @ tcos
+ lsl r12, r12, lr @ n = 1 << nbits
+ lsr lr, r12, #3 @ n8 = n >> 3
+
+ add r4, r4, lr, lsl #3
+ add r6, r6, lr, lsl #3
+ sub r1, r4, #16
+ sub r3, r6, #16
+
+ mov r7, #-16
+ mov r8, r6
+ mov r0, r3
+
+ vld2.32 {d0-d1}, [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
+ vld2.32 {d20-d21},[r6,:128]! @ d20=r2,i2 d21=r3,i3
+ vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
+1:
+ subs lr, lr, #2
+ vmul.f32 d7, d0, d18 @ r1*s1,r0*s0
+ vld2.32 {d17,d19},[r4,:128]! @ c2,c3 s2,s3
+ vmul.f32 d4, d1, d18 @ i1*s1,i0*s0
+ vmul.f32 d5, d21, d19 @ i2*s2,i3*s3
+ vmul.f32 d6, d20, d19 @ r2*s2,r3*s3
+ vmul.f32 d24, d0, d16 @ r1*c1,r0*c0
+ vmul.f32 d25, d20, d17 @ r2*c2,r3*c3
+ vmul.f32 d22, d21, d17 @ i2*c2,i3*c3
+ vmul.f32 d23, d1, d16 @ i1*c1,i0*c0
+ vadd.f32 d4, d4, d24 @ i1*s1+r1*c1,i0*s0+r0*c0
+ vadd.f32 d5, d5, d25 @ i2*s2+r2*c2,i3*s3+r3*c3
+ vsub.f32 d6, d22, d6 @ i2*c2-r2*s2,i3*c3-r3*s3
+ vsub.f32 d7, d23, d7 @ i1*c1-r1*s1,i0*c0-r0*s0
+ vneg.f32 q2, q2
+ beq 1f
+ vld2.32 {d0-d1}, [r3,:128], r7
+ vld2.32 {d20-d21},[r6,:128]!
+ vld2.32 {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
+ vrev64.32 q3, q3
+ vst2.32 {d4,d6}, [r0,:128], r7
+ vst2.32 {d5,d7}, [r8,:128]!
+ b 1b
+1:
+ vrev64.32 q3, q3
+ vst2.32 {d4,d6}, [r0,:128]
+ vst2.32 {d5,d7}, [r8,:128]
+
+ pop {r4-r10,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S b/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
new file mode 100644
index 0000000..49bd0bc
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegaudiodsp_fixed_armv6.S
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro skip args:vararg
+.endm
+
+.macro sum8 lo, hi, w, p, t1, t2, t3, t4, rsb=skip, offs=0
+ ldr \t1, [\w, #4*\offs]
+ ldr \t2, [\p, #4]!
+ \rsb \t1, \t1, #0
+ .irpc i, 135
+ ldr \t3, [\w, #4*64*\i+4*\offs]
+ ldr \t4, [\p, #4*64*\i]
+ smlal \lo, \hi, \t1, \t2
+ \rsb \t3, \t3, #0
+ ldr \t1, [\w, #4*64*(\i+1)+4*\offs]
+ ldr \t2, [\p, #4*64*(\i+1)]
+ smlal \lo, \hi, \t3, \t4
+ \rsb \t1, \t1, #0
+ .endr
+ ldr \t3, [\w, #4*64*7+4*\offs]
+ ldr \t4, [\p, #4*64*7]
+ smlal \lo, \hi, \t1, \t2
+ \rsb \t3, \t3, #0
+ smlal \lo, \hi, \t3, \t4
+.endm
+
+.macro round rd, lo, hi
+ lsr \rd, \lo, #24
+ bic \lo, \lo, #0xff000000
+ orr \rd, \rd, \hi, lsl #8
+ mov \hi, #0
+ ssat \rd, #16, \rd
+.endm
+
+function ff_mpadsp_apply_window_fixed_armv6, export=1
+ push {r2,r4-r11,lr}
+
+ add r4, r0, #4*512 @ synth_buf + 512
+ .rept 4
+ ldm r0!, {r5-r12}
+ stm r4!, {r5-r12}
+ .endr
+
+ ldr r4, [sp, #40] @ incr
+ sub r0, r0, #4*17 @ synth_buf + 16
+ ldr r8, [r2] @ sum:low
+ add r2, r0, #4*32 @ synth_buf + 48
+ rsb r5, r4, r4, lsl #5 @ 31 * incr
+ lsl r4, r4, #1
+ asr r9, r8, #31 @ sum:high
+ add r5, r3, r5, lsl #1 @ samples2
+ add r6, r1, #4*32 @ w2
+ str r4, [sp, #40]
+
+ sum8 r8, r9, r1, r0, r10, r11, r12, lr
+ sum8 r8, r9, r1, r2, r10, r11, r12, lr, rsb, 32
+ round r10, r8, r9
+ strh_post r10, r3, r4
+
+ mov lr, #15
+1:
+ ldr r12, [r0, #4]!
+ ldr r11, [r6, #-4]!
+ ldr r10, [r1, #4]!
+ .irpc i, 0246
+ .if \i
+ ldr r11, [r6, #4*64*\i]
+ ldr r10, [r1, #4*64*\i]
+ .endif
+ rsb r11, r11, #0
+ smlal r8, r9, r10, r12
+ ldr r10, [r0, #4*64*(\i+1)]
+ .ifeq \i
+ smull r4, r7, r11, r12
+ .else
+ smlal r4, r7, r11, r12
+ .endif
+ ldr r11, [r6, #4*64*(\i+1)]
+ ldr r12, [r1, #4*64*(\i+1)]
+ rsb r11, r11, #0
+ smlal r8, r9, r12, r10
+ .iflt \i-6
+ ldr r12, [r0, #4*64*(\i+2)]
+ .else
+ ldr r12, [r2, #-4]!
+ .endif
+ smlal r4, r7, r11, r10
+ .endr
+ .irpc i, 0246
+ ldr r10, [r1, #4*64*\i+4*32]
+ rsb r12, r12, #0
+ ldr r11, [r6, #4*64*\i+4*32]
+ smlal r8, r9, r10, r12
+ ldr r10, [r2, #4*64*(\i+1)]
+ smlal r4, r7, r11, r12
+ ldr r12, [r1, #4*64*(\i+1)+4*32]
+ rsb r10, r10, #0
+ ldr r11, [r6, #4*64*(\i+1)+4*32]
+ smlal r8, r9, r12, r10
+ .iflt \i-6
+ ldr r12, [r2, #4*64*(\i+2)]
+ .else
+ ldr r12, [sp, #40]
+ .endif
+ smlal r4, r7, r11, r10
+ .endr
+ round r10, r8, r9
+ adds r8, r8, r4
+ adc r9, r9, r7
+ strh_post r10, r3, r12
+ round r11, r8, r9
+ subs lr, lr, #1
+ strh_dpost r11, r5, r12
+ bgt 1b
+
+ sum8 r8, r9, r1, r0, r10, r11, r12, lr, rsb, 33
+ pop {r4}
+ round r10, r8, r9
+ str r8, [r4]
+ strh r10, [r3]
+
+ pop {r4-r11,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c b/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c
new file mode 100644
index 0000000..e73aee6
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegaudiodsp_init_arm.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2011 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/mpegaudiodsp.h"
+#include "config.h"
+
+void ff_mpadsp_apply_window_fixed_armv6(int32_t *synth_buf, int32_t *window,
+ int *dither, int16_t *out, int incr);
+
+av_cold void ff_mpadsp_init_arm(MPADSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_armv6(cpu_flags)) {
+ s->apply_window_fixed = ff_mpadsp_apply_window_fixed_armv6;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.c b/ffmpeg/libavcodec/arm/mpegvideo_arm.c
new file mode 100644
index 0000000..6566798
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegvideo_arm.c
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2002 Michael Niedermayer
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideo_arm.h"
+#include "asm-offsets.h"
+
+#if HAVE_NEON
+CHK_OFFS(MpegEncContext, y_dc_scale, Y_DC_SCALE);
+CHK_OFFS(MpegEncContext, c_dc_scale, C_DC_SCALE);
+CHK_OFFS(MpegEncContext, ac_pred, AC_PRED);
+CHK_OFFS(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
+CHK_OFFS(MpegEncContext, inter_scantable.raster_end, INTER_SCANTAB_RASTER_END);
+CHK_OFFS(MpegEncContext, h263_aic, H263_AIC);
+#endif
+
+void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+ int n, int qscale);
+void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+ int n, int qscale);
+
+av_cold void ff_MPV_common_init_arm(MpegEncContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_armv5te(cpu_flags))
+ ff_MPV_common_init_armv5te(s);
+
+ if (have_neon(cpu_flags)) {
+ s->dct_unquantize_h263_intra = ff_dct_unquantize_h263_intra_neon;
+ s->dct_unquantize_h263_inter = ff_dct_unquantize_h263_inter_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_arm.h b/ffmpeg/libavcodec/arm/mpegvideo_arm.h
new file mode 100644
index 0000000..4ff93b7
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegvideo_arm.h
@@ -0,0 +1,26 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_MPEGVIDEO_H
+#define AVCODEC_ARM_MPEGVIDEO_H
+
+#include "libavcodec/mpegvideo.h"
+
+void ff_MPV_common_init_armv5te(MpegEncContext *s);
+
+#endif /* AVCODEC_ARM_MPEGVIDEO_H */
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c b/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c
new file mode 100644
index 0000000..a572290
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegvideo_armv5te.c
@@ -0,0 +1,102 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/avassert.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/mpegvideo.h"
+#include "mpegvideo_arm.h"
+
+void ff_dct_unquantize_h263_armv5te(int16_t *block, int qmul, int qadd, int count);
+
+#ifdef ENABLE_ARM_TESTS
+/**
+ * h263 dequantizer supplementary function, it is performance critical and needs to
+ * have optimized implementations for each architecture. Is also used as a reference
+ * implementation in regression tests
+ */
+static inline void dct_unquantize_h263_helper_c(int16_t *block, int qmul, int qadd, int count)
+{
+ int i, level;
+ for (i = 0; i < count; i++) {
+ level = block[i];
+ if (level) {
+ if (level < 0) {
+ level = level * qmul - qadd;
+ } else {
+ level = level * qmul + qadd;
+ }
+ block[i] = level;
+ }
+ }
+}
+#endif
+
+static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ int level, qmul, qadd;
+ int nCoeffs;
+
+ av_assert2(s->block_last_index[n]>=0);
+
+ qmul = qscale << 1;
+
+ if (!s->h263_aic) {
+ if (n < 4)
+ level = block[0] * s->y_dc_scale;
+ else
+ level = block[0] * s->c_dc_scale;
+ qadd = (qscale - 1) | 1;
+ }else{
+ qadd = 0;
+ level = block[0];
+ }
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+ ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+ block[0] = level;
+}
+
+static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
+ int16_t *block, int n, int qscale)
+{
+ int qmul, qadd;
+ int nCoeffs;
+
+ av_assert2(s->block_last_index[n]>=0);
+
+ qadd = (qscale - 1) | 1;
+ qmul = qscale << 1;
+
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+
+ ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
+}
+
+av_cold void ff_MPV_common_init_armv5te(MpegEncContext *s)
+{
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te;
+}
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S b/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S
new file mode 100644
index 0000000..8687d6b
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegvideo_armv5te_s.S
@@ -0,0 +1,114 @@
+/*
+ * Optimization of some functions from mpegvideo.c for armv5te
+ * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+/*
+ * Special optimized version of dct_unquantize_h263_helper_c, it
+ * requires the block to be at least 8 bytes aligned, and may process
+ * more elements than requested. But it is guaranteed to never
+ * process more than 64 elements provided that count argument is <= 64,
+ * so it is safe. This function is optimized for a common distribution
+ * of values for nCoeffs (they are mostly multiple of 8 plus one or
+ * two extra elements). So this function processes data as 8 elements
+ * per loop iteration and contains optional 2 elements processing in
+ * the end.
+ *
+ * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770)
+ */
+
+.macro dequant_t dst, src, mul, add, tmp
+ rsbs \tmp, ip, \src, asr #16
+ it gt
+ addgt \tmp, \add, #0
+ it lt
+ rsblt \tmp, \add, #0
+ it ne
+ smlatbne \dst, \src, \mul, \tmp
+.endm
+
+.macro dequant_b dst, src, mul, add, tmp
+ rsbs \tmp, ip, \src, lsl #16
+ it gt
+ addgt \tmp, \add, #0
+ it lt
+ rsblt \tmp, \add, #0
+ it ne
+ smlabbne \dst, \src, \mul, \tmp
+.endm
+
+function ff_dct_unquantize_h263_armv5te, export=1
+ push {r4-r9,lr}
+ mov ip, #0
+ subs r3, r3, #2
+ ble 2f
+ ldrd r4, r5, [r0, #0]
+1:
+ ldrd r6, r7, [r0, #8]
+
+ dequant_t r9, r4, r1, r2, r9
+ dequant_t lr, r5, r1, r2, lr
+ dequant_b r4, r4, r1, r2, r8
+ dequant_b r5, r5, r1, r2, r8
+
+ strh r4, [r0], #2
+ strh r9, [r0], #2
+ strh r5, [r0], #2
+ strh lr, [r0], #2
+
+ dequant_t r9, r6, r1, r2, r9
+ dequant_t lr, r7, r1, r2, lr
+ dequant_b r6, r6, r1, r2, r8
+ dequant_b r7, r7, r1, r2, r8
+
+ strh r6, [r0], #2
+ strh r9, [r0], #2
+ strh r7, [r0], #2
+ strh lr, [r0], #2
+
+ subs r3, r3, #8
+ it gt
+ ldrdgt r4, r5, [r0, #0] /* load data early to avoid load/use pipeline stall */
+ bgt 1b
+
+ adds r3, r3, #2
+ it le
+ pople {r4-r9,pc}
+2:
+ ldrsh r9, [r0, #0]
+ ldrsh lr, [r0, #2]
+ mov r8, r2
+ cmp r9, #0
+ it lt
+ rsblt r8, r2, #0
+ it ne
+ smlabbne r9, r9, r1, r8
+ mov r8, r2
+ cmp lr, #0
+ it lt
+ rsblt r8, r2, #0
+ it ne
+ smlabbne lr, lr, r1, r8
+ strh r9, [r0], #2
+ strh lr, [r0], #2
+ pop {r4-r9,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/mpegvideo_neon.S b/ffmpeg/libavcodec/arm/mpegvideo_neon.S
new file mode 100644
index 0000000..e05df8e
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/mpegvideo_neon.S
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "asm-offsets.h"
+
+function ff_dct_unquantize_h263_inter_neon, export=1
+ add r12, r0, #BLOCK_LAST_INDEX
+ ldr r12, [r12, r2, lsl #2]
+ add r0, r0, #INTER_SCANTAB_RASTER_END
+ ldrb r12, [r0, r12]
+ sub r2, r3, #1
+ lsl r0, r3, #1
+ orr r2, r2, #1
+ add r3, r12, #1
+endfunc
+
+function ff_dct_unquantize_h263_neon, export=1
+ vdup.16 q15, r0 @ qmul
+ vdup.16 q14, r2 @ qadd
+ vneg.s16 q13, q14
+ cmp r3, #4
+ mov r0, r1
+ ble 2f
+1:
+ vld1.16 {q0}, [r0,:128]!
+ vclt.s16 q3, q0, #0
+ vld1.16 {q8}, [r0,:128]!
+ vceq.s16 q1, q0, #0
+ vmul.s16 q2, q0, q15
+ vclt.s16 q11, q8, #0
+ vmul.s16 q10, q8, q15
+ vbsl q3, q13, q14
+ vbsl q11, q13, q14
+ vadd.s16 q2, q2, q3
+ vceq.s16 q9, q8, #0
+ vadd.s16 q10, q10, q11
+ vbif q0, q2, q1
+ vbif q8, q10, q9
+ subs r3, r3, #16
+ vst1.16 {q0}, [r1,:128]!
+ vst1.16 {q8}, [r1,:128]!
+ it le
+ bxle lr
+ cmp r3, #8
+ bgt 1b
+2:
+ vld1.16 {d0}, [r0,:64]
+ vclt.s16 d3, d0, #0
+ vceq.s16 d1, d0, #0
+ vmul.s16 d2, d0, d30
+ vbsl d3, d26, d28
+ vadd.s16 d2, d2, d3
+ vbif d0, d2, d1
+ vst1.16 {d0}, [r1,:64]
+ bx lr
+endfunc
+
+function ff_dct_unquantize_h263_intra_neon, export=1
+ push {r4-r6,lr}
+ add r12, r0, #BLOCK_LAST_INDEX
+ ldr r6, [r0, #AC_PRED]
+ add lr, r0, #INTER_SCANTAB_RASTER_END
+ cmp r6, #0
+ it ne
+ movne r12, #63
+ bne 1f
+ ldr r12, [r12, r2, lsl #2]
+ ldrb r12, [lr, r12]
+1: ldr r5, [r0, #H263_AIC]
+ ldrsh r4, [r1]
+ cmp r5, #0
+ mov r5, r1
+ it ne
+ movne r2, #0
+ bne 2f
+ cmp r2, #4
+ it ge
+ addge r0, r0, #4
+ sub r2, r3, #1
+ ldr r6, [r0, #Y_DC_SCALE]
+ orr r2, r2, #1
+ smulbb r4, r4, r6
+2: lsl r0, r3, #1
+ add r3, r12, #1
+ bl ff_dct_unquantize_h263_neon
+ vmov.16 d0[0], r4
+ vst1.16 {d0[0]}, [r5]
+ pop {r4-r6,pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/neon.S b/ffmpeg/libavcodec/arm/neon.S
new file mode 100644
index 0000000..716a607
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/neon.S
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+.macro transpose_8x8 r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.32 \r0, \r4
+ vtrn.32 \r1, \r5
+ vtrn.32 \r2, \r6
+ vtrn.32 \r3, \r7
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.16 \r4, \r6
+ vtrn.16 \r5, \r7
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+ vtrn.8 \r4, \r5
+ vtrn.8 \r6, \r7
+.endm
+
+.macro transpose_4x4 r0, r1, r2, r3
+ vtrn.16 \r0, \r2
+ vtrn.16 \r1, \r3
+ vtrn.8 \r0, \r1
+ vtrn.8 \r2, \r3
+.endm
+
+.macro swap4 r0, r1, r2, r3, r4, r5, r6, r7
+ vswp \r0, \r4
+ vswp \r1, \r5
+ vswp \r2, \r6
+ vswp \r3, \r7
+.endm
+
+.macro transpose16_4x4 r0, r1, r2, r3, r4, r5, r6, r7
+ vtrn.32 \r0, \r2
+ vtrn.32 \r1, \r3
+ vtrn.32 \r4, \r6
+ vtrn.32 \r5, \r7
+ vtrn.16 \r0, \r1
+ vtrn.16 \r2, \r3
+ vtrn.16 \r4, \r5
+ vtrn.16 \r6, \r7
+.endm
diff --git a/ffmpeg/libavcodec/arm/rdft_neon.S b/ffmpeg/libavcodec/arm/rdft_neon.S
new file mode 100644
index 0000000..781d976
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/rdft_neon.S
@@ -0,0 +1,150 @@
+/*
+ * ARM NEON optimised RDFT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_rdft_calc_neon, export=1
+ push {r4-r8,lr}
+
+ ldr r6, [r0, #4] @ inverse
+ mov r4, r0
+ mov r5, r1
+
+ lsls r6, r6, #31
+ bne 1f
+ add r0, r4, #20
+ bl X(ff_fft_permute_neon)
+ add r0, r4, #20
+ mov r1, r5
+ bl X(ff_fft_calc_neon)
+1:
+ ldr r12, [r4, #0] @ nbits
+ mov r2, #1
+ lsl r12, r2, r12
+ add r0, r5, #8
+ add r1, r5, r12, lsl #2
+ lsr r12, r12, #2
+ ldr r2, [r4, #12] @ tcos
+ sub r12, r12, #2
+ ldr r3, [r4, #16] @ tsin
+ mov r7, r0
+ sub r1, r1, #8
+ mov lr, r1
+ mov r8, #-8
+ vld1.32 {d0}, [r0,:64]! @ d1[0,1]
+ vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
+ vld1.32 {d4}, [r2,:64]! @ tcos[i]
+ vld1.32 {d5}, [r3,:64]! @ tsin[i]
+ vmov.f32 d18, #0.5 @ k1
+ vdup.32 d19, r6
+ pld [r0, #32]
+ veor d19, d18, d19 @ k2
+ vmov.i32 d16, #0
+ vmov.i32 d17, #1<<31
+ pld [r1, #-32]
+ vtrn.32 d16, d17
+ pld [r2, #32]
+ vrev64.32 d16, d16 @ d16=1,0 d17=0,1
+ pld [r3, #32]
+2:
+ veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
+ vld1.32 {d24}, [r0,:64]! @ d1[0,1]
+ vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
+ vld1.32 {d25}, [r1,:64], r8 @ d2[0,1]
+ vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
+ veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1]
+ pld [r0, #32]
+ vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
+ pld [r1, #-32]
+ vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1]
+ vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1]
+ vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re
+ veor d7, d21, d16 @ -od.im, od.re
+ vrev64.32 d3, d21 @ od.re, od.im
+ veor d6, d20, d17 @ ev.re,-ev.im
+ veor d2, d3, d16 @ -od.re, od.im
+ vmla.f32 d20, d3, d4[1]
+ vmla.f32 d20, d7, d5[1]
+ vmla.f32 d6, d2, d4[1]
+ vmla.f32 d6, d21, d5[1]
+ vld1.32 {d4}, [r2,:64]! @ tcos[i]
+ veor d7, d23, d16 @ -od.im, od.re
+ vld1.32 {d5}, [r3,:64]! @ tsin[i]
+ veor d24, d22, d17 @ ev.re,-ev.im
+ vrev64.32 d3, d23 @ od.re, od.im
+ pld [r2, #32]
+ veor d2, d3, d16 @ -od.re, od.im
+ pld [r3, #32]
+ vmla.f32 d22, d3, d4[0]
+ vmla.f32 d22, d7, d5[0]
+ vmla.f32 d24, d2, d4[0]
+ vmla.f32 d24, d23, d5[0]
+ vld1.32 {d0}, [r0,:64]! @ d1[0,1]
+ vld1.32 {d1}, [r1,:64], r8 @ d2[0,1]
+ vst1.32 {d20}, [r7,:64]!
+ vst1.32 {d6}, [lr,:64], r8
+ vst1.32 {d22}, [r7,:64]!
+ vst1.32 {d24}, [lr,:64], r8
+ subs r12, r12, #2
+ bgt 2b
+
+ veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1]
+ vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1]
+ vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1]
+ ldr r2, [r4, #8] @ sign_convention
+ vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re
+ add r0, r0, #4
+ bfc r2, #0, #31
+ vld1.32 {d0[0]}, [r0,:32]
+ veor d7, d21, d16 @ -od.im, od.re
+ vrev64.32 d3, d21 @ od.re, od.im
+ veor d6, d20, d17 @ ev.re,-ev.im
+ vld1.32 {d22}, [r5,:64]
+ vdup.32 d1, r2
+ vmov d23, d22
+ veor d2, d3, d16 @ -od.re, od.im
+ vtrn.32 d22, d23
+ veor d0, d0, d1
+ veor d23, d23, d17
+ vmla.f32 d20, d3, d4[1]
+ vmla.f32 d20, d7, d5[1]
+ vmla.f32 d6, d2, d4[1]
+ vmla.f32 d6, d21, d5[1]
+ vadd.f32 d22, d22, d23
+ vst1.32 {d20}, [r7,:64]
+ vst1.32 {d6}, [lr,:64]
+ vst1.32 {d0[0]}, [r0,:32]
+ vst1.32 {d22}, [r5,:64]
+
+ cmp r6, #0
+ it eq
+ popeq {r4-r8,pc}
+
+ vmul.f32 d22, d22, d18
+ vst1.32 {d22}, [r5,:64]
+ add r0, r4, #20
+ mov r1, r5
+ bl X(ff_fft_permute_neon)
+ add r0, r4, #20
+ mov r1, r5
+ pop {r4-r8,lr}
+ b X(ff_fft_calc_neon)
+endfunc
diff --git a/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c
new file mode 100644
index 0000000..8bfe90b
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/rv34dsp_init_arm.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+#include "libavutil/arm/cpu.h"
+
+void ff_rv34_inv_transform_noround_neon(int16_t *block);
+
+void ff_rv34_inv_transform_noround_dc_neon(int16_t *block);
+
+void ff_rv34_idct_add_neon(uint8_t *dst, ptrdiff_t stride, int16_t *block);
+void ff_rv34_idct_dc_add_neon(uint8_t *dst, ptrdiff_t stride, int dc);
+
+av_cold void ff_rv34dsp_init_arm(RV34DSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->rv34_inv_transform = ff_rv34_inv_transform_noround_neon;
+ c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
+
+ c->rv34_idct_add = ff_rv34_idct_add_neon;
+ c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/rv34dsp_neon.S b/ffmpeg/libavcodec/arm/rv34dsp_neon.S
new file mode 100644
index 0000000..a29123f
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/rv34dsp_neon.S
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro rv34_inv_transform r0
+ vld1.16 {q14-q15}, [\r0,:128]
+ vmov.s16 d0, #13
+ vshll.s16 q12, d29, #3
+ vshll.s16 q13, d29, #4
+ vshll.s16 q9, d31, #3
+ vshll.s16 q1, d31, #4
+ vmull.s16 q10, d28, d0
+ vmlal.s16 q10, d30, d0
+ vmull.s16 q11, d28, d0
+ vmlsl.s16 q11, d30, d0
+ vsubw.s16 q12, q12, d29 @ z2 = block[i+4*1]*7
+ vaddw.s16 q13, q13, d29 @ z3 = block[i+4*1]*17
+ vsubw.s16 q9, q9, d31
+ vaddw.s16 q1, q1, d31
+ vadd.s32 q13, q13, q9 @ z3 = 17*block[i+4*1] + 7*block[i+4*3]
+ vsub.s32 q12, q12, q1 @ z2 = 7*block[i+4*1] - 17*block[i+4*3]
+ vadd.s32 q1, q10, q13 @ z0 + z3
+ vadd.s32 q2, q11, q12 @ z1 + z2
+ vsub.s32 q8, q10, q13 @ z0 - z3
+ vsub.s32 q3, q11, q12 @ z1 - z2
+ vtrn.32 q1, q2
+ vtrn.32 q3, q8
+ vswp d3, d6
+ vswp d5, d16
+ vmov.s32 d0, #13
+ vadd.s32 q10, q1, q3
+ vsub.s32 q11, q1, q3
+ vshl.s32 q12, q2, #3
+ vshl.s32 q9, q2, #4
+ vmul.s32 q13, q11, d0[0]
+ vshl.s32 q11, q8, #4
+ vadd.s32 q9, q9, q2
+ vshl.s32 q15, q8, #3
+ vsub.s32 q12, q12, q2
+ vadd.s32 q11, q11, q8
+ vmul.s32 q14, q10, d0[0]
+ vsub.s32 q8, q15, q8
+ vsub.s32 q12, q12, q11
+ vadd.s32 q9, q9, q8
+ vadd.s32 q2, q13, q12 @ z1 + z2
+ vadd.s32 q1, q14, q9 @ z0 + z3
+ vsub.s32 q3, q13, q12 @ z1 - z2
+ vsub.s32 q15, q14, q9 @ z0 - z3
+.endm
+
+/* void rv34_idct_add_c(uint8_t *dst, int stride, int16_t *block) */
+function ff_rv34_idct_add_neon, export=1
+ mov r3, r0
+ rv34_inv_transform r2
+ vmov.i16 q12, #0
+ vrshrn.s32 d16, q1, #10 @ (z0 + z3) >> 10
+ vrshrn.s32 d17, q2, #10 @ (z1 + z2) >> 10
+ vrshrn.s32 d18, q3, #10 @ (z1 - z2) >> 10
+ vrshrn.s32 d19, q15, #10 @ (z0 - z3) >> 10
+ vld1.32 {d28[]}, [r0,:32], r1
+ vld1.32 {d29[]}, [r0,:32], r1
+ vtrn.32 q8, q9
+ vld1.32 {d28[1]}, [r0,:32], r1
+ vld1.32 {d29[1]}, [r0,:32], r1
+ vst1.16 {q12}, [r2,:128]! @ memset(block, 0, 16)
+ vst1.16 {q12}, [r2,:128] @ memset(block+16, 0, 16)
+ vtrn.16 d16, d17
+ vtrn.32 d28, d29
+ vtrn.16 d18, d19
+ vaddw.u8 q0, q8, d28
+ vaddw.u8 q1, q9, d29
+ vqmovun.s16 d28, q0
+ vqmovun.s16 d29, q1
+ vst1.32 {d28[0]}, [r3,:32], r1
+ vst1.32 {d28[1]}, [r3,:32], r1
+ vst1.32 {d29[0]}, [r3,:32], r1
+ vst1.32 {d29[1]}, [r3,:32], r1
+ bx lr
+endfunc
+
+/* void rv34_inv_transform_noround_neon(int16_t *block); */
+function ff_rv34_inv_transform_noround_neon, export=1
+ rv34_inv_transform r0
+ vshl.s32 q11, q2, #1
+ vshl.s32 q10, q1, #1
+ vshl.s32 q12, q3, #1
+ vshl.s32 q13, q15, #1
+ vadd.s32 q11, q11, q2
+ vadd.s32 q10, q10, q1
+ vadd.s32 q12, q12, q3
+ vadd.s32 q13, q13, q15
+ vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11
+ vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11
+ vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11
+ vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0,:64]!
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0,:64]!
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0,:64]!
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0,:64]!
+ bx lr
+endfunc
+
+/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
+function ff_rv34_idct_dc_add_neon, export=1
+ mov r3, r0
+ vld1.32 {d28[]}, [r0,:32], r1
+ vld1.32 {d29[]}, [r0,:32], r1
+ vdup.16 d0, r2
+ vmov.s16 d1, #169
+ vld1.32 {d28[1]}, [r0,:32], r1
+ vmull.s16 q1, d0, d1 @ dc * 13 * 13
+ vld1.32 {d29[1]}, [r0,:32], r1
+ vrshrn.s32 d0, q1, #10 @ (dc * 13 * 13 + 0x200) >> 10
+ vmov d1, d0
+ vaddw.u8 q2, q0, d28
+ vaddw.u8 q3, q0, d29
+ vqmovun.s16 d28, q2
+ vqmovun.s16 d29, q3
+ vst1.32 {d28[0]}, [r3,:32], r1
+ vst1.32 {d29[0]}, [r3,:32], r1
+ vst1.32 {d28[1]}, [r3,:32], r1
+ vst1.32 {d29[1]}, [r3,:32], r1
+ bx lr
+endfunc
+
+/* void rv34_inv_transform_dc_noround_c(int16_t *block) */
+function ff_rv34_inv_transform_noround_dc_neon, export=1
+ vld1.16 {d28[]}, [r0,:16] @ block[0]
+ vmov.i16 d4, #251
+ vorr.s16 d4, #256 @ 13^2 * 3
+ vmull.s16 q3, d28, d4
+ vshrn.s32 d0, q3, #11
+ vmov.i16 d1, d0
+ vst1.64 {q0}, [r0,:128]!
+ vst1.64 {q0}, [r0,:128]!
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c b/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c
new file mode 100644
index 0000000..fec3702
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/rv40dsp_init_arm.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+#include "libavutil/arm/cpu.h"
+
+#define DECL_QPEL3(type, w, pos) \
+ void ff_##type##_rv40_qpel##w##_mc##pos##_neon(uint8_t *dst, uint8_t *src,\
+ ptrdiff_t stride)
+#define DECL_QPEL2(w, pos) \
+ DECL_QPEL3(put, w, pos); \
+ DECL_QPEL3(avg, w, pos)
+
+#define DECL_QPEL_XY(x, y) \
+ DECL_QPEL2(16, x ## y); \
+ DECL_QPEL2(8, x ## y)
+
+#define DECL_QPEL_Y(y) \
+ DECL_QPEL_XY(0, y); \
+ DECL_QPEL_XY(1, y); \
+ DECL_QPEL_XY(2, y); \
+ DECL_QPEL_XY(3, y); \
+
+DECL_QPEL_Y(0);
+DECL_QPEL_Y(1);
+DECL_QPEL_Y(2);
+DECL_QPEL_Y(3);
+
+void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
+void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, ptrdiff_t);
+
+int ff_rv40_h_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
+ int beta, int beta2, int edge,
+ int *p1, int *q1);
+int ff_rv40_v_loop_filter_strength_neon(uint8_t *src, ptrdiff_t stride,
+ int beta, int beta2, int edge,
+ int *p1, int *q1);
+
+void ff_rv40_h_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
+ int filter_q1, int alpha, int beta,
+ int lim_p0q0, int lim_q1, int lim_p1);
+void ff_rv40_v_weak_loop_filter_neon(uint8_t *src, ptrdiff_t stride, int filter_p1,
+ int filter_q1, int alpha, int beta,
+ int lim_p0q0, int lim_q1, int lim_p1);
+
+static av_cold void ff_rv40dsp_init_neon(RV34DSPContext *c)
+{
+ c->put_pixels_tab[0][ 1] = ff_put_rv40_qpel16_mc10_neon;
+ c->put_pixels_tab[0][ 3] = ff_put_rv40_qpel16_mc30_neon;
+ c->put_pixels_tab[0][ 4] = ff_put_rv40_qpel16_mc01_neon;
+ c->put_pixels_tab[0][ 5] = ff_put_rv40_qpel16_mc11_neon;
+ c->put_pixels_tab[0][ 6] = ff_put_rv40_qpel16_mc21_neon;
+ c->put_pixels_tab[0][ 7] = ff_put_rv40_qpel16_mc31_neon;
+ c->put_pixels_tab[0][ 9] = ff_put_rv40_qpel16_mc12_neon;
+ c->put_pixels_tab[0][10] = ff_put_rv40_qpel16_mc22_neon;
+ c->put_pixels_tab[0][11] = ff_put_rv40_qpel16_mc32_neon;
+ c->put_pixels_tab[0][12] = ff_put_rv40_qpel16_mc03_neon;
+ c->put_pixels_tab[0][13] = ff_put_rv40_qpel16_mc13_neon;
+ c->put_pixels_tab[0][14] = ff_put_rv40_qpel16_mc23_neon;
+ c->put_pixels_tab[0][15] = ff_put_rv40_qpel16_mc33_neon;
+ c->avg_pixels_tab[0][ 1] = ff_avg_rv40_qpel16_mc10_neon;
+ c->avg_pixels_tab[0][ 3] = ff_avg_rv40_qpel16_mc30_neon;
+ c->avg_pixels_tab[0][ 4] = ff_avg_rv40_qpel16_mc01_neon;
+ c->avg_pixels_tab[0][ 5] = ff_avg_rv40_qpel16_mc11_neon;
+ c->avg_pixels_tab[0][ 6] = ff_avg_rv40_qpel16_mc21_neon;
+ c->avg_pixels_tab[0][ 7] = ff_avg_rv40_qpel16_mc31_neon;
+ c->avg_pixels_tab[0][ 9] = ff_avg_rv40_qpel16_mc12_neon;
+ c->avg_pixels_tab[0][10] = ff_avg_rv40_qpel16_mc22_neon;
+ c->avg_pixels_tab[0][11] = ff_avg_rv40_qpel16_mc32_neon;
+ c->avg_pixels_tab[0][12] = ff_avg_rv40_qpel16_mc03_neon;
+ c->avg_pixels_tab[0][13] = ff_avg_rv40_qpel16_mc13_neon;
+ c->avg_pixels_tab[0][14] = ff_avg_rv40_qpel16_mc23_neon;
+ c->avg_pixels_tab[0][15] = ff_avg_rv40_qpel16_mc33_neon;
+ c->put_pixels_tab[1][ 1] = ff_put_rv40_qpel8_mc10_neon;
+ c->put_pixels_tab[1][ 3] = ff_put_rv40_qpel8_mc30_neon;
+ c->put_pixels_tab[1][ 4] = ff_put_rv40_qpel8_mc01_neon;
+ c->put_pixels_tab[1][ 5] = ff_put_rv40_qpel8_mc11_neon;
+ c->put_pixels_tab[1][ 6] = ff_put_rv40_qpel8_mc21_neon;
+ c->put_pixels_tab[1][ 7] = ff_put_rv40_qpel8_mc31_neon;
+ c->put_pixels_tab[1][ 9] = ff_put_rv40_qpel8_mc12_neon;
+ c->put_pixels_tab[1][10] = ff_put_rv40_qpel8_mc22_neon;
+ c->put_pixels_tab[1][11] = ff_put_rv40_qpel8_mc32_neon;
+ c->put_pixels_tab[1][12] = ff_put_rv40_qpel8_mc03_neon;
+ c->put_pixels_tab[1][13] = ff_put_rv40_qpel8_mc13_neon;
+ c->put_pixels_tab[1][14] = ff_put_rv40_qpel8_mc23_neon;
+ c->put_pixels_tab[1][15] = ff_put_rv40_qpel8_mc33_neon;
+ c->avg_pixels_tab[1][ 1] = ff_avg_rv40_qpel8_mc10_neon;
+ c->avg_pixels_tab[1][ 3] = ff_avg_rv40_qpel8_mc30_neon;
+ c->avg_pixels_tab[1][ 4] = ff_avg_rv40_qpel8_mc01_neon;
+ c->avg_pixels_tab[1][ 5] = ff_avg_rv40_qpel8_mc11_neon;
+ c->avg_pixels_tab[1][ 6] = ff_avg_rv40_qpel8_mc21_neon;
+ c->avg_pixels_tab[1][ 7] = ff_avg_rv40_qpel8_mc31_neon;
+ c->avg_pixels_tab[1][ 9] = ff_avg_rv40_qpel8_mc12_neon;
+ c->avg_pixels_tab[1][10] = ff_avg_rv40_qpel8_mc22_neon;
+ c->avg_pixels_tab[1][11] = ff_avg_rv40_qpel8_mc32_neon;
+ c->avg_pixels_tab[1][12] = ff_avg_rv40_qpel8_mc03_neon;
+ c->avg_pixels_tab[1][13] = ff_avg_rv40_qpel8_mc13_neon;
+ c->avg_pixels_tab[1][14] = ff_avg_rv40_qpel8_mc23_neon;
+ c->avg_pixels_tab[1][15] = ff_avg_rv40_qpel8_mc33_neon;
+
+ c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
+ c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
+ c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
+ c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+
+ c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
+ c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
+
+ c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
+ c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
+ c->rv40_weak_loop_filter[0] = ff_rv40_h_weak_loop_filter_neon;
+ c->rv40_weak_loop_filter[1] = ff_rv40_v_weak_loop_filter_neon;
+}
+
+av_cold void ff_rv40dsp_init_arm(RV34DSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags))
+ ff_rv40dsp_init_neon(c);
+}
diff --git a/ffmpeg/libavcodec/arm/rv40dsp_neon.S b/ffmpeg/libavcodec/arm/rv40dsp_neon.S
new file mode 100644
index 0000000..6bd45eb
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/rv40dsp_neon.S
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+.macro qpel_lowpass r0, r1, rc1, rc2, shift
+ vext.8 d25, \r0, \r1, #1 @ src[-1]
+ vext.8 d26, \r0, \r1, #4 @ src[ 2]
+ vext.8 d24, \r0, \r1, #5 @ src[ 3]
+ vaddl.u8 q9, d25, d26
+ vaddl.u8 q8, \r0, d24
+ vext.8 d27, \r0, \r1, #2 @ src[ 0]
+ vshl.s16 q12, q9, #2
+ vsub.s16 q8, q8, q9
+ vext.8 d28, \r0, \r1, #3 @ src[ 1]
+ vsub.s16 q8, q8, q12
+ vmlal.u8 q8, d27, \rc1
+ vmlal.u8 q8, d28, \rc2
+ vqrshrun.s16 \r0, q8, #\shift
+.endm
+
+.macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
+ vext.8 d25, \r0, \r1, #1 @ src[-1]
+ vext.8 d26, \r0, \r1, #4 @ src[ 2]
+ vext.8 d24, \r0, \r1, #5 @ src[ 3]
+ vaddl.u8 q9, d25, d26
+ vaddl.u8 q8, \r0, d24
+ vext.8 d29, \r0, \r1, #2 @ src[ 0]
+ vext.8 d28, \r0, \r1, #3 @ src[ 1]
+ vshl.s16 q10, q9, #2
+ vext.8 \r1, \r2, \r3, #1 @ src[-1]
+ vsub.s16 q8, q8, q9
+ vext.8 d22, \r2, \r3, #4 @ src[ 2]
+ vext.8 \r0, \r2, \r3, #5 @ src[ 3]
+ vaddl.u8 q13, \r1, d22
+ vaddl.u8 q12, \r2, \r0
+ vsub.s16 q8, q8, q10
+ vshl.s16 q9, q13, #2
+ vsub.s16 q12, q12, q13
+ vmlal.u8 q8, d29, \rc1
+ vmlal.u8 q8, d28, \rc2
+ vsub.s16 q12, q12, q9
+ vext.8 d26, \r2, \r3, #2 @ src[ 0]
+ vext.8 d27, \r2, \r3, #3 @ src[ 1]
+ vmlal.u8 q12, d26, \rc1
+ vmlal.u8 q12, d27, \rc2
+ vqrshrun.s16 \r0, q8, #\shift
+ vqrshrun.s16 \r2, q12, #\shift
+.endm
+
+.macro rv40_qpel8_h shift
+function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
+1:
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r1], r2
+ qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
+ vst1.8 {d4}, [r12,:64]!
+ vst1.8 {d6}, [r12,:64]!
+ subs r3, r3, #2
+ bgt 1b
+ vld1.8 {q2}, [r1]
+ qpel_lowpass d4, d5, d0, d1, \shift
+ vst1.8 {d4}, [r12,:64]!
+ bx lr
+endfunc
+.endm
+
+.macro rv40_qpel8_v shift, type
+function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
+ vld1.64 {d2}, [r1,:64]!
+ vld1.64 {d3}, [r1,:64]!
+ vld1.64 {d4}, [r1,:64]!
+ vld1.64 {d5}, [r1,:64]!
+ vld1.64 {d6}, [r1,:64]!
+ vld1.64 {d7}, [r1,:64]!
+ vld1.64 {d8}, [r1,:64]!
+ vld1.64 {d9}, [r1,:64]!
+ vld1.64 {d10}, [r1,:64]!
+ vld1.64 {d11}, [r1,:64]!
+ vld1.64 {d12}, [r1,:64]!
+ vld1.64 {d13}, [r1,:64]!
+ vld1.64 {d14}, [r1,:64]!
+ transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
+ transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
+ qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
+ qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
+ qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
+ qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
+ transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
+ .ifc \type,avg
+ vld1.64 d12, [r0,:64], r2
+ vld1.64 d13, [r0,:64], r2
+ vld1.64 d14, [r0,:64], r2
+ vld1.64 d15, [r0,:64], r2
+ vld1.64 d16, [r0,:64], r2
+ vld1.64 d17, [r0,:64], r2
+ vld1.64 d18, [r0,:64], r2
+ vld1.64 d19, [r0,:64], r2
+ sub r0, r0, r2, lsl #3
+ vrhadd.u8 q1, q1, q6
+ vrhadd.u8 q2, q2, q7
+ vrhadd.u8 q3, q3, q8
+ vrhadd.u8 q4, q4, q9
+ .endif
+ vst1.64 d2, [r0,:64], r2
+ vst1.64 d3, [r0,:64], r2
+ vst1.64 d4, [r0,:64], r2
+ vst1.64 d5, [r0,:64], r2
+ vst1.64 d6, [r0,:64], r2
+ vst1.64 d7, [r0,:64], r2
+ vst1.64 d8, [r0,:64], r2
+ vst1.64 d9, [r0,:64], r2
+ bx lr
+endfunc
+.endm
+
+ rv40_qpel8_h 5
+ rv40_qpel8_h 6
+
+.macro rv40_qpel type
+function \type\()_rv40_qpel8_h_lowpass_neon
+ .ifc \type,avg
+ mov r12, r0
+ .endif
+1:
+ vld1.8 {q2}, [r1], r2
+ vld1.8 {q3}, [r1], r2
+ qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
+ .ifc \type,avg
+ vld1.8 {d3}, [r12,:64], r2
+ vld1.8 {d16}, [r12,:64], r2
+ vrhadd.u8 d4, d4, d3
+ vrhadd.u8 d6, d6, d16
+ .endif
+ vst1.8 {d4}, [r0,:64], r2
+ vst1.8 {d6}, [r0,:64], r2
+ subs r3, r3, #2
+ bgt 1b
+ bx lr
+endfunc
+
+function \type\()_rv40_qpel8_v_lowpass_neon
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1], r2
+ vld1.64 {d8}, [r1], r2
+ vld1.64 {d9}, [r1], r2
+ vld1.64 {d10}, [r1], r2
+ vld1.64 {d11}, [r1], r2
+ vld1.64 {d12}, [r1], r2
+ vld1.64 {d13}, [r1], r2
+ vld1.64 {d14}, [r1]
+ transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
+ transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
+ qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
+ qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
+ qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
+ qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
+ transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
+ .ifc \type,avg
+ vld1.64 d12, [r0,:64], r2
+ vld1.64 d13, [r0,:64], r2
+ vld1.64 d14, [r0,:64], r2
+ vld1.64 d15, [r0,:64], r2
+ vld1.64 d16, [r0,:64], r2
+ vld1.64 d17, [r0,:64], r2
+ vld1.64 d18, [r0,:64], r2
+ vld1.64 d19, [r0,:64], r2
+ sub r0, r0, r2, lsl #3
+ vrhadd.u8 q1, q1, q6
+ vrhadd.u8 q2, q2, q7
+ vrhadd.u8 q3, q3, q8
+ vrhadd.u8 q4, q4, q9
+ .endif
+ vst1.64 d2, [r0,:64], r2
+ vst1.64 d3, [r0,:64], r2
+ vst1.64 d4, [r0,:64], r2
+ vst1.64 d5, [r0,:64], r2
+ vst1.64 d6, [r0,:64], r2
+ vst1.64 d7, [r0,:64], r2
+ vst1.64 d8, [r0,:64], r2
+ vst1.64 d9, [r0,:64], r2
+ bx lr
+endfunc
+
+ rv40_qpel8_v 5, \type
+ rv40_qpel8_v 6, \type
+
+function ff_\type\()_rv40_qpel8_mc10_neon, export=1
+ sub r1, r1, #2
+ mov r3, #8
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ b \type\()_rv40_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc30_neon, export=1
+ sub r1, r1, #2
+ mov r3, #8
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ b \type\()_rv40_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc01_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub r1, r1, r2, lsl #1
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl \type\()_rv40_qpel8_v_lowpass_neon
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc11_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc21_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #20
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ vmov.i8 d0, #52
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc31_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ vswp d0, d1
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc12_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ vmov.i8 d0, #20
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc22_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #20
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc32_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ vmov.i8 d1, #20
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc03_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub r1, r1, r2, lsl #1
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ bl \type\()_rv40_qpel8_v_lowpass_neon
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc33_neon, export=1
+ mov r3, #8
+ b X(ff_\type\()_pixels8_xy2_neon)
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc13_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ vswp d0, d1
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel8_mc23_neon, export=1
+ push {r4, lr}
+ vpush {d8-d15}
+ sub sp, sp, #14*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ mov r3, #12
+ vmov.i8 d0, #20
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ add r1, sp, #7
+ bic r1, r1, #7
+ vmov.i8 d1, #52
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ add sp, sp, #14*8
+ vpop {d8-d15}
+ pop {r4, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc10_neon, export=1
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+.L\type\()_rv40_qpel16_h:
+ push {r1, lr}
+ sub r1, r1, #2
+ mov r3, #16
+ bl \type\()_rv40_qpel8_h_lowpass_neon
+ pop {r1, lr}
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ add r1, r1, #6
+ mov r3, #16
+ b \type\()_rv40_qpel8_h_lowpass_neon
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc30_neon, export=1
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ b .L\type\()_rv40_qpel16_h
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc01_neon, export=1
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+.L\type\()_rv40_qpel16_v:
+ sub r1, r1, r2, lsl #1
+ push {r1, lr}
+ vpush {d8-d15}
+ bl \type\()_rv40_qpel8_v_lowpass_neon
+ sub r1, r1, r2, lsl #2
+ bl \type\()_rv40_qpel8_v_lowpass_neon
+ ldr r1, [sp, #64]
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ add r1, r1, #8
+ bl \type\()_rv40_qpel8_v_lowpass_neon
+ sub r1, r1, r2, lsl #2
+ bl \type\()_rv40_qpel8_v_lowpass_neon
+ vpop {d8-d15}
+ pop {r1, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc11_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+.L\type\()_rv40_qpel16_v_s6:
+ add r1, sp, #7
+ bic r1, r1, #7
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ sub r1, r1, #40
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ sub r1, r1, #40
+ bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
+ add sp, sp, #44*8
+ vpop {d8-d15}
+ pop {r1, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc21_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #20
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ vmov.i8 d0, #52
+ b .L\type\()_rv40_qpel16_v_s6
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc31_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ vswp d0, d1
+ b .L\type\()_rv40_qpel16_v_s6
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc12_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ vmov.i8 d0, #20
+.L\type\()_rv40_qpel16_v_s5:
+ add r1, sp, #7
+ bic r1, r1, #7
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ sub r1, r1, #40
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ sub r0, r0, r2, lsl #4
+ add r0, r0, #8
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ sub r1, r1, #40
+ bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
+ add sp, sp, #44*8
+ vpop {d8-d15}
+ pop {r1, pc}
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc22_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #20
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ b .L\type\()_rv40_qpel16_v_s5
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc32_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ vmov.i8 d1, #20
+ b .L\type\()_rv40_qpel16_v_s5
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc03_neon, export=1
+ vmov.i8 d0, #20
+ vmov.i8 d1, #52
+ b .L\type\()_rv40_qpel16_v
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc13_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #52
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s6_neon
+ vswp d0, d1
+ b .L\type\()_rv40_qpel16_v_s6
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc23_neon, export=1
+ sub r1, r1, r2, lsl #1
+ sub r1, r1, #2
+ push {r1, lr}
+ vpush {d8-d15}
+ sub sp, sp, #44*8
+ add r12, sp, #7
+ bic r12, r12, #7
+ mov r3, #20
+ vmov.i8 d0, #20
+ vmov.i8 d1, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ ldr r1, [sp, #416]
+ add r1, r1, #8
+ mov r3, #20
+ bl put_rv40_qpel8_h_lp_packed_s5_neon
+ vmov.i8 d1, #52
+ b .L\type\()_rv40_qpel16_v_s6
+endfunc
+
+function ff_\type\()_rv40_qpel16_mc33_neon, export=1
+ mov r3, #16
+ b X(ff_\type\()_pixels16_xy2_neon)
+endfunc
+.endm
+
+ rv40_qpel put
+ rv40_qpel avg
+
+.macro rv40_weight
+ vmovl.u8 q8, d2
+ vmovl.u8 q9, d3
+ vmovl.u8 q10, d4
+ vmovl.u8 q11, d5
+ vmull.u16 q2, d16, d0[2]
+ vmull.u16 q3, d17, d0[2]
+ vmull.u16 q8, d18, d0[2]
+ vmull.u16 q9, d19, d0[2]
+ vmull.u16 q12, d20, d0[0]
+ vmull.u16 q13, d21, d0[0]
+ vmull.u16 q14, d22, d0[0]
+ vmull.u16 q15, d23, d0[0]
+ vshrn.i32 d4, q2, #9
+ vshrn.i32 d5, q3, #9
+ vshrn.i32 d6, q8, #9
+ vshrn.i32 d7, q9, #9
+ vshrn.i32 d16, q12, #9
+ vshrn.i32 d17, q13, #9
+ vshrn.i32 d18, q14, #9
+ vshrn.i32 d19, q15, #9
+ vadd.u16 q2, q2, q8
+ vadd.u16 q3, q3, q9
+ vrshrn.i16 d2, q2, #5
+ vrshrn.i16 d3, q3, #5
+.endm
+
+/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int w1, int w2, int stride) */
+function ff_rv40_weight_func_16_neon, export=1
+ ldr r12, [sp]
+ vmov d0, r3, r12
+ ldr r12, [sp, #4]
+ mov r3, #16
+1:
+ vld1.8 {q1}, [r1,:128], r12
+ vld1.8 {q2}, [r2,:128], r12
+ rv40_weight
+ vst1.8 {q1}, [r0,:128], r12
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+ int w1, int w2, int stride) */
+function ff_rv40_weight_func_8_neon, export=1
+ ldr r12, [sp]
+ vmov d0, r3, r12
+ ldr r12, [sp, #4]
+ mov r3, #8
+1:
+ vld1.8 {d2}, [r1,:64], r12
+ vld1.8 {d3}, [r1,:64], r12
+ vld1.8 {d4}, [r2,:64], r12
+ vld1.8 {d5}, [r2,:64], r12
+ rv40_weight
+ vst1.8 {d2}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ subs r3, r3, #2
+ bne 1b
+ bx lr
+endfunc
+
+function ff_rv40_h_loop_filter_strength_neon, export=1
+ pkhbt r2, r3, r2, lsl #18
+
+ ldr r3, [r0]
+ ldr_dpre r12, r0, r1
+ teq r3, r12
+ beq 1f
+
+ sub r0, r0, r1, lsl #1
+
+ vld1.32 {d4[]}, [r0,:32], r1 @ -3
+ vld1.32 {d0[]}, [r0,:32], r1 @ -2
+ vld1.32 {d4[1]}, [r0,:32], r1 @ -1
+ vld1.32 {d5[]}, [r0,:32], r1 @ 0
+ vld1.32 {d1[]}, [r0,:32], r1 @ 1
+ vld1.32 {d5[0]}, [r0,:32], r1 @ 2
+
+ vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
+ vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
+ vdup.32 d30, r2 @ beta2, beta << 2
+ vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
+ vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
+ vabd.u16 d16, d18, d16
+ vclt.u16 d16, d16, d30
+
+ ldrd r2, r3, [sp, #4]
+ vmovl.u16 q12, d16
+ vtrn.16 d16, d17
+ vshr.u32 q12, q12, #15
+ ldr r0, [sp]
+ vst1.32 {d24[1]}, [r2,:32]
+ vst1.32 {d25[1]}, [r3,:32]
+
+ cmp r0, #0
+ it eq
+ bxeq lr
+
+ vand d18, d16, d17
+ vtrn.32 d18, d19
+ vand d18, d18, d19
+ vmov.u16 r0, d18[0]
+ bx lr
+1:
+ ldrd r2, r3, [sp, #4]
+ mov r0, #0
+ str r0, [r2]
+ str r0, [r3]
+ bx lr
+endfunc
+
+function ff_rv40_v_loop_filter_strength_neon, export=1
+ sub r0, r0, #3
+ pkhbt r2, r3, r2, lsl #18
+
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d3}, [r0], r1
+
+ vaddl.u8 q0, d0, d1
+ vaddl.u8 q1, d2, d3
+ vdup.32 q15, r2
+ vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
+ vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
+ vabd.u16 q0, q1, q0
+ vclt.u16 q0, q0, q15
+
+ ldrd r2, r3, [sp, #4]
+ vmovl.u16 q1, d0
+ vext.16 d1, d0, d1, #3
+ vshr.u32 q1, q1, #15
+ ldr r0, [sp]
+ vst1.32 {d2[1]}, [r2,:32]
+ vst1.32 {d3[1]}, [r3,:32]
+
+ cmp r0, #0
+ it eq
+ bxeq lr
+
+ vand d0, d0, d1
+ vtrn.16 d0, d1
+ vand d0, d0, d1
+ vmov.u16 r0, d0[0]
+ bx lr
+endfunc
+
+.macro rv40_weak_loop_filter
+ vdup.16 d30, r2 @ filter_p1
+ vdup.16 d31, r3 @ filter_q1
+ ldrd r2, r3, [sp]
+ vdup.16 d28, r2 @ alpha
+ vdup.16 d29, r3 @ beta
+ ldr r12, [sp, #8]
+ vdup.16 d25, r12 @ lim_p0q0
+ ldrd r2, r3, [sp, #12]
+ vsubl.u8 q9, d5, d4 @ x, t
+ vabdl.u8 q8, d5, d4 @ x, abs(t)
+ vneg.s16 q15, q15
+ vceq.i16 d16, d19, #0 @ !t
+ vshl.s16 d19, d19, #2 @ t << 2
+ vmul.u16 d18, d17, d28 @ alpha * abs(t)
+ vand d24, d30, d31 @ filter_p1 & filter_q1
+ vsubl.u8 q1, d0, d4 @ p1p2, p1p0
+ vsubl.u8 q3, d1, d5 @ q1q2, q1q0
+ vmov.i16 d22, #3
+ vshr.u16 d18, d18, #7
+ vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
+ vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
+ vcle.u16 d18, d18, d22
+ vand d20, d20, d24
+ vneg.s16 d23, d25 @ -lim_p0q0
+ vadd.s16 d19, d19, d20
+ vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
+ vtrn.32 d4, d5 @ -3, 2, -1, 0
+ vrshr.s16 d19, d19, #3
+ vmov d28, d29 @ beta
+ vswp d3, d6 @ q1q2, p1p0
+ vmin.s16 d19, d19, d25
+ vand d30, d30, d16
+ vand d31, d31, d16
+ vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
+ vmax.s16 d19, d19, d23 @ diff
+ vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
+ vand d18, d19, d16 @ diff
+ vcle.u16 q1, q1, q14
+ vneg.s16 d19, d18 @ -diff
+ vdup.16 d26, r3 @ lim_p1
+ vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
+ vhsub.s16 q11, q10, q9
+ vand q1, q1, q15
+ vqmovun.s16 d4, q2 @ -1, 0
+ vand q9, q11, q1
+ vdup.16 d27, r2 @ lim_q1
+ vneg.s16 q9, q9
+ vneg.s16 q14, q13
+ vmin.s16 q9, q9, q13
+ vtrn.32 d0, d1 @ -2, 1, -2, 1
+ vmax.s16 q9, q9, q14
+ vaddw.u8 q3, q9, d0
+ vqmovun.s16 d5, q3 @ -2, 1
+.endm
+
+function ff_rv40_h_weak_loop_filter_neon, export=1
+ sub r0, r0, r1, lsl #1
+ sub r0, r0, r1
+
+ vld1.32 {d4[]}, [r0,:32], r1
+ vld1.32 {d0[]}, [r0,:32], r1
+ vld1.32 {d4[1]}, [r0,:32], r1
+ vld1.32 {d5[]}, [r0,:32], r1
+ vld1.32 {d1[]}, [r0,:32], r1
+ vld1.32 {d5[0]}, [r0,:32]
+
+ sub r0, r0, r1, lsl #2
+
+ rv40_weak_loop_filter
+
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r0,:32], r1
+
+ bx lr
+endfunc
+
+function ff_rv40_v_weak_loop_filter_neon, export=1
+ sub r12, r0, #3
+ sub r0, r0, #2
+
+ vld1.8 {d4}, [r12], r1
+ vld1.8 {d5}, [r12], r1
+ vld1.8 {d2}, [r12], r1
+ vld1.8 {d3}, [r12], r1
+
+ vtrn.16 q2, q1
+ vtrn.8 d4, d5
+ vtrn.8 d2, d3
+
+ vrev64.32 d5, d5
+ vtrn.32 q2, q1
+ vdup.32 d0, d3[0]
+ vdup.32 d1, d2[0]
+
+ rv40_weak_loop_filter
+
+ vtrn.32 q2, q3
+ vswp d4, d5
+
+ vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
+ vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
+ vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
+ vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
+
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c b/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c
new file mode 100644
index 0000000..4da7967
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/sbrdsp_init_arm.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/sbrdsp.h"
+
+void ff_sbr_sum64x5_neon(float *z);
+float ff_sbr_sum_square_neon(float (*x)[2], int n);
+void ff_sbr_neg_odd_64_neon(float *x);
+void ff_sbr_qmf_pre_shuffle_neon(float *z);
+void ff_sbr_qmf_post_shuffle_neon(float W[32][2], const float *z);
+void ff_sbr_qmf_deint_neg_neon(float *v, const float *src);
+void ff_sbr_qmf_deint_bfly_neon(float *v, const float *src0, const float *src1);
+void ff_sbr_hf_g_filt_neon(float (*Y)[2], const float (*X_high)[40][2],
+ const float *g_filt, int m_max, intptr_t ixh);
+void ff_sbr_hf_gen_neon(float (*X_high)[2], const float (*X_low)[2],
+ const float alpha0[2], const float alpha1[2],
+ float bw, int start, int end);
+void ff_sbr_autocorrelate_neon(const float x[40][2], float phi[3][2][2]);
+
+void ff_sbr_hf_apply_noise_0_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_1_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_2_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+void ff_sbr_hf_apply_noise_3_neon(float Y[64][2], const float *s_m,
+ const float *q_filt, int noise,
+ int kx, int m_max);
+
+av_cold void ff_sbrdsp_init_arm(SBRDSPContext *s)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ s->sum64x5 = ff_sbr_sum64x5_neon;
+ s->sum_square = ff_sbr_sum_square_neon;
+ s->neg_odd_64 = ff_sbr_neg_odd_64_neon;
+ s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_neon;
+ s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_neon;
+ s->qmf_deint_neg = ff_sbr_qmf_deint_neg_neon;
+ s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_neon;
+ s->hf_g_filt = ff_sbr_hf_g_filt_neon;
+ s->hf_gen = ff_sbr_hf_gen_neon;
+ s->autocorrelate = ff_sbr_autocorrelate_neon;
+ s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_neon;
+ s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_neon;
+ s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_neon;
+ s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/sbrdsp_neon.S b/ffmpeg/libavcodec/arm/sbrdsp_neon.S
new file mode 100644
index 0000000..610397f
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/sbrdsp_neon.S
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2012 Mans Rullgard
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_sbr_sum64x5_neon, export=1
+ push {lr}
+ add r1, r0, # 64*4
+ add r2, r0, #128*4
+ add r3, r0, #192*4
+ add lr, r0, #256*4
+ mov r12, #64
+1:
+ vld1.32 {q0}, [r0,:128]
+ vld1.32 {q1}, [r1,:128]!
+ vadd.f32 q0, q0, q1
+ vld1.32 {q2}, [r2,:128]!
+ vadd.f32 q0, q0, q2
+ vld1.32 {q3}, [r3,:128]!
+ vadd.f32 q0, q0, q3
+ vld1.32 {q8}, [lr,:128]!
+ vadd.f32 q0, q0, q8
+ vst1.32 {q0}, [r0,:128]!
+ subs r12, #4
+ bgt 1b
+ pop {pc}
+endfunc
+
+function ff_sbr_sum_square_neon, export=1
+ vmov.f32 q0, #0.0
+1:
+ vld1.32 {q1}, [r0,:128]!
+ vmla.f32 q0, q1, q1
+ subs r1, r1, #2
+ bgt 1b
+ vadd.f32 d0, d0, d1
+ vpadd.f32 d0, d0, d0
+NOVFP vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function ff_sbr_neg_odd_64_neon, export=1
+ mov r1, r0
+ vmov.i32 q8, #1<<31
+ vld2.32 {q0,q1}, [r0,:128]!
+ veor q1, q1, q8
+ vld2.32 {q2,q3}, [r0,:128]!
+ .rept 3
+ vst2.32 {q0,q1}, [r1,:128]!
+ veor q3, q3, q8
+ vld2.32 {q0,q1}, [r0,:128]!
+ vst2.32 {q2,q3}, [r1,:128]!
+ veor q1, q1, q8
+ vld2.32 {q2,q3}, [r0,:128]!
+ .endr
+ veor q3, q3, q8
+ vst2.32 {q0,q1}, [r1,:128]!
+ vst2.32 {q2,q3}, [r1,:128]!
+ bx lr
+endfunc
+
+function ff_sbr_qmf_pre_shuffle_neon, export=1
+ add r1, r0, #60*4
+ add r2, r0, #64*4
+ vld1.32 {d0}, [r0,:64]!
+ vst1.32 {d0}, [r2,:64]!
+ mov r3, #-16
+ mov r12, #24
+ vmov.i32 q8, #1<<31
+ vld1.32 {q0}, [r1,:128], r3
+ vld1.32 {d2}, [r0,:64]!
+1:
+ vld1.32 {d3,d4}, [r0,:128]!
+ vrev64.32 q0, q0
+ vld1.32 {q9}, [r1,:128], r3
+ veor q0, q0, q8
+ vld1.32 {d5,d6}, [r0,:128]!
+ vswp d0, d1
+ vrev64.32 q9, q9
+ vst2.32 {q0,q1}, [r2,:64]!
+ vmov q10, q2
+ veor q9, q9, q8
+ vmov d2, d6
+ vswp d18, d19
+ vld1.32 {q0}, [r1,:128], r3
+ vst2.32 {q9,q10}, [r2,:64]!
+ subs r12, r12, #8
+ bgt 1b
+ vld1.32 {d3,d4}, [r0,:128]!
+ vrev64.32 q0, q0
+ vld1.32 {q9}, [r1,:128], r3
+ veor q0, q0, q8
+ vld1.32 {d5}, [r0,:64]!
+ vswp d0, d1
+ vrev64.32 q9, q9
+ vst2.32 {q0,q1}, [r2,:64]!
+ vswp d4, d5
+ veor q1, q9, q8
+ vst2.32 {d3,d5}, [r2,:64]!
+ vst2.32 {d2[0],d4[0]}, [r2,:64]!
+ bx lr
+endfunc
+
+function ff_sbr_qmf_post_shuffle_neon, export=1
+ add r2, r1, #60*4
+ mov r3, #-16
+ mov r12, #32
+ vmov.i32 q8, #1<<31
+ vld1.32 {q0}, [r2,:128], r3
+ vld1.32 {q1}, [r1,:128]!
+1:
+ pld [r2, #-32]
+ vrev64.32 q0, q0
+ vswp d2, d3
+ veor q0, q0, q8
+ vld1.32 {q2}, [r2,:128], r3
+ vld1.32 {q3}, [r1,:128]!
+ vst2.32 {d1,d3}, [r0,:128]!
+ vst2.32 {d0,d2}, [r0,:128]!
+ pld [r2, #-32]
+ vrev64.32 q2, q2
+ vswp d6, d7
+ veor q2, q2, q8
+ vld1.32 {q0}, [r2,:128], r3
+ vld1.32 {q1}, [r1,:128]!
+ vst2.32 {d5,d7}, [r0,:128]!
+ vst2.32 {d4,d6}, [r0,:128]!
+ subs r12, r12, #8
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_sbr_qmf_deint_neg_neon, export=1
+ add r1, r1, #60*4
+ add r2, r0, #62*4
+ mov r3, #-16
+ mov r12, #32
+ vmov.i32 d2, #1<<31
+1:
+ vld2.32 {d0,d1}, [r1,:128], r3
+ veor d0, d0, d2
+ vrev64.32 d1, d1
+ vst1.32 {d0}, [r2,:64]
+ vst1.32 {d1}, [r0,:64]!
+ sub r2, r2, #8
+ subs r12, r12, #2
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_sbr_qmf_deint_bfly_neon, export=1
+ push {lr}
+ add r2, r2, #60*4
+ add r3, r0, #124*4
+ mov r12, #64
+ mov lr, #-16
+1:
+ vld1.32 {q0}, [r1,:128]!
+ vld1.32 {q1}, [r2,:128], lr
+ vrev64.32 q2, q0
+ vrev64.32 q3, q1
+ vadd.f32 d3, d4, d3
+ vadd.f32 d2, d5, d2
+ vsub.f32 d0, d0, d7
+ vsub.f32 d1, d1, d6
+ vst1.32 {q1}, [r3,:128], lr
+ vst1.32 {q0}, [r0,:128]!
+ subs r12, r12, #4
+ bgt 1b
+ pop {pc}
+endfunc
+
+function ff_sbr_hf_g_filt_neon, export=1
+ ldr r12, [sp]
+ add r1, r1, r12, lsl #3
+ mov r12, #40*2*4
+ sub r3, r3, #1
+ vld2.32 {d2[],d3[]},[r2,:64]!
+ vld1.32 {d0}, [r1,:64], r12
+1:
+ vld1.32 {d1}, [r1,:64], r12
+ vmul.f32 q3, q0, q1
+ vld2.32 {d2[],d3[]},[r2,:64]!
+ vld1.32 {d0}, [r1,:64], r12
+ vst1.32 {q3}, [r0,:64]!
+ subs r3, r3, #2
+ bgt 1b
+ it lt
+ bxlt lr
+ vmul.f32 d0, d0, d2
+ vst1.32 {d0}, [r0,:64]!
+ bx lr
+endfunc
+
+function ff_sbr_hf_gen_neon, export=1
+NOVFP vld1.32 {d1[]}, [sp,:32]
+VFP vdup.32 d1, d0[0]
+ vmul.f32 d0, d1, d1
+ vld1.32 {d3}, [r2,:64]
+ vld1.32 {d2}, [r3,:64]
+ vmul.f32 q0, q0, q1
+ ldrd r2, r3, [sp, #4*!HAVE_VFP_ARGS]
+ vtrn.32 d0, d1
+ vneg.f32 d18, d1
+ vtrn.32 d18, d1
+ add r0, r0, r2, lsl #3
+ add r1, r1, r2, lsl #3
+ sub r1, r1, #2*8
+ sub r3, r3, r2
+ vld1.32 {q1}, [r1,:128]!
+1:
+ vld1.32 {q3}, [r1,:128]!
+ vrev64.32 q2, q1
+ vmov q8, q3
+ vrev64.32 d20, d3
+ vrev64.32 d21, d6
+ vmla.f32 q3, q1, d0[0]
+ vmla.f32 d6, d4, d18
+ vmla.f32 d7, d20, d18
+ vmla.f32 d6, d3, d0[1]
+ vmla.f32 d7, d16, d0[1]
+ vmla.f32 d6, d5, d1
+ vmla.f32 d7, d21, d1
+ vmov q1, q8
+ vst1.32 {q3}, [r0,:128]!
+ subs r3, r3, #2
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_sbr_autocorrelate_neon, export=1
+ vld1.32 {q0}, [r0,:128]!
+ vmov.f32 q1, #0.0
+ vmov.f32 q3, #0.0
+ vmov.f32 d20, #0.0
+ vmul.f32 d21, d1, d1
+ vmov q8, q0
+ vmov q11, q0
+ mov r12, #36
+1:
+ vld1.32 {q2}, [r0,:128]!
+ vrev64.32 q12, q2
+ vmla.f32 q10, q2, q2
+ vmla.f32 d2, d1, d4
+ vmla.f32 d3, d1, d24
+ vmla.f32 d6, d0, d4
+ vmla.f32 d7, d0, d24
+ vmla.f32 d2, d4, d5
+ vmla.f32 d3, d4, d25
+ vmla.f32 d6, d1, d5
+ vmla.f32 d7, d1, d25
+ vmov q0, q2
+ subs r12, r12, #2
+ bgt 1b
+ vld1.32 {q2}, [r0,:128]!
+ vrev64.32 q12, q2
+ vmla.f32 d2, d1, d4
+ vmla.f32 d3, d1, d24
+ vmla.f32 d6, d0, d4
+ vmla.f32 d7, d0, d24
+ vadd.f32 d20, d20, d21
+ vrev64.32 d18, d17
+ vmla.f32 d6, d1, d5
+ vmla.f32 d7, d1, d25
+ vmov q0, q1
+ vmla.f32 d0, d16, d17
+ vmla.f32 d1, d16, d18
+ vmla.f32 d2, d4, d5
+ vmla.f32 d3, d4, d25
+ vneg.f32 s15, s15
+ vmov d21, d20
+ vpadd.f32 d0, d0, d2
+ vpadd.f32 d7, d6, d7
+ vtrn.32 d1, d3
+ vsub.f32 d6, d1, d3
+ vmla.f32 d20, d22, d22
+ vmla.f32 d21, d4, d4
+ vtrn.32 d0, d6
+ vpadd.f32 d20, d20, d21
+ vst1.32 {q3}, [r1,:128]!
+ vst1.32 {d20[1]}, [r1,:32]
+ add r1, r1, #2*4
+ vst1.32 {d0}, [r1,:64]
+ add r1, r1, #4*4
+ vst1.32 {d20[0]}, [r1,:32]
+ bx lr
+endfunc
+
+function ff_sbr_hf_apply_noise_0_neon, export=1
+ vmov.i32 d3, #0
+.Lhf_apply_noise_0:
+ push {r4,lr}
+ movrelx r4, X(ff_sbr_noise_table)
+ ldr r12, [sp, #12]
+ add r3, r3, #1
+ bfc r3, #9, #23
+ sub r12, r12, #1
+1:
+ add lr, r4, r3, lsl #3
+ vld2.32 {q0}, [r0,:64]
+ vld2.32 {q3}, [lr,:64]
+ vld1.32 {d2}, [r1,:64]!
+ vld1.32 {d18}, [r2,:64]!
+ vceq.f32 d16, d2, #0
+ veor d2, d2, d3
+ vmov q2, q0
+ vmla.f32 d0, d6, d18
+ vmla.f32 d1, d7, d18
+ vadd.f32 d4, d4, d2
+ add r3, r3, #2
+ bfc r3, #9, #23
+ vbif d0, d4, d16
+ vbif d1, d5, d16
+ vst2.32 {q0}, [r0,:64]!
+ subs r12, r12, #2
+ bgt 1b
+ blt 2f
+ add lr, r4, r3, lsl #3
+ vld1.32 {d0}, [r0,:64]
+ vld1.32 {d6}, [lr,:64]
+ vld1.32 {d2[]}, [r1,:32]!
+ vld1.32 {d3[]}, [r2,:32]!
+ vceq.f32 d4, d2, #0
+ veor d2, d2, d3
+ vmov d1, d0
+ vmla.f32 d0, d6, d3
+ vadd.f32 s2, s2, s4
+ vbif d0, d1, d4
+ vst1.32 {d0}, [r0,:64]!
+2:
+ pop {r4,pc}
+endfunc
+
+function ff_sbr_hf_apply_noise_1_neon, export=1
+ ldr r12, [sp]
+ push {r4,lr}
+ lsl r12, r12, #31
+ eor lr, r12, #1<<31
+ vmov d3, r12, lr
+.Lhf_apply_noise_1:
+ movrelx r4, X(ff_sbr_noise_table)
+ ldr r12, [sp, #12]
+ add r3, r3, #1
+ bfc r3, #9, #23
+ sub r12, r12, #1
+1:
+ add lr, r4, r3, lsl #3
+ vld2.32 {q0}, [r0,:64]
+ vld2.32 {q3}, [lr,:64]
+ vld1.32 {d2}, [r1,:64]!
+ vld1.32 {d18}, [r2,:64]!
+ vceq.f32 d16, d2, #0
+ veor d2, d2, d3
+ vmov q2, q0
+ vmla.f32 d0, d6, d18
+ vmla.f32 d1, d7, d18
+ vadd.f32 d5, d5, d2
+ add r3, r3, #2
+ bfc r3, #9, #23
+ vbif d0, d4, d16
+ vbif d1, d5, d16
+ vst2.32 {q0}, [r0,:64]!
+ subs r12, r12, #2
+ bgt 1b
+ blt 2f
+ add lr, r4, r3, lsl #3
+ vld1.32 {d0}, [r0,:64]
+ vld1.32 {d6}, [lr,:64]
+ vld1.32 {d2[]}, [r1,:32]!
+ vld1.32 {d18[]}, [r2,:32]!
+ vceq.f32 d4, d2, #0
+ veor d2, d2, d3
+ vmov d1, d0
+ vmla.f32 d0, d6, d18
+ vadd.f32 s3, s3, s5
+ vbif d0, d1, d4
+ vst1.32 {d0}, [r0,:64]!
+2:
+ pop {r4,pc}
+endfunc
+
+function ff_sbr_hf_apply_noise_2_neon, export=1
+ vmov.i32 d3, #1<<31
+ b .Lhf_apply_noise_0
+endfunc
+
+function ff_sbr_hf_apply_noise_3_neon, export=1
+ ldr r12, [sp]
+ push {r4,lr}
+ lsl r12, r12, #31
+ eor lr, r12, #1<<31
+ vmov d3, lr, r12
+ b .Lhf_apply_noise_1
+endfunc
diff --git a/ffmpeg/libavcodec/arm/simple_idct_arm.S b/ffmpeg/libavcodec/arm/simple_idct_arm.S
new file mode 100644
index 0000000..dd1c815
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/simple_idct_arm.S
@@ -0,0 +1,479 @@
+/*
+ * Copyright (C) 2002 Frederic 'dilb' Boulay
+ *
+ * Author: Frederic Boulay <dilb@handhelds.org>
+ *
+ * The function defined in this file is derived from the simple_idct function
+ * from the libavcodec library part of the FFmpeg project.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+/* useful constants for the algorithm */
+#define W1 22725
+#define W2 21407
+#define W3 19266
+#define W4 16383
+#define W5 12873
+#define W6 8867
+#define W7 4520
+#define MASK_MSHW 0xFFFF0000
+
+#define ROW_SHIFT 11
+#define ROW_SHIFT2MSHW (16-11)
+#define COL_SHIFT 20
+#define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
+#define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
+
+
+function ff_simple_idct_arm, export=1
+ @@ void simple_idct_arm(int16_t *block)
+ @@ save stack for reg needed (take all of them),
+ @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
+ @@ so it must not be overwritten, if it is not saved!!
+ @@ R12 is another scratch register, so it should not be saved too
+ @@ save all registers
+ stmfd sp!, {r4-r11, r14} @ R14 is also called LR
+ @@ at this point, R0=block, other registers are free.
+ add r14, r0, #112 @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
+ @@ add 2 temporary variables in the stack: R0 and R14
+ sub sp, sp, #8 @ allow 2 local variables
+ str r0, [sp, #0] @ save block in sp[0]
+ @@ stack status
+ @@ sp+4 free
+ @@ sp+0 R0 (block)
+
+
+ @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
+
+
+__row_loop:
+ @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
+ ldr r1, [r14, #0] @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
+ ldr r2, [r14, #4] @ R2=(int32)(R12)[1]=ROWr32[1]
+ ldr r3, [r14, #8] @ R3=ROWr32[2]
+ ldr r4, [r14, #12] @ R4=ROWr32[3]
+ @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
+ @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
+ @@ else follow the complete algorithm.
+ @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+ @@ R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
+ orr r5, r4, r3 @ R5=R4 | R3
+ orr r5, r5, r2 @ R5=R4 | R3 | R2
+ orrs r6, r5, r1 @ Test R5 | R1 (the aim is to check if everything is null)
+ beq __end_row_loop
+ mov r7, r1, asr #16 @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
+ ldrsh r6, [r14, #0] @ R6=ROWr16[0]
+ orrs r5, r5, r7 @ R5=R4 | R3 | R2 | R7
+ beq __almost_empty_row
+
+__b_evaluation:
+ @@ at this point, R0=block (temp), R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
+ @@ R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
+
+ @@ MUL16(b0, W1, row[1]);
+ @@ MUL16(b1, W3, row[1]);
+ @@ MUL16(b2, W5, row[1]);
+ @@ MUL16(b3, W7, row[1]);
+ @@ MAC16(b0, W3, row[3]);
+ @@ MAC16(b1, -W7, row[3]);
+ @@ MAC16(b2, -W1, row[3]);
+ @@ MAC16(b3, -W5, row[3]);
+ ldr r8, =W1 @ R8=W1
+ mov r2, r2, asr #16 @ R2=ROWr16[3]
+ mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r9, =W3 @ R9=W3
+ ldr r10, =W5 @ R10=W5
+ mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r11, =W7 @ R11=W7
+ mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if null avoid muls
+ itttt ne
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ rsbne r2, r2, #0 @ R2=-ROWr16[3]
+ mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ it ne
+ mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+ @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+ @@ if (temp != 0) {}
+ orrs r2, r3, r4 @ R2=ROWr32[2] | ROWr32[3]
+ beq __end_b_evaluation
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
+ @@ R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ MAC16(b0, W5, row[5]);
+ @@ MAC16(b2, W7, row[5]);
+ @@ MAC16(b3, W3, row[5]);
+ @@ MAC16(b1, -W1, row[5]);
+ @@ MAC16(b0, W7, row[7]);
+ @@ MAC16(b2, W3, row[7]);
+ @@ MAC16(b3, -W1, row[7]);
+ @@ MAC16(b1, -W5, row[7]);
+ mov r3, r3, asr #16 @ R3=ROWr16[5]
+ teq r3, #0 @ if null avoid muls
+ it ne
+ mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5]=b0
+ mov r4, r4, asr #16 @ R4=ROWr16[7]
+ itttt ne
+ mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5]=b2
+ mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5]=b3
+ rsbne r3, r3, #0 @ R3=-ROWr16[5]
+ mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5]=b1
+ @@ R3 is free now
+ teq r4, #0 @ if null avoid muls
+ itttt ne
+ mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7]=b0
+ mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7]=b2
+ rsbne r4, r4, #0 @ R4=-ROWr16[7]
+ mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7]=b3
+ it ne
+ mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7]=b1
+ @@ R4 is free now
+__end_b_evaluation:
+ @@ at this point, R0=b0, R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
+ @@ R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+
+__a_evaluation:
+ @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
+ @@ a1 = a0 + W6 * row[2];
+ @@ a2 = a0 - W6 * row[2];
+ @@ a3 = a0 - W2 * row[2];
+ @@ a0 = a0 + W2 * row[2];
+ ldr r9, =W4 @ R9=W4
+ mul r6, r9, r6 @ R6=W4*ROWr16[0]
+ ldr r10, =W6 @ R10=W6
+ ldrsh r4, [r14, #4] @ R4=ROWr16[2] (a3 not defined yet)
+ add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
+
+ mul r11, r10, r4 @ R11=W6*ROWr16[2]
+ ldr r8, =W2 @ R8=W2
+ sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
+ @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
+ @@ if (temp != 0) {}
+ teq r2, #0
+ beq __end_bef_a_evaluation
+
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+
+
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+
+
+ @@ a0 += W4*row[4]
+ @@ a1 -= W4*row[4]
+ @@ a2 -= W4*row[4]
+ @@ a3 += W4*row[4]
+ ldrsh r11, [r14, #8] @ R11=ROWr16[4]
+ teq r11, #0 @ if null avoid muls
+ it ne
+ mulne r11, r9, r11 @ R11=W4*ROWr16[4]
+ @@ R9 is free now
+ ldrsh r9, [r14, #12] @ R9=ROWr16[6]
+ itttt ne
+ addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
+ subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
+ subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
+ addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
+ @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+ teq r9, #0 @ if null avoid muls
+ itttt ne
+ mulne r11, r10, r9 @ R11=W6*ROWr16[6]
+ addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
+ mulne r10, r8, r9 @ R10=W2*ROWr16[6]
+ @@ a0 += W6*row[6];
+ @@ a3 -= W6*row[6];
+ @@ a1 -= W2*row[6];
+ @@ a2 += W2*row[6];
+ subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ itt ne
+ subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
+ addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
+
+__end_a_evaluation:
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ row[0] = (a0 + b0) >> ROW_SHIFT;
+ @@ row[1] = (a1 + b1) >> ROW_SHIFT;
+ @@ row[2] = (a2 + b2) >> ROW_SHIFT;
+ @@ row[3] = (a3 + b3) >> ROW_SHIFT;
+ @@ row[4] = (a3 - b3) >> ROW_SHIFT;
+ @@ row[5] = (a2 - b2) >> ROW_SHIFT;
+ @@ row[6] = (a1 - b1) >> ROW_SHIFT;
+ @@ row[7] = (a0 - b0) >> ROW_SHIFT;
+ add r8, r6, r0 @ R8=a0+b0
+ add r9, r2, r1 @ R9=a1+b1
+ @@ put 2 16 bits half-words in a 32bits word
+ @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
+ ldr r10, =MASK_MSHW @ R10=0xFFFF0000
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
+ mvn r11, r10 @ R11= NOT R10= 0x0000FFFF
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #0]
+
+ add r8, r3, r5 @ R8=a2+b2
+ add r9, r4, r7 @ R9=a3+b3
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #4]
+
+ sub r8, r4, r7 @ R8=a3-b3
+ sub r9, r3, r5 @ R9=a2-b2
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #8]
+
+ sub r8, r2, r1 @ R8=a1-b1
+ sub r9, r6, r0 @ R9=a0-b0
+ and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
+ and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
+ orr r8, r8, r9
+ str r8, [r14, #12]
+
+ bal __end_row_loop
+
+__almost_empty_row:
+ @@ the row was empty, except ROWr16[0], now, management of this special case
+ @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
+ @@ R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
+ @@ R8=0xFFFF (temp), R9-R11 free
+ mov r8, #0x10000 @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
+ sub r8, r8, #1 @ R8 is now ready.
+ and r5, r8, r6, lsl #3 @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
+ orr r5, r5, r5, lsl #16 @ R5=R5 | (R5<<16)
+ str r5, [r14, #0] @ R14[0]=ROWr32[0]=R5
+ str r5, [r14, #4] @ R14[4]=ROWr32[1]=R5
+ str r5, [r14, #8] @ R14[8]=ROWr32[2]=R5
+ str r5, [r14, #12] @ R14[12]=ROWr32[3]=R5
+
+__end_row_loop:
+ @@ at this point, R0-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ ldr r0, [sp, #0] @ R0=block
+ teq r0, r14 @ compare current &block[8*n] to block, when block is reached, the loop is finished.
+ sub r14, r14, #16
+ bne __row_loop
+
+
+
+ @@ at this point, R0=block, R1-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ add r14, r0, #14 @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
+__col_loop:
+
+__b_evaluation2:
+ @@ at this point, R0=block (temp), R1-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ proceed with b0-b3 first, followed by a0-a3
+ @@ MUL16(b0, W1, col[8x1]);
+ @@ MUL16(b1, W3, col[8x1]);
+ @@ MUL16(b2, W5, col[8x1]);
+ @@ MUL16(b3, W7, col[8x1]);
+ @@ MAC16(b0, W3, col[8x3]);
+ @@ MAC16(b1, -W7, col[8x3]);
+ @@ MAC16(b2, -W1, col[8x3]);
+ @@ MAC16(b3, -W5, col[8x3]);
+ ldr r8, =W1 @ R8=W1
+ ldrsh r7, [r14, #16]
+ mul r0, r8, r7 @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r9, =W3 @ R9=W3
+ ldr r10, =W5 @ R10=W5
+ mul r1, r9, r7 @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldr r11, =W7 @ R11=W7
+ mul r5, r10, r7 @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ ldrsh r2, [r14, #48]
+ mul r7, r11, r7 @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
+ teq r2, #0 @ if 0, then avoid muls
+ itttt ne
+ mlane r0, r9, r2, r0 @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ rsbne r2, r2, #0 @ R2=-ROWr16[3]
+ mlane r1, r11, r2, r1 @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ mlane r5, r8, r2, r5 @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+ it ne
+ mlane r7, r10, r2, r7 @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
+
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
+ @@ R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ MAC16(b0, W5, col[5x8]);
+ @@ MAC16(b2, W7, col[5x8]);
+ @@ MAC16(b3, W3, col[5x8]);
+ @@ MAC16(b1, -W1, col[5x8]);
+ @@ MAC16(b0, W7, col[7x8]);
+ @@ MAC16(b2, W3, col[7x8]);
+ @@ MAC16(b3, -W1, col[7x8]);
+ @@ MAC16(b1, -W5, col[7x8]);
+ ldrsh r3, [r14, #80] @ R3=COLr16[5x8]
+ teq r3, #0 @ if 0 then avoid muls
+ itttt ne
+ mlane r0, r10, r3, r0 @ R0+=W5*ROWr16[5x8]=b0
+ mlane r5, r11, r3, r5 @ R5+=W7*ROWr16[5x8]=b2
+ mlane r7, r9, r3, r7 @ R7+=W3*ROWr16[5x8]=b3
+ rsbne r3, r3, #0 @ R3=-ROWr16[5x8]
+ ldrsh r4, [r14, #112] @ R4=COLr16[7x8]
+ it ne
+ mlane r1, r8, r3, r1 @ R7-=W1*ROWr16[5x8]=b1
+ @@ R3 is free now
+ teq r4, #0 @ if 0 then avoid muls
+ itttt ne
+ mlane r0, r11, r4, r0 @ R0+=W7*ROWr16[7x8]=b0
+ mlane r5, r9, r4, r5 @ R5+=W3*ROWr16[7x8]=b2
+ rsbne r4, r4, #0 @ R4=-ROWr16[7x8]
+ mlane r7, r8, r4, r7 @ R7-=W1*ROWr16[7x8]=b3
+ it ne
+ mlane r1, r10, r4, r1 @ R1-=W5*ROWr16[7x8]=b1
+ @@ R4 is free now
+__end_b_evaluation2:
+ @@ at this point, R0=b0, R1=b1, R2 (free), R3 (free), R4 (free),
+ @@ R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+
+__a_evaluation2:
+ @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
+ @@ a1 = a0 + W6 * row[2];
+ @@ a2 = a0 - W6 * row[2];
+ @@ a3 = a0 - W2 * row[2];
+ @@ a0 = a0 + W2 * row[2];
+ ldrsh r6, [r14, #0]
+ ldr r9, =W4 @ R9=W4
+ mul r6, r9, r6 @ R6=W4*ROWr16[0]
+ ldr r10, =W6 @ R10=W6
+ ldrsh r4, [r14, #32] @ R4=ROWr16[2] (a3 not defined yet)
+ add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
+ mul r11, r10, r4 @ R11=W6*ROWr16[2]
+ ldr r8, =W2 @ R8=W2
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ sub r3, r6, r11 @ R3=a0-W6*ROWr16[2] (a2)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ a0 += W4*row[4]
+ @@ a1 -= W4*row[4]
+ @@ a2 -= W4*row[4]
+ @@ a3 += W4*row[4]
+ ldrsh r11, [r14, #64] @ R11=ROWr16[4]
+ teq r11, #0 @ if null avoid muls
+ itttt ne
+ mulne r11, r9, r11 @ R11=W4*ROWr16[4]
+ @@ R9 is free now
+ addne r6, r6, r11 @ R6+=W4*ROWr16[4] (a0)
+ subne r2, r2, r11 @ R2-=W4*ROWr16[4] (a1)
+ subne r3, r3, r11 @ R3-=W4*ROWr16[4] (a2)
+ ldrsh r9, [r14, #96] @ R9=ROWr16[6]
+ it ne
+ addne r4, r4, r11 @ R4+=W4*ROWr16[4] (a3)
+ @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
+ teq r9, #0 @ if null avoid muls
+ itttt ne
+ mulne r11, r10, r9 @ R11=W6*ROWr16[6]
+ addne r6, r6, r11 @ R6+=W6*ROWr16[6] (a0)
+ mulne r10, r8, r9 @ R10=W2*ROWr16[6]
+ @@ a0 += W6*row[6];
+ @@ a3 -= W6*row[6];
+ @@ a1 -= W2*row[6];
+ @@ a2 += W2*row[6];
+ subne r4, r4, r11 @ R4-=W6*ROWr16[6] (a3)
+ itt ne
+ subne r2, r2, r10 @ R2-=W2*ROWr16[6] (a1)
+ addne r3, r3, r10 @ R3+=W2*ROWr16[6] (a2)
+__end_a_evaluation2:
+ @@ at this point, R0=b0, R1=b1, R2=a1, R3=a2, R4=a3,
+ @@ R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
+ @@ R12=__const_ptr_, R14=&block[n]
+ @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
+ @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
+ @@ col[16] = ((a2 + b2) >> COL_SHIFT);
+ @@ col[24] = ((a3 + b3) >> COL_SHIFT);
+ @@ col[32] = ((a3 - b3) >> COL_SHIFT);
+ @@ col[40] = ((a2 - b2) >> COL_SHIFT);
+ @@ col[48] = ((a1 - b1) >> COL_SHIFT);
+ @@ col[56] = ((a0 - b0) >> COL_SHIFT);
+ @@@@@ no optimization here @@@@@
+ add r8, r6, r0 @ R8=a0+b0
+ add r9, r2, r1 @ R9=a1+b1
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #0]
+ strh r9, [r14, #16]
+ add r8, r3, r5 @ R8=a2+b2
+ add r9, r4, r7 @ R9=a3+b3
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #32]
+ strh r9, [r14, #48]
+ sub r8, r4, r7 @ R8=a3-b3
+ sub r9, r3, r5 @ R9=a2-b2
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #64]
+ strh r9, [r14, #80]
+ sub r8, r2, r1 @ R8=a1-b1
+ sub r9, r6, r0 @ R9=a0-b0
+ mov r8, r8, asr #COL_SHIFT
+ mov r9, r9, asr #COL_SHIFT
+ strh r8, [r14, #96]
+ strh r9, [r14, #112]
+
+__end_col_loop:
+ @@ at this point, R0-R11 (free)
+ @@ R12=__const_ptr_, R14=&block[n]
+ ldr r0, [sp, #0] @ R0=block
+ teq r0, r14 @ compare current &block[n] to block, when block is reached, the loop is finished.
+ sub r14, r14, #2
+ bne __col_loop
+
+
+
+
+__end_simple_idct_arm:
+ @@ restore registers to previous status!
+ add sp, sp, #8 @@ the local variables!
+ ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
+
+
+
+@@ kind of sub-function, here not to overload the common case.
+__end_bef_a_evaluation:
+ add r2, r6, r11 @ R2=a0+W6*ROWr16[2] (a1)
+ mul r11, r8, r4 @ R11=W2*ROWr16[2]
+ sub r4, r6, r11 @ R4=a0-W2*ROWr16[2] (a3)
+ add r6, r6, r11 @ R6=a0+W2*ROWr16[2] (a0)
+ bal __end_a_evaluation
diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv5te.S b/ffmpeg/libavcodec/arm/simple_idct_armv5te.S
new file mode 100644
index 0000000..d1f10b7
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/simple_idct_armv5te.S
@@ -0,0 +1,620 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2006 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+function idct_row_armv5te
+ str lr, [sp, #-4]!
+
+ ldrd v1, v2, [a1, #8]
+ ldrd a3, a4, [a1] /* a3 = row[1:0], a4 = row[3:2] */
+ orrs v1, v1, v2
+ itt eq
+ cmpeq v1, a4
+ cmpeq v1, a3, lsr #16
+ beq row_dc_only
+
+ mov v1, #(1<<(ROW_SHIFT-1))
+ mov ip, #16384
+ sub ip, ip, #1 /* ip = W4 */
+ smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */
+ ldr ip, =W26 /* ip = W2 | (W6 << 16) */
+ smultb a2, ip, a4
+ smulbb lr, ip, a4
+ add v2, v1, a2
+ sub v3, v1, a2
+ sub v4, v1, lr
+ add v1, v1, lr
+
+ ldr ip, =W13 /* ip = W1 | (W3 << 16) */
+ ldr lr, =W57 /* lr = W5 | (W7 << 16) */
+ smulbt v5, ip, a3
+ smultt v6, lr, a4
+ smlatt v5, ip, a4, v5
+ smultt a2, ip, a3
+ smulbt v7, lr, a3
+ sub v6, v6, a2
+ smulbt a2, ip, a4
+ smultt fp, lr, a3
+ sub v7, v7, a2
+ smulbt a2, lr, a4
+ ldrd a3, a4, [a1, #8] /* a3=row[5:4] a4=row[7:6] */
+ sub fp, fp, a2
+
+ orrs a2, a3, a4
+ beq 1f
+
+ smlabt v5, lr, a3, v5
+ smlabt v6, ip, a3, v6
+ smlatt v5, lr, a4, v5
+ smlabt v6, lr, a4, v6
+ smlatt v7, lr, a3, v7
+ smlatt fp, ip, a3, fp
+ smulbt a2, ip, a4
+ smlatt v7, ip, a4, v7
+ sub fp, fp, a2
+
+ ldr ip, =W26 /* ip = W2 | (W6 << 16) */
+ mov a2, #16384
+ sub a2, a2, #1 /* a2 = W4 */
+ smulbb a2, a2, a3 /* a2 = W4*row[4] */
+ smultb lr, ip, a4 /* lr = W6*row[6] */
+ add v1, v1, a2 /* v1 += W4*row[4] */
+ add v1, v1, lr /* v1 += W6*row[6] */
+ add v4, v4, a2 /* v4 += W4*row[4] */
+ sub v4, v4, lr /* v4 -= W6*row[6] */
+ smulbb lr, ip, a4 /* lr = W2*row[6] */
+ sub v2, v2, a2 /* v2 -= W4*row[4] */
+ sub v2, v2, lr /* v2 -= W2*row[6] */
+ sub v3, v3, a2 /* v3 -= W4*row[4] */
+ add v3, v3, lr /* v3 += W2*row[6] */
+
+1: add a2, v1, v5
+ mov a3, a2, lsr #11
+ bic a3, a3, #0x1f0000
+ sub a2, v2, v6
+ mov a2, a2, lsr #11
+ add a3, a3, a2, lsl #16
+ add a2, v3, v7
+ mov a4, a2, lsr #11
+ bic a4, a4, #0x1f0000
+ add a2, v4, fp
+ mov a2, a2, lsr #11
+ add a4, a4, a2, lsl #16
+ strd a3, a4, [a1]
+
+ sub a2, v4, fp
+ mov a3, a2, lsr #11
+ bic a3, a3, #0x1f0000
+ sub a2, v3, v7
+ mov a2, a2, lsr #11
+ add a3, a3, a2, lsl #16
+ add a2, v2, v6
+ mov a4, a2, lsr #11
+ bic a4, a4, #0x1f0000
+ sub a2, v1, v5
+ mov a2, a2, lsr #11
+ add a4, a4, a2, lsl #16
+ strd a3, a4, [a1, #8]
+
+ ldr pc, [sp], #4
+
+row_dc_only:
+ orr a3, a3, a3, lsl #16
+ bic a3, a3, #0xe000
+ mov a3, a3, lsl #3
+ mov a4, a3
+ strd a3, a4, [a1]
+ strd a3, a4, [a1, #8]
+
+ ldr pc, [sp], #4
+endfunc
+
+ .macro idct_col
+ ldr a4, [a1] /* a4 = col[1:0] */
+ mov ip, #16384
+ sub ip, ip, #1 /* ip = W4 */
+#if 0
+ mov v1, #(1<<(COL_SHIFT-1))
+ smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */
+ smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */
+ ldr a4, [a1, #(16*4)]
+#else
+ mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */
+ add v2, v1, a4, asr #16
+ rsb v2, v2, v2, lsl #14
+ mov a4, a4, lsl #16
+ add v1, v1, a4, asr #16
+ ldr a4, [a1, #(16*4)]
+ rsb v1, v1, v1, lsl #14
+#endif
+
+ smulbb lr, ip, a4
+ smulbt a3, ip, a4
+ sub v3, v1, lr
+ sub v5, v1, lr
+ add v7, v1, lr
+ add v1, v1, lr
+ sub v4, v2, a3
+ sub v6, v2, a3
+ add fp, v2, a3
+ ldr ip, =W26
+ ldr a4, [a1, #(16*2)]
+ add v2, v2, a3
+
+ smulbb lr, ip, a4
+ smultb a3, ip, a4
+ add v1, v1, lr
+ sub v7, v7, lr
+ add v3, v3, a3
+ sub v5, v5, a3
+ smulbt lr, ip, a4
+ smultt a3, ip, a4
+ add v2, v2, lr
+ sub fp, fp, lr
+ add v4, v4, a3
+ ldr a4, [a1, #(16*6)]
+ sub v6, v6, a3
+
+ smultb lr, ip, a4
+ smulbb a3, ip, a4
+ add v1, v1, lr
+ sub v7, v7, lr
+ sub v3, v3, a3
+ add v5, v5, a3
+ smultt lr, ip, a4
+ smulbt a3, ip, a4
+ add v2, v2, lr
+ sub fp, fp, lr
+ sub v4, v4, a3
+ add v6, v6, a3
+
+ stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp}
+
+ ldr ip, =W13
+ ldr a4, [a1, #(16*1)]
+ ldr lr, =W57
+ smulbb v1, ip, a4
+ smultb v3, ip, a4
+ smulbb v5, lr, a4
+ smultb v7, lr, a4
+ smulbt v2, ip, a4
+ smultt v4, ip, a4
+ smulbt v6, lr, a4
+ smultt fp, lr, a4
+ rsb v4, v4, #0
+ ldr a4, [a1, #(16*3)]
+ rsb v3, v3, #0
+
+ smlatb v1, ip, a4, v1
+ smlatb v3, lr, a4, v3
+ smulbb a3, ip, a4
+ smulbb a2, lr, a4
+ sub v5, v5, a3
+ sub v7, v7, a2
+ smlatt v2, ip, a4, v2
+ smlatt v4, lr, a4, v4
+ smulbt a3, ip, a4
+ smulbt a2, lr, a4
+ sub v6, v6, a3
+ ldr a4, [a1, #(16*5)]
+ sub fp, fp, a2
+
+ smlabb v1, lr, a4, v1
+ smlabb v3, ip, a4, v3
+ smlatb v5, lr, a4, v5
+ smlatb v7, ip, a4, v7
+ smlabt v2, lr, a4, v2
+ smlabt v4, ip, a4, v4
+ smlatt v6, lr, a4, v6
+ ldr a3, [a1, #(16*7)]
+ smlatt fp, ip, a4, fp
+
+ smlatb v1, lr, a3, v1
+ smlabb v3, lr, a3, v3
+ smlatb v5, ip, a3, v5
+ smulbb a4, ip, a3
+ smlatt v2, lr, a3, v2
+ sub v7, v7, a4
+ smlabt v4, lr, a3, v4
+ smulbt a4, ip, a3
+ smlatt v6, ip, a3, v6
+ sub fp, fp, a4
+ .endm
+
+function idct_col_armv5te
+ str lr, [sp, #-4]!
+
+ idct_col
+
+ ldmfd sp!, {a3, a4}
+ adds a2, a3, v1
+ mov a2, a2, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ add ip, a4, v2
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1]
+ subs a3, a3, v1
+ mov a2, a3, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ sub a4, a4, v2
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ ldmfd sp!, {a3, a4}
+ str a2, [a1, #(16*7)]
+
+ subs a2, a3, v3
+ mov a2, a2, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ sub ip, a4, v4
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1, #(16*1)]
+ adds a3, a3, v3
+ mov a2, a3, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ add a4, a4, v4
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ ldmfd sp!, {a3, a4}
+ str a2, [a1, #(16*6)]
+
+ adds a2, a3, v5
+ mov a2, a2, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ add ip, a4, v6
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1, #(16*2)]
+ subs a3, a3, v5
+ mov a2, a3, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ sub a4, a4, v6
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ ldmfd sp!, {a3, a4}
+ str a2, [a1, #(16*5)]
+
+ adds a2, a3, v7
+ mov a2, a2, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ add ip, a4, fp
+ mov ip, ip, asr #20
+ orr a2, a2, ip, lsl #16
+ str a2, [a1, #(16*3)]
+ subs a3, a3, v7
+ mov a2, a3, lsr #20
+ it mi
+ orrmi a2, a2, #0xf000
+ sub a4, a4, fp
+ mov a4, a4, asr #20
+ orr a2, a2, a4, lsl #16
+ str a2, [a1, #(16*4)]
+
+ ldr pc, [sp], #4
+endfunc
+
+.macro clip dst, src:vararg
+ movs \dst, \src
+ it mi
+ movmi \dst, #0
+ cmp \dst, #255
+ it gt
+ movgt \dst, #255
+.endm
+
+.macro aclip dst, src:vararg
+ adds \dst, \src
+ it mi
+ movmi \dst, #0
+ cmp \dst, #255
+ it gt
+ movgt \dst, #255
+.endm
+
+function idct_col_put_armv5te
+ str lr, [sp, #-4]!
+
+ idct_col
+
+ ldmfd sp!, {a3, a4}
+ ldr lr, [sp, #32]
+ add a2, a3, v1
+ clip a2, a2, asr #20
+ add ip, a4, v2
+ clip ip, ip, asr #20
+ orr a2, a2, ip, lsl #8
+ sub a3, a3, v1
+ clip a3, a3, asr #20
+ sub a4, a4, v2
+ clip a4, a4, asr #20
+ ldr v1, [sp, #28]
+ strh a2, [v1]
+ add a2, v1, #2
+ str a2, [sp, #28]
+ orr a2, a3, a4, lsl #8
+ rsb v2, lr, lr, lsl #3
+ ldmfd sp!, {a3, a4}
+ strh_pre a2, v2, v1
+
+ sub a2, a3, v3
+ clip a2, a2, asr #20
+ sub ip, a4, v4
+ clip ip, ip, asr #20
+ orr a2, a2, ip, lsl #8
+ strh_pre a2, v1, lr
+ add a3, a3, v3
+ clip a2, a3, asr #20
+ add a4, a4, v4
+ clip a4, a4, asr #20
+ orr a2, a2, a4, lsl #8
+ ldmfd sp!, {a3, a4}
+ strh_dpre a2, v2, lr
+
+ add a2, a3, v5
+ clip a2, a2, asr #20
+ add ip, a4, v6
+ clip ip, ip, asr #20
+ orr a2, a2, ip, lsl #8
+ strh_pre a2, v1, lr
+ sub a3, a3, v5
+ clip a2, a3, asr #20
+ sub a4, a4, v6
+ clip a4, a4, asr #20
+ orr a2, a2, a4, lsl #8
+ ldmfd sp!, {a3, a4}
+ strh_dpre a2, v2, lr
+
+ add a2, a3, v7
+ clip a2, a2, asr #20
+ add ip, a4, fp
+ clip ip, ip, asr #20
+ orr a2, a2, ip, lsl #8
+ strh a2, [v1, lr]
+ sub a3, a3, v7
+ clip a2, a3, asr #20
+ sub a4, a4, fp
+ clip a4, a4, asr #20
+ orr a2, a2, a4, lsl #8
+ strh_dpre a2, v2, lr
+
+ ldr pc, [sp], #4
+endfunc
+
+function idct_col_add_armv5te
+ str lr, [sp, #-4]!
+
+ idct_col
+
+ ldr lr, [sp, #36]
+
+ ldmfd sp!, {a3, a4}
+ ldrh ip, [lr]
+ add a2, a3, v1
+ sub a3, a3, v1
+ and v1, ip, #255
+ aclip a2, v1, a2, asr #20
+ add v1, a4, v2
+ mov v1, v1, asr #20
+ aclip v1, v1, ip, lsr #8
+ orr a2, a2, v1, lsl #8
+ ldr v1, [sp, #32]
+ sub a4, a4, v2
+ rsb v2, v1, v1, lsl #3
+ ldrh_pre ip, v2, lr
+ strh a2, [lr]
+ and a2, ip, #255
+ aclip a3, a2, a3, asr #20
+ mov a4, a4, asr #20
+ aclip a4, a4, ip, lsr #8
+ add a2, lr, #2
+ str a2, [sp, #28]
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldmfd sp!, {a3, a4}
+ ldrh_pre ip, lr, v1
+ sub a2, a3, v3
+ add a3, a3, v3
+ and v3, ip, #255
+ aclip a2, v3, a2, asr #20
+ sub v3, a4, v4
+ mov v3, v3, asr #20
+ aclip v3, v3, ip, lsr #8
+ orr a2, a2, v3, lsl #8
+ add a4, a4, v4
+ ldrh_dpre ip, v2, v1
+ strh a2, [lr]
+ and a2, ip, #255
+ aclip a3, a2, a3, asr #20
+ mov a4, a4, asr #20
+ aclip a4, a4, ip, lsr #8
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldmfd sp!, {a3, a4}
+ ldrh_pre ip, lr, v1
+ add a2, a3, v5
+ sub a3, a3, v5
+ and v3, ip, #255
+ aclip a2, v3, a2, asr #20
+ add v3, a4, v6
+ mov v3, v3, asr #20
+ aclip v3, v3, ip, lsr #8
+ orr a2, a2, v3, lsl #8
+ sub a4, a4, v6
+ ldrh_dpre ip, v2, v1
+ strh a2, [lr]
+ and a2, ip, #255
+ aclip a3, a2, a3, asr #20
+ mov a4, a4, asr #20
+ aclip a4, a4, ip, lsr #8
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldmfd sp!, {a3, a4}
+ ldrh_pre ip, lr, v1
+ add a2, a3, v7
+ sub a3, a3, v7
+ and v3, ip, #255
+ aclip a2, v3, a2, asr #20
+ add v3, a4, fp
+ mov v3, v3, asr #20
+ aclip v3, v3, ip, lsr #8
+ orr a2, a2, v3, lsl #8
+ sub a4, a4, fp
+ ldrh_dpre ip, v2, v1
+ strh a2, [lr]
+ and a2, ip, #255
+ aclip a3, a2, a3, asr #20
+ mov a4, a4, asr #20
+ aclip a4, a4, ip, lsr #8
+ orr a2, a3, a4, lsl #8
+ strh a2, [v2]
+
+ ldr pc, [sp], #4
+endfunc
+
+function ff_simple_idct_armv5te, export=1
+ stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+
+ sub a1, a1, #(16*7)
+
+ bl idct_col_armv5te
+ add a1, a1, #4
+ bl idct_col_armv5te
+ add a1, a1, #4
+ bl idct_col_armv5te
+ add a1, a1, #4
+ bl idct_col_armv5te
+
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
+
+function ff_simple_idct_add_armv5te, export=1
+ stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+ mov a1, a3
+
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+
+ sub a1, a1, #(16*7)
+
+ bl idct_col_add_armv5te
+ add a1, a1, #4
+ bl idct_col_add_armv5te
+ add a1, a1, #4
+ bl idct_col_add_armv5te
+ add a1, a1, #4
+ bl idct_col_add_armv5te
+
+ add sp, sp, #8
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
+
+function ff_simple_idct_put_armv5te, export=1
+ stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr}
+
+ mov a1, a3
+
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+ add a1, a1, #16
+ bl idct_row_armv5te
+
+ sub a1, a1, #(16*7)
+
+ bl idct_col_put_armv5te
+ add a1, a1, #4
+ bl idct_col_put_armv5te
+ add a1, a1, #4
+ bl idct_col_put_armv5te
+ add a1, a1, #4
+ bl idct_col_put_armv5te
+
+ add sp, sp, #8
+ ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/simple_idct_armv6.S b/ffmpeg/libavcodec/arm/simple_idct_armv6.S
new file mode 100644
index 0000000..79cf5d4
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/simple_idct_armv6.S
@@ -0,0 +1,425 @@
+/*
+ * Simple IDCT
+ *
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define W13 (W1 | (W3 << 16))
+#define W26 (W2 | (W6 << 16))
+#define W42 (W4 | (W2 << 16))
+#define W42n (-W4&0xffff | (-W2 << 16))
+#define W46 (W4 | (W6 << 16))
+#define W57 (W5 | (W7 << 16))
+
+/*
+ Compute partial IDCT of single row.
+ shift = left-shift amount
+ r0 = source address
+ r2 = row[2,0] <= 2 cycles
+ r3 = row[3,1]
+ ip = w42 <= 2 cycles
+
+ Output in registers r4--r11
+*/
+ .macro idct_row shift
+ ldr lr, =W46 /* lr = W4 | (W6 << 16) */
+ mov r1, #(1<<(\shift-1))
+ smlad r4, r2, ip, r1
+ smlsd r7, r2, ip, r1
+ ldr ip, =W13 /* ip = W1 | (W3 << 16) */
+ ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
+ smlad r5, r2, lr, r1
+ smlsd r6, r2, lr, r1
+
+ smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
+ smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
+ ldr lr, [r0, #12] /* lr = row[7,5] */
+ pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
+ pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
+ smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
+ smlad r8, lr, r10,r8 /* B0 += W5*row[5] + W7*row[7] */
+ smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
+
+ ldr r3, =W42n /* r3 = -W4 | (-W2 << 16) */
+ smlad r10,lr, r2, r10 /* B2 += W7*row[5] + W3*row[7] */
+ ldr r2, [r0, #4] /* r2 = row[6,4] */
+ smlsdx r11,lr, ip, r11 /* B3 += W3*row[5] - W1*row[7] */
+ ldr ip, =W46 /* ip = W4 | (W6 << 16) */
+ smlad r9, lr, r1, r9 /* B1 -= W1*row[5] + W5*row[7] */
+
+ smlad r5, r2, r3, r5 /* A1 += -W4*row[4] - W2*row[6] */
+ smlsd r6, r2, r3, r6 /* A2 += -W4*row[4] + W2*row[6] */
+ smlad r4, r2, ip, r4 /* A0 += W4*row[4] + W6*row[6] */
+ smlsd r7, r2, ip, r7 /* A3 += W4*row[4] - W6*row[6] */
+ .endm
+
+/*
+ Compute partial IDCT of half row.
+ shift = left-shift amount
+ r2 = row[2,0]
+ r3 = row[3,1]
+ ip = w42
+
+ Output in registers r4--r11
+*/
+ .macro idct_row4 shift
+ ldr lr, =W46 /* lr = W4 | (W6 << 16) */
+ ldr r10,=W57 /* r10 = W5 | (W7 << 16) */
+ mov r1, #(1<<(\shift-1))
+ smlad r4, r2, ip, r1
+ smlsd r7, r2, ip, r1
+ ldr ip, =W13 /* ip = W1 | (W3 << 16) */
+ smlad r5, r2, lr, r1
+ smlsd r6, r2, lr, r1
+ smusdx r11,r3, r10 /* r11 = B3 = W7*row[1] - W5*row[3] */
+ smuad r8, r3, ip /* r8 = B0 = W1*row[1] + W3*row[3] */
+ pkhtb r2, ip, r10,asr #16 /* r3 = W7 | (W3 << 16) */
+ pkhbt r1, ip, r10,lsl #16 /* r1 = W1 | (W5 << 16) */
+ smusdx r9, r2, r3 /* r9 = -B1 = W7*row[3] - W3*row[1] */
+ smusdx r10,r3, r1 /* r10 = B2 = W5*row[1] - W1*row[3] */
+ .endm
+
+/*
+ Compute final part of IDCT single row without shift.
+ Input in registers r4--r11
+ Output in registers ip, r4--r6, lr, r8--r10
+*/
+ .macro idct_finish
+ add ip, r4, r8 /* r1 = A0 + B0 */
+ sub lr, r4, r8 /* r2 = A0 - B0 */
+ sub r4, r5, r9 /* r2 = A1 + B1 */
+ add r8, r5, r9 /* r2 = A1 - B1 */
+ add r5, r6, r10 /* r1 = A2 + B2 */
+ sub r9, r6, r10 /* r1 = A2 - B2 */
+ add r6, r7, r11 /* r2 = A3 + B3 */
+ sub r10,r7, r11 /* r2 = A3 - B3 */
+ .endm
+
+/*
+ Compute final part of IDCT single row.
+ shift = right-shift amount
+ Input/output in registers r4--r11
+*/
+ .macro idct_finish_shift shift
+ add r3, r4, r8 /* r3 = A0 + B0 */
+ sub r2, r4, r8 /* r2 = A0 - B0 */
+ mov r4, r3, asr #\shift
+ mov r8, r2, asr #\shift
+
+ sub r3, r5, r9 /* r3 = A1 + B1 */
+ add r2, r5, r9 /* r2 = A1 - B1 */
+ mov r5, r3, asr #\shift
+ mov r9, r2, asr #\shift
+
+ add r3, r6, r10 /* r3 = A2 + B2 */
+ sub r2, r6, r10 /* r2 = A2 - B2 */
+ mov r6, r3, asr #\shift
+ mov r10,r2, asr #\shift
+
+ add r3, r7, r11 /* r3 = A3 + B3 */
+ sub r2, r7, r11 /* r2 = A3 - B3 */
+ mov r7, r3, asr #\shift
+ mov r11,r2, asr #\shift
+ .endm
+
+/*
+ Compute final part of IDCT single row, saturating results at 8 bits.
+ shift = right-shift amount
+ Input/output in registers r4--r11
+*/
+ .macro idct_finish_shift_sat shift
+ add r3, r4, r8 /* r3 = A0 + B0 */
+ sub ip, r4, r8 /* ip = A0 - B0 */
+ usat r4, #8, r3, asr #\shift
+ usat r8, #8, ip, asr #\shift
+
+ sub r3, r5, r9 /* r3 = A1 + B1 */
+ add ip, r5, r9 /* ip = A1 - B1 */
+ usat r5, #8, r3, asr #\shift
+ usat r9, #8, ip, asr #\shift
+
+ add r3, r6, r10 /* r3 = A2 + B2 */
+ sub ip, r6, r10 /* ip = A2 - B2 */
+ usat r6, #8, r3, asr #\shift
+ usat r10,#8, ip, asr #\shift
+
+ add r3, r7, r11 /* r3 = A3 + B3 */
+ sub ip, r7, r11 /* ip = A3 - B3 */
+ usat r7, #8, r3, asr #\shift
+ usat r11,#8, ip, asr #\shift
+ .endm
+
+/*
+ Compute IDCT of single row, storing as column.
+ r0 = source
+ r1 = dest
+*/
+function idct_row_armv6
+ push {lr}
+
+ ldr lr, [r0, #12] /* lr = row[7,5] */
+ ldr ip, [r0, #4] /* ip = row[6,4] */
+ ldr r3, [r0, #8] /* r3 = row[3,1] */
+ ldr r2, [r0] /* r2 = row[2,0] */
+ orrs lr, lr, ip
+ itt eq
+ cmpeq lr, r3
+ cmpeq lr, r2, lsr #16
+ beq 1f
+ push {r1}
+ ldr ip, =W42 /* ip = W4 | (W2 << 16) */
+ cmp lr, #0
+ beq 2f
+
+ idct_row ROW_SHIFT
+ b 3f
+
+2: idct_row4 ROW_SHIFT
+
+3: pop {r1}
+ idct_finish_shift ROW_SHIFT
+
+ strh r4, [r1]
+ strh r5, [r1, #(16*2)]
+ strh r6, [r1, #(16*4)]
+ strh r7, [r1, #(16*6)]
+ strh r11,[r1, #(16*1)]
+ strh r10,[r1, #(16*3)]
+ strh r9, [r1, #(16*5)]
+ strh r8, [r1, #(16*7)]
+
+ pop {pc}
+
+1: mov r2, r2, lsl #3
+ strh r2, [r1]
+ strh r2, [r1, #(16*2)]
+ strh r2, [r1, #(16*4)]
+ strh r2, [r1, #(16*6)]
+ strh r2, [r1, #(16*1)]
+ strh r2, [r1, #(16*3)]
+ strh r2, [r1, #(16*5)]
+ strh r2, [r1, #(16*7)]
+ pop {pc}
+endfunc
+
+/*
+ Compute IDCT of single column, read as row.
+ r0 = source
+ r1 = dest
+*/
+function idct_col_armv6
+ push {r1, lr}
+
+ ldr r2, [r0] /* r2 = row[2,0] */
+ ldr ip, =W42 /* ip = W4 | (W2 << 16) */
+ ldr r3, [r0, #8] /* r3 = row[3,1] */
+ idct_row COL_SHIFT
+ pop {r1}
+ idct_finish_shift COL_SHIFT
+
+ strh r4, [r1]
+ strh r5, [r1, #(16*1)]
+ strh r6, [r1, #(16*2)]
+ strh r7, [r1, #(16*3)]
+ strh r11,[r1, #(16*4)]
+ strh r10,[r1, #(16*5)]
+ strh r9, [r1, #(16*6)]
+ strh r8, [r1, #(16*7)]
+
+ pop {pc}
+endfunc
+
+/*
+ Compute IDCT of single column, read as row, store saturated 8-bit.
+ r0 = source
+ r1 = dest
+ r2 = line size
+*/
+function idct_col_put_armv6
+ push {r1, r2, lr}
+
+ ldr r2, [r0] /* r2 = row[2,0] */
+ ldr ip, =W42 /* ip = W4 | (W2 << 16) */
+ ldr r3, [r0, #8] /* r3 = row[3,1] */
+ idct_row COL_SHIFT
+ pop {r1, r2}
+ idct_finish_shift_sat COL_SHIFT
+
+ strb_post r4, r1, r2
+ strb_post r5, r1, r2
+ strb_post r6, r1, r2
+ strb_post r7, r1, r2
+ strb_post r11,r1, r2
+ strb_post r10,r1, r2
+ strb_post r9, r1, r2
+ strb_post r8, r1, r2
+
+ sub r1, r1, r2, lsl #3
+
+ pop {pc}
+endfunc
+
+/*
+ Compute IDCT of single column, read as row, add/store saturated 8-bit.
+ r0 = source
+ r1 = dest
+ r2 = line size
+*/
+function idct_col_add_armv6
+ push {r1, r2, lr}
+
+ ldr r2, [r0] /* r2 = row[2,0] */
+ ldr ip, =W42 /* ip = W4 | (W2 << 16) */
+ ldr r3, [r0, #8] /* r3 = row[3,1] */
+ idct_row COL_SHIFT
+ pop {r1, r2}
+ idct_finish
+
+ ldrb r3, [r1]
+ ldrb r7, [r1, r2]
+ ldrb r11,[r1, r2, lsl #2]
+ add ip, r3, ip, asr #COL_SHIFT
+ usat ip, #8, ip
+ add r4, r7, r4, asr #COL_SHIFT
+ strb_post ip, r1, r2
+ ldrb ip, [r1, r2]
+ usat r4, #8, r4
+ ldrb r11,[r1, r2, lsl #2]
+ add r5, ip, r5, asr #COL_SHIFT
+ usat r5, #8, r5
+ strb_post r4, r1, r2
+ ldrb r3, [r1, r2]
+ ldrb ip, [r1, r2, lsl #2]
+ strb_post r5, r1, r2
+ ldrb r7, [r1, r2]
+ ldrb r4, [r1, r2, lsl #2]
+ add r6, r3, r6, asr #COL_SHIFT
+ usat r6, #8, r6
+ add r10,r7, r10,asr #COL_SHIFT
+ usat r10,#8, r10
+ add r9, r11,r9, asr #COL_SHIFT
+ usat r9, #8, r9
+ add r8, ip, r8, asr #COL_SHIFT
+ usat r8, #8, r8
+ add lr, r4, lr, asr #COL_SHIFT
+ usat lr, #8, lr
+ strb_post r6, r1, r2
+ strb_post r10,r1, r2
+ strb_post r9, r1, r2
+ strb_post r8, r1, r2
+ strb_post lr, r1, r2
+
+ sub r1, r1, r2, lsl #3
+
+ pop {pc}
+endfunc
+
+/*
+ Compute 8 IDCT row transforms.
+ func = IDCT row->col function
+ width = width of columns in bytes
+*/
+ .macro idct_rows func width
+ bl \func
+ add r0, r0, #(16*2)
+ add r1, r1, #\width
+ bl \func
+ add r0, r0, #(16*2)
+ add r1, r1, #\width
+ bl \func
+ add r0, r0, #(16*2)
+ add r1, r1, #\width
+ bl \func
+ sub r0, r0, #(16*5)
+ add r1, r1, #\width
+ bl \func
+ add r0, r0, #(16*2)
+ add r1, r1, #\width
+ bl \func
+ add r0, r0, #(16*2)
+ add r1, r1, #\width
+ bl \func
+ add r0, r0, #(16*2)
+ add r1, r1, #\width
+ bl \func
+
+ sub r0, r0, #(16*7)
+ .endm
+
+/* void ff_simple_idct_armv6(int16_t *data); */
+function ff_simple_idct_armv6, export=1
+ push {r4-r11, lr}
+ sub sp, sp, #128
+
+ mov r1, sp
+ idct_rows idct_row_armv6, 2
+ mov r1, r0
+ mov r0, sp
+ idct_rows idct_col_armv6, 2
+
+ add sp, sp, #128
+ pop {r4-r11, pc}
+endfunc
+
+/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, int16_t *data); */
+function ff_simple_idct_add_armv6, export=1
+ push {r0, r1, r4-r11, lr}
+ sub sp, sp, #128
+
+ mov r0, r2
+ mov r1, sp
+ idct_rows idct_row_armv6, 2
+ mov r0, sp
+ ldr r1, [sp, #128]
+ ldr r2, [sp, #(128+4)]
+ idct_rows idct_col_add_armv6, 1
+
+ add sp, sp, #(128+8)
+ pop {r4-r11, pc}
+endfunc
+
+/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, int16_t *data); */
+function ff_simple_idct_put_armv6, export=1
+ push {r0, r1, r4-r11, lr}
+ sub sp, sp, #128
+
+ mov r0, r2
+ mov r1, sp
+ idct_rows idct_row_armv6, 2
+ mov r0, sp
+ ldr r1, [sp, #128]
+ ldr r2, [sp, #(128+4)]
+ idct_rows idct_col_put_armv6, 1
+
+ add sp, sp, #(128+8)
+ pop {r4-r11, pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/simple_idct_neon.S b/ffmpeg/libavcodec/arm/simple_idct_neon.S
new file mode 100644
index 0000000..c3e573c
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/simple_idct_neon.S
@@ -0,0 +1,375 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+ .macro idct_col4_top
+ vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */
+ vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */
+ vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */
+ vadd.i32 q11, q15, q7
+ vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */
+ vadd.i32 q12, q15, q8
+ vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */
+ vsub.i32 q13, q15, q8
+ vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */
+ vsub.i32 q14, q15, q7
+
+ vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */
+ vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */
+ vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */
+ vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */
+ .endm
+
+ .text
+ .align 6
+
+function idct_row4_pld_neon
+ pld [r0]
+ add r3, r0, r1, lsl #2
+ pld [r0, r1]
+ pld [r0, r1, lsl #1]
+A pld [r3, -r1]
+ pld [r3]
+ pld [r3, r1]
+ add r3, r3, r1, lsl #1
+ pld [r3]
+ pld [r3, r1]
+endfunc
+
+function idct_row4_neon
+ vmov.i32 q15, #(1<<(ROW_SHIFT-1))
+ vld1.64 {d2-d5}, [r2,:128]!
+ vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */
+ vld1.64 {d6,d7}, [r2,:128]!
+ vorr d10, d3, d5
+ vld1.64 {d8,d9}, [r2,:128]!
+ add r2, r2, #-64
+
+ vorr d11, d7, d9
+ vorr d10, d10, d11
+ vmov r3, r4, d10
+
+ idct_col4_top
+
+ orrs r3, r3, r4
+ beq 1f
+
+ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
+ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
+ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
+ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
+ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q7
+ vsub.i32 q13, q13, q7
+ vadd.i32 q14, q14, q7
+ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
+ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
+ vmlal.s16 q9, d9, w7
+ vmlsl.s16 q10, d9, w5
+ vmlal.s16 q5, d9, w3
+ vmlsl.s16 q6, d9, w1
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q8
+ vadd.i32 q13, q13, q8
+ vsub.i32 q14, q14, q7
+
+1: vadd.i32 q3, q11, q9
+ vadd.i32 q4, q12, q10
+ vshrn.i32 d2, q3, #ROW_SHIFT
+ vshrn.i32 d4, q4, #ROW_SHIFT
+ vadd.i32 q7, q13, q5
+ vadd.i32 q8, q14, q6
+ vtrn.16 d2, d4
+ vshrn.i32 d6, q7, #ROW_SHIFT
+ vshrn.i32 d8, q8, #ROW_SHIFT
+ vsub.i32 q14, q14, q6
+ vsub.i32 q11, q11, q9
+ vtrn.16 d6, d8
+ vsub.i32 q13, q13, q5
+ vshrn.i32 d3, q14, #ROW_SHIFT
+ vtrn.32 d2, d6
+ vsub.i32 q12, q12, q10
+ vtrn.32 d4, d8
+ vshrn.i32 d5, q13, #ROW_SHIFT
+ vshrn.i32 d7, q12, #ROW_SHIFT
+ vshrn.i32 d9, q11, #ROW_SHIFT
+
+ vtrn.16 d3, d5
+ vtrn.16 d7, d9
+ vtrn.32 d3, d7
+ vtrn.32 d5, d9
+
+ vst1.64 {d2-d5}, [r2,:128]!
+ vst1.64 {d6-d9}, [r2,:128]!
+
+ bx lr
+endfunc
+
+function idct_col4_neon
+ mov ip, #16
+ vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */
+ vdup.16 d30, w4c
+ vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */
+ vadd.i16 d30, d30, d2
+ vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */
+ vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+ vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */
+
+ ldrd r4, r5, [r2]
+ ldrd r6, r7, [r2, #16]
+ orrs r4, r4, r5
+
+ idct_col4_top
+ it eq
+ addeq r2, r2, #16
+ beq 1f
+
+ vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */
+ vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q12, q12, q7
+ vsub.i32 q13, q13, q7
+ vadd.i32 q14, q14, q7
+
+1: orrs r6, r6, r7
+ ldrd r4, r5, [r2, #16]
+ it eq
+ addeq r2, r2, #16
+ beq 2f
+
+ vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */
+ vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */
+ vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */
+ vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */
+ vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */
+
+2: orrs r4, r4, r5
+ ldrd r4, r5, [r2, #16]
+ it eq
+ addeq r2, r2, #16
+ beq 3f
+
+ vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */
+ vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */
+ vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */
+ vadd.i32 q11, q11, q7
+ vsub.i32 q14, q14, q7
+ vsub.i32 q12, q12, q8
+ vadd.i32 q13, q13, q8
+
+3: orrs r4, r4, r5
+ it eq
+ addeq r2, r2, #16
+ beq 4f
+
+ vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */
+ vmlal.s16 q9, d9, w7
+ vmlsl.s16 q10, d9, w5
+ vmlal.s16 q5, d9, w3
+ vmlsl.s16 q6, d9, w1
+
+4: vaddhn.i32 d2, q11, q9
+ vaddhn.i32 d3, q12, q10
+ vaddhn.i32 d4, q13, q5
+ vaddhn.i32 d5, q14, q6
+ vsubhn.i32 d9, q11, q9
+ vsubhn.i32 d8, q12, q10
+ vsubhn.i32 d7, q13, q5
+ vsubhn.i32 d6, q14, q6
+
+ bx lr
+endfunc
+
+ .align 6
+
+function idct_col4_st8_neon
+ vqshrun.s16 d2, q1, #COL_SHIFT-16
+ vqshrun.s16 d3, q2, #COL_SHIFT-16
+ vqshrun.s16 d4, q3, #COL_SHIFT-16
+ vqshrun.s16 d5, q4, #COL_SHIFT-16
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ vst1.32 {d5[0]}, [r0,:32], r1
+ vst1.32 {d5[1]}, [r0,:32], r1
+
+ bx lr
+endfunc
+
+const idct_coeff_neon, align=4
+ .short W1, W2, W3, W4, W5, W6, W7, W4c
+endconst
+
+ .macro idct_start data
+ push {r4-r7, lr}
+ pld [\data]
+ pld [\data, #64]
+ vpush {d8-d15}
+ movrel r3, idct_coeff_neon
+ vld1.64 {d0,d1}, [r3,:128]
+ .endm
+
+ .macro idct_end
+ vpop {d8-d15}
+ pop {r4-r7, pc}
+ .endm
+
+/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, int16_t *data); */
+function ff_simple_idct_put_neon, export=1
+ idct_start r2
+
+ bl idct_row4_pld_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+ bl idct_col4_st8_neon
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #4
+ add r2, r2, #-120
+ bl idct_col4_neon
+ bl idct_col4_st8_neon
+
+ idct_end
+endfunc
+
+ .align 6
+
+function idct_col4_add8_neon
+ mov ip, r0
+
+ vld1.32 {d10[0]}, [r0,:32], r1
+ vshr.s16 q1, q1, #COL_SHIFT-16
+ vld1.32 {d10[1]}, [r0,:32], r1
+ vshr.s16 q2, q2, #COL_SHIFT-16
+ vld1.32 {d11[0]}, [r0,:32], r1
+ vshr.s16 q3, q3, #COL_SHIFT-16
+ vld1.32 {d11[1]}, [r0,:32], r1
+ vshr.s16 q4, q4, #COL_SHIFT-16
+ vld1.32 {d12[0]}, [r0,:32], r1
+ vaddw.u8 q1, q1, d10
+ vld1.32 {d12[1]}, [r0,:32], r1
+ vaddw.u8 q2, q2, d11
+ vld1.32 {d13[0]}, [r0,:32], r1
+ vqmovun.s16 d2, q1
+ vld1.32 {d13[1]}, [r0,:32], r1
+ vaddw.u8 q3, q3, d12
+ vst1.32 {d2[0]}, [ip,:32], r1
+ vqmovun.s16 d3, q2
+ vst1.32 {d2[1]}, [ip,:32], r1
+ vaddw.u8 q4, q4, d13
+ vst1.32 {d3[0]}, [ip,:32], r1
+ vqmovun.s16 d4, q3
+ vst1.32 {d3[1]}, [ip,:32], r1
+ vqmovun.s16 d5, q4
+ vst1.32 {d4[0]}, [ip,:32], r1
+ vst1.32 {d4[1]}, [ip,:32], r1
+ vst1.32 {d5[0]}, [ip,:32], r1
+ vst1.32 {d5[1]}, [ip,:32], r1
+
+ bx lr
+endfunc
+
+/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, int16_t *data); */
+function ff_simple_idct_add_neon, export=1
+ idct_start r2
+
+ bl idct_row4_pld_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+ bl idct_col4_add8_neon
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #4
+ add r2, r2, #-120
+ bl idct_col4_neon
+ bl idct_col4_add8_neon
+
+ idct_end
+endfunc
+
+ .align 6
+
+function idct_col4_st16_neon
+ mov ip, #16
+
+ vshr.s16 q1, q1, #COL_SHIFT-16
+ vshr.s16 q2, q2, #COL_SHIFT-16
+ vst1.64 {d2}, [r2,:64], ip
+ vshr.s16 q3, q3, #COL_SHIFT-16
+ vst1.64 {d3}, [r2,:64], ip
+ vshr.s16 q4, q4, #COL_SHIFT-16
+ vst1.64 {d4}, [r2,:64], ip
+ vst1.64 {d5}, [r2,:64], ip
+ vst1.64 {d6}, [r2,:64], ip
+ vst1.64 {d7}, [r2,:64], ip
+ vst1.64 {d8}, [r2,:64], ip
+ vst1.64 {d9}, [r2,:64], ip
+
+ bx lr
+endfunc
+
+/* void ff_simple_idct_neon(int16_t *data); */
+function ff_simple_idct_neon, export=1
+ idct_start r0
+
+ mov r2, r0
+ bl idct_row4_neon
+ bl idct_row4_neon
+ add r2, r2, #-128
+ bl idct_col4_neon
+ add r2, r2, #-128
+ bl idct_col4_st16_neon
+ add r2, r2, #-120
+ bl idct_col4_neon
+ add r2, r2, #-128
+ bl idct_col4_st16_neon
+
+ idct_end
+endfunc
diff --git a/ffmpeg/libavcodec/arm/synth_filter_neon.S b/ffmpeg/libavcodec/arm/synth_filter_neon.S
new file mode 100644
index 0000000..5417be7
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/synth_filter_neon.S
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_synth_filter_float_neon, export=1
+ push {r3-r11,lr}
+
+ ldr r4, [r2] @ synth_buf_offset
+ add r1, r1, r4, lsl #2 @ synth_buf
+ sub r12, r4, #32
+ bfc r12, #9, #23
+ bic r4, r4, #63
+ str r12, [r2]
+
+ ldr r2, [sp, #12*4] @ in
+ mov r9, r1 @ synth_buf
+
+VFP vpush {d0}
+ bl X(ff_imdct_half_neon)
+VFP vpop {d0}
+ pop {r3}
+
+ ldr r5, [sp, #9*4] @ window
+ ldr r2, [sp, #10*4] @ out
+NOVFP vldr s0, [sp, #12*4] @ scale
+ add r8, r9, #12*4
+
+ mov lr, #64*4
+ mov r1, #4
+1:
+ add r10, r9, #16*4 @ synth_buf
+ add r11, r8, #16*4
+ add r0, r5, #16*4 @ window
+ add r6, r5, #32*4
+ add r7, r5, #48*4
+
+ vld1.32 {q10}, [r3,:128] @ a
+ add r3, r3, #16*4
+ vld1.32 {q1}, [r3,:128] @ b
+ vmov.f32 q2, #0.0 @ c
+ vmov.f32 q3, #0.0 @ d
+
+ mov r12, #512
+2:
+ vld1.32 {q9}, [r8, :128], lr
+ vrev64.32 q9, q9
+ vld1.32 {q8}, [r5, :128], lr
+ vmls.f32 d20, d16, d19
+ vld1.32 {q11}, [r0, :128], lr
+ vmls.f32 d21, d17, d18
+ vld1.32 {q12}, [r9, :128], lr
+ vmla.f32 d2, d22, d24
+ vld1.32 {q8}, [r6, :128], lr
+ vmla.f32 d3, d23, d25
+ vld1.32 {q9}, [r10,:128], lr
+ vmla.f32 d4, d16, d18
+ vld1.32 {q12}, [r11,:128], lr
+ vmla.f32 d5, d17, d19
+ vrev64.32 q12, q12
+ vld1.32 {q11}, [r7, :128], lr
+ vmla.f32 d6, d22, d25
+ vmla.f32 d7, d23, d24
+ subs r12, r12, #64
+ beq 3f
+ cmp r12, r4
+ bne 2b
+ sub r8, r8, #512*4
+ sub r9, r9, #512*4
+ sub r10, r10, #512*4
+ sub r11, r11, #512*4
+ b 2b
+3:
+ vmul.f32 q8, q10, d0[0]
+ vmul.f32 q9, q1, d0[0]
+ vst1.32 {q3}, [r3,:128]
+ sub r3, r3, #16*4
+ vst1.32 {q2}, [r3,:128]
+ vst1.32 {q8}, [r2,:128]
+ add r2, r2, #16*4
+ vst1.32 {q9}, [r2,:128]
+
+ subs r1, r1, #1
+ it eq
+ popeq {r4-r11,pc}
+
+ cmp r4, #0
+ itt eq
+ subeq r8, r8, #512*4
+ subeq r9, r9, #512*4
+ sub r5, r5, #512*4
+ sub r2, r2, #12*4 @ out
+ add r3, r3, #4*4 @ synth_buf2
+ add r5, r5, #4*4 @ window
+ add r9, r9, #4*4 @ synth_buf
+ sub r8, r8, #4*4 @ synth_buf
+ b 1b
+endfunc
diff --git a/ffmpeg/libavcodec/arm/videodsp_arm.h b/ffmpeg/libavcodec/arm/videodsp_arm.h
new file mode 100644
index 0000000..112cbb8
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/videodsp_arm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VIDEODSP_ARM_H
+#define AVCODEC_ARM_VIDEODSP_ARM_H
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/videodsp.h"
+
+void ff_videodsp_init_armv5te(VideoDSPContext* ctx, int bpc);
+
+#endif /* AVCODEC_ARM_VIDEODSP_ARM_H */
diff --git a/ffmpeg/libavcodec/arm/videodsp_armv5te.S b/ffmpeg/libavcodec/arm/videodsp_armv5te.S
new file mode 100644
index 0000000..48a6c3b
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/videodsp_armv5te.S
@@ -0,0 +1,31 @@
+@
+@ ARMv5te optimized DSP utils
+@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
+@
+@ This file is part of FFmpeg
+@
+@ FFmpeg is free software; you can redistribute it and/or
+@ modify it under the terms of the GNU Lesser General Public
+@ License as published by the Free Software Foundation; either
+@ version 2.1 of the License, or (at your option) any later version.
+@
+@ FFmpeg is distributed in the hope that it will be useful,
+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+@ Lesser General Public License for more details.
+@
+@ You should have received a copy of the GNU Lesser General Public
+@ License along with FFmpeg; if not, write to the Free Software
+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+@
+
+#include "config.h"
+#include "libavutil/arm/asm.S"
+
+function ff_prefetch_arm, export=1
+ subs r2, r2, #1
+ pld [r0]
+ add r0, r0, r1
+ bne ff_prefetch_arm
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/videodsp_init_arm.c b/ffmpeg/libavcodec/arm/videodsp_init_arm.c
new file mode 100644
index 0000000..a89abb2
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/videodsp_init_arm.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2012 Ronald S. Bultje
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/videodsp.h"
+#include "videodsp_arm.h"
+
+av_cold void ff_videodsp_init_arm(VideoDSPContext *ctx, int bpc)
+{
+ int cpu_flags = av_get_cpu_flags();
+ if (have_armv5te(cpu_flags)) ff_videodsp_init_armv5te(ctx, bpc);
+}
diff --git a/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c b/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c
new file mode 100644
index 0000000..1ea1f34
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/videodsp_init_armv5te.c
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2012 Ronald S. Bultje
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/videodsp.h"
+#include "videodsp_arm.h"
+
+void ff_prefetch_arm(uint8_t *mem, ptrdiff_t stride, int h);
+
+av_cold void ff_videodsp_init_armv5te(VideoDSPContext *ctx, int bpc)
+{
+#if HAVE_ARMV5TE_EXTERNAL
+ ctx->prefetch = ff_prefetch_arm;
+#endif
+}
diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c b/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c
new file mode 100644
index 0000000..f4b3d80
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vorbisdsp_init_arm.c
@@ -0,0 +1,37 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vorbisdsp.h"
+
+void ff_vorbis_inverse_coupling_neon(float *mag, float *ang,
+ intptr_t blocksize);
+
+av_cold void ff_vorbisdsp_init_arm(VorbisDSPContext *c)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/vorbisdsp_neon.S b/ffmpeg/libavcodec/arm/vorbisdsp_neon.S
new file mode 100644
index 0000000..79ce54f
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vorbisdsp_neon.S
@@ -0,0 +1,83 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+function ff_vorbis_inverse_coupling_neon, export=1
+ vmov.i32 q10, #1<<31
+ subs r2, r2, #4
+ mov r3, r0
+ mov r12, r1
+ beq 3f
+
+ vld1.32 {d24-d25},[r1,:128]!
+ vld1.32 {d22-d23},[r0,:128]!
+ vcle.s32 q8, q12, #0
+ vand q9, q11, q10
+ veor q12, q12, q9
+ vand q2, q12, q8
+ vbic q3, q12, q8
+ vadd.f32 q12, q11, q2
+ vsub.f32 q11, q11, q3
+1: vld1.32 {d2-d3}, [r1,:128]!
+ vld1.32 {d0-d1}, [r0,:128]!
+ vcle.s32 q8, q1, #0
+ vand q9, q0, q10
+ veor q1, q1, q9
+ vst1.32 {d24-d25},[r3, :128]!
+ vst1.32 {d22-d23},[r12,:128]!
+ vand q2, q1, q8
+ vbic q3, q1, q8
+ vadd.f32 q1, q0, q2
+ vsub.f32 q0, q0, q3
+ subs r2, r2, #8
+ ble 2f
+ vld1.32 {d24-d25},[r1,:128]!
+ vld1.32 {d22-d23},[r0,:128]!
+ vcle.s32 q8, q12, #0
+ vand q9, q11, q10
+ veor q12, q12, q9
+ vst1.32 {d2-d3}, [r3, :128]!
+ vst1.32 {d0-d1}, [r12,:128]!
+ vand q2, q12, q8
+ vbic q3, q12, q8
+ vadd.f32 q12, q11, q2
+ vsub.f32 q11, q11, q3
+ b 1b
+
+2: vst1.32 {d2-d3}, [r3, :128]!
+ vst1.32 {d0-d1}, [r12,:128]!
+ it lt
+ bxlt lr
+
+3: vld1.32 {d2-d3}, [r1,:128]
+ vld1.32 {d0-d1}, [r0,:128]
+ vcle.s32 q8, q1, #0
+ vand q9, q0, q10
+ veor q1, q1, q9
+ vand q2, q1, q8
+ vbic q3, q1, q8
+ vadd.f32 q1, q0, q2
+ vsub.f32 q0, q0, q3
+ vst1.32 {d2-d3}, [r0,:128]!
+ vst1.32 {d0-d1}, [r1,:128]!
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c
new file mode 100644
index 0000000..5af795b
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp3dsp_init_arm.c
@@ -0,0 +1,45 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/dsputil.h"
+#include "libavcodec/vp3dsp.h"
+
+void ff_vp3_idct_put_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_vp3_idct_add_neon(uint8_t *dest, int line_size, int16_t *data);
+void ff_vp3_idct_dc_add_neon(uint8_t *dest, int line_size, const int16_t *data);
+
+void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
+void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
+
+av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->idct_put = ff_vp3_idct_put_neon;
+ c->idct_add = ff_vp3_idct_add_neon;
+ c->idct_dc_add = ff_vp3_idct_dc_add_neon;
+ c->v_loop_filter = ff_vp3_v_loop_filter_neon;
+ c->h_loop_filter = ff_vp3_h_loop_filter_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/vp3dsp_neon.S b/ffmpeg/libavcodec/arm/vp3dsp_neon.S
new file mode 100644
index 0000000..f133905
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp3dsp_neon.S
@@ -0,0 +1,395 @@
+/*
+ * Copyright (c) 2009 David Conrad
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+const vp3_idct_constants, align=4
+.short 64277, 60547, 54491, 46341, 36410, 25080, 12785
+endconst
+
+#define xC1S7 d0[0]
+#define xC2S6 d0[1]
+#define xC3S5 d0[2]
+#define xC4S4 d0[3]
+#define xC5S3 d1[0]
+#define xC6S2 d1[1]
+#define xC7S1 d1[2]
+
+.macro vp3_loop_filter
+ vsubl.u8 q3, d18, d17
+ vsubl.u8 q2, d16, d19
+ vadd.i16 q1, q3, q3
+ vadd.i16 q2, q2, q3
+ vadd.i16 q0, q1, q2
+ vrshr.s16 q0, q0, #3
+ vmovl.u8 q9, d18
+ vdup.u16 q15, r2
+
+ vabs.s16 q1, q0
+ vshr.s16 q0, q0, #15
+ vqsub.u16 q2, q15, q1
+ vqsub.u16 q3, q2, q1
+ vsub.i16 q1, q2, q3
+ veor q1, q1, q0
+ vsub.i16 q0, q1, q0
+
+ vaddw.u8 q2, q0, d17
+ vsub.i16 q3, q9, q0
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+.endm
+
+function ff_vp3_v_loop_filter_neon, export=1
+ sub ip, r0, r1
+ sub r0, r0, r1, lsl #1
+ vld1.64 {d16}, [r0,:64], r1
+ vld1.64 {d17}, [r0,:64], r1
+ vld1.64 {d18}, [r0,:64], r1
+ vld1.64 {d19}, [r0,:64], r1
+ ldrb r2, [r2, #129*4]
+
+ vp3_loop_filter
+
+ vst1.64 {d0}, [ip,:64], r1
+ vst1.64 {d1}, [ip,:64], r1
+ bx lr
+endfunc
+
+function ff_vp3_h_loop_filter_neon, export=1
+ sub ip, r0, #1
+ sub r0, r0, #2
+ vld1.32 {d16[]}, [r0], r1
+ vld1.32 {d17[]}, [r0], r1
+ vld1.32 {d18[]}, [r0], r1
+ vld1.32 {d19[]}, [r0], r1
+ vld1.32 {d16[1]}, [r0], r1
+ vld1.32 {d17[1]}, [r0], r1
+ vld1.32 {d18[1]}, [r0], r1
+ vld1.32 {d19[1]}, [r0], r1
+ ldrb r2, [r2, #129*4]
+
+ vtrn.8 d16, d17
+ vtrn.8 d18, d19
+ vtrn.16 d16, d18
+ vtrn.16 d17, d19
+
+ vp3_loop_filter
+
+ vtrn.8 d0, d1
+
+ vst1.16 {d0[0]}, [ip], r1
+ vst1.16 {d1[0]}, [ip], r1
+ vst1.16 {d0[1]}, [ip], r1
+ vst1.16 {d1[1]}, [ip], r1
+ vst1.16 {d0[2]}, [ip], r1
+ vst1.16 {d1[2]}, [ip], r1
+ vst1.16 {d0[3]}, [ip], r1
+ vst1.16 {d1[3]}, [ip], r1
+ bx lr
+endfunc
+
+
+function vp3_idct_start_neon
+ vpush {d8-d15}
+ vmov.i16 q4, #0
+ vmov.i16 q5, #0
+ movrel r3, vp3_idct_constants
+ vld1.64 {d0-d1}, [r3,:128]
+ vld1.64 {d16-d19}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vld1.64 {d20-d23}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vld1.64 {d24-d27}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+ vadd.s16 q1, q8, q12
+ vsub.s16 q8, q8, q12
+ vld1.64 {d28-d31}, [r2,:128]
+ vst1.64 {q4-q5}, [r2,:128]!
+
+vp3_idct_core_neon:
+ vmull.s16 q2, d18, xC1S7 // (ip[1] * C1) << 16
+ vmull.s16 q3, d19, xC1S7
+ vmull.s16 q4, d2, xC4S4 // ((ip[0] + ip[4]) * C4) << 16
+ vmull.s16 q5, d3, xC4S4
+ vmull.s16 q6, d16, xC4S4 // ((ip[0] - ip[4]) * C4) << 16
+ vmull.s16 q7, d17, xC4S4
+ vshrn.s32 d4, q2, #16
+ vshrn.s32 d5, q3, #16
+ vshrn.s32 d6, q4, #16
+ vshrn.s32 d7, q5, #16
+ vshrn.s32 d8, q6, #16
+ vshrn.s32 d9, q7, #16
+ vadd.s16 q12, q1, q3 // E = (ip[0] + ip[4]) * C4
+ vadd.s16 q8, q8, q4 // F = (ip[0] - ip[4]) * C4
+ vadd.s16 q1, q2, q9 // ip[1] * C1
+
+ vmull.s16 q2, d30, xC1S7 // (ip[7] * C1) << 16
+ vmull.s16 q3, d31, xC1S7
+ vmull.s16 q4, d30, xC7S1 // (ip[7] * C7) << 16
+ vmull.s16 q5, d31, xC7S1
+ vmull.s16 q6, d18, xC7S1 // (ip[1] * C7) << 16
+ vmull.s16 q7, d19, xC7S1
+ vshrn.s32 d4, q2, #16
+ vshrn.s32 d5, q3, #16
+ vshrn.s32 d6, q4, #16 // ip[7] * C7
+ vshrn.s32 d7, q5, #16
+ vshrn.s32 d8, q6, #16 // ip[1] * C7
+ vshrn.s32 d9, q7, #16
+ vadd.s16 q2, q2, q15 // ip[7] * C1
+ vadd.s16 q9, q1, q3 // A = ip[1] * C1 + ip[7] * C7
+ vsub.s16 q15, q4, q2 // B = ip[1] * C7 - ip[7] * C1
+
+ vmull.s16 q2, d22, xC5S3 // (ip[3] * C5) << 16
+ vmull.s16 q3, d23, xC5S3
+ vmull.s16 q4, d22, xC3S5 // (ip[3] * C3) << 16
+ vmull.s16 q5, d23, xC3S5
+ vmull.s16 q6, d26, xC5S3 // (ip[5] * C5) << 16
+ vmull.s16 q7, d27, xC5S3
+ vshrn.s32 d4, q2, #16
+ vshrn.s32 d5, q3, #16
+ vshrn.s32 d6, q4, #16
+ vshrn.s32 d7, q5, #16
+ vshrn.s32 d8, q6, #16
+ vshrn.s32 d9, q7, #16
+ vadd.s16 q3, q3, q11 // ip[3] * C3
+ vadd.s16 q4, q4, q13 // ip[5] * C5
+ vadd.s16 q1, q2, q11 // ip[3] * C5
+ vadd.s16 q11, q3, q4 // C = ip[3] * C3 + ip[5] * C5
+
+ vmull.s16 q2, d26, xC3S5 // (ip[5] * C3) << 16
+ vmull.s16 q3, d27, xC3S5
+ vmull.s16 q4, d20, xC2S6 // (ip[2] * C2) << 16
+ vmull.s16 q5, d21, xC2S6
+ vmull.s16 q6, d28, xC6S2 // (ip[6] * C6) << 16
+ vmull.s16 q7, d29, xC6S2
+ vshrn.s32 d4, q2, #16
+ vshrn.s32 d5, q3, #16
+ vshrn.s32 d6, q4, #16
+ vshrn.s32 d7, q5, #16
+ vshrn.s32 d8, q6, #16 // ip[6] * C6
+ vshrn.s32 d9, q7, #16
+ vadd.s16 q2, q2, q13 // ip[5] * C3
+ vadd.s16 q3, q3, q10 // ip[2] * C2
+ vsub.s16 q13, q2, q1 // D = ip[5] * C3 - ip[3] * C5
+ vsub.s16 q1, q9, q11 // (A - C)
+ vadd.s16 q11, q9, q11 // Cd = A + C
+ vsub.s16 q9, q15, q13 // (B - D)
+ vadd.s16 q13, q15, q13 // Dd = B + D
+ vadd.s16 q15, q3, q4 // G = ip[2] * C2 + ip[6] * C6
+
+ vmull.s16 q2, d2, xC4S4 // ((A - C) * C4) << 16
+ vmull.s16 q3, d3, xC4S4
+ vmull.s16 q4, d28, xC2S6 // (ip[6] * C2) << 16
+ vmull.s16 q5, d29, xC2S6
+ vmull.s16 q6, d20, xC6S2 // (ip[2] * C6) << 16
+ vmull.s16 q7, d21, xC6S2
+ vshrn.s32 d4, q2, #16
+ vshrn.s32 d5, q3, #16
+ vshrn.s32 d6, q4, #16
+ vshrn.s32 d7, q5, #16
+ vshrn.s32 d8, q6, #16 // ip[2] * C6
+ vmull.s16 q5, d18, xC4S4 // ((B - D) * C4) << 16
+ vmull.s16 q6, d19, xC4S4
+ vshrn.s32 d9, q7, #16
+ vadd.s16 q3, q3, q14 // ip[6] * C2
+ vadd.s16 q10, q1, q2 // Ad = (A - C) * C4
+ vsub.s16 q14, q4, q3 // H = ip[2] * C6 - ip[6] * C2
+ bx lr
+endfunc
+
+.macro VP3_IDCT_END type
+function vp3_idct_end_\type\()_neon
+.ifc \type, col
+ vdup.16 q0, r3
+ vadd.s16 q12, q12, q0
+ vadd.s16 q8, q8, q0
+.endif
+
+ vshrn.s32 d2, q5, #16
+ vshrn.s32 d3, q6, #16
+ vadd.s16 q2, q12, q15 // Gd = E + G
+ vadd.s16 q9, q1, q9 // (B - D) * C4
+ vsub.s16 q12, q12, q15 // Ed = E - G
+ vsub.s16 q3, q8, q10 // Fd = F - Ad
+ vadd.s16 q10, q8, q10 // Add = F + Ad
+ vadd.s16 q4, q9, q14 // Hd = Bd + H
+ vsub.s16 q14, q9, q14 // Bdd = Bd - H
+ vadd.s16 q8, q2, q11 // [0] = Gd + Cd
+ vsub.s16 q15, q2, q11 // [7] = Gd - Cd
+ vadd.s16 q9, q10, q4 // [1] = Add + Hd
+ vsub.s16 q10, q10, q4 // [2] = Add - Hd
+ vadd.s16 q11, q12, q13 // [3] = Ed + Dd
+ vsub.s16 q12, q12, q13 // [4] = Ed - Dd
+.ifc \type, row
+ vtrn.16 q8, q9
+.endif
+ vadd.s16 q13, q3, q14 // [5] = Fd + Bdd
+ vsub.s16 q14, q3, q14 // [6] = Fd - Bdd
+
+.ifc \type, row
+ // 8x8 transpose
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vswp d17, d24
+ vswp d19, d26
+ vadd.s16 q1, q8, q12
+ vswp d21, d28
+ vsub.s16 q8, q8, q12
+ vswp d23, d30
+.endif
+ bx lr
+endfunc
+.endm
+
+VP3_IDCT_END row
+VP3_IDCT_END col
+
+function ff_vp3_idct_put_neon, export=1
+ mov ip, lr
+ bl vp3_idct_start_neon
+ bl vp3_idct_end_row_neon
+ mov r3, #8
+ add r3, r3, #2048 // convert signed pixel to unsigned
+ bl vp3_idct_core_neon
+ bl vp3_idct_end_col_neon
+ mov lr, ip
+ vpop {d8-d15}
+
+ vqshrun.s16 d0, q8, #4
+ vqshrun.s16 d1, q9, #4
+ vqshrun.s16 d2, q10, #4
+ vqshrun.s16 d3, q11, #4
+ vst1.64 {d0}, [r0,:64], r1
+ vqshrun.s16 d4, q12, #4
+ vst1.64 {d1}, [r0,:64], r1
+ vqshrun.s16 d5, q13, #4
+ vst1.64 {d2}, [r0,:64], r1
+ vqshrun.s16 d6, q14, #4
+ vst1.64 {d3}, [r0,:64], r1
+ vqshrun.s16 d7, q15, #4
+ vst1.64 {d4}, [r0,:64], r1
+ vst1.64 {d5}, [r0,:64], r1
+ vst1.64 {d6}, [r0,:64], r1
+ vst1.64 {d7}, [r0,:64], r1
+ bx lr
+endfunc
+
+function ff_vp3_idct_add_neon, export=1
+ mov ip, lr
+ bl vp3_idct_start_neon
+ bl vp3_idct_end_row_neon
+ mov r3, #8
+ bl vp3_idct_core_neon
+ bl vp3_idct_end_col_neon
+ mov lr, ip
+ vpop {d8-d15}
+ mov r2, r0
+
+ vld1.64 {d0}, [r0,:64], r1
+ vshr.s16 q8, q8, #4
+ vld1.64 {d1}, [r0,:64], r1
+ vshr.s16 q9, q9, #4
+ vld1.64 {d2}, [r0,:64], r1
+ vaddw.u8 q8, q8, d0
+ vld1.64 {d3}, [r0,:64], r1
+ vaddw.u8 q9, q9, d1
+ vld1.64 {d4}, [r0,:64], r1
+ vshr.s16 q10, q10, #4
+ vld1.64 {d5}, [r0,:64], r1
+ vshr.s16 q11, q11, #4
+ vld1.64 {d6}, [r0,:64], r1
+ vqmovun.s16 d0, q8
+ vld1.64 {d7}, [r0,:64], r1
+ vqmovun.s16 d1, q9
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vshr.s16 q12, q12, #4
+ vshr.s16 q13, q13, #4
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vshr.s16 q14, q14, #4
+ vshr.s16 q15, q15, #4
+ vst1.64 {d0}, [r2,:64], r1
+ vqmovun.s16 d4, q12
+ vst1.64 {d1}, [r2,:64], r1
+ vqmovun.s16 d5, q13
+ vst1.64 {d2}, [r2,:64], r1
+ vaddw.u8 q14, q14, d6
+ vst1.64 {d3}, [r2,:64], r1
+ vaddw.u8 q15, q15, d7
+ vst1.64 {d4}, [r2,:64], r1
+ vqmovun.s16 d6, q14
+ vst1.64 {d5}, [r2,:64], r1
+ vqmovun.s16 d7, q15
+ vst1.64 {d6}, [r2,:64], r1
+ vst1.64 {d7}, [r2,:64], r1
+ bx lr
+endfunc
+
+function ff_vp3_idct_dc_add_neon, export=1
+ ldrsh r12, [r2]
+ mov r3, r0
+ add r12, r12, #15
+ vdup.16 q15, r12
+ mov r12, 0
+ strh r12, [r2]
+ vshr.s16 q15, q15, #5
+
+ vld1.8 {d0}, [r0,:64], r1
+ vld1.8 {d1}, [r0,:64], r1
+ vld1.8 {d2}, [r0,:64], r1
+ vaddw.u8 q8, q15, d0
+ vld1.8 {d3}, [r0,:64], r1
+ vaddw.u8 q9, q15, d1
+ vld1.8 {d4}, [r0,:64], r1
+ vaddw.u8 q10, q15, d2
+ vld1.8 {d5}, [r0,:64], r1
+ vaddw.u8 q11, q15, d3
+ vld1.8 {d6}, [r0,:64], r1
+ vaddw.u8 q12, q15, d4
+ vld1.8 {d7}, [r0,:64], r1
+ vaddw.u8 q13, q15, d5
+ vqmovun.s16 d0, q8
+ vaddw.u8 q14, q15, d6
+ vqmovun.s16 d1, q9
+ vaddw.u8 q15, q15, d7
+ vqmovun.s16 d2, q10
+ vst1.8 {d0}, [r3,:64], r1
+ vqmovun.s16 d3, q11
+ vst1.8 {d1}, [r3,:64], r1
+ vqmovun.s16 d4, q12
+ vst1.8 {d2}, [r3,:64], r1
+ vqmovun.s16 d5, q13
+ vst1.8 {d3}, [r3,:64], r1
+ vqmovun.s16 d6, q14
+ vst1.8 {d4}, [r3,:64], r1
+ vqmovun.s16 d7, q15
+ vst1.8 {d5}, [r3,:64], r1
+ vst1.8 {d6}, [r3,:64], r1
+ vst1.8 {d7}, [r3,:64], r1
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/vp56_arith.h b/ffmpeg/libavcodec/arm/vp56_arith.h
new file mode 100644
index 0000000..feb1247
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp56_arith.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP56_ARITH_H
+#define AVCODEC_ARM_VP56_ARITH_H
+
+#if CONFIG_THUMB
+# define A(x)
+# define T(x) x
+#else
+# define A(x) x
+# define T(x)
+#endif
+
+#if CONFIG_THUMB || defined __clang__
+# define L(x)
+# define U(x) x
+#else
+# define L(x) x
+# define U(x)
+#endif
+
+#if HAVE_ARMV6_INLINE
+
+#define vp56_rac_get_prob vp56_rac_get_prob_armv6
+static inline int vp56_rac_get_prob_armv6(VP56RangeCoder *c, int pr)
+{
+ unsigned shift = ff_vp56_norm_shift[c->high];
+ unsigned code_word = c->code_word << shift;
+ unsigned high = c->high << shift;
+ unsigned bit;
+
+ __asm__ ("adds %3, %3, %0 \n"
+ "itt cs \n"
+ "cmpcs %7, %4 \n"
+ L("ldrcsh %2, [%4], #2 \n")
+ U("ldrhcs %2, [%4], #2 \n")
+ "rsb %0, %6, #256 \n"
+ "smlabb %0, %5, %6, %0 \n"
+ T("itttt cs \n")
+ "rev16cs %2, %2 \n"
+ T("lslcs %2, %2, %3 \n")
+ T("orrcs %1, %1, %2 \n")
+ A("orrcs %1, %1, %2, lsl %3 \n")
+ "subcs %3, %3, #16 \n"
+ "lsr %0, %0, #8 \n"
+ "cmp %1, %0, lsl #16 \n"
+ "ittte ge \n"
+ "subge %1, %1, %0, lsl #16 \n"
+ "subge %0, %5, %0 \n"
+ "movge %2, #1 \n"
+ "movlt %2, #0 \n"
+ : "=&r"(c->high), "=&r"(c->code_word), "=&r"(bit),
+ "+&r"(c->bits), "+&r"(c->buffer)
+ : "r"(high), "r"(pr), "r"(c->end - 1),
+ "0"(shift), "1"(code_word)
+ : "cc");
+
+ return bit;
+}
+
+#define vp56_rac_get_prob_branchy vp56_rac_get_prob_branchy_armv6
+static inline int vp56_rac_get_prob_branchy_armv6(VP56RangeCoder *c, int pr)
+{
+ unsigned shift = ff_vp56_norm_shift[c->high];
+ unsigned code_word = c->code_word << shift;
+ unsigned high = c->high << shift;
+ unsigned low;
+ unsigned tmp;
+
+ __asm__ ("adds %3, %3, %0 \n"
+ "itt cs \n"
+ "cmpcs %7, %4 \n"
+ L("ldrcsh %2, [%4], #2 \n")
+ U("ldrhcs %2, [%4], #2 \n")
+ "rsb %0, %6, #256 \n"
+ "smlabb %0, %5, %6, %0 \n"
+ T("itttt cs \n")
+ "rev16cs %2, %2 \n"
+ T("lslcs %2, %2, %3 \n")
+ T("orrcs %1, %1, %2 \n")
+ A("orrcs %1, %1, %2, lsl %3 \n")
+ "subcs %3, %3, #16 \n"
+ "lsr %0, %0, #8 \n"
+ "lsl %2, %0, #16 \n"
+ : "=&r"(low), "+&r"(code_word), "=&r"(tmp),
+ "+&r"(c->bits), "+&r"(c->buffer)
+ : "r"(high), "r"(pr), "r"(c->end - 1), "0"(shift)
+ : "cc");
+
+ if (code_word >= tmp) {
+ c->high = high - low;
+ c->code_word = code_word - tmp;
+ return 1;
+ }
+
+ c->high = low;
+ c->code_word = code_word;
+ return 0;
+}
+
+#endif
+
+#endif /* AVCODEC_ARM_VP56_ARITH_H */
diff --git a/ffmpeg/libavcodec/arm/vp56dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp56dsp_init_arm.c
new file mode 100644
index 0000000..f53cbae
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp56dsp_init_arm.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/vp56dsp.h"
+
+void ff_vp6_edge_filter_hor_neon(uint8_t *yuv, int stride, int t);
+void ff_vp6_edge_filter_ver_neon(uint8_t *yuv, int stride, int t);
+
+av_cold void ff_vp56dsp_init_arm(VP56DSPContext *s, enum AVCodecID codec)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (codec != AV_CODEC_ID_VP5 && have_neon(cpu_flags)) {
+ s->edge_filter_hor = ff_vp6_edge_filter_hor_neon;
+ s->edge_filter_ver = ff_vp6_edge_filter_ver_neon;
+ }
+}
diff --git a/ffmpeg/libavcodec/arm/vp56dsp_neon.S b/ffmpeg/libavcodec/arm/vp56dsp_neon.S
new file mode 100644
index 0000000..03dd28d
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp56dsp_neon.S
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro vp6_edge_filter
+ vdup.16 q3, r2 @ t
+ vmov.i16 q13, #1
+ vsubl.u8 q0, d20, d18 @ p[ 0] - p[-s]
+ vsubl.u8 q1, d16, d22 @ p[-2*s] - p[ s]
+ vsubl.u8 q14, d21, d19
+ vsubl.u8 q15, d17, d23
+ vadd.i16 q2, q0, q0 @ 2*(p[0]-p[-s])
+ vadd.i16 d29, d28, d28
+ vadd.i16 q0, q0, q1 @ p[0]-p[-s] + p[-2*s]-p[s]
+ vadd.i16 d28, d28, d30
+ vadd.i16 q0, q0, q2 @ 3*(p[0]-p[-s]) + p[-2*s]-p[s]
+ vadd.i16 d28, d28, d29
+ vrshr.s16 q0, q0, #3 @ v
+ vrshr.s16 d28, d28, #3
+ vsub.i16 q8, q3, q13 @ t-1
+ vabs.s16 q1, q0 @ V
+ vshr.s16 q2, q0, #15 @ s
+ vabs.s16 d30, d28
+ vshr.s16 d29, d28, #15
+ vsub.i16 q12, q1, q3 @ V-t
+ vsub.i16 d31, d30, d6
+ vsub.i16 q12, q12, q13 @ V-t-1
+ vsub.i16 d31, d31, d26
+ vcge.u16 q12, q12, q8 @ V-t-1 >= t-1
+ vcge.u16 d31, d31, d16
+ vadd.i16 q13, q3, q3 @ 2*t
+ vadd.i16 d16, d6, d6
+ vsub.i16 q13, q13, q1 @ 2*t - V
+ vsub.i16 d16, d16, d30
+ vadd.i16 q13, q13, q2 @ += s
+ vadd.i16 d16, d16, d29
+ veor q13, q13, q2 @ ^= s
+ veor d16, d16, d29
+ vbif q0, q13, q12
+ vbif d28, d16, d31
+ vmovl.u8 q1, d20
+ vmovl.u8 q15, d21
+ vaddw.u8 q2, q0, d18
+ vaddw.u8 q3, q14, d19
+ vsub.i16 q1, q1, q0
+ vsub.i16 d30, d30, d28
+ vqmovun.s16 d18, q2
+ vqmovun.s16 d19, q3
+ vqmovun.s16 d20, q1
+ vqmovun.s16 d21, q15
+.endm
+
+function ff_vp6_edge_filter_ver_neon, export=1
+ sub r0, r0, r1, lsl #1
+ vld1.8 {q8}, [r0], r1 @ p[-2*s]
+ vld1.8 {q9}, [r0], r1 @ p[-s]
+ vld1.8 {q10}, [r0], r1 @ p[0]
+ vld1.8 {q11}, [r0] @ p[s]
+ vp6_edge_filter
+ sub r0, r0, r1, lsl #1
+ sub r1, r1, #8
+ vst1.8 {d18}, [r0]!
+ vst1.32 {d19[0]}, [r0], r1
+ vst1.8 {d20}, [r0]!
+ vst1.32 {d21[0]}, [r0]
+ bx lr
+endfunc
+
+function ff_vp6_edge_filter_hor_neon, export=1
+ sub r3, r0, #1
+ sub r0, r0, #2
+ vld1.32 {d16[0]}, [r0], r1
+ vld1.32 {d18[0]}, [r0], r1
+ vld1.32 {d20[0]}, [r0], r1
+ vld1.32 {d22[0]}, [r0], r1
+ vld1.32 {d16[1]}, [r0], r1
+ vld1.32 {d18[1]}, [r0], r1
+ vld1.32 {d20[1]}, [r0], r1
+ vld1.32 {d22[1]}, [r0], r1
+ vld1.32 {d17[0]}, [r0], r1
+ vld1.32 {d19[0]}, [r0], r1
+ vld1.32 {d21[0]}, [r0], r1
+ vld1.32 {d23[0]}, [r0], r1
+ vtrn.8 q8, q9
+ vtrn.8 q10, q11
+ vtrn.16 q8, q10
+ vtrn.16 q9, q11
+ vp6_edge_filter
+ vtrn.8 q9, q10
+ vst1.16 {d18[0]}, [r3], r1
+ vst1.16 {d20[0]}, [r3], r1
+ vst1.16 {d18[1]}, [r3], r1
+ vst1.16 {d20[1]}, [r3], r1
+ vst1.16 {d18[2]}, [r3], r1
+ vst1.16 {d20[2]}, [r3], r1
+ vst1.16 {d18[3]}, [r3], r1
+ vst1.16 {d20[3]}, [r3], r1
+ vst1.16 {d19[0]}, [r3], r1
+ vst1.16 {d21[0]}, [r3], r1
+ vst1.16 {d19[1]}, [r3], r1
+ vst1.16 {d21[1]}, [r3], r1
+ bx lr
+endfunc
diff --git a/ffmpeg/libavcodec/arm/vp8.h b/ffmpeg/libavcodec/arm/vp8.h
new file mode 100644
index 0000000..ddaa120
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8.h
@@ -0,0 +1,35 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP8_H
+#define AVCODEC_ARM_VP8_H
+
+#include <stdint.h>
+
+#include "config.h"
+#include "libavcodec/vp56.h"
+#include "libavcodec/vp8.h"
+
+#if HAVE_ARMV6_EXTERNAL
+#define decode_block_coeffs_internal ff_decode_block_coeffs_armv6
+int ff_decode_block_coeffs_armv6(VP56RangeCoder *rc, int16_t block[16],
+ uint8_t probs[8][3][NUM_DCT_TOKENS-1],
+ int i, uint8_t *token_prob, int16_t qmul[2]);
+#endif
+
+#endif /* AVCODEC_ARM_VP8_H */
diff --git a/ffmpeg/libavcodec/arm/vp8_armv6.S b/ffmpeg/libavcodec/arm/vp8_armv6.S
new file mode 100644
index 0000000..e7d25a4
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8_armv6.S
@@ -0,0 +1,248 @@
+/*
+ * Copyright (C) 2010 Mans Rullgard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro rac_get_prob h, bs, buf, cw, pr, t0, t1
+ adds \bs, \bs, \t0
+ lsl \cw, \cw, \t0
+ lsl \t0, \h, \t0
+ rsb \h, \pr, #256
+ it cs
+ ldrhcs \t1, [\buf], #2
+ smlabb \h, \t0, \pr, \h
+T itttt cs
+ rev16cs \t1, \t1
+A orrcs \cw, \cw, \t1, lsl \bs
+T lslcs \t1, \t1, \bs
+T orrcs \cw, \cw, \t1
+ subcs \bs, \bs, #16
+ lsr \h, \h, #8
+ cmp \cw, \h, lsl #16
+ itt ge
+ subge \cw, \cw, \h, lsl #16
+ subge \h, \t0, \h
+.endm
+
+.macro rac_get_128 h, bs, buf, cw, t0, t1
+ adds \bs, \bs, \t0
+ lsl \cw, \cw, \t0
+ lsl \t0, \h, \t0
+ it cs
+ ldrhcs \t1, [\buf], #2
+ mov \h, #128
+ it cs
+ rev16cs \t1, \t1
+ add \h, \h, \t0, lsl #7
+A orrcs \cw, \cw, \t1, lsl \bs
+T ittt cs
+T lslcs \t1, \t1, \bs
+T orrcs \cw, \cw, \t1
+ subcs \bs, \bs, #16
+ lsr \h, \h, #8
+ cmp \cw, \h, lsl #16
+ itt ge
+ subge \cw, \cw, \h, lsl #16
+ subge \h, \t0, \h
+.endm
+
+function ff_decode_block_coeffs_armv6, export=1
+ push {r0,r1,r4-r11,lr}
+ movrelx lr, X(ff_vp56_norm_shift)
+ ldrd r4, r5, [sp, #44] @ token_prob, qmul
+ cmp r3, #0
+ ldr r11, [r5]
+ ldm r0, {r5-r7} @ high, bits, buf
+ it ne
+ pkhtbne r11, r11, r11, asr #16
+ ldr r8, [r0, #16] @ code_word
+0:
+ ldrb r9, [lr, r5]
+ add r3, r3, #1
+ ldrb r0, [r4, #1]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ blt 2f
+
+ ldrb r9, [lr, r5]
+ ldrb r0, [r4, #2]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 3f
+
+ add r4, r3, r3, lsl #5
+ sxth r12, r11
+ add r4, r4, r2
+ adds r6, r6, r9
+ add r4, r4, #11
+ lsl r8, r8, r9
+ it cs
+ ldrhcs r10, [r7], #2
+ lsl r9, r5, r9
+ mov r5, #128
+ it cs
+ rev16cs r10, r10
+ add r5, r5, r9, lsl #7
+T ittt cs
+T lslcs r10, r10, r6
+T orrcs r8, r8, r10
+A orrcs r8, r8, r10, lsl r6
+ subcs r6, r6, #16
+ lsr r5, r5, #8
+ cmp r8, r5, lsl #16
+ movrel r10, zigzag_scan-1
+ itt ge
+ subge r8, r8, r5, lsl #16
+ subge r5, r9, r5
+ ldrb r10, [r10, r3]
+ it ge
+ rsbge r12, r12, #0
+ cmp r3, #16
+ strh r12, [r1, r10]
+ bge 6f
+5:
+ ldrb r9, [lr, r5]
+ ldrb r0, [r4]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ pkhtb r11, r11, r11, asr #16
+ bge 0b
+
+6:
+ ldr r0, [sp]
+ ldr r9, [r0, #12]
+ cmp r7, r9
+ it hi
+ movhi r7, r9
+ stm r0, {r5-r7} @ high, bits, buf
+ str r8, [r0, #16] @ code_word
+
+ add sp, sp, #8
+ mov r0, r3
+ pop {r4-r11,pc}
+2:
+ add r4, r3, r3, lsl #5
+ cmp r3, #16
+ add r4, r4, r2
+ pkhtb r11, r11, r11, asr #16
+ bne 0b
+ b 6b
+3:
+ ldrb r0, [r4, #3]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 1f
+
+ mov r12, #2
+ ldrb r0, [r4, #4]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
+ addge r12, #1
+ ldrb r9, [lr, r5]
+ blt 4f
+ ldrb r0, [r4, #5]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
+ addge r12, #1
+ ldrb r9, [lr, r5]
+ b 4f
+1:
+ ldrb r0, [r4, #6]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 3f
+
+ ldrb r0, [r4, #7]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r9, [lr, r5]
+ bge 2f
+
+ mov r12, #5
+ mov r0, #159
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
+ addge r12, r12, #1
+ ldrb r9, [lr, r5]
+ b 4f
+2:
+ mov r12, #7
+ mov r0, #165
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
+ addge r12, r12, #2
+ ldrb r9, [lr, r5]
+ mov r0, #145
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
+ addge r12, r12, #1
+ ldrb r9, [lr, r5]
+ b 4f
+3:
+ ldrb r0, [r4, #8]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ it ge
+ addge r4, r4, #1
+ ldrb r9, [lr, r5]
+ ite ge
+ movge r12, #2
+ movlt r12, #0
+ ldrb r0, [r4, #9]
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ mov r9, #8
+ it ge
+ addge r12, r12, #1
+ movrelx r4, X(ff_vp8_dct_cat_prob), r1
+ lsl r9, r9, r12
+ ldr r4, [r4, r12, lsl #2]
+ add r12, r9, #3
+ mov r1, #0
+ ldrb r0, [r4], #1
+1:
+ ldrb r9, [lr, r5]
+ lsl r1, r1, #1
+ rac_get_prob r5, r6, r7, r8, r0, r9, r10
+ ldrb r0, [r4], #1
+ it ge
+ addge r1, r1, #1
+ cmp r0, #0
+ bne 1b
+ ldrb r9, [lr, r5]
+ add r12, r12, r1
+ ldr r1, [sp, #4]
+4:
+ add r4, r3, r3, lsl #5
+ add r4, r4, r2
+ add r4, r4, #22
+ rac_get_128 r5, r6, r7, r8, r9, r10
+ it ge
+ rsbge r12, r12, #0
+ smulbb r12, r12, r11
+ movrel r9, zigzag_scan-1
+ ldrb r9, [r9, r3]
+ cmp r3, #16
+ strh r12, [r1, r9]
+ bge 6b
+ b 5b
+endfunc
+
+const zigzag_scan
+ .byte 0, 2, 8, 16
+ .byte 10, 4, 6, 12
+ .byte 18, 24, 26, 20
+ .byte 14, 22, 28, 30
+endconst
diff --git a/ffmpeg/libavcodec/arm/vp8dsp.h b/ffmpeg/libavcodec/arm/vp8dsp.h
new file mode 100644
index 0000000..ce00e4a
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8dsp.h
@@ -0,0 +1,78 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_ARM_VP8DSP_H
+#define AVCODEC_ARM_VP8DSP_H
+
+#include "libavcodec/vp8dsp.h"
+
+void ff_vp8dsp_init_armv6(VP8DSPContext *dsp);
+void ff_vp8dsp_init_neon(VP8DSPContext *dsp);
+
+#define VP8_LF_Y(hv, inner, opt) \
+ void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim_E, int flim_I, \
+ int hev_thresh)
+
+#define VP8_LF_UV(hv, inner, opt) \
+ void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \
+ uint8_t *dstV, \
+ ptrdiff_t stride, \
+ int flim_E, int flim_I, \
+ int hev_thresh)
+
+#define VP8_LF_SIMPLE(hv, opt) \
+ void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
+ ptrdiff_t stride, \
+ int flim)
+
+#define VP8_LF_HV(inner, opt) \
+ VP8_LF_Y(h, inner, opt); \
+ VP8_LF_Y(v, inner, opt); \
+ VP8_LF_UV(h, inner, opt); \
+ VP8_LF_UV(v, inner, opt)
+
+#define VP8_LF(opt) \
+ VP8_LF_HV(, opt); \
+ VP8_LF_HV(_inner, opt); \
+ VP8_LF_SIMPLE(h, opt); \
+ VP8_LF_SIMPLE(v, opt)
+
+#define VP8_MC(n, opt) \
+ void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \
+ uint8_t *src, ptrdiff_t srcstride, \
+ int h, int x, int y)
+
+#define VP8_EPEL(w, opt) \
+ VP8_MC(pixels ## w, opt); \
+ VP8_MC(epel ## w ## _h4, opt); \
+ VP8_MC(epel ## w ## _h6, opt); \
+ VP8_MC(epel ## w ## _v4, opt); \
+ VP8_MC(epel ## w ## _h4v4, opt); \
+ VP8_MC(epel ## w ## _h6v4, opt); \
+ VP8_MC(epel ## w ## _v6, opt); \
+ VP8_MC(epel ## w ## _h4v6, opt); \
+ VP8_MC(epel ## w ## _h6v6, opt)
+
+#define VP8_BILIN(w, opt) \
+ VP8_MC(bilin ## w ## _h, opt); \
+ VP8_MC(bilin ## w ## _v, opt); \
+ VP8_MC(bilin ## w ## _hv, opt)
+
+#endif /* AVCODEC_ARM_VP8DSP_H */
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_armv6.S b/ffmpeg/libavcodec/arm/vp8dsp_armv6.S
new file mode 100644
index 0000000..5207758
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8dsp_armv6.S
@@ -0,0 +1,1634 @@
+/*
+ * VP8 ARMv6 optimisations
+ *
+ * Copyright (c) 2010 Google Inc.
+ * Copyright (c) 2010 Rob Clark <rob@ti.com>
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * This code was partially ported from libvpx, which uses this license:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * * Neither the name of Google nor the names of its contributors may
+ * be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ idct
+
+@ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
+function ff_vp8_luma_dc_wht_armv6, export=1
+ push {r4-r10, lr}
+
+ ldm r1, {r2-r9}
+ mov r10, #0
+ mov lr, #0
+ uadd16 r12, r2, r8 @ t0[0,1]
+ usub16 r2, r2, r8 @ t3[0,1]
+ stm r1!, {r10, lr}
+ uadd16 r8, r4, r6 @ t1[0,1]
+ usub16 r4, r4, r6 @ t2[0,1]
+ stm r1!, {r10, lr}
+ uadd16 r6, r12, r8 @ dc0[0,1]
+ usub16 r12, r12, r8 @ dc2[0,1]
+ stm r1!, {r10, lr}
+ uadd16 r8, r2, r4 @ dc1[0,1]
+ usub16 r2, r2, r4 @ dc3[0,1]
+ stm r1!, {r10, lr}
+
+ uadd16 lr, r3, r9 @ t0[2,3]
+ usub16 r3, r3, r9 @ t3[2,3]
+ uadd16 r9, r5, r7 @ t1[2,3]
+ usub16 r5, r5, r7 @ t2[2,3]
+
+ uadd16 r7, lr, r9 @ dc0[2,3]
+ usub16 lr, lr, r9 @ dc2[2,3]
+ uadd16 r9, r3, r5 @ dc1[2,3]
+ usub16 r3, r3, r5 @ dc3[2,3]
+
+ mov r1, #3
+ orr r1, r1, #0x30000 @ 3 | 3 (round)
+
+ pkhbt r4, r6, r8, lsl #16 @ dc{0,1}[0]
+ pkhtb r6, r8, r6, asr #16 @ dc{0,1}[1]
+ pkhbt r5, r12, r2, lsl #16 @ dc{2,3}[0]
+ pkhtb r12, r2, r12, asr #16 @ dc{2,3}[1]
+ pkhbt r8, r7, r9, lsl #16 @ dc{0,1}[2]
+ uadd16 r4, r4, r1
+ uadd16 r5, r5, r1
+ pkhtb r7, r9, r7, asr #16 @ dc{0,1}[3]
+ pkhbt r2, lr, r3, lsl #16 @ dc{2,3}[2]
+ pkhtb lr, r3, lr, asr #16 @ dc{2,3}[3]
+
+ uadd16 r9, r4, r7 @ t0[0,1]
+ uadd16 r3, r5, lr @ t0[2,3]
+ usub16 r4, r4, r7 @ t3[0,1]
+ usub16 r5, r5, lr @ t3[2,3]
+ uadd16 r7, r6, r8 @ t1[0,1]
+ uadd16 lr, r12, r2 @ t1[2,3]
+ usub16 r6, r6, r8 @ t2[0,1]
+ usub16 r12, r12, r2 @ t2[2,3]
+
+ uadd16 r8, r9, r7 @ block[0,1][0]
+ uadd16 r2, r3, lr @ block[2,3][0]
+ usub16 r9, r9, r7 @ block[0,1][2]
+ usub16 r3, r3, lr @ block[2,3][2]
+ uadd16 r7, r4, r6 @ block[0,1][1]
+ uadd16 lr, r5, r12 @ block[2,3][1]
+ usub16 r4, r4, r6 @ block[0,1][3]
+ usub16 r5, r5, r12 @ block[2,3][3]
+
+#if HAVE_ARMV6T2_EXTERNAL
+ sbfx r6, r8, #3, #13
+ sbfx r12, r7, #3, #13
+ sbfx r1, r9, #3, #13
+ sbfx r10, r4, #3, #13
+#else
+ sxth r6, r8
+ sxth r12, r7
+ sxth r1, r9
+ sxth r10, r4
+ asr r6, #3 @ block[0][0]
+ asr r12, #3 @ block[0][1]
+ asr r1, #3 @ block[0][2]
+ asr r10, #3 @ block[0][3]
+#endif
+
+ strh r6, [r0], #32
+ asr r8, r8, #19 @ block[1][0]
+ strh r12, [r0], #32
+ asr r7, r7, #19 @ block[1][1]
+ strh r1, [r0], #32
+ asr r9, r9, #19 @ block[1][2]
+ strh r10, [r0], #32
+ asr r4, r4, #19 @ block[1][3]
+ strh r8, [r0], #32
+ asr r6, r2, #19 @ block[3][0]
+ strh r7, [r0], #32
+ asr r12, lr, #19 @ block[3][1]
+ strh r9, [r0], #32
+ asr r1, r3, #19 @ block[3][2]
+ strh r4, [r0], #32
+ asr r10, r5, #19 @ block[3][3]
+
+#if HAVE_ARMV6T2_EXTERNAL
+ sbfx r2, r2, #3, #13
+ sbfx lr, lr, #3, #13
+ sbfx r3, r3, #3, #13
+ sbfx r5, r5, #3, #13
+#else
+ sxth r2, r2
+ sxth lr, lr
+ sxth r3, r3
+ sxth r5, r5
+ asr r2, #3 @ block[2][0]
+ asr lr, #3 @ block[2][1]
+ asr r3, #3 @ block[2][2]
+ asr r5, #3 @ block[2][3]
+#endif
+
+ strh r2, [r0], #32
+ strh lr, [r0], #32
+ strh r3, [r0], #32
+ strh r5, [r0], #32
+ strh r6, [r0], #32
+ strh r12, [r0], #32
+ strh r1, [r0], #32
+ strh r10, [r0], #32
+
+ pop {r4-r10, pc}
+endfunc
+
+@ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16])
+function ff_vp8_luma_dc_wht_dc_armv6, export=1
+ ldrsh r2, [r1]
+ mov r3, #0
+ add r2, r2, #3
+ strh r3, [r1]
+ asr r2, r2, #3
+ .rept 16
+ strh r2, [r0], #32
+ .endr
+ bx lr
+endfunc
+
+@ void vp8_idct_add(uint8_t *dst, int16_t block[16], int stride)
+function ff_vp8_idct_add_armv6, export=1
+ push {r4-r12, lr}
+ sub sp, sp, #32
+
+ movw r3, #20091 @ cospi8sqrt2minus1
+ movw r4, #35468 @ sinpi8sqrt2
+ mov r5, sp
+1:
+ ldr r6, [r1, #8] @ i5 | i4 = block1[1] | block1[0]
+ ldr lr, [r1, #16] @ i9 | i8 = block2[1] | block2[0]
+ ldr r12, [r1, #24] @ i13 | i12 = block3[1] | block3[0]
+
+ smulwt r9, r3, r6 @ ip[5] * cospi8sqrt2minus1
+ smulwb r7, r3, r6 @ ip[4] * cospi8sqrt2minus1
+ smulwt r10, r4, r6 @ ip[5] * sinpi8sqrt2
+ smulwb r8, r4, r6 @ ip[4] * sinpi8sqrt2
+ pkhbt r7, r7, r9, lsl #16 @ 5c | 4c
+ smulwt r11, r3, r12 @ ip[13] * cospi8sqrt2minus1
+ pkhbt r8, r8, r10, lsl #16 @ 5s | 4s = t2 first half
+ uadd16 r6, r6, r7 @ 5c+5 | 4c+4 = t3 first half
+ smulwb r9, r3, r12 @ ip[12] * cospi8sqrt2minus1
+ smulwt r7, r4, r12 @ ip[13] * sinpi8sqrt2
+ smulwb r10, r4, r12 @ ip[12] * sinpi8sqrt2
+
+ pkhbt r9, r9, r11, lsl #16 @ 13c | 12c
+ ldr r11, [r1] @ i1 | i0
+ pkhbt r10, r10, r7, lsl #16 @ 13s | 12s = t3 second half
+ uadd16 r7, r12, r9 @ 13c+13 | 12c+12 = t2 2nd half
+ uadd16 r6, r6, r10 @ d = t3
+ uadd16 r10, r11, lr @ a = t0
+ usub16 r7, r8, r7 @ c = t2
+ usub16 r8, r11, lr @ b = t1
+ uadd16 r9, r10, r6 @ a+d = tmp{0,1}[0]
+ usub16 r10, r10, r6 @ a-d = tmp{0,1}[3]
+ uadd16 r6, r8, r7 @ b+c = tmp{0,1}[1]
+ usub16 r7, r8, r7 @ b-c = tmp{0,1}[2]
+ mov r8, #0
+ cmp sp, r5
+ str r6, [r5, #8] @ o5 | o4
+ str r7, [r5, #16] @ o9 | o8
+ str r10, [r5, #24] @ o13 | o12
+ str r9, [r5], #4 @ o1 | o0
+ str r8, [r1, #8]
+ str r8, [r1, #16]
+ str r8, [r1, #24]
+ str r8, [r1], #4
+ beq 1b
+
+ mov r5, #2
+2:
+ pop {r1, r6, r12, lr}
+ smulwt r9, r3, r12 @ ip[5] * cospi8sqrt2minus1
+ smulwt r7, r3, r1 @ ip[1] * cospi8sqrt2minus1
+ smulwt r10, r4, r12 @ ip[5] * sinpi8sqrt2
+ smulwt r8, r4, r1 @ ip[1] * sinpi8sqrt2
+ pkhbt r11, r1, r12, lsl #16 @ i4 | i0 = t0/t1 first half
+ pkhtb r1, r12, r1, asr #16 @ i5 | i1
+ pkhbt r7, r7, r9, lsl #16 @ 5c | 1c
+ pkhbt r8, r8, r10, lsl #16 @ 5s | 1s = t2 first half
+ pkhbt r9, r6, lr, lsl #16 @ i6 | i2 = t0/t1 second half
+ pkhtb r12, lr, r6, asr #16 @ i7 | i3
+ uadd16 r1, r7, r1 @ 5c+5 | 1c+1 = t3 first half
+ uadd16 r10, r11, r9 @ a = t0
+ usub16 r9, r11, r9 @ b = t1
+ smulwt r7, r3, r12 @ ip[7] * cospi8sqrt2minus1
+ smulwb lr, r3, r12 @ ip[3] * cospi8sqrt2minus1
+ smulwt r11, r4, r12 @ ip[7] * sinpi8sqrt2
+ smulwb r6, r4, r12 @ ip[3] * sinpi8sqrt2
+ subs r5, r5, #1
+ pkhbt r7, lr, r7, lsl #16 @ 7c | 3c
+ pkhbt r11, r6, r11, lsl #16 @ 7s | 3s = t3 second half
+ mov r6, #0x4
+ orr r6, r6, #0x40000
+ uadd16 r12, r7, r12 @ 7c+7 | 3c+3 = t2 second half
+ uadd16 r10, r10, r6 @ t0 + 4
+ uadd16 r9, r9, r6 @ t1 + 4
+ usub16 lr, r8, r12 @ c (o5 | o1) = t2
+ uadd16 r12, r11, r1 @ d (o7 | o3) = t3
+ usub16 r1, r9, lr @ b-c = dst{0,1}[2]
+ uadd16 r7, r10, r12 @ a+d = dst{0,1}[0]
+ usub16 r12, r10, r12 @ a-d = dst{0,1}[3]
+ uadd16 r10, r9, lr @ b+c = dst{0,1}[1]
+
+ asr lr, r1, #3 @ o[1][2]
+ asr r9, r12, #3 @ o[1][3]
+ pkhtb r8, lr, r7, asr #19 @ o[1][0,2]
+ pkhtb r11, r9, r10, asr #19 @ o[1][1,3]
+ ldr lr, [r0]
+ sxth r12, r12
+ ldr r9, [r0, r2]
+ sxth r1, r1
+#if HAVE_ARMV6T2_EXTERNAL
+ sbfx r7, r7, #3, #13
+ sbfx r10, r10, #3, #13
+#else
+ sxth r7, r7
+ sxth r10, r10
+ asr r7, #3 @ o[0][0]
+ asr r10, #3 @ o[0][1]
+#endif
+ pkhbt r7, r7, r1, lsl #13 @ o[0][0,2]
+ pkhbt r10, r10, r12, lsl #13 @ o[0][1,3]
+
+ uxtab16 r7, r7, lr
+ uxtab16 r10, r10, lr, ror #8
+ uxtab16 r8, r8, r9
+ uxtab16 r11, r11, r9, ror #8
+ usat16 r7, #8, r7
+ usat16 r10, #8, r10
+ usat16 r8, #8, r8
+ usat16 r11, #8, r11
+ orr r7, r7, r10, lsl #8
+ orr r8, r8, r11, lsl #8
+ str r8, [r0, r2]
+ str_post r7, r0, r2, lsl #1
+
+ bne 2b
+
+ pop {r4-r12, pc}
+endfunc
+
+@ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], int stride)
+function ff_vp8_idct_dc_add_armv6, export=1
+ push {r4-r6, lr}
+ add r6, r0, r2, lsl #1
+ ldrsh r3, [r1]
+ mov r4, #0
+ add r3, r3, #4
+ strh r4, [r1], #32
+ asr r3, #3
+ ldr r5, [r0]
+ ldr r4, [r0, r2]
+ pkhbt r3, r3, r3, lsl #16
+ uxtab16 lr, r3, r5 @ a1+2 | a1+0
+ uxtab16 r5, r3, r5, ror #8 @ a1+3 | a1+1
+ uxtab16 r12, r3, r4
+ uxtab16 r4, r3, r4, ror #8
+ usat16 lr, #8, lr
+ usat16 r5, #8, r5
+ usat16 r12, #8, r12
+ usat16 r4, #8, r4
+ orr lr, lr, r5, lsl #8
+ ldr r5, [r6]
+ orr r12, r12, r4, lsl #8
+ ldr r4, [r6, r2]
+ str lr, [r0]
+ uxtab16 lr, r3, r5
+ str r12, [r0, r2]
+ uxtab16 r5, r3, r5, ror #8
+ uxtab16 r12, r3, r4
+ uxtab16 r4, r3, r4, ror #8
+ usat16 lr, #8, lr
+ usat16 r5, #8, r5
+ usat16 r12, #8, r12
+ usat16 r4, #8, r4
+ orr lr, lr, r5, lsl #8
+ orr r12, r12, r4, lsl #8
+ str lr, [r6]
+ str r12, [r6, r2]
+ pop {r4-r6, pc}
+endfunc
+
+@ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], int stride)
+function ff_vp8_idct_dc_add4uv_armv6, export=1
+ push {r4, lr}
+
+ bl ff_vp8_idct_dc_add_armv6
+ add r0, r0, #4
+ bl ff_vp8_idct_dc_add_armv6
+ add r0, r0, r2, lsl #2
+ sub r0, r0, #4
+ bl ff_vp8_idct_dc_add_armv6
+ add r0, r0, #4
+ bl ff_vp8_idct_dc_add_armv6
+
+ pop {r4, pc}
+endfunc
+
+@ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], int stride)
+function ff_vp8_idct_dc_add4y_armv6, export=1
+ push {r4, lr}
+
+ bl ff_vp8_idct_dc_add_armv6
+ add r0, r0, #4
+ bl ff_vp8_idct_dc_add_armv6
+ add r0, r0, #4
+ bl ff_vp8_idct_dc_add_armv6
+ add r0, r0, #4
+ bl ff_vp8_idct_dc_add_armv6
+
+ pop {r4, pc}
+endfunc
+
+@ loopfilter
+
+.macro transpose o3, o2, o1, o0, i0, i1, i2, i3
+ uxtb16 \o1, \i1 @ xx 12 xx 10
+ uxtb16 \o0, \i0 @ xx 02 xx 00
+ uxtb16 \o3, \i3 @ xx 32 xx 30
+ uxtb16 \o2, \i2 @ xx 22 xx 20
+ orr \o1, \o0, \o1, lsl #8 @ 12 02 10 00
+ orr \o3, \o2, \o3, lsl #8 @ 32 22 30 20
+
+ uxtb16 \i1, \i1, ror #8 @ xx 13 xx 11
+ uxtb16 \i3, \i3, ror #8 @ xx 33 xx 31
+ uxtb16 \i0, \i0, ror #8 @ xx 03 xx 01
+ uxtb16 \i2, \i2, ror #8 @ xx 23 xx 21
+ orr \i0, \i0, \i1, lsl #8 @ 13 03 11 01
+ orr \i2, \i2, \i3, lsl #8 @ 33 23 31 21
+
+ pkhtb \o2, \o3, \o1, asr #16 @ 32 22 12 02
+ pkhbt \o0, \o1, \o3, lsl #16 @ 30 20 10 00
+ pkhtb \o3, \i2, \i0, asr #16 @ 33 23 13 03
+ pkhbt \o1, \i0, \i2, lsl #16 @ 31 21 11 01
+.endm
+
+.macro simple_filter
+ uqsub8 r7, r3, r6 @ p1 - q1
+ uqsub8 r8, r6, r3 @ q1 - p1
+ uqsub8 r10, r4, r5 @ p0 - q0
+ uqsub8 r9, r5, r4 @ q0 - p0
+ orr r7, r7, r8 @ abs(p1 - q1)
+ orr r9, r9, r10 @ abs(p0 - q0)
+ uhadd8 r7, r7, lr @ abs(p1 - q2) >> 1
+ uqadd8 r9, r9, r9 @ abs(p0 - q0) * 2
+ uqadd8 r7, r7, r9 @ abs(p0 - q0)*2 + abs(p1-q1)/2
+ mvn r8, #0
+ usub8 r10, r12, r7 @ compare to flimit
+ sel r10, r8, lr @ filter mask: F or 0
+ cmp r10, #0
+ beq 2f
+
+ eor r3, r3, r2 @ ps1
+ eor r6, r6, r2 @ qs1
+ eor r4, r4, r2 @ ps0
+ eor r5, r5, r2 @ qs0
+
+ qsub8 r3, r3, r6 @ vp8_filter = p1 - q1
+ qsub8 r6, r5, r4 @ q0 - p0
+ qadd8 r3, r3, r6 @ += q0 - p0
+ lsr r7, r2, #5 @ 0x04040404
+ qadd8 r3, r3, r6 @ += q0 - p0
+ sub r9, r7, r2, lsr #7 @ 0x03030303
+ qadd8 r3, r3, r6 @ vp8_filter = p1-q1 + 3*(q0-p0)
+ and r3, r3, r10 @ vp8_filter &= mask
+
+ qadd8 r9, r3, r9 @ Filter2 = vp8_filter + 3
+ qadd8 r3, r3, r7 @ Filter1 = vp8_filter + 4
+
+ shadd8 r9, r9, lr
+ shadd8 r3, r3, lr
+ shadd8 r9, r9, lr
+ shadd8 r3, r3, lr
+ shadd8 r9, r9, lr @ Filter2 >>= 3
+ shadd8 r3, r3, lr @ Filter1 >>= 3
+
+ qadd8 r4, r4, r9 @ u = p0 + Filter2
+ qsub8 r5, r5, r3 @ u = q0 - Filter1
+ eor r4, r4, r2 @ *op0 = u ^ 0x80
+ eor r5, r5, r2 @ *oq0 = u ^ 0x80
+.endm
+
+@ void vp8_v_loop_filter16_simple(uint8_t *dst, int stride, int flim)
+function ff_vp8_v_loop_filter16_simple_armv6, export=1
+ push {r4-r11, lr}
+
+ orr r2, r2, r2, lsl #16
+ mov r11, #4
+ mov lr, #0
+ orr r12, r2, r2, lsl #8
+ mov32 r2, 0x80808080
+1:
+ ldr_nreg r3, r0, r1, lsl #1 @ p1
+ ldr_nreg r4, r0, r1 @ p0
+ ldr r5, [r0] @ q0
+ ldr r6, [r0, r1] @ q1
+ simple_filter
+T sub r7, r0, r1
+ str r5, [r0] @ oq0
+A str r4, [r0, -r1] @ op0
+T str r4, [r7]
+2:
+ subs r11, r11, #1
+ add r0, r0, #4
+ bne 1b
+
+ pop {r4-r11, pc}
+endfunc
+
+.macro filter_mask_p
+ uqsub8 r6, r9, r10 @ p3 - p2
+ uqsub8 r7, r10, r9 @ p2 - p3
+ uqsub8 r8, r10, r11 @ p2 - p1
+ uqsub8 r10, r11, r10 @ p1 - p2
+ orr r6, r6, r7 @ abs(p3-p2)
+ orr r8, r8, r10 @ abs(p2-p1)
+ uqsub8 lr, r6, r2 @ compare to limit
+ uqsub8 r8, r8, r2 @ compare to limit
+ uqsub8 r6, r11, r12 @ p1 - p0
+ orr lr, lr, r8
+ uqsub8 r7, r12, r11 @ p0 - p1
+ orr r6, r6, r7 @ abs(p1-p0)
+ uqsub8 r7, r6, r2 @ compare to limit
+ uqsub8 r8, r6, r3 @ compare to thresh
+ orr lr, lr, r7
+.endm
+
+.macro filter_mask_pq
+ uqsub8 r6, r11, r10 @ p1 - q1
+ uqsub8 r7, r10, r11 @ q1 - p1
+ uqsub8 r11, r12, r9 @ p0 - q0
+ uqsub8 r12, r9, r12 @ q0 - p0
+ orr r6, r6, r7 @ abs(p1-q1)
+ orr r12, r11, r12 @ abs(p0-q0)
+ mov32 r7, 0x7f7f7f7f
+ uqadd8 r12, r12, r12 @ abs(p0-q0) * 2
+ and r6, r7, r6, lsr #1 @ abs(p1-q1) / 2
+ uqadd8 r12, r12, r6 @ abs(p0-q0) * 2 + abs(p1-q1)/2
+.endm
+
+.macro filter_mask_v
+ filter_mask_p
+
+ ldr r10, [r0, r1] @ q1
+ ldr_post r9, r0, r1, lsl #1 @ q0
+
+ filter_mask_pq
+
+ ldr r11, [r0] @ q2
+
+ uqsub8 r7, r9, r10 @ q0 - q1
+ uqsub8 r6, r10, r9 @ q1 - q0
+ uqsub8 r12, r12, r4 @ compare to flimit
+ uqsub8 r9, r11, r10 @ q2 - q1
+ uqsub8 r10, r10, r11 @ q1 - q2
+ orr lr, lr, r12
+ ldr r12, [r0, r1] @ q3
+ orr r6, r7, r6 @ abs(q1-q0)
+ orr r10, r9, r10 @ abs(q2-q1)
+ uqsub8 r9, r12, r11 @ q3 - q2
+ uqsub8 r11, r11, r12 @ q2 - q3
+ uqsub8 r7, r6, r2 @ compare to limit
+ uqsub8 r10, r10, r2 @ compare to limit
+ uqsub8 r6, r6, r3 @ compare to thresh
+ orr r9, r9, r11 @ abs(q3-q2)
+ orr lr, lr, r7
+ orr lr, lr, r10
+ uqsub8 r9, r9, r2 @ compare to limit
+ orr lr, lr, r9
+
+ mov r12, #0
+ usub8 lr, r12, lr
+ mvn r11, #0
+ sel lr, r11, r12 @ filter mask
+ sub r0, r0, r1, lsl #1
+.endm
+
+.macro filter_mask_h
+ transpose r12, r11, r10, r9, r6, r7, r8, lr
+
+ filter_mask_p
+
+ stm sp, {r8, r11, r12, lr}
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #4
+
+ ldr r7, [r0, r1]
+ ldr_post r6, r0, r1, lsl #1
+ ldr lr, [r0, r1]
+ ldr r8, [r0]
+
+ transpose r12, r11, r10, r9, r6, r7, r8, lr
+
+ uqsub8 r8, r12, r11 @ q3 - q2
+ uqsub8 lr, r11, r12 @ q2 - q3
+ uqsub8 r7, r9, r10 @ q0 - q1
+ uqsub8 r6, r10, r9 @ q1 - q0
+ uqsub8 r12, r11, r10 @ q2 - q1
+ uqsub8 r11, r10, r11 @ q1 - q2
+ orr r8, r8, lr @ abs(q3-q2)
+ orr r6, r7, r6 @ abs(q1-q0)
+ orr r11, r12, r11 @ abs(q2-q1)
+ ldr lr, [sp, #12] @ load back (f)limit accumulator
+ uqsub8 r8, r8, r2 @ compare to limit
+ uqsub8 r7, r6, r2 @ compare to limit
+ uqsub8 r11, r11, r2 @ compare to limit
+ orr lr, lr, r8
+ uqsub8 r8, r6, r3 @ compare to thresh
+ orr lr, lr, r7
+ ldr r12, [sp, #8] @ p1
+ orr lr, lr, r11
+
+ ldr r11, [sp, #4] @ p0
+
+ filter_mask_pq
+
+ mov r10, #0
+ uqsub8 r12, r12, r4 @ compare to flimit
+ mvn r11, #0
+ orr lr, lr, r12
+ usub8 lr, r10, lr
+ sel lr, r11, r10 @ filter mask
+.endm
+
+.macro filter inner
+ mov32 r12, 0x80808080
+ eor r11, r7, r12 @ ps1
+ eor r8, r8, r12 @ ps0
+ eor r9, r9, r12 @ qs0
+ eor r10, r10, r12 @ qs1
+
+ stm sp, {r8-r11}
+
+ qsub8 r7, r11, r10 @ vp8_signed_char_clamp(ps1-qs1)
+ qsub8 r8, r9, r8 @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
+ .if \inner
+ and r7, r7, r6 @ vp8_filter &= hev
+ .endif
+ qadd8 r7, r7, r8
+ lsr r10, r12, #5 @ 0x04040404
+ qadd8 r7, r7, r8
+ sub r9, r10, r12, lsr #7 @ 0x03030303
+ qadd8 r7, r7, r8
+
+ and r7, r7, lr @ vp8_filter &= mask
+ .if !\inner
+ mov r12, r7 @ Filter2
+ and r7, r7, r6 @ Filter2 &= hev
+ .endif
+ qadd8 lr, r7, r9 @ Filter2 = vp8_signed_char_clamp(vp8_filter+3)
+ qadd8 r7, r7, r10 @ Filter1 = vp8_signed_char_clamp(vp8_filter+4)
+
+ mov r9, #0
+ shadd8 lr, lr, r9 @ Filter2 >>= 3
+ shadd8 r7, r7, r9 @ Filter1 >>= 3
+ shadd8 lr, lr, r9
+ shadd8 r7, r7, r9
+ shadd8 lr, lr, r9 @ Filter2
+ shadd8 r7, r7, r9 @ Filter1
+.endm
+
+.macro filter_v inner
+ orr r10, r6, r8 @ calculate vp8_hevmask
+ ldr_nreg r7, r0, r1, lsl #1 @ p1
+ usub8 r10, r12, r10
+ ldr_nreg r8, r0, r1 @ p0
+ sel r6, r12, r11 @ obtain vp8_hevmask
+ ldr r9, [r0] @ q0
+ ldr r10, [r0, r1] @ q1
+ filter \inner
+.endm
+
+.macro filter_h inner
+ orr r9, r6, r8
+ usub8 r9, r12, r9
+ sel r6, r12, r11 @ hev mask
+
+ stm sp, {r6, lr}
+
+ ldr_nreg r12, r0, r1, lsl #1
+ ldr_nreg r11, r0, r1
+ ldr r6, [r0]
+ ldr lr, [r0, r1]
+
+ transpose r10, r9, r8, r7, r12, r11, r6, lr
+
+ ldm sp, {r6, lr}
+ filter \inner
+.endm
+
+.macro filter_inner
+ ldm sp, {r8, r9}
+ lsr r10, r10, #2 @ 0x01010101
+ qadd8 r8, r8, lr @ u = vp8_signed_char_clamp(ps0 + Filter2)
+ mov lr, #0
+ qsub8 r9, r9, r7 @ u = vp8_signed_char_clamp(qs0 - Filter1)
+ sadd8 r7, r7, r10 @ vp8_filter += 1
+ ldr r10, [sp, #8] @ qs1
+ shadd8 r7, r7, lr @ vp8_filter >>= 1
+ eor r8, r8, r12 @ *op0 = u ^ 0x80
+ bic r7, r7, r6 @ vp8_filter &= ~hev
+ qadd8 r11, r11, r7 @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
+ eor r9, r9, r12 @ *oq0 = u ^ 0x80
+ qsub8 r10, r10, r7 @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
+ eor r11, r11, r12 @ *op1 = u ^ 0x80
+ eor r10, r10, r12 @ *oq1 = u ^ 0x80
+.endm
+
+.macro filter_x c0
+ mov lr, \c0
+ mov r7, #63
+
+ sxtb16 r6, r12
+ sxtb16 r10, r12, ror #8
+ smlabb r8, r6, lr, r7
+ smlatb r6, r6, lr, r7
+ smlabb r7, r10, lr, r7
+ smultb r10, r10, lr
+ ssat r8, #8, r8, asr #7
+ ssat r6, #8, r6, asr #7
+ add r10, r10, #63
+ ssat r7, #8, r7, asr #7
+ ssat r10, #8, r10, asr #7
+
+ pkhbt r6, r8, r6, lsl #16
+ pkhbt r10, r7, r10, lsl #16
+ uxtb16 r6, r6
+ uxtb16 r10, r10
+
+ mov32 lr, 0x80808080
+
+ orr r10, r6, r10, lsl #8 @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
+ qsub8 r8, r9, r10 @ s = vp8_signed_char_clamp(qs0 - u)
+ qadd8 r10, r11, r10 @ s = vp8_signed_char_clamp(ps0 + u)
+ eor r8, r8, lr @ *oq0 = s ^ 0x80
+ eor r10, r10, lr @ *op0 = s ^ 0x80
+.endm
+
+.macro filter_1
+ ldm sp, {r8, r9}
+ qadd8 r11, r8, lr
+ qsub8 r9, r9, r7
+ bic r12, r12, r6 @ vp8_filter &= ~hev
+ filter_x #27
+.endm
+
+.macro filter_2
+ ldr r9, [sp, #8] @ qs1
+ ldr r11, [sp, #12] @ ps1
+ filter_x #18
+.endm
+
+.macro filter_3
+ eor r9, r9, lr
+ eor r11, r11, lr
+ filter_x #9
+.endm
+
+function vp8_v_loop_filter_inner_armv6
+ mov r5, #4
+ sub sp, sp, #16
+
+ orr r2, r2, r2, lsl #16
+ orr r3, r3, r3, lsl #16
+ orr r6, r6, r6, lsl #16
+ orr r4, r2, r2, lsl #8 @ flimE
+ orr r2, r3, r3, lsl #8 @ flimI
+ orr r3, r6, r6, lsl #8 @ thresh
+1:
+ sub r0, r0, r1, lsl #2
+ ldr r10, [r0, r1] @ p2
+ ldr_post r9, r0, r1, lsl #1 @ p3
+ ldr r12, [r0, r1] @ p0
+ ldr_post r11, r0, r1, lsl #1 @ p1
+
+ filter_mask_v
+ cmp lr, #0
+ beq 2f
+ filter_v inner=1
+ filter_inner
+
+A str r11, [r0, -r1, lsl #1] @ op1
+A str r8, [r0, -r1] @ op0
+T sub r0, r0, r1, lsl #1
+T str r8, [r0, r1]
+T str_post r11, r0, r1, lsl #1
+ str r9, [r0] @ oq0
+ str r10, [r0, r1] @ oq1
+2:
+ add r0, r0, #4
+ cmp r5, #3
+ it eq
+ ldreq r0, [sp, #16]
+ subs r5, r5, #1
+ bne 1b
+
+ add sp, sp, #16
+ pop {r0, r4-r11, pc}
+endfunc
+
+function ff_vp8_v_loop_filter16_inner_armv6, export=1
+ push {r4-r11, lr}
+ add r12, r0, #8
+ push {r12}
+ ldr r6, [sp, #40]
+ orr r2, r2, r2, lsl #16
+ b vp8_v_loop_filter_inner_armv6
+endfunc
+
+function ff_vp8_v_loop_filter8uv_inner_armv6, export=1
+ push {r1, r4-r11, lr}
+ mov r1, r2
+ orr r2, r3, r3, lsl #16
+ ldr r3, [sp, #40]
+ ldr r6, [sp, #44]
+ b vp8_v_loop_filter_inner_armv6
+endfunc
+
+function vp8_v_loop_filter_armv6
+ mov r5, #4
+ sub sp, sp, #16
+
+ orr r3, r3, r3, lsl #16
+ orr r6, r6, r6, lsl #16
+ orr r4, r2, r2, lsl #8 @ flimE
+ orr r2, r3, r3, lsl #8 @ flimI
+ orr r3, r6, r6, lsl #8 @ thresh
+1:
+ sub r0, r0, r1, lsl #2
+ ldr r10, [r0, r1] @ p2
+ ldr_post r9, r0, r1, lsl #1 @ p3
+ ldr r12, [r0, r1] @ p0
+ ldr_post r11, r0, r1, lsl #1 @ p1
+
+ filter_mask_v
+ cmp lr, #0
+ beq 2f
+
+ filter_v inner=0
+ filter_1
+
+ str r8, [r0] @ *oq0
+A str r10, [r0, -r1] @ *op0
+T sub r0, r0, r1, lsl #1
+T str r10, [r0, r1]
+
+ filter_2
+
+A str r10, [r0, -r1, lsl #1] @ *op1
+T str_post r10, r0, r1, lsl #1
+ str r8, [r0, r1] @ *oq1
+
+ ldr r9, [r0, r1, lsl #1] @ q2
+ add r0, r0, r1
+A ldr r11, [r0, -r1, lsl #2] @ p2
+T ldr_dpre r11, r0, r1, lsl #2
+
+ filter_3
+
+A str r10, [r0, -r1, lsl #2] @ *op2
+T str_post r10, r0, r1, lsl #2
+ str r8, [r0, r1] @ *oq2
+ sub r0, r0, r1
+2:
+ add r0, r0, #4
+ cmp r5, #3
+ it eq
+ ldreq r0, [sp, #16]
+ subs r5, r5, #1
+ bne 1b
+
+ add sp, sp, #16
+ pop {r0, r4-r11, pc}
+endfunc
+
+function ff_vp8_v_loop_filter16_armv6, export=1
+ push {r4-r11, lr}
+ add r12, r0, #8
+ push {r12}
+ ldr r6, [sp, #40]
+ orr r2, r2, r2, lsl #16
+ b vp8_v_loop_filter_armv6
+endfunc
+
+function ff_vp8_v_loop_filter8uv_armv6, export=1
+ push {r1, r4-r11, lr}
+ mov r1, r2
+ orr r2, r3, r3, lsl #16
+ ldr r3, [sp, #40]
+ ldr r6, [sp, #44]
+ b vp8_v_loop_filter_armv6
+endfunc
+
+@ void vp8_h_loop_filter16_simple(uint8_t *dst, int stride, int flim)
+function ff_vp8_h_loop_filter16_simple_armv6, export=1
+ push {r4-r11, lr}
+ orr r12, r2, r2, lsl #16
+ mov32 r2, 0x80808080
+ orr r12, r12, r12, lsl #8
+
+ mov lr, #0
+ mov r11, #4
+1:
+ sub r0, r0, #2
+ ldr r8, [r0, r1]
+ ldr_post r7, r0, r1, lsl #1
+ ldr r10, [r0, r1]
+ ldr_post r9, r0, r1, lsl #1
+ add r0, r0, #2
+ transpose r6, r5, r4, r3, r7, r8, r9, r10
+ simple_filter
+ sub r0, r0, r1, lsl #2
+ sub r0, r0, #1
+
+ uxtb16 r6, r4
+ uxtb16 r8, r5
+ uxtb16 r7, r4, ror #8
+ uxtb16 r9, r5, ror #8
+ orr r6, r6, r8, lsl #8
+ orr r7, r7, r9, lsl #8
+ lsr r4, r6, #16
+ lsr r5, r7, #16
+
+ strh_post r6, r0, r1
+ strh_post r7, r0, r1
+ strh_post r4, r0, r1
+ strh_post r5, r0, r1
+ add r0, r0, #1
+2:
+ subs r11, r11, #1
+ bne 1b
+
+ pop {r4-r11, pc}
+endfunc
+
+function vp8_h_loop_filter_inner_armv6
+ mov r5, #4
+ sub sp, sp, #16
+
+ orr r3, r3, r3, lsl #16
+ orr r9, r9, r9, lsl #16
+ orr r4, r2, r2, lsl #8 @ flimE
+ orr r2, r3, r3, lsl #8 @ flimI
+ orr r3, r9, r9, lsl #8 @ thresh
+ sub r0, r0, #4
+1:
+ ldr r7, [r0, r1]
+ ldr_post r6, r0, r1, lsl #1
+ ldr lr, [r0, r1]
+ ldr_post r8, r0, r1, lsl #1
+
+ filter_mask_h
+
+ cmp lr, #0
+ sub r0, r0, #2
+ beq 2f
+
+ ldr r6, [sp]
+
+ filter_h inner=1
+ filter_inner
+
+ transpose lr, r12, r7, r6, r11, r8, r9, r10
+
+A str r6, [r0, -r1, lsl #1]
+A str r7, [r0, -r1]
+T sub r0, r0, r1, lsl #1
+T str r7, [r0, r1]
+T str_post r6, r0, r1, lsl #1
+ str r12, [r0]
+ str lr, [r0, r1]
+2:
+ sub r0, r0, #2
+ add r0, r0, r1, lsl #1
+ cmp r5, #3
+ it eq
+ ldreq r0, [sp, #16]
+ subs r5, r5, #1
+ bne 1b
+
+ add sp, sp, #16
+ pop {r0, r4-r11, pc}
+endfunc
+
+function ff_vp8_h_loop_filter16_inner_armv6, export=1
+ push {r4-r11, lr}
+ add r12, r0, r1, lsl #3
+ sub r12, r12, #4
+ push {r12}
+ ldr r9, [sp, #40]
+ orr r2, r2, r2, lsl #16
+ b vp8_h_loop_filter_inner_armv6
+endfunc
+
+function ff_vp8_h_loop_filter8uv_inner_armv6, export=1
+ sub r1, r1, #4
+ push {r1, r4-r11, lr}
+ mov r1, r2
+ orr r2, r3, r3, lsl #16
+ ldr r3, [sp, #40]
+ ldr r9, [sp, #44]
+ b vp8_h_loop_filter_inner_armv6
+endfunc
+
+function vp8_h_loop_filter_armv6
+ mov r5, #4
+ sub sp, sp, #16
+
+ orr r3, r3, r3, lsl #16
+ orr r9, r9, r9, lsl #16
+ orr r4, r2, r2, lsl #8 @ flimE
+ orr r2, r3, r3, lsl #8 @ flimI
+ orr r3, r9, r9, lsl #8 @ thresh
+1:
+ sub r0, r0, #4
+ ldr r7, [r0, r1]
+ ldr_post r6, r0, r1, lsl #1
+ ldr lr, [r0, r1]
+ ldr_post r8, r0, r1, lsl #1
+
+ filter_mask_h
+ cmp lr, #0
+ it eq
+ addeq r0, r0, r1, lsl #1
+ beq 2f
+
+ ldr r6, [sp]
+ sub r0, r0, #2
+
+ filter_h inner=0
+ filter_1
+
+ sub r0, r0, r1, lsl #1
+ uxtb16 r6, r10
+ uxtb16 r7, r8
+ uxtb16 r10, r10, ror #8
+ uxtb16 r8, r8, ror #8
+ orr r6, r6, r7, lsl #8
+ orr r10, r10, r8, lsl #8
+ lsr r7, r6, #16
+ lsr r8, r10, #16
+
+ add r0, r0, #1
+ strh_post r6, r0, r1
+ strh_post r10, r0, r1
+ strh_post r7, r0, r1
+ strh_post r8, r0, r1
+
+ filter_2
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #3
+
+ ldrb r11, [r0, #-5] @ p2 for 1/7th difference
+ strb r10, [r0, #-4] @ op1
+ strb r8, [r0, #-1] @ oq1
+ ldrb_post r9, r0, r1 @ q2 for 1/7th difference
+
+ lsr r10, r10, #8
+ lsr r8, r8, #8
+
+ ldrb r6, [r0, #-5]
+ strb r10, [r0, #-4]
+ strb r8, [r0, #-1]
+ ldrb_post r7, r0, r1
+
+ lsr r10, r10, #8
+ lsr r8, r8, #8
+ orr r11, r11, r6, lsl #8
+ orr r9, r9, r7, lsl #8
+
+ ldrb r6, [r0, #-5]
+ strb r10, [r0, #-4]
+ strb r8, [r0, #-1]
+ ldrb_post r7, r0, r1
+
+ lsr r10, r10, #8
+ lsr r8, r8, #8
+ orr r11, r11, r6, lsl #16
+ orr r9, r9, r7, lsl #16
+
+ ldrb r6, [r0, #-5]
+ strb r10, [r0, #-4]
+ strb r8, [r0, #-1]
+ ldrb_post r7, r0, r1
+ orr r11, r11, r6, lsl #24
+ orr r9, r9, r7, lsl #24
+
+ filter_3
+
+ sub r0, r0, r1, lsl #2
+ strb r10, [r0, #-5]
+ strb_post r8, r0, r1
+ lsr r10, r10, #8
+ lsr r8, r8, #8
+ strb r10, [r0, #-5]
+ strb_post r8, r0, r1
+ lsr r10, r10, #8
+ lsr r8, r8, #8
+ strb r10, [r0, #-5]
+ strb_post r8, r0, r1
+ lsr r10, r10, #8
+ lsr r8, r8, #8
+ strb r10, [r0, #-5]
+ strb_post r8, r0, r1
+
+ sub r0, r0, #2
+2:
+ cmp r5, #3
+ it eq
+ ldreq r0, [sp, #16]
+ subs r5, r5, #1
+ bne 1b
+
+ add sp, sp, #16
+ pop {r0, r4-r11, pc}
+endfunc
+
+function ff_vp8_h_loop_filter16_armv6, export=1
+ push {r4-r11, lr}
+ add r12, r0, r1, lsl #3
+ push {r12}
+ ldr r9, [sp, #40]
+ orr r2, r2, r2, lsl #16
+ b vp8_h_loop_filter_armv6
+endfunc
+
+function ff_vp8_h_loop_filter8uv_armv6, export=1
+ push {r1, r4-r11, lr}
+ mov r1, r2
+ orr r2, r3, r3, lsl #16
+ ldr r3, [sp, #40]
+ ldr r9, [sp, #44]
+ b vp8_h_loop_filter_armv6
+endfunc
+
+.ltorg
+
+@ MC
+
+@ void put_vp8_pixels16(uint8_t *dst, int dststride, uint8_t *src,
+@ int srcstride, int h, int mx, int my)
+function ff_put_vp8_pixels16_armv6, export=1
+ push {r4-r11}
+ ldr r12, [sp, #32] @ h
+1:
+ subs r12, r12, #2
+ ldr r5, [r2, #4]
+ ldr r6, [r2, #8]
+ ldr r7, [r2, #12]
+ ldr_post r4, r2, r3
+ ldr r9, [r2, #4]
+ ldr r10, [r2, #8]
+ ldr r11, [r2, #12]
+ ldr_post r8, r2, r3
+ strd r6, r7, [r0, #8]
+ strd_post r4, r5, r0, r1
+ strd r10, r11, [r0, #8]
+ strd_post r8, r9, r0, r1
+ bgt 1b
+ pop {r4-r11}
+ bx lr
+endfunc
+
+@ void put_vp8_pixels8(uint8_t *dst, int dststride, uint8_t *src,
+@ int srcstride, int h, int mx, int my)
+function ff_put_vp8_pixels8_armv6, export=1
+ push {r4-r11}
+ ldr r12, [sp, #32] @ h
+1:
+ subs r12, r12, #4
+ ldr r5, [r2, #4]
+ ldr_post r4, r2, r3
+ ldr r7, [r2, #4]
+ ldr_post r6, r2, r3
+ ldr r9, [r2, #4]
+ ldr_post r8, r2, r3
+ ldr r11, [r2, #4]
+ ldr_post r10, r2, r3
+ strd_post r4, r5, r0, r1
+ strd_post r6, r7, r0, r1
+ strd_post r8, r9, r0, r1
+ strd_post r10, r11, r0, r1
+ bgt 1b
+ pop {r4-r11}
+ bx lr
+endfunc
+
+@ void put_vp8_pixels4(uint8_t *dst, int dststride, uint8_t *src,
+@ int srcstride, int h, int mx, int my)
+function ff_put_vp8_pixels4_armv6, export=1
+ ldr r12, [sp, #0] @ h
+ push {r4-r6,lr}
+1:
+ subs r12, r12, #4
+ ldr_post r4, r2, r3
+ ldr_post r5, r2, r3
+ ldr_post r6, r2, r3
+ ldr_post lr, r2, r3
+ str_post r4, r0, r1
+ str_post r5, r0, r1
+ str_post r6, r0, r1
+ str_post lr, r0, r1
+ bgt 1b
+ pop {r4-r6,pc}
+endfunc
+
+@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
+@ arithmatic can be used to apply filters
+const sixtap_filters_13245600, align=4
+ .short 2, 108, -11, 36, -8, 1, 0, 0
+ .short 3, 77, -16, 77, -16, 3, 0, 0
+ .short 1, 36, -8, 108, -11, 2, 0, 0
+endconst
+
+const fourtap_filters_1324, align=4
+ .short -6, 12, 123, -1
+ .short -9, 50, 93, -6
+ .short -6, 93, 50, -9
+ .short -1, 123, 12, -6
+endconst
+
+.macro vp8_mc_1 name, size, hv
+function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1
+ sub r1, r1, #\size
+ mov r12, sp
+ push {r1, r4-r11, lr}
+ ldm r12, {r5-r7}
+ mov r4, #\size
+ stm r12, {r4, r5}
+ orr r12, r6, r7
+ b vp8_put_\name\()_\hv\()_armv6 + 4
+endfunc
+.endm
+
+vp8_mc_1 epel, 16, h6
+vp8_mc_1 epel, 16, v6
+vp8_mc_1 epel, 8, h6
+vp8_mc_1 epel, 8, v6
+vp8_mc_1 epel, 8, h4
+vp8_mc_1 epel, 8, v4
+vp8_mc_1 epel, 4, h6
+vp8_mc_1 epel, 4, v6
+vp8_mc_1 epel, 4, h4
+vp8_mc_1 epel, 4, v4
+
+vp8_mc_1 bilin, 16, h
+vp8_mc_1 bilin, 16, v
+vp8_mc_1 bilin, 8, h
+vp8_mc_1 bilin, 8, v
+vp8_mc_1 bilin, 4, h
+vp8_mc_1 bilin, 4, v
+
+/* True relational expressions have the value -1 in the GNU assembler,
+ +1 in Apple's. */
+#ifdef __APPLE__
+# define TMPSIZE \size * (8 + 8*(\size > 4) + \ytaps - 1)
+#else
+# define TMPSIZE \size * (8 - 8*(\size > 4) + \ytaps - 1)
+#endif
+
+.macro vp8_mc_hv name, size, h, v, ytaps
+function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1
+ push {r0, r1, r4, lr}
+ add r0, sp, #16
+ sub sp, sp, #TMPSIZE+16
+ ldm r0, {r0, r12}
+ mov r4, #\size
+ add lr, r0, #\ytaps-1
+ .if \ytaps > 2
+ sub r2, r2, r3, lsl #\ytaps >> 1 & 1
+ .endif
+ stm sp, {r4, lr}
+ add r0, sp, #16
+ mov r1, #0
+ bl vp8_put_\name\()_\h\()_armv6
+ add r0, sp, #TMPSIZE+16
+ ldr lr, [sp, #TMPSIZE+16+16]
+ ldm r0, {r0, r1}
+ mov r3, #\size
+ ldr r12, [sp, #TMPSIZE+16+16+8]
+ str lr, [sp, #4]
+ add r2, sp, #16 + \size * (\ytaps / 2 - 1)
+ sub r1, r1, #\size
+ bl vp8_put_\name\()_\v\()_armv6
+ add sp, sp, #TMPSIZE+16+8
+ pop {r4, pc}
+endfunc
+.endm
+
+vp8_mc_hv epel, 16, h6, v6, 6
+vp8_mc_hv epel, 8, h6, v6, 6
+vp8_mc_hv epel, 8, h4, v6, 6
+vp8_mc_hv epel, 8, h6, v4, 4
+vp8_mc_hv epel, 8, h4, v4, 4
+vp8_mc_hv epel, 4, h6, v6, 6
+vp8_mc_hv epel, 4, h4, v6, 6
+vp8_mc_hv epel, 4, h6, v4, 4
+vp8_mc_hv epel, 4, h4, v4, 4
+
+vp8_mc_hv bilin, 16, h, v, 2
+vp8_mc_hv bilin, 8, h, v, 2
+vp8_mc_hv bilin, 4, h, v, 2
+
+.macro sat4 r0, r1, r2, r3
+ asr \r0, \r0, #7
+ asr \r1, \r1, #7
+ pkhbt \r0, \r0, \r2, lsl #9
+ pkhbt \r1, \r1, \r3, lsl #9
+ usat16 \r0, #8, \r0
+ usat16 \r1, #8, \r1
+ orr \r0, \r0, \r1, lsl #8
+.endm
+
+@ Calling convention for the inner MC functions:
+@ r0 dst
+@ r1 dst_stride - block_width
+@ r2 src
+@ r3 src_stride
+@ r4 block_width
+@ r12 filter_index
+@ [sp] block_width
+@ [sp+4] height
+@ [sp+8] scratch
+
+function vp8_put_epel_h6_armv6
+ push {r1, r4-r11, lr}
+ sub r2, r2, #2
+ movrel lr, sixtap_filters_13245600 - 16
+ add lr, lr, r12, lsl #3
+ sub r3, r3, r4
+ str r3, [sp, #48]
+ ldm lr, {r1, r3, lr}
+1:
+ ldr r7, [r2, #5] @ src[5-8]
+ ldr r6, [r2, #2] @ src[2-5]
+ ldr r5, [r2], #4 @ src[0-3]
+
+ pkhtb r7, r7, r7, asr #8 @ src[8,7,7,6]
+ uxtb16 r9, r6, ror #8 @ src[5] | src[3]
+ uxtb16 r6, r6 @ src[4] | src[2]
+ uxtb16 r8, r5, ror #8 @ src[3] | src[1]
+ uxtb16 r11, r7, ror #8 @ src[8] | src[7]
+ uxtb16 r7, r7 @ src[7] | src[6]
+ uxtb16 r5, r5 @ src[2] | src[0]
+
+ mov r10, #0x40
+ smlad r5, r5, r1, r10 @ filter[0][0]
+ smlad r11, r11, lr, r10 @ filter[3][2]
+ smlad r12, r7, lr, r10 @ filter[2][2]
+ smlad r10, r8, r1, r10 @ filter[1][0]
+ smlad r5, r8, r3, r5 @ filter[0][1]
+ smlad r11, r9, r1, r11 @ filter[3][0]
+ smlad r12, r9, r3, r12 @ filter[2][1]
+ pkhtb r9, r9, r6, asr #16 @ src[5] | src[4]
+ smlad r10, r6, r3, r10 @ filter[1][1]
+ pkhbt r7, r9, r7, lsl #16 @ src[6] | src[4]
+ smlad r5, r9, lr, r5 @ filter[0][2]
+ pkhtb r8, r7, r9, asr #16 @ src[6] | src[5]
+ smlad r11, r7, r3, r11 @ filter[3][1]
+ smlad r9, r8, lr, r10 @ filter[1][2]
+ smlad r7, r6, r1, r12 @ filter[2][0]
+
+ subs r4, r4, #4
+
+ sat4 r5, r9, r7, r11
+ str r5, [r0], #4
+
+ bne 1b
+
+ add r4, sp, #40
+ ldm r4, {r4, r5, r12}
+ ldr r6, [sp]
+ subs r5, r5, #1
+ add r2, r2, r12
+ str r5, [sp, #44]
+ add r0, r0, r6
+
+ bne 1b
+
+ pop {r1, r4-r11, pc}
+endfunc
+
+function vp8_put_epel_v6_armv6
+ push {r1, r4-r11, lr}
+ movrel lr, sixtap_filters_13245600 - 16
+ add lr, lr, r12, lsl #3
+ str r3, [sp, #48]
+1:
+ add r1, r3, r3, lsl #1 @ stride * 3
+ ldr_nreg r5, r2, r3 @ src[0,1,2,3 + stride * 1]
+ ldr r6, [r2, r3] @ src[0,1,2,3 + stride * 3]
+ ldr r7, [r2, r3, lsl #1] @ src[0,1,2,3 + stride * 4]
+ ldr r8, [r2, r1] @ src[0,1,2,3 + stride * 5]
+
+ uxtb16 r9, r5, ror #8 @ src[3 + s*1] | src[1 + s*1]
+ uxtb16 r10, r6, ror #8 @ src[3 + s*3] | src[1 + s*3]
+ uxtb16 r11, r7, ror #8 @ src[3 + s*4] | src[1 + s*4]
+ uxtb16 r12, r8, ror #8 @ src[3 + s*5] | src[1 + s*5]
+ uxtb16 r5, r5 @ src[2 + s*1] | src[0 + s*1]
+ uxtb16 r6, r6 @ src[2 + s*3] | src[0 + s*3]
+ uxtb16 r7, r7 @ src[2 + s*4] | src[0 + s*4]
+ uxtb16 r8, r8 @ src[2 + s*5] | src[0 + s*5]
+ pkhbt r1, r9, r10, lsl #16 @ src[1 + s*3] | src[1 + s*1]
+ pkhtb r9, r10, r9, asr #16 @ src[3 + s*3] | src[3 + s*1]
+ pkhbt r10, r11, r12, lsl #16 @ src[1 + s*5] | src[1 + s*4]
+ pkhtb r11, r12, r11, asr #16 @ src[3 + s*5] | src[3 + s*4]
+ pkhbt r12, r5, r6, lsl #16 @ src[0 + s*3] | src[0 + s*1]
+ pkhtb r5, r6, r5, asr #16 @ src[2 + s*3] | src[2 + s*1]
+ pkhbt r6, r7, r8, lsl #16 @ src[0 + s*5] | src[0 + s*4]
+ pkhtb r7, r8, r7, asr #16 @ src[2 + s*5] | src[2 + s*4]
+
+ ldr r8, [lr, #4]
+ mov r3, #0x40
+ smlad r12, r12, r8, r3 @ filter[0][1]
+ smlad r1, r1, r8, r3 @ filter[1][1]
+ smlad r5, r5, r8, r3 @ filter[2][1]
+ smlad r9, r9, r8, r3 @ filter[3][1]
+ ldr r8, [lr, #8]
+ ldr r3, [sp, #48]
+ smlad r12, r6, r8, r12 @ filter[0][2]
+ smlad r1, r10, r8, r1 @ filter[1][2]
+ ldr_nreg r6, r2, r3, lsl #1 @ src[0,1,2,3 + stride * 0]
+ ldr r10, [r2], #4 @ src[0,1,2,3 + stride * 2]
+ smlad r5, r7, r8, r5 @ filter[2][2]
+ smlad r9, r11, r8, r9 @ filter[3][2]
+
+ uxtb16 r7, r6, ror #8 @ src[3 + s*0] | src[1 + s*0]
+ uxtb16 r11, r10, ror #8 @ src[3 + s*2] | src[1 + s*2]
+ uxtb16 r6, r6 @ src[2 + s*0] | src[0 + s*0]
+ uxtb16 r10, r10 @ src[2 + s*2] | src[0 + s*2]
+
+ pkhbt r8, r7, r11, lsl #16 @ src[1 + s*2] | src[1 + s*0]
+ pkhtb r7, r11, r7, asr #16 @ src[3 + s*2] | src[3 + s*0]
+ pkhbt r11, r6, r10, lsl #16 @ src[0 + s*2] | src[0 + s*0]
+ pkhtb r6, r10, r6, asr #16 @ src[2 + s*2] | src[2 + s*0]
+
+ ldr r10, [lr]
+ subs r4, r4, #4
+ smlad r12, r11, r10, r12 @ filter[0][0]
+ smlad r1, r8, r10, r1 @ filter[1][0]
+ smlad r5, r6, r10, r5 @ filter[2][0]
+ smlad r9, r7, r10, r9 @ filter[3][0]
+
+ sat4 r12, r1, r5, r9
+ str r12, [r0], #4
+
+ bne 1b
+
+ ldrd r4, r5, [sp, #40]
+ ldr r6, [sp]
+ subs r5, r5, #1
+ sub r2, r2, r4
+ str r5, [sp, #44]
+ add r0, r0, r6
+ add r2, r2, r3
+
+ bne 1b
+
+ pop {r1, r4-r11, pc}
+endfunc
+
+function vp8_put_epel_h4_armv6
+ push {r1, r4-r11, lr}
+ subs r2, r2, #1
+ movrel lr, fourtap_filters_1324 - 4
+ add lr, lr, r12, lsl #2
+ sub r3, r3, r4
+ ldm lr, {r5, r6}
+ ldr lr, [sp, #44]
+1:
+ ldr r9, [r2, #3]
+ ldr r8, [r2, #2]
+ ldr r7, [r2], #4
+
+ uxtb16 r9, r9, ror #8 @ src[6] | src[4]
+ uxtb16 r10, r8, ror #8 @ src[5] | src[3]
+ uxtb16 r8, r8 @ src[4] | src[2]
+ uxtb16 r11, r7, ror #8 @ src[3] | src[1]
+ uxtb16 r7, r7 @ src[2] | src[0]
+
+ mov r12, #0x40
+ smlad r9, r9, r6, r12 @ filter[3][1]
+ smlad r7, r7, r5, r12 @ filter[0][0]
+ smlad r9, r10, r5, r9 @ filter[3][0]
+ smlad r10, r10, r6, r12 @ filter[2][1]
+ smlad r12, r11, r5, r12 @ filter[1][0]
+ smlad r7, r11, r6, r7 @ filter[0][1]
+ smlad r10, r8, r5, r10 @ filter[2][0]
+ smlad r12, r8, r6, r12 @ filter[1][1]
+
+ subs r4, r4, #4
+
+ sat4 r7, r12, r10, r9
+ str r7, [r0], #4
+
+ bne 1b
+
+ subs lr, lr, #1
+ ldr r4, [sp, #40]
+ add r2, r2, r3
+ add r0, r0, r1
+
+ bne 1b
+
+ pop {r1, r4-r11, pc}
+endfunc
+
+function vp8_put_epel_v4_armv6
+ push {r1, r4-r11, lr}
+ movrel lr, fourtap_filters_1324 - 4
+ add lr, lr, r12, lsl #2
+ ldm lr, {r5, r6}
+ str r3, [sp, #48]
+1:
+ ldr lr, [r2, r3, lsl #1]
+ ldr r12, [r2, r3]
+ ldr_nreg r7, r2, r3
+ ldr r11, [r2], #4
+
+ uxtb16 r8, lr, ror #8 @ src[3 + s*3] | src[1 + s*3]
+ uxtb16 r9, r12, ror #8 @ src[3 + s*2] | src[1 + s*2]
+ uxtb16 r3, r7, ror #8 @ src[3 + s*0] | src[1 + s*0]
+ uxtb16 r1, r11, ror #8 @ src[3 + s*1] | src[1 + s*1]
+ uxtb16 lr, lr @ src[2 + s*3] | src[0 + s*3]
+ uxtb16 r12, r12 @ src[2 + s*2] | src[0 + s*2]
+ uxtb16 r7, r7 @ src[2 + s*0] | src[0 + s*0]
+ uxtb16 r11, r11 @ src[2 + s*1] | src[0 + s*1]
+ pkhbt r10, r1, r8, lsl #16 @ src[1 + s*3] | src[1 + s*1]
+ pkhtb r1, r8, r1, asr #16 @ src[3 + s*3] | src[3 + s*1]
+ pkhbt r8, r3, r9, lsl #16 @ src[1 + s*2] | src[1 + s*0]
+ pkhtb r3, r9, r3, asr #16 @ src[3 + s*2] | src[3 + s*0]
+ pkhbt r9, r11, lr, lsl #16 @ src[0 + s*3] | src[0 + s*1]
+ pkhtb r11, lr, r11, asr #16 @ src[2 + s*3] | src[2 + s*1]
+ pkhbt lr, r7, r12, lsl #16 @ src[0 + s*2] | src[0 + s*0]
+ pkhtb r7, r12, r7, asr #16 @ src[2 + s*2] | src[2 + s*0]
+
+ mov r12, #0x40
+ smlad r9, r9, r6, r12 @ filter[0][1]
+ smlad r10, r10, r6, r12 @ filter[1][1]
+ smlad r11, r11, r6, r12 @ filter[2][1]
+ smlad r1, r1, r6, r12 @ filter[3][1]
+ smlad r9, lr, r5, r9 @ filter[0][0]
+ smlad r10, r8, r5, r10 @ filter[1][0]
+ smlad r11, r7, r5, r11 @ filter[2][0]
+ smlad r1, r3, r5, r1 @ filter[3][0]
+
+ subs r4, r4, #4
+ ldr r3, [sp, #48]
+
+ sat4 r9, r10, r11, r1
+ str r9, [r0], #4
+
+ bne 1b
+
+ ldr r4, [sp, #40]
+ ldr r12, [sp, #44]
+ add r2, r2, r3
+ ldr r9, [sp, #0]
+ subs r12, r12, #1
+ sub r2, r2, r4
+ str r12, [sp, #44]
+ add r0, r0, r9
+
+ bne 1b
+
+ pop {r1, r4-r11, pc}
+endfunc
+
+function vp8_put_bilin_h_armv6
+ push {r1, r4-r11, lr}
+ rsb r5, r12, r12, lsl #16
+ ldr r12, [sp, #44]
+ sub r3, r3, r4
+ add r5, r5, #8
+1:
+ ldrb r6, [r2], #1
+ ldrb r7, [r2], #1
+ ldrb r8, [r2], #1
+ ldrb r9, [r2], #1
+ ldrb lr, [r2]
+
+ pkhbt r6, r6, r7, lsl #16 @ src[1] | src[0]
+ pkhbt r7, r7, r8, lsl #16 @ src[2] | src[1]
+ pkhbt r8, r8, r9, lsl #16 @ src[3] | src[2]
+ pkhbt r9, r9, lr, lsl #16 @ src[4] | src[3]
+
+ mov r10, #4
+ smlad r6, r6, r5, r10
+ smlad r7, r7, r5, r10
+ smlad r8, r8, r5, r10
+ smlad r9, r9, r5, r10
+
+ subs r4, r4, #4
+
+ asr r6, #3
+ asr r7, #3
+ pkhbt r6, r6, r8, lsl #13
+ pkhbt r7, r7, r9, lsl #13
+ orr r6, r6, r7, lsl #8
+ str r6, [r0], #4
+
+ bne 1b
+
+ ldr r4, [sp, #40]
+ subs r12, r12, #1
+ add r2, r2, r3
+ add r0, r0, r1
+
+ bne 1b
+
+ pop {r1, r4-r11, pc}
+endfunc
+
+function vp8_put_bilin_v_armv6
+ push {r1, r4-r11, lr}
+ rsb r5, r12, r12, lsl #16
+ ldr r12, [sp, #44]
+ add r5, r5, #8
+1:
+ ldrb r10, [r2, r3]
+ ldrb r6, [r2], #1
+ ldrb r11, [r2, r3]
+ ldrb r7, [r2], #1
+ ldrb lr, [r2, r3]
+ ldrb r8, [r2], #1
+ ldrb r9, [r2, r3]
+ pkhbt r6, r6, r10, lsl #16
+ ldrb r10, [r2], #1
+ pkhbt r7, r7, r11, lsl #16
+ pkhbt r8, r8, lr, lsl #16
+ pkhbt r9, r10, r9, lsl #16
+
+ mov r10, #4
+ smlad r6, r6, r5, r10
+ smlad r7, r7, r5, r10
+ smlad r8, r8, r5, r10
+ smlad r9, r9, r5, r10
+
+ subs r4, r4, #4
+
+ asr r6, #3
+ asr r7, #3
+ pkhbt r6, r6, r8, lsl #13
+ pkhbt r7, r7, r9, lsl #13
+ orr r6, r6, r7, lsl #8
+ str r6, [r0], #4
+
+ bne 1b
+
+ ldr r4, [sp, #40]
+ subs r12, r12, #1
+ add r2, r2, r3
+ add r0, r0, r1
+ sub r2, r2, r4
+
+ bne 1b
+ pop {r1, r4-r11, pc}
+endfunc
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c b/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c
new file mode 100644
index 0000000..d360ae3
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8dsp_init_arm.c
@@ -0,0 +1,34 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/arm/cpu.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp.h"
+
+av_cold void ff_vp8dsp_init_arm(VP8DSPContext *dsp)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_armv6(cpu_flags))
+ ff_vp8dsp_init_armv6(dsp);
+ if (have_neon(cpu_flags))
+ ff_vp8dsp_init_neon(dsp);
+}
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c b/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c
new file mode 100644
index 0000000..e15e191
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8dsp_init_armv6.c
@@ -0,0 +1,120 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp.h"
+
+void ff_vp8_luma_dc_wht_armv6(int16_t block[4][4][16], int16_t dc[16]);
+void ff_vp8_luma_dc_wht_dc_armv6(int16_t block[4][4][16], int16_t dc[16]);
+
+void ff_vp8_idct_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_armv6(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_armv6(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+
+VP8_LF(armv6);
+
+VP8_EPEL(16, armv6);
+VP8_EPEL(8, armv6);
+VP8_EPEL(4, armv6);
+
+VP8_BILIN(16, armv6);
+VP8_BILIN(8, armv6);
+VP8_BILIN(4, armv6);
+
+av_cold void ff_vp8dsp_init_armv6(VP8DSPContext *dsp)
+{
+ dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_armv6;
+ dsp->vp8_luma_dc_wht_dc = ff_vp8_luma_dc_wht_dc_armv6;
+
+ dsp->vp8_idct_add = ff_vp8_idct_add_armv6;
+ dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_armv6;
+ dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_armv6;
+ dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_armv6;
+
+ dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_armv6;
+ dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_armv6;
+ dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_armv6;
+ dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_armv6;
+
+ dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_armv6;
+ dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_armv6;
+ dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_armv6;
+ dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_armv6;
+
+ dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_armv6;
+ dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_armv6;
+
+ dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6;
+ dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_armv6;
+ dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_armv6;
+ dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_armv6;
+
+ dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_armv6;
+ dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_armv6;
+
+ dsp->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_armv6;
+ dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_armv6;
+
+ dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_armv6;
+
+ dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_armv6;
+
+ dsp->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_armv6;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_armv6;
+}
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c b/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c
new file mode 100644
index 0000000..0468181
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8dsp_init_neon.c
@@ -0,0 +1,116 @@
+/*
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavcodec/vp8dsp.h"
+#include "vp8dsp.h"
+
+void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
+
+void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+
+VP8_LF(neon);
+
+VP8_EPEL(16, neon);
+VP8_EPEL(8, neon);
+VP8_EPEL(4, neon);
+
+VP8_BILIN(16, neon);
+VP8_BILIN(8, neon);
+VP8_BILIN(4, neon);
+
+av_cold void ff_vp8dsp_init_neon(VP8DSPContext *dsp)
+{
+ dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon;
+
+ dsp->vp8_idct_add = ff_vp8_idct_add_neon;
+ dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon;
+ dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon;
+ dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
+
+ dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
+ dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
+ dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon;
+ dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon;
+
+ dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon;
+ dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon;
+ dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon;
+ dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon;
+
+ dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon;
+ dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon;
+
+ dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+ dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon;
+
+ dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon;
+ dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon;
+
+ dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon;
+ dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon;
+
+ dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon;
+ dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon;
+}
diff --git a/ffmpeg/libavcodec/arm/vp8dsp_neon.S b/ffmpeg/libavcodec/arm/vp8dsp_neon.S
new file mode 100644
index 0000000..04e7c5c
--- /dev/null
+++ b/ffmpeg/libavcodec/arm/vp8dsp_neon.S
@@ -0,0 +1,1867 @@
+/*
+ * VP8 NEON optimisations
+ *
+ * Copyright (c) 2010 Rob Clark <rob@ti.com>
+ * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+function ff_vp8_luma_dc_wht_neon, export=1
+ vld1.16 {q0-q1}, [r1,:128]
+ vmov.i16 q15, #0
+
+ vadd.i16 d4, d0, d3
+ vadd.i16 d6, d1, d2
+ vst1.16 {q15}, [r1,:128]!
+ vsub.i16 d7, d1, d2
+ vsub.i16 d5, d0, d3
+ vst1.16 {q15}, [r1,:128]
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vmov.i16 q8, #3
+
+ vtrn.32 d0, d2
+ vtrn.32 d1, d3
+ vtrn.16 d0, d1
+ vtrn.16 d2, d3
+
+ vadd.i16 d0, d0, d16
+
+ vadd.i16 d4, d0, d3
+ vadd.i16 d6, d1, d2
+ vsub.i16 d7, d1, d2
+ vsub.i16 d5, d0, d3
+ vadd.i16 q0, q2, q3
+ vsub.i16 q1, q2, q3
+
+ vshr.s16 q0, q0, #3
+ vshr.s16 q1, q1, #3
+
+ mov r3, #32
+ vst1.16 {d0[0]}, [r0,:16], r3
+ vst1.16 {d1[0]}, [r0,:16], r3
+ vst1.16 {d2[0]}, [r0,:16], r3
+ vst1.16 {d3[0]}, [r0,:16], r3
+ vst1.16 {d0[1]}, [r0,:16], r3
+ vst1.16 {d1[1]}, [r0,:16], r3
+ vst1.16 {d2[1]}, [r0,:16], r3
+ vst1.16 {d3[1]}, [r0,:16], r3
+ vst1.16 {d0[2]}, [r0,:16], r3
+ vst1.16 {d1[2]}, [r0,:16], r3
+ vst1.16 {d2[2]}, [r0,:16], r3
+ vst1.16 {d3[2]}, [r0,:16], r3
+ vst1.16 {d0[3]}, [r0,:16], r3
+ vst1.16 {d1[3]}, [r0,:16], r3
+ vst1.16 {d2[3]}, [r0,:16], r3
+ vst1.16 {d3[3]}, [r0,:16], r3
+
+ bx lr
+endfunc
+
+function ff_vp8_idct_add_neon, export=1
+ vld1.16 {q0-q1}, [r1,:128]
+ movw r3, #20091
+ movt r3, #35468/2
+ vdup.32 d4, r3
+
+ vmull.s16 q12, d1, d4[0]
+ vmull.s16 q13, d3, d4[0]
+ vqdmulh.s16 d20, d1, d4[1]
+ vqdmulh.s16 d23, d3, d4[1]
+ vshrn.s32 d21, q12, #16
+ vshrn.s32 d22, q13, #16
+ vadd.s16 d21, d21, d1
+ vadd.s16 d22, d22, d3
+
+ vadd.s16 d16, d0, d2
+ vsub.s16 d17, d0, d2
+ vadd.s16 d18, d21, d23
+ vsub.s16 d19, d20, d22
+ vadd.s16 q0, q8, q9
+ vsub.s16 q1, q8, q9
+
+ vtrn.32 d0, d3
+ vtrn.32 d1, d2
+ vtrn.16 d0, d1
+ vtrn.16 d3, d2
+
+ vmov.i16 q15, #0
+ vmull.s16 q12, d1, d4[0]
+ vst1.16 {q15}, [r1,:128]!
+ vmull.s16 q13, d2, d4[0]
+ vst1.16 {q15}, [r1,:128]
+ vqdmulh.s16 d21, d1, d4[1]
+ vqdmulh.s16 d23, d2, d4[1]
+ vshrn.s32 d20, q12, #16
+ vshrn.s32 d22, q13, #16
+ vadd.i16 d20, d20, d1
+ vadd.i16 d22, d22, d2
+
+ vadd.i16 d16, d0, d3
+ vsub.i16 d17, d0, d3
+ vadd.i16 d18, d20, d23
+ vld1.32 {d20[]}, [r0,:32], r2
+ vsub.i16 d19, d21, d22
+ vld1.32 {d22[]}, [r0,:32], r2
+ vadd.s16 q0, q8, q9
+ vld1.32 {d23[]}, [r0,:32], r2
+ vsub.s16 q1, q8, q9
+ vld1.32 {d21[]}, [r0,:32], r2
+ vrshr.s16 q0, q0, #3
+ vtrn.32 q10, q11
+ vrshr.s16 q1, q1, #3
+
+ sub r0, r0, r2, lsl #2
+
+ vtrn.32 d0, d3
+ vtrn.32 d1, d2
+ vtrn.16 d0, d1
+ vtrn.16 d3, d2
+
+ vaddw.u8 q0, q0, d20
+ vaddw.u8 q1, q1, d21
+ vqmovun.s16 d0, q0
+ vqmovun.s16 d1, q1
+
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+
+ bx lr
+endfunc
+
+function ff_vp8_idct_dc_add_neon, export=1
+ mov r3, #0
+ ldrsh r12, [r1]
+ strh r3, [r1]
+ vdup.16 q1, r12
+ vrshr.s16 q1, q1, #3
+ vld1.32 {d0[]}, [r0,:32], r2
+ vld1.32 {d1[]}, [r0,:32], r2
+ vld1.32 {d0[1]}, [r0,:32], r2
+ vld1.32 {d1[1]}, [r0,:32], r2
+ vaddw.u8 q2, q1, d0
+ vaddw.u8 q3, q1, d1
+ sub r0, r0, r2, lsl #2
+ vqmovun.s16 d0, q2
+ vqmovun.s16 d1, q3
+ vst1.32 {d0[0]}, [r0,:32], r2
+ vst1.32 {d1[0]}, [r0,:32], r2
+ vst1.32 {d0[1]}, [r0,:32], r2
+ vst1.32 {d1[1]}, [r0,:32], r2
+ bx lr
+endfunc
+
+function ff_vp8_idct_dc_add4uv_neon, export=1
+ vmov.i16 d0, #0
+ mov r3, #32
+ vld1.16 {d16[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d17[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d18[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d19[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ mov r3, r0
+ vrshr.s16 q8, q8, #3 @ dc >>= 3
+ vld1.8 {d0}, [r0,:64], r2
+ vrshr.s16 q9, q9, #3
+ vld1.8 {d1}, [r0,:64], r2
+ vaddw.u8 q10, q8, d0
+ vld1.8 {d2}, [r0,:64], r2
+ vaddw.u8 q0, q8, d1
+ vld1.8 {d3}, [r0,:64], r2
+ vaddw.u8 q11, q8, d2
+ vld1.8 {d4}, [r0,:64], r2
+ vaddw.u8 q1, q8, d3
+ vld1.8 {d5}, [r0,:64], r2
+ vaddw.u8 q12, q9, d4
+ vld1.8 {d6}, [r0,:64], r2
+ vaddw.u8 q2, q9, d5
+ vld1.8 {d7}, [r0,:64], r2
+ vaddw.u8 q13, q9, d6
+ vqmovun.s16 d20, q10
+ vaddw.u8 q3, q9, d7
+ vqmovun.s16 d21, q0
+ vqmovun.s16 d22, q11
+ vst1.8 {d20}, [r3,:64], r2
+ vqmovun.s16 d23, q1
+ vst1.8 {d21}, [r3,:64], r2
+ vqmovun.s16 d24, q12
+ vst1.8 {d22}, [r3,:64], r2
+ vqmovun.s16 d25, q2
+ vst1.8 {d23}, [r3,:64], r2
+ vqmovun.s16 d26, q13
+ vst1.8 {d24}, [r3,:64], r2
+ vqmovun.s16 d27, q3
+ vst1.8 {d25}, [r3,:64], r2
+ vst1.8 {d26}, [r3,:64], r2
+ vst1.8 {d27}, [r3,:64], r2
+
+ bx lr
+endfunc
+
+function ff_vp8_idct_dc_add4y_neon, export=1
+ vmov.i16 d0, #0
+ mov r3, #32
+ vld1.16 {d16[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d17[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d18[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vld1.16 {d19[]}, [r1,:16]
+ vst1.16 {d0[0]}, [r1,:16], r3
+ vrshr.s16 q8, q8, #3 @ dc >>= 3
+ vld1.8 {q0}, [r0,:128], r2
+ vrshr.s16 q9, q9, #3
+ vld1.8 {q1}, [r0,:128], r2
+ vaddw.u8 q10, q8, d0
+ vld1.8 {q2}, [r0,:128], r2
+ vaddw.u8 q0, q9, d1
+ vld1.8 {q3}, [r0,:128], r2
+ vaddw.u8 q11, q8, d2
+ vaddw.u8 q1, q9, d3
+ vaddw.u8 q12, q8, d4
+ vaddw.u8 q2, q9, d5
+ vaddw.u8 q13, q8, d6
+ vaddw.u8 q3, q9, d7
+ sub r0, r0, r2, lsl #2
+ vqmovun.s16 d20, q10
+ vqmovun.s16 d21, q0
+ vqmovun.s16 d22, q11
+ vqmovun.s16 d23, q1
+ vqmovun.s16 d24, q12
+ vst1.8 {q10}, [r0,:128], r2
+ vqmovun.s16 d25, q2
+ vst1.8 {q11}, [r0,:128], r2
+ vqmovun.s16 d26, q13
+ vst1.8 {q12}, [r0,:128], r2
+ vqmovun.s16 d27, q3
+ vst1.8 {q13}, [r0,:128], r2
+
+ bx lr
+endfunc
+
+@ Register layout:
+@ P3..Q3 -> q0..q7
+@ flim_E -> q14
+@ flim_I -> q15
+@ hev_thresh -> r12
+@
+.macro vp8_loop_filter, inner=0, simple=0
+ .if \simple
+ vabd.u8 q9, q3, q4 @ abs(P0-Q0)
+ vabd.u8 q15, q2, q5 @ abs(P1-Q1)
+ vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
+ vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
+ vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+ vmov.i8 q13, #0x80
+ vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
+ .else
+ @ calculate hev and normal_limit:
+ vabd.u8 q12, q2, q3 @ abs(P1-P0)
+ vabd.u8 q13, q5, q4 @ abs(Q1-Q0)
+ vabd.u8 q10, q0, q1 @ abs(P3-P2)
+ vabd.u8 q11, q1, q2 @ abs(P2-P1)
+ vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I
+ vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I
+ vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I
+ vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I
+ vand q8, q8, q9
+ vabd.u8 q9, q7, q6 @ abs(Q3-Q2)
+ vand q8, q8, q11
+ vabd.u8 q11, q6, q5 @ abs(Q2-Q1)
+ vand q8, q8, q10
+ vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I
+ vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I
+ vabd.u8 q9, q3, q4 @ abs(P0-Q0)
+ vabd.u8 q15, q2, q5 @ abs(P1-Q1)
+ vand q8, q8, q10
+ vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2
+ vand q8, q8, q11
+ vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2
+ vdup.8 q15, r12 @ hev_thresh
+ vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
+ vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh
+ vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
+ vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh
+ vand q8, q8, q11
+ vmov.i8 q13, #0x80
+ vorr q9, q12, q14
+ .endif
+
+ @ at this point:
+ @ q8: normal_limit
+ @ q9: hev
+
+ @ convert to signed value:
+ veor q3, q3, q13 @ PS0 = P0 ^ 0x80
+ veor q4, q4, q13 @ QS0 = Q0 ^ 0x80
+
+ vmov.i16 q12, #3
+ vsubl.s8 q10, d8, d6 @ QS0 - PS0
+ vsubl.s8 q11, d9, d7 @ (widened to 16bit)
+ veor q2, q2, q13 @ PS1 = P1 ^ 0x80
+ veor q5, q5, q13 @ QS1 = Q1 ^ 0x80
+ vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0)
+ vmul.i16 q11, q11, q12
+
+ vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1)
+ vmov.i8 q14, #4
+ vmov.i8 q15, #3
+ .if \inner
+ vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1)
+ .endif
+ vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1)
+ vaddw.s8 q11, q11, d25
+ vqmovn.s16 d20, q10 @ narrow result back into q10
+ vqmovn.s16 d21, q11
+ .if !\inner && !\simple
+ veor q1, q1, q13 @ PS2 = P2 ^ 0x80
+ veor q6, q6, q13 @ QS2 = Q2 ^ 0x80
+ .endif
+ vand q10, q10, q8 @ w &= normal_limit
+
+ @ registers used at this point..
+ @ q0 -> P3 (don't corrupt)
+ @ q1-q6 -> PS2-QS2
+ @ q7 -> Q3 (don't corrupt)
+ @ q9 -> hev
+ @ q10 -> w
+ @ q13 -> #0x80
+ @ q14 -> #4
+ @ q15 -> #3
+ @ q8, q11, q12 -> unused
+
+ @ filter_common: is4tap==1
+ @ c1 = clamp(w + 4) >> 3;
+ @ c2 = clamp(w + 3) >> 3;
+ @ Q0 = s2u(QS0 - c1);
+ @ P0 = s2u(PS0 + c2);
+
+ .if \simple
+ vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
+ vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
+ vshr.s8 q11, q11, #3 @ c1 >>= 3
+ vshr.s8 q12, q12, #3 @ c2 >>= 3
+ vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
+ vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
+ veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
+ veor q3, q3, q13 @ P0 = PS0 ^ 0x80
+ veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
+ veor q2, q2, q13 @ P1 = PS1 ^ 0x80
+ .elseif \inner
+ @ the !is4tap case of filter_common, only used for inner blocks
+ @ c3 = ((c1&~hev) + 1) >> 1;
+ @ Q1 = s2u(QS1 - c3);
+ @ P1 = s2u(PS1 + c3);
+ vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4)
+ vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3)
+ vshr.s8 q11, q11, #3 @ c1 >>= 3
+ vshr.s8 q12, q12, #3 @ c2 >>= 3
+ vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
+ vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
+ vbic q11, q11, q9 @ c1 & ~hev
+ veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
+ vrshr.s8 q11, q11, #1 @ c3 >>= 1
+ veor q3, q3, q13 @ P0 = PS0 ^ 0x80
+ vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3)
+ vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3)
+ veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
+ veor q2, q2, q13 @ P1 = PS1 ^ 0x80
+ .else
+ vand q12, q10, q9 @ w & hev
+ vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4)
+ vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3)
+ vshr.s8 q11, q11, #3 @ c1 >>= 3
+ vshr.s8 q12, q12, #3 @ c2 >>= 3
+ vbic q10, q10, q9 @ w &= ~hev
+ vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1)
+ vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2)
+
+ @ filter_mbedge:
+ @ a = clamp((27*w + 63) >> 7);
+ @ Q0 = s2u(QS0 - a);
+ @ P0 = s2u(PS0 + a);
+ @ a = clamp((18*w + 63) >> 7);
+ @ Q1 = s2u(QS1 - a);
+ @ P1 = s2u(PS1 + a);
+ @ a = clamp((9*w + 63) >> 7);
+ @ Q2 = s2u(QS2 - a);
+ @ P2 = s2u(PS2 + a);
+ vmov.i16 q9, #63
+ vshll.s8 q14, d20, #3
+ vshll.s8 q15, d21, #3
+ vaddw.s8 q14, q14, d20
+ vaddw.s8 q15, q15, d21
+ vadd.s16 q8, q9, q14
+ vadd.s16 q9, q9, q15 @ 9*w + 63
+ vadd.s16 q11, q8, q14
+ vadd.s16 q12, q9, q15 @ 18*w + 63
+ vadd.s16 q14, q11, q14
+ vadd.s16 q15, q12, q15 @ 27*w + 63
+ vqshrn.s16 d16, q8, #7
+ vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7)
+ vqshrn.s16 d22, q11, #7
+ vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7)
+ vqshrn.s16 d28, q14, #7
+ vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7)
+ vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a)
+ vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a)
+ vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a)
+ vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a)
+ vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a)
+ vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a)
+ veor q3, q3, q13 @ P0 = PS0 ^ 0x80
+ veor q4, q4, q13 @ Q0 = QS0 ^ 0x80
+ veor q2, q2, q13 @ P1 = PS1 ^ 0x80
+ veor q5, q5, q13 @ Q1 = QS1 ^ 0x80
+ veor q1, q1, q13 @ P2 = PS2 ^ 0x80
+ veor q6, q6, q13 @ Q2 = QS2 ^ 0x80
+ .endif
+.endm
+
+.macro vp8_v_loop_filter16 name, inner=0, simple=0
+function ff_vp8_v_loop_filter16\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, r1, lsl #1+!\simple
+
+ @ Load pixels:
+ .if !\simple
+ ldr r12, [sp, #64] @ hev_thresh
+ vld1.8 {q0}, [r0,:128], r1 @ P3
+ vld1.8 {q1}, [r0,:128], r1 @ P2
+ .endif
+ vld1.8 {q2}, [r0,:128], r1 @ P1
+ vld1.8 {q3}, [r0,:128], r1 @ P0
+ vld1.8 {q4}, [r0,:128], r1 @ Q0
+ vld1.8 {q5}, [r0,:128], r1 @ Q1
+ .if !\simple
+ vld1.8 {q6}, [r0,:128], r1 @ Q2
+ vld1.8 {q7}, [r0,:128] @ Q3
+ vdup.8 q15, r3 @ flim_I
+ .endif
+ vdup.8 q14, r2 @ flim_E
+
+ vp8_loop_filter inner=\inner, simple=\simple
+
+ @ back up to P2: dst -= stride * 6
+ sub r0, r0, r1, lsl #2
+ .if !\simple
+ sub r0, r0, r1, lsl #1
+
+ @ Store pixels:
+ vst1.8 {q1}, [r0,:128], r1 @ P2
+ .endif
+ vst1.8 {q2}, [r0,:128], r1 @ P1
+ vst1.8 {q3}, [r0,:128], r1 @ P0
+ vst1.8 {q4}, [r0,:128], r1 @ Q0
+ vst1.8 {q5}, [r0,:128], r1 @ Q1
+ .if !\simple
+ vst1.8 {q6}, [r0,:128] @ Q2
+ .endif
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_v_loop_filter16
+vp8_v_loop_filter16 _inner, inner=1
+vp8_v_loop_filter16 _simple, simple=1
+
+.macro vp8_v_loop_filter8uv name, inner=0
+function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, r2, lsl #2
+ sub r1, r1, r2, lsl #2
+ ldr r12, [sp, #64] @ flim_I
+
+ @ Load pixels:
+ vld1.8 {d0}, [r0,:64], r2 @ P3
+ vld1.8 {d1}, [r1,:64], r2 @ P3
+ vld1.8 {d2}, [r0,:64], r2 @ P2
+ vld1.8 {d3}, [r1,:64], r2 @ P2
+ vld1.8 {d4}, [r0,:64], r2 @ P1
+ vld1.8 {d5}, [r1,:64], r2 @ P1
+ vld1.8 {d6}, [r0,:64], r2 @ P0
+ vld1.8 {d7}, [r1,:64], r2 @ P0
+ vld1.8 {d8}, [r0,:64], r2 @ Q0
+ vld1.8 {d9}, [r1,:64], r2 @ Q0
+ vld1.8 {d10}, [r0,:64], r2 @ Q1
+ vld1.8 {d11}, [r1,:64], r2 @ Q1
+ vld1.8 {d12}, [r0,:64], r2 @ Q2
+ vld1.8 {d13}, [r1,:64], r2 @ Q2
+ vld1.8 {d14}, [r0,:64] @ Q3
+ vld1.8 {d15}, [r1,:64] @ Q3
+
+ vdup.8 q14, r3 @ flim_E
+ vdup.8 q15, r12 @ flim_I
+ ldr r12, [sp, #68] @ hev_thresh
+
+ vp8_loop_filter inner=\inner
+
+ @ back up to P2: u,v -= stride * 6
+ sub r0, r0, r2, lsl #2
+ sub r1, r1, r2, lsl #2
+ sub r0, r0, r2, lsl #1
+ sub r1, r1, r2, lsl #1
+
+ @ Store pixels:
+ vst1.8 {d2}, [r0,:64], r2 @ P2
+ vst1.8 {d3}, [r1,:64], r2 @ P2
+ vst1.8 {d4}, [r0,:64], r2 @ P1
+ vst1.8 {d5}, [r1,:64], r2 @ P1
+ vst1.8 {d6}, [r0,:64], r2 @ P0
+ vst1.8 {d7}, [r1,:64], r2 @ P0
+ vst1.8 {d8}, [r0,:64], r2 @ Q0
+ vst1.8 {d9}, [r1,:64], r2 @ Q0
+ vst1.8 {d10}, [r0,:64], r2 @ Q1
+ vst1.8 {d11}, [r1,:64], r2 @ Q1
+ vst1.8 {d12}, [r0,:64] @ Q2
+ vst1.8 {d13}, [r1,:64] @ Q2
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_v_loop_filter8uv
+vp8_v_loop_filter8uv _inner, inner=1
+
+.macro vp8_h_loop_filter16 name, inner=0, simple=0
+function ff_vp8_h_loop_filter16\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, #4
+ .if !\simple
+ ldr r12, [sp, #64] @ hev_thresh
+ .endif
+
+ @ Load pixels:
+ vld1.8 {d0}, [r0], r1 @ load first 8-line src data
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d6}, [r0], r1
+ vld1.8 {d8}, [r0], r1
+ vld1.8 {d10}, [r0], r1
+ vld1.8 {d12}, [r0], r1
+ vld1.8 {d14}, [r0], r1
+ vld1.8 {d1}, [r0], r1 @ load second 8-line src data
+ vld1.8 {d3}, [r0], r1
+ vld1.8 {d5}, [r0], r1
+ vld1.8 {d7}, [r0], r1
+ vld1.8 {d9}, [r0], r1
+ vld1.8 {d11}, [r0], r1
+ vld1.8 {d13}, [r0], r1
+ vld1.8 {d15}, [r0], r1
+
+ transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
+
+ vdup.8 q14, r2 @ flim_E
+ .if !\simple
+ vdup.8 q15, r3 @ flim_I
+ .endif
+
+ vp8_loop_filter inner=\inner, simple=\simple
+
+ sub r0, r0, r1, lsl #4 @ backup 16 rows
+
+ transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
+
+ @ Store pixels:
+ vst1.8 {d0}, [r0], r1
+ vst1.8 {d2}, [r0], r1
+ vst1.8 {d4}, [r0], r1
+ vst1.8 {d6}, [r0], r1
+ vst1.8 {d8}, [r0], r1
+ vst1.8 {d10}, [r0], r1
+ vst1.8 {d12}, [r0], r1
+ vst1.8 {d14}, [r0], r1
+ vst1.8 {d1}, [r0], r1
+ vst1.8 {d3}, [r0], r1
+ vst1.8 {d5}, [r0], r1
+ vst1.8 {d7}, [r0], r1
+ vst1.8 {d9}, [r0], r1
+ vst1.8 {d11}, [r0], r1
+ vst1.8 {d13}, [r0], r1
+ vst1.8 {d15}, [r0]
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_h_loop_filter16
+vp8_h_loop_filter16 _inner, inner=1
+vp8_h_loop_filter16 _simple, simple=1
+
+.macro vp8_h_loop_filter8uv name, inner=0
+function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
+ vpush {q4-q7}
+ sub r0, r0, #4
+ sub r1, r1, #4
+ ldr r12, [sp, #64] @ flim_I
+
+ @ Load pixels:
+ vld1.8 {d0}, [r0], r2 @ load u
+ vld1.8 {d1}, [r1], r2 @ load v
+ vld1.8 {d2}, [r0], r2
+ vld1.8 {d3}, [r1], r2
+ vld1.8 {d4}, [r0], r2
+ vld1.8 {d5}, [r1], r2
+ vld1.8 {d6}, [r0], r2
+ vld1.8 {d7}, [r1], r2
+ vld1.8 {d8}, [r0], r2
+ vld1.8 {d9}, [r1], r2
+ vld1.8 {d10}, [r0], r2
+ vld1.8 {d11}, [r1], r2
+ vld1.8 {d12}, [r0], r2
+ vld1.8 {d13}, [r1], r2
+ vld1.8 {d14}, [r0], r2
+ vld1.8 {d15}, [r1], r2
+
+ transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
+
+ vdup.8 q14, r3 @ flim_E
+ vdup.8 q15, r12 @ flim_I
+ ldr r12, [sp, #68] @ hev_thresh
+
+ vp8_loop_filter inner=\inner
+
+ sub r0, r0, r2, lsl #3 @ backup u 8 rows
+ sub r1, r1, r2, lsl #3 @ backup v 8 rows
+
+ transpose_8x8 q0, q1, q2, q3, q4, q5, q6, q7
+
+ @ Store pixels:
+ vst1.8 {d0}, [r0], r2
+ vst1.8 {d1}, [r1], r2
+ vst1.8 {d2}, [r0], r2
+ vst1.8 {d3}, [r1], r2
+ vst1.8 {d4}, [r0], r2
+ vst1.8 {d5}, [r1], r2
+ vst1.8 {d6}, [r0], r2
+ vst1.8 {d7}, [r1], r2
+ vst1.8 {d8}, [r0], r2
+ vst1.8 {d9}, [r1], r2
+ vst1.8 {d10}, [r0], r2
+ vst1.8 {d11}, [r1], r2
+ vst1.8 {d12}, [r0], r2
+ vst1.8 {d13}, [r1], r2
+ vst1.8 {d14}, [r0]
+ vst1.8 {d15}, [r1]
+
+ vpop {q4-q7}
+ bx lr
+endfunc
+.endm
+
+vp8_h_loop_filter8uv
+vp8_h_loop_filter8uv _inner, inner=1
+
+function ff_put_vp8_pixels16_neon, export=1
+ ldr r12, [sp, #0] @ h
+1:
+ subs r12, r12, #4
+ vld1.8 {q0}, [r2], r3
+ vld1.8 {q1}, [r2], r3
+ vld1.8 {q2}, [r2], r3
+ vld1.8 {q3}, [r2], r3
+ vst1.8 {q0}, [r0,:128], r1
+ vst1.8 {q1}, [r0,:128], r1
+ vst1.8 {q2}, [r0,:128], r1
+ vst1.8 {q3}, [r0,:128], r1
+ bgt 1b
+ bx lr
+endfunc
+
+function ff_put_vp8_pixels8_neon, export=1
+ ldr r12, [sp, #0] @ h
+1:
+ subs r12, r12, #4
+ vld1.8 {d0}, [r2], r3
+ vld1.8 {d1}, [r2], r3
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vst1.8 {d0}, [r0,:64], r1
+ vst1.8 {d1}, [r0,:64], r1
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ bgt 1b
+ bx lr
+endfunc
+
+/* 4/6-tap 8th-pel MC */
+
+.macro vp8_epel8_h6 d, a, b
+ vext.8 d27, \a, \b, #1
+ vmovl.u8 q8, \a
+ vext.8 d28, \a, \b, #2
+ vmovl.u8 q9, d27
+ vext.8 d29, \a, \b, #3
+ vmovl.u8 q10, d28
+ vext.8 d30, \a, \b, #4
+ vmovl.u8 q11, d29
+ vext.8 d31, \a, \b, #5
+ vmovl.u8 q12, d30
+ vmul.u16 q10, q10, d0[2]
+ vmovl.u8 q13, d31
+ vmul.u16 q11, q11, d0[3]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q11, q12, d1[0]
+ vmla.u16 q10, q8, d0[0]
+ vmla.u16 q11, q13, d1[1]
+ vqadd.s16 q11, q10, q11
+ vqrshrun.s16 \d, q11, #7
+.endm
+
+.macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1
+ vext.8 q14, \q0, \q1, #3
+ vext.8 q15, \q0, \q1, #4
+ vmovl.u8 q11, d28
+ vmovl.u8 q14, d29
+ vext.8 q3, \q0, \q1, #2
+ vmovl.u8 q12, d30
+ vmovl.u8 q15, d31
+ vext.8 q8, \q0, \q1, #1
+ vmovl.u8 q10, d6
+ vmovl.u8 q3, d7
+ vext.8 q2, \q0, \q1, #5
+ vmovl.u8 q13, d4
+ vmovl.u8 q2, d5
+ vmovl.u8 q9, d16
+ vmovl.u8 q8, d17
+ vmul.u16 q11, q11, d0[3]
+ vmul.u16 q10, q10, d0[2]
+ vmul.u16 q3, q3, d0[2]
+ vmul.u16 q14, q14, d0[3]
+ vmls.u16 q11, q12, d1[0]
+ vmovl.u8 q12, \s0
+ vmovl.u8 q1, \s1
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q3, q8, d0[1]
+ vmls.u16 q14, q15, d1[0]
+ vmla.u16 q10, q12, d0[0]
+ vmla.u16 q11, q13, d1[1]
+ vmla.u16 q3, q1, d0[0]
+ vmla.u16 q14, q2, d1[1]
+ vqadd.s16 q11, q10, q11
+ vqadd.s16 q14, q3, q14
+ vqrshrun.s16 \d0, q11, #7
+ vqrshrun.s16 \d1, q14, #7
+.endm
+
+.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5
+ vmovl.u8 q10, \s2
+ vmovl.u8 q11, \s3
+ vmovl.u8 q9, \s1
+ vmovl.u8 q12, \s4
+ vmovl.u8 q8, \s0
+ vmovl.u8 q13, \s5
+ vmul.u16 q10, q10, d0[2]
+ vmul.u16 q11, q11, d0[3]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q11, q12, d1[0]
+ vmla.u16 q10, q8, d0[0]
+ vmla.u16 q11, q13, d1[1]
+ vqadd.s16 q11, q10, q11
+ vqrshrun.s16 \d0, q11, #7
+.endm
+
+.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
+ vmovl.u8 q10, \s0
+ vmovl.u8 q11, \s3
+ vmovl.u8 q14, \s6
+ vmovl.u8 q9, \s1
+ vmovl.u8 q12, \s4
+ vmovl.u8 q8, \s2
+ vmovl.u8 q13, \s5
+ vmul.u16 q10, q10, d0[0]
+ vmul.u16 q15, q11, d0[3]
+ vmul.u16 q11, q11, d0[2]
+ vmul.u16 q14, q14, d1[1]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q15, q12, d1[0]
+ vmls.u16 q11, q8, d0[1]
+ vmls.u16 q14, q13, d1[0]
+ vmla.u16 q10, q8, d0[2]
+ vmla.u16 q15, q13, d1[1]
+ vmla.u16 q11, q9, d0[0]
+ vmla.u16 q14, q12, d0[3]
+ vqadd.s16 q15, q10, q15
+ vqadd.s16 q14, q11, q14
+ vqrshrun.s16 \d0, q15, #7
+ vqrshrun.s16 \d1, q14, #7
+.endm
+
+.macro vp8_epel8_h4 d, a, b
+ vext.8 d28, \a, \b, #1
+ vmovl.u8 q9, \a
+ vext.8 d29, \a, \b, #2
+ vmovl.u8 q10, d28
+ vext.8 d30, \a, \b, #3
+ vmovl.u8 q11, d29
+ vmovl.u8 q12, d30
+ vmul.u16 q10, q10, d0[2]
+ vmul.u16 q11, q11, d0[3]
+ vmls.u16 q10, q9, d0[1]
+ vmls.u16 q11, q12, d1[0]
+ vqadd.s16 q11, q10, q11
+ vqrshrun.s16 \d, q11, #7
+.endm
+
+.macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4
+ vmovl.u8 q9, \s0
+ vmovl.u8 q10, \s1
+ vmovl.u8 q11, \s2
+ vmovl.u8 q12, \s3
+ vmovl.u8 q13, \s4
+ vmul.u16 q8, q10, d0[2]
+ vmul.u16 q14, q11, d0[3]
+ vmul.u16 q11, q11, d0[2]
+ vmul.u16 q15, q12, d0[3]
+ vmls.u16 q8, q9, d0[1]
+ vmls.u16 q14, q12, d1[0]
+ vmls.u16 q11, q10, d0[1]
+ vmls.u16 q15, q13, d1[0]
+ vqadd.s16 q8, q8, q14
+ vqadd.s16 q11, q11, q15
+ vqrshrun.s16 \d0, q8, #7
+ vqrshrun.s16 \d1, q11, #7
+.endm
+
+function ff_put_vp8_epel16_v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ push {r4,lr}
+ vpush {d8-d15}
+
+ ldr r4, [sp, #80] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #72] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2-d3}, [r2], r3
+ vld1.8 {d4-d5}, [r2], r3
+ vld1.8 {d6-d7}, [r2], r3
+ vld1.8 {d8-d9}, [r2], r3
+ vld1.8 {d10-d11},[r2], r3
+ vld1.8 {d12-d13},[r2], r3
+ vld1.8 {d14-d15},[r2]
+ sub r2, r2, r3, lsl #2
+
+ vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14
+ vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15
+
+ vst1.8 {d2-d3}, [r0,:128], r1
+ vst1.8 {d4-d5}, [r0,:128], r1
+ subs r12, r12, #2
+ bne 1b
+
+ vpop {d8-d15}
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel16_h6_neon, export=1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2-d4}, [r2], r3
+
+ vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
+
+ vst1.8 {d2-d3}, [r0,:128], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel16_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #2
+ push {r4,lr}
+ vpush {d8-d9}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #28] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #24] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #336+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3,d4}, [r2], r3
+
+ vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2
+
+ vst1.8 {d2-d3}, [lr,:128]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #336+16+32] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #336+16+24] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6-d9}, [lr,:128]!
+ vld1.8 {d28-d31},[lr,:128]
+ sub lr, lr, #48
+
+ vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30
+ vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31
+
+ vst1.8 {d2-d3}, [r0,:128], r1
+ subs r12, r12, #1
+ bne 2b
+
+ add sp, sp, #336+16
+ vpop {d8-d9}
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d6}, [r2], r3
+ vld1.8 {d7}, [r2], r3
+ vld1.8 {d28}, [r2]
+
+ sub r2, r2, r3, lsl #2
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h6_neon, export=1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h6 d2, d2, d3
+
+ vst1.8 {d2}, [r0,:64], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h6 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6-d7}, [lr,:128]!
+ vld1.8 {d30}, [lr,:64]
+ sub lr, lr, #32
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_v4_neon, export=1
+ sub r2, r2, r3
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2}, [r2], r3
+ vld1.8 {d3}, [r2], r3
+ vld1.8 {d4}, [r2], r3
+ vld1.8 {d5}, [r2], r3
+ vld1.8 {d6}, [r2]
+ sub r2, r2, r3, lsl #1
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h4_neon, export=1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h4 d2, d2, d3
+
+ vst1.8 {d2}, [r0,:64], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h4v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #1
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h4 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]
+ sub lr, lr, #16
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h6v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #2
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h6 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]
+ sub lr, lr, #16
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel8_h4v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ @ first pass (horizontal):
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #168+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2,d3}, [r2], r3
+
+ vp8_epel8_h4 d2, d2, d3
+
+ vst1.8 {d2}, [lr,:64]!
+ subs r12, r12, #1
+ bne 1b
+
+ @ second pass (vertical):
+ ldr r4, [sp, #168+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #168+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d5}, [lr,:128]!
+ vld1.8 {d6-d7}, [lr,:128]!
+ vld1.8 {d30}, [lr,:64]
+ sub lr, lr, #32
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30
+
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r0,:64], r1
+ subs r12, r12, #2
+ bne 2b
+
+ add sp, sp, #168+16
+ pop {r4,pc}
+endfunc
+
+.ltorg
+
+function ff_put_vp8_epel4_v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d6[]}, [r2], r3
+ vld1.32 {d7[]}, [r2], r3
+ vld1.32 {d28[]}, [r2]
+ sub r2, r2, r3, lsl #2
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d5[1]}, [r2], r3
+ vld1.32 {d6[1]}, [r2], r3
+ vld1.32 {d7[1]}, [r2], r3
+ vld1.32 {d28[1]}, [r2]
+ sub r2, r2, r3, lsl #2
+
+ vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28
+
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h6_neon, export=1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {q1}, [r2], r3
+ vp8_epel8_h6 d2, d2, d3
+ vst1.32 {d2[0]}, [r0,:32], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h6v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #52+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {q1}, [r2], r3
+ vp8_epel8_h6 d2, d2, d3
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #52+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #52+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]!
+ vld1.32 {d28[]}, [lr,:32]
+ sub lr, lr, #16
+ vld1.8 {d4-d5}, [lr]!
+ vld1.8 {d7}, [lr,:64]!
+ vld1.32 {d28[1]}, [lr,:32]
+ sub lr, lr, #16
+ vtrn.32 q1, q2
+ vtrn.32 d6, d7
+ vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #52+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h4v6_neon, export=1
+ sub r2, r2, r3, lsl #1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #52+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #5
+ bic lr, lr, #15
+1:
+ vld1.8 {d2}, [r2], r3
+ vp8_epel8_h4 d2, d2, d2
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #52+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #52+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.8 {d6}, [lr,:64]!
+ vld1.32 {d28[]}, [lr,:32]
+ sub lr, lr, #16
+ vld1.8 {d4-d5}, [lr]!
+ vld1.8 {d7}, [lr,:64]!
+ vld1.32 {d28[1]}, [lr,:32]
+ sub lr, lr, #16
+ vtrn.32 q1, q2
+ vtrn.32 d6, d7
+ vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #52+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h6v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #2
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #44+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {q1}, [r2], r3
+ vp8_epel8_h6 d2, d2, d3
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #44+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #44+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.32 {d6[]}, [lr,:32]
+ sub lr, lr, #8
+ vld1.8 {d4-d5}, [lr]!
+ vld1.32 {d6[1]}, [lr,:32]
+ sub lr, lr, #8
+ vtrn.32 q1, q2
+ vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #44+16
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h4_neon, export=1
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.8 {d2}, [r2], r3
+ vp8_epel8_h4 d2, d2, d2
+ vst1.32 {d2[0]}, [r0,:32], r1
+ subs r12, r12, #1
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_v4_neon, export=1
+ sub r2, r2, r3
+ push {r4,lr}
+
+ ldr r4, [sp, #16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ vld1.16 {q0}, [r4,:128]
+1:
+ vld1.32 {d2[]}, [r2], r3
+ vld1.32 {d3[]}, [r2], r3
+ vld1.32 {d4[]}, [r2], r3
+ vld1.32 {d5[]}, [r2], r3
+ vld1.32 {d6[]}, [r2]
+ sub r2, r2, r3, lsl #1
+ vld1.32 {d2[1]}, [r2], r3
+ vld1.32 {d3[1]}, [r2], r3
+ vld1.32 {d4[1]}, [r2], r3
+ vld1.32 {d5[1]}, [r2], r3
+ vld1.32 {d6[1]}, [r2]
+ sub r2, r2, r3, lsl #1
+
+ vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6
+
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 1b
+
+ pop {r4,pc}
+endfunc
+
+function ff_put_vp8_epel4_h4v4_neon, export=1
+ sub r2, r2, r3
+ sub r2, r2, #1
+ push {r4,lr}
+
+ ldr r4, [sp, #12] @ mx
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #8] @ h
+ add r4, lr, r4, lsl #4
+ sub sp, sp, #44+16
+ vld1.16 {q0}, [r4,:128]
+ add lr, sp, #15
+ add r12, r12, #3
+ bic lr, lr, #15
+1:
+ vld1.8 {d2}, [r2], r3
+ vp8_epel8_h4 d2, d2, d3
+ vst1.32 {d2[0]}, [lr,:32]!
+ subs r12, r12, #1
+ bne 1b
+
+ ldr r4, [sp, #44+16+16] @ my
+ movrel lr, subpel_filters-16
+ ldr r12, [sp, #44+16+8] @ h
+ add r4, lr, r4, lsl #4
+ add lr, sp, #15
+ vld1.16 {q0}, [r4,:128]
+ bic lr, lr, #15
+2:
+ vld1.8 {d2-d3}, [lr,:128]!
+ vld1.32 {d6[]}, [lr,:32]
+ sub lr, lr, #8
+ vld1.8 {d4-d5}, [lr]!
+ vld1.32 {d6[1]}, [lr,:32]
+ sub lr, lr, #8
+ vtrn.32 q1, q2
+ vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6
+ vst1.32 {d2[0]}, [r0,:32], r1
+ vst1.32 {d3[0]}, [r0,:32], r1
+ vst1.32 {d2[1]}, [r0,:32], r1
+ vst1.32 {d3[1]}, [r0,:32], r1
+ subs r12, r12, #4
+ bne 2b
+
+ add sp, sp, #44+16
+ pop {r4,pc}
+endfunc
+
+@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
+@ arithmatic can be used to apply filters
+const subpel_filters, align=4
+ .short 0, 6, 123, 12, 1, 0, 0, 0
+ .short 2, 11, 108, 36, 8, 1, 0, 0
+ .short 0, 9, 93, 50, 6, 0, 0, 0
+ .short 3, 16, 77, 77, 16, 3, 0, 0
+ .short 0, 6, 50, 93, 9, 0, 0, 0
+ .short 1, 8, 36, 108, 11, 2, 0, 0
+ .short 0, 1, 12, 123, 6, 0, 0, 0
+endconst
+
+/* Bilinear MC */
+
+function ff_put_vp8_bilin16_h_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+1:
+ subs r12, r12, #2
+ vld1.8 {d2-d4}, [r2], r1
+ vext.8 q2, q1, q2, #1
+ vmull.u8 q8, d2, d1
+ vmlal.u8 q8, d4, d0
+ vld1.8 {d18-d20},[r2], r1
+ vmull.u8 q3, d3, d1
+ vmlal.u8 q3, d5, d0
+ vext.8 q10, q9, q10, #1
+ vmull.u8 q11, d18, d1
+ vmlal.u8 q11, d20, d0
+ vmull.u8 q12, d19, d1
+ vmlal.u8 q12, d21, d0
+ vrshrn.u16 d4, q8, #3
+ vrshrn.u16 d5, q3, #3
+ vrshrn.u16 d6, q11, #3
+ vrshrn.u16 d7, q12, #3
+ vst1.8 {q2}, [r0,:128], r1
+ vst1.8 {q3}, [r0,:128], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin16_v_neon, export=1
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+ vld1.8 {q1}, [r2], r1
+1:
+ subs r12, r12, #2
+ vld1.8 {q2}, [r2], r1
+ vmull.u8 q3, d2, d1
+ vmlal.u8 q3, d4, d0
+ vmull.u8 q8, d3, d1
+ vmlal.u8 q8, d5, d0
+ vld1.8 {q1}, [r2], r1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d2, d0
+ vmull.u8 q10, d5, d1
+ vmlal.u8 q10, d3, d0
+ vrshrn.u16 d4, q3, #3
+ vrshrn.u16 d5, q8, #3
+ vrshrn.u16 d6, q9, #3
+ vrshrn.u16 d7, q10, #3
+ vst1.8 {q2}, [r0,:128], r1
+ vst1.8 {q3}, [r0,:128], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin16_hv_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d2, r3
+ vdup.8 d3, r12
+ ldr r12, [sp] @ h
+
+ vld1.8 {d4-d6}, [r2], r1
+ vext.8 q3, q2, q3, #1
+ vmull.u8 q8, d4, d1
+ vmlal.u8 q8, d6, d0
+ vmull.u8 q9, d5, d1
+ vmlal.u8 q9, d7, d0
+ vrshrn.u16 d4, q8, #3
+ vrshrn.u16 d5, q9, #3
+1:
+ subs r12, r12, #2
+ vld1.8 {d18-d20},[r2], r1
+ vext.8 q10, q9, q10, #1
+ vmull.u8 q11, d18, d1
+ vmlal.u8 q11, d20, d0
+ vld1.8 {d26-d28},[r2], r1
+ vmull.u8 q12, d19, d1
+ vmlal.u8 q12, d21, d0
+ vext.8 q14, q13, q14, #1
+ vmull.u8 q8, d26, d1
+ vmlal.u8 q8, d28, d0
+ vmull.u8 q9, d27, d1
+ vmlal.u8 q9, d29, d0
+ vrshrn.u16 d6, q11, #3
+ vrshrn.u16 d7, q12, #3
+ vmull.u8 q12, d4, d3
+ vmlal.u8 q12, d6, d2
+ vmull.u8 q15, d5, d3
+ vmlal.u8 q15, d7, d2
+ vrshrn.u16 d4, q8, #3
+ vrshrn.u16 d5, q9, #3
+ vmull.u8 q10, d6, d3
+ vmlal.u8 q10, d4, d2
+ vmull.u8 q11, d7, d3
+ vmlal.u8 q11, d5, d2
+ vrshrn.u16 d24, q12, #3
+ vrshrn.u16 d25, q15, #3
+ vst1.8 {q12}, [r0,:128], r1
+ vrshrn.u16 d20, q10, #3
+ vrshrn.u16 d21, q11, #3
+ vst1.8 {q10}, [r0,:128], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin8_h_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+1:
+ subs r12, r12, #2
+ vld1.8 {q1}, [r2], r1
+ vext.8 d3, d2, d3, #1
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vld1.8 {q3}, [r2], r1
+ vext.8 d7, d6, d7, #1
+ vmull.u8 q8, d6, d1
+ vmlal.u8 q8, d7, d0
+ vrshrn.u16 d4, q2, #3
+ vrshrn.u16 d16, q8, #3
+ vst1.8 {d4}, [r0,:64], r1
+ vst1.8 {d16}, [r0,:64], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin8_v_neon, export=1
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+ vld1.8 {d2}, [r2], r1
+1:
+ subs r12, r12, #2
+ vld1.8 {d3}, [r2], r1
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vld1.8 {d2}, [r2], r1
+ vmull.u8 q3, d3, d1
+ vmlal.u8 q3, d2, d0
+ vrshrn.u16 d4, q2, #3
+ vrshrn.u16 d6, q3, #3
+ vst1.8 {d4}, [r0,:64], r1
+ vst1.8 {d6}, [r0,:64], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin8_hv_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d2, r3
+ vdup.8 d3, r12
+ ldr r12, [sp] @ h
+
+ vld1.8 {q2}, [r2], r1
+ vext.8 d5, d4, d5, #1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d5, d0
+ vrshrn.u16 d22, q9, #3
+1:
+ subs r12, r12, #2
+ vld1.8 {q3}, [r2], r1
+ vext.8 d7, d6, d7, #1
+ vmull.u8 q8, d6, d1
+ vmlal.u8 q8, d7, d0
+ vld1.8 {q2}, [r2], r1
+ vext.8 d5, d4, d5, #1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d5, d0
+ vrshrn.u16 d16, q8, #3
+ vmull.u8 q10, d22, d3
+ vmlal.u8 q10, d16, d2
+ vrshrn.u16 d22, q9, #3
+ vmull.u8 q12, d16, d3
+ vmlal.u8 q12, d22, d2
+ vrshrn.u16 d20, q10, #3
+ vst1.8 {d20}, [r0,:64], r1
+ vrshrn.u16 d23, q12, #3
+ vst1.8 {d23}, [r0,:64], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin4_h_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+1:
+ subs r12, r12, #2
+ vld1.8 {d2}, [r2], r1
+ vext.8 d3, d2, d3, #1
+ vld1.8 {d6}, [r2], r1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 q1, q3
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vrshrn.u16 d4, q2, #3
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin4_v_neon, export=1
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r12, [sp] @ h
+ vld1.32 {d2[]}, [r2], r1
+1:
+ vld1.32 {d3[]}, [r2]
+ vld1.32 {d2[1]}, [r2], r1
+ vld1.32 {d3[1]}, [r2], r1
+ vmull.u8 q2, d2, d1
+ vmlal.u8 q2, d3, d0
+ vtrn.32 d3, d2
+ vrshrn.u16 d4, q2, #3
+ vst1.32 {d4[0]}, [r0,:32], r1
+ vst1.32 {d4[1]}, [r0,:32], r1
+ subs r12, r12, #2
+ bgt 1b
+
+ bx lr
+endfunc
+
+function ff_put_vp8_bilin4_hv_neon, export=1
+ ldr r3, [sp, #4] @ mx
+ rsb r12, r3, #8
+ vdup.8 d0, r3
+ vdup.8 d1, r12
+ ldr r3, [sp, #8] @ my
+ rsb r12, r3, #8
+ vdup.8 d2, r3
+ vdup.8 d3, r12
+ ldr r12, [sp] @ h
+
+ vld1.8 {d4}, [r2], r1
+ vext.8 d5, d4, d4, #1
+ vmull.u8 q9, d4, d1
+ vmlal.u8 q9, d5, d0
+ vrshrn.u16 d22, q9, #3
+1:
+ subs r12, r12, #2
+ vld1.8 {d6}, [r2], r1
+ vext.8 d7, d6, d6, #1
+ vld1.8 {d4}, [r2], r1
+ vext.8 d5, d4, d4, #1
+ vtrn.32 q3, q2
+ vmull.u8 q8, d6, d1
+ vmlal.u8 q8, d7, d0
+ vrshrn.u16 d16, q8, #3
+ vmull.u8 q10, d16, d2
+ vtrn.32 d22, d16
+ vmlal.u8 q10, d22, d3
+ vrev64.32 d22, d16
+ vrshrn.u16 d20, q10, #3
+ vst1.32 {d20[0]}, [r0,:32], r1
+ vst1.32 {d20[1]}, [r0,:32], r1
+ bgt 1b
+
+ bx lr
+endfunc