diff options
Diffstat (limited to 'ffmpeg/libavcodec/arm/fmtconvert_vfp.S')
| -rw-r--r-- | ffmpeg/libavcodec/arm/fmtconvert_vfp.S | 245 |
1 files changed, 194 insertions, 51 deletions
diff --git a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S index 7b012bc..b14af45 100644 --- a/ffmpeg/libavcodec/arm/fmtconvert_vfp.S +++ b/ffmpeg/libavcodec/arm/fmtconvert_vfp.S @@ -1,5 +1,5 @@ /* - * Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net> + * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org> * * This file is part of FFmpeg. * @@ -22,57 +22,200 @@ #include "libavutil/arm/asm.S" /** - * ARM VFP optimized float to int16 conversion. - * Assume that len is a positive number and is multiple of 8, destination - * buffer is at least 4 bytes aligned (8 bytes alignment is better for - * performance), little-endian byte sex. + * ARM VFP optimised int32 to float conversion. + * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned + * (16 bytes alignment is best for BCM2835), little-endian. */ -@ void ff_float_to_int16_vfp(int16_t *dst, const float *src, int len) -function ff_float_to_int16_vfp, export=1 - push {r4-r8,lr} - vpush {d8-d11} - vldmia r1!, {s16-s23} - vcvt.s32.f32 s0, s16 - vcvt.s32.f32 s1, s17 - vcvt.s32.f32 s2, s18 - vcvt.s32.f32 s3, s19 - vcvt.s32.f32 s4, s20 - vcvt.s32.f32 s5, s21 - vcvt.s32.f32 s6, s22 - vcvt.s32.f32 s7, s23 +@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len) +function ff_int32_to_float_fmul_array8_vfp, export=1 + push {lr} + ldr a1, [sp, #4] + subs lr, a1, #3*8 + bcc 50f @ too short to pipeline + @ Now need to find (len / 8) % 3. The approximation + @ x / 24 = (x * 0xAB) >> 12 + @ is good for x < 4096, which is true for both AC3 and DCA. + mov a1, #0xAB + ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 + mul a1, lr, a1 + vpush {s16-s31} + mov a1, a1, lsr #12 + add a1, a1, a1, lsl #1 + rsb a1, a1, lr, lsr #3 + cmp a1, #1 + fmrx a1, FPSCR + fmxr FPSCR, ip + beq 11f + blo 10f + @ Array is (2 + multiple of 3) x 8 floats long + @ drop through... + vldmia a3!, {s16-s23} + vldmia a4!, {s2,s3} + vldmia a3!, {s24-s31} + vcvt.f32.s32 s16, s16 + vcvt.f32.s32 s17, s17 + vcvt.f32.s32 s18, s18 + vcvt.f32.s32 s19, s19 + vcvt.f32.s32 s20, s20 + vcvt.f32.s32 s21, s21 + vcvt.f32.s32 s22, s22 + vcvt.f32.s32 s23, s23 + vmul.f32 s16, s16, s2 + @ drop through... +3: + vldmia a3!, {s8-s15} + vldmia a4!, {s1} + vcvt.f32.s32 s24, s24 + vcvt.f32.s32 s25, s25 + vcvt.f32.s32 s26, s26 + vcvt.f32.s32 s27, s27 + vcvt.f32.s32 s28, s28 + vcvt.f32.s32 s29, s29 + vcvt.f32.s32 s30, s30 + vcvt.f32.s32 s31, s31 + vmul.f32 s24, s24, s3 + vstmia a2!, {s16-s19} + vstmia a2!, {s20-s23} +2: + vldmia a3!, {s16-s23} + vldmia a4!, {s2} + vcvt.f32.s32 s8, s8 + vcvt.f32.s32 s9, s9 + vcvt.f32.s32 s10, s10 + vcvt.f32.s32 s11, s11 + vcvt.f32.s32 s12, s12 + vcvt.f32.s32 s13, s13 + vcvt.f32.s32 s14, s14 + vcvt.f32.s32 s15, s15 + vmul.f32 s8, s8, s1 + vstmia a2!, {s24-s27} + vstmia a2!, {s28-s31} 1: - subs r2, r2, #8 - vmov r3, r4, s0, s1 - vmov r5, r6, s2, s3 - vmov r7, r8, s4, s5 - vmov ip, lr, s6, s7 - it gt - vldmiagt r1!, {s16-s23} - ssat r4, #16, r4 - ssat r3, #16, r3 - ssat r6, #16, r6 - ssat r5, #16, r5 - pkhbt r3, r3, r4, lsl #16 - pkhbt r4, r5, r6, lsl #16 - itttt gt - vcvtgt.s32.f32 s0, s16 - vcvtgt.s32.f32 s1, s17 - vcvtgt.s32.f32 s2, s18 - vcvtgt.s32.f32 s3, s19 - itttt gt - vcvtgt.s32.f32 s4, s20 - vcvtgt.s32.f32 s5, s21 - vcvtgt.s32.f32 s6, s22 - vcvtgt.s32.f32 s7, s23 - ssat r8, #16, r8 - ssat r7, #16, r7 - ssat lr, #16, lr - ssat ip, #16, ip - pkhbt r5, r7, r8, lsl #16 - pkhbt r6, ip, lr, lsl #16 - stmia r0!, {r3-r6} - bgt 1b + vldmia a3!, {s24-s31} + vldmia a4!, {s3} + vcvt.f32.s32 s16, s16 + vcvt.f32.s32 s17, s17 + vcvt.f32.s32 s18, s18 + vcvt.f32.s32 s19, s19 + vcvt.f32.s32 s20, s20 + vcvt.f32.s32 s21, s21 + vcvt.f32.s32 s22, s22 + vcvt.f32.s32 s23, s23 + vmul.f32 s16, s16, s2 + vstmia a2!, {s8-s11} + vstmia a2!, {s12-s15} - vpop {d8-d11} - pop {r4-r8,pc} + subs lr, lr, #8*3 + bpl 3b + + vcvt.f32.s32 s24, s24 + vcvt.f32.s32 s25, s25 + vcvt.f32.s32 s26, s26 + vcvt.f32.s32 s27, s27 + vcvt.f32.s32 s28, s28 + vcvt.f32.s32 s29, s29 + vcvt.f32.s32 s30, s30 + vcvt.f32.s32 s31, s31 + vmul.f32 s24, s24, s3 + vstmia a2!, {s16-s19} + vstmia a2!, {s20-s23} + vstmia a2!, {s24-s27} + vstmia a2!, {s28-s31} + + fmxr FPSCR, a1 + vpop {s16-s31} + pop {pc} + +10: @ Array is (multiple of 3) x 8 floats long + vldmia a3!, {s8-s15} + vldmia a4!, {s1,s2} + vldmia a3!, {s16-s23} + vcvt.f32.s32 s8, s8 + vcvt.f32.s32 s9, s9 + vcvt.f32.s32 s10, s10 + vcvt.f32.s32 s11, s11 + vcvt.f32.s32 s12, s12 + vcvt.f32.s32 s13, s13 + vcvt.f32.s32 s14, s14 + vcvt.f32.s32 s15, s15 + vmul.f32 s8, s8, s1 + b 1b + +11: @ Array is (1 + multiple of 3) x 8 floats long + vldmia a3!, {s24-s31} + vldmia a4!, {s3} + vldmia a3!, {s8-s15} + vldmia a4!, {s1} + vcvt.f32.s32 s24, s24 + vcvt.f32.s32 s25, s25 + vcvt.f32.s32 s26, s26 + vcvt.f32.s32 s27, s27 + vcvt.f32.s32 s28, s28 + vcvt.f32.s32 s29, s29 + vcvt.f32.s32 s30, s30 + vcvt.f32.s32 s31, s31 + vmul.f32 s24, s24, s3 + b 2b + +50: + ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 + fmrx ip, FPSCR + fmxr FPSCR, lr +51: + vldmia a3!, {s8-s15} + vldmia a4!, {s0} + vcvt.f32.s32 s8, s8 + vcvt.f32.s32 s9, s9 + vcvt.f32.s32 s10, s10 + vcvt.f32.s32 s11, s11 + vcvt.f32.s32 s12, s12 + vcvt.f32.s32 s13, s13 + vcvt.f32.s32 s14, s14 + vcvt.f32.s32 s15, s15 + vmul.f32 s8, s8, s0 + subs a1, a1, #8 + vstmia a2!, {s8-s11} + vstmia a2!, {s12-s15} + bne 51b + + fmxr FPSCR, ip + pop {pc} +endfunc + +/** + * ARM VFP optimised int32 to float conversion. + * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned + * (16 bytes alignment is best for BCM2835), little-endian. + * TODO: could be further optimised by unrolling and interleaving, as above + */ +@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len) +function ff_int32_to_float_fmul_scalar_vfp, export=1 +VFP tmp .req a4 +VFP len .req a3 +NOVFP tmp .req a3 +NOVFP len .req a4 +NOVFP vmov s0, a3 + ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1 + fmrx ip, FPSCR + fmxr FPSCR, tmp +1: + vldmia a2!, {s8-s15} + vcvt.f32.s32 s8, s8 + vcvt.f32.s32 s9, s9 + vcvt.f32.s32 s10, s10 + vcvt.f32.s32 s11, s11 + vcvt.f32.s32 s12, s12 + vcvt.f32.s32 s13, s13 + vcvt.f32.s32 s14, s14 + vcvt.f32.s32 s15, s15 + vmul.f32 s8, s8, s0 + subs len, len, #8 + vstmia a1!, {s8-s11} + vstmia a1!, {s12-s15} + bne 1b + + fmxr FPSCR, ip + bx lr endfunc + .unreq tmp + .unreq len |
