mirror of
https://github.com/LibreELEC/LibreELEC.tv.git
synced 2025-07-28 13:16:41 +00:00
ffmpeg: update to ffmpeg-1.2.3, sync upstream patches
Signed-off-by: Stephan Raue <stephan@openelec.tv>
This commit is contained in:
parent
badfce90d2
commit
ed2f08ddc6
@ -21,7 +21,7 @@
|
||||
PKG_NAME="ffmpeg"
|
||||
PKG_VERSION="0.10.7"
|
||||
if [ "$XBMC" = "master" ]; then
|
||||
PKG_VERSION="1.2.1"
|
||||
PKG_VERSION="1.2.3"
|
||||
fi
|
||||
PKG_REV="1"
|
||||
PKG_ARCH="any"
|
||||
|
@ -1,22 +0,0 @@
|
||||
Subject: [libav-devel] [PATCH 1/2] vaapi: return early from ff_vaapi_render_picture() without picture
|
||||
From: Janne Grunau janne-libav at jannau.net
|
||||
|
||||
Fixes an assertion when called on uninitialized frame. Spotted after
|
||||
seeking in vlc. (backported from libav mailing list)
|
||||
|
||||
---
|
||||
|
||||
diff --git a/libavcodec/vaapi.c b/libavcodec/vaapi.c
|
||||
index a220a9d..94959bf 100644
|
||||
--- a/libavcodec/vaapi.c
|
||||
+++ b/libavcodec/vaapi.c
|
||||
@@ -46,6 +46,9 @@ int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface)
|
||||
VABufferID va_buffers[3];
|
||||
unsigned int n_va_buffers = 0;
|
||||
|
||||
+ if (!vactx->pic_param_buf_id)
|
||||
+ return 0;
|
||||
+
|
||||
vaUnmapBuffer(vactx->display, vactx->pic_param_buf_id);
|
||||
va_buffers[n_va_buffers++] = vactx->pic_param_buf_id;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,311 @@
|
||||
From 40daea3c1bafa9cea37b65f856c3c0432767d760 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 15 Jul 2013 18:28:09 +0100
|
||||
Subject: [PATCH 39/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of synth_filter_float
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 9295.0 114.9 4853.2 83.5 +91.5%
|
||||
Overall 23699.8 397.6 19285.5 292.0 +22.9%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/Makefile | 1 +
|
||||
libavcodec/arm/fft_init_arm.c | 8 +
|
||||
libavcodec/arm/synth_filter_vfp.S | 243 ++++++++++++++++++++++++++
|
||||
3 files changed, 252 insertions(+)
|
||||
create mode 100644 libavcodec/arm/synth_filter_vfp.S
|
||||
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index 1c91d62..aee9d73 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -58,6 +58,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
|
||||
arm/dsputil_armv6.o \
|
||||
arm/simple_idct_armv6.o \
|
||||
|
||||
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
|
||||
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
|
||||
index 8c98abc..fe0acc5 100644
|
||||
--- a/libavcodec/arm/fft_init_arm.c
|
||||
+++ b/libavcodec/arm/fft_init_arm.c
|
||||
@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)
|
||||
|
||||
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
|
||||
|
||||
+void ff_synth_filter_float_vfp(FFTContext *imdct,
|
||||
+ float *synth_buf_ptr, int *synth_buf_offset,
|
||||
+ float synth_buf2[32], const float window[512],
|
||||
+ float out[32], const float in[32],
|
||||
+ float scale);
|
||||
+
|
||||
void ff_synth_filter_float_neon(FFTContext *imdct,
|
||||
float *synth_buf_ptr, int *synth_buf_offset,
|
||||
float synth_buf2[32], const float window[512],
|
||||
@@ -71,6 +77,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
|
||||
+ s->synth_filter_float = ff_synth_filter_float_vfp;
|
||||
if (have_neon(cpu_flags))
|
||||
s->synth_filter_float = ff_synth_filter_float_neon;
|
||||
}
|
||||
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
|
||||
new file mode 100644
|
||||
index 0000000..c219c41
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/synth_filter_vfp.S
|
||||
@@ -0,0 +1,243 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2013 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of Libav.
|
||||
+ *
|
||||
+ * Libav is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * Libav is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with Libav; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+IMDCT .req r0
|
||||
+ORIG_P_SB .req r1
|
||||
+P_SB_OFF .req r2
|
||||
+I .req r0
|
||||
+P_SB2_UP .req r1
|
||||
+OLDFPSCR .req r2
|
||||
+P_SB2_DN .req r3
|
||||
+P_WIN_DN .req r4
|
||||
+P_OUT_DN .req r5
|
||||
+P_SB .req r6
|
||||
+J_WRAP .req r7
|
||||
+P_WIN_UP .req r12
|
||||
+P_OUT_UP .req r14
|
||||
+
|
||||
+SCALE .req s0
|
||||
+SBUF_DAT_REV0 .req s4
|
||||
+SBUF_DAT_REV1 .req s5
|
||||
+SBUF_DAT_REV2 .req s6
|
||||
+SBUF_DAT_REV3 .req s7
|
||||
+VA0 .req s8
|
||||
+VA3 .req s11
|
||||
+VB0 .req s12
|
||||
+VB3 .req s15
|
||||
+VC0 .req s8
|
||||
+VC3 .req s11
|
||||
+VD0 .req s12
|
||||
+VD3 .req s15
|
||||
+SBUF_DAT0 .req s16
|
||||
+SBUF_DAT1 .req s17
|
||||
+SBUF_DAT2 .req s18
|
||||
+SBUF_DAT3 .req s19
|
||||
+SBUF_DAT_ALT0 .req s20
|
||||
+SBUF_DAT_ALT1 .req s21
|
||||
+SBUF_DAT_ALT2 .req s22
|
||||
+SBUF_DAT_ALT3 .req s23
|
||||
+WIN_DN_DAT0 .req s24
|
||||
+WIN_UP_DAT0 .req s28
|
||||
+
|
||||
+
|
||||
+.macro inner_loop half, tail, head
|
||||
+ .if (OFFSET & (64*4)) == 0 @ even numbered call
|
||||
+ SBUF_DAT_THIS0 .req SBUF_DAT0
|
||||
+ SBUF_DAT_THIS1 .req SBUF_DAT1
|
||||
+ SBUF_DAT_THIS2 .req SBUF_DAT2
|
||||
+ SBUF_DAT_THIS3 .req SBUF_DAT3
|
||||
+ .ifnc "\head",""
|
||||
+ vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
|
||||
+ vldr d9, [P_SB, #OFFSET+8]
|
||||
+ .endif
|
||||
+ .else
|
||||
+ SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
|
||||
+ SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
|
||||
+ SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
|
||||
+ SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
|
||||
+ .ifnc "\head",""
|
||||
+ vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
|
||||
+ vldr d11, [P_SB, #OFFSET+8]
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ .ifc "\half","ab"
|
||||
+ vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
|
||||
+ .else
|
||||
+ vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
|
||||
+ vldr d15, [P_WIN_UP, #OFFSET+8]
|
||||
+ vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
|
||||
+ vldr d13, [P_WIN_DN, #OFFSET+8]
|
||||
+ vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
|
||||
+ vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
|
||||
+ vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
|
||||
+ vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
|
||||
+ .ifc "\half","ab"
|
||||
+ vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
|
||||
+ .else
|
||||
+ vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
|
||||
+ .endif
|
||||
+ teq J_WRAP, #J
|
||||
+ bne 2f @ strongly predictable, so better than cond exec in this case
|
||||
+ sub P_SB, P_SB, #512*4
|
||||
+2:
|
||||
+ .set J, J - 64
|
||||
+ .set OFFSET, OFFSET + 64*4
|
||||
+ .endif
|
||||
+ .unreq SBUF_DAT_THIS0
|
||||
+ .unreq SBUF_DAT_THIS1
|
||||
+ .unreq SBUF_DAT_THIS2
|
||||
+ .unreq SBUF_DAT_THIS3
|
||||
+.endm
|
||||
+
|
||||
+
|
||||
+/* void ff_synth_filter_float_vfp(FFTContext *imdct,
|
||||
+ * float *synth_buf_ptr, int *synth_buf_offset,
|
||||
+ * float synth_buf2[32], const float window[512],
|
||||
+ * float out[32], const float in[32], float scale)
|
||||
+ */
|
||||
+function ff_synth_filter_float_vfp, export=1
|
||||
+ push {r3-r7,lr}
|
||||
+ vpush {s16-s31}
|
||||
+ ldr lr, [P_SB_OFF]
|
||||
+ add a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
|
||||
+ mov P_SB, a2 @ and keep a copy for ourselves
|
||||
+ bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
|
||||
+ sub lr, lr, #32
|
||||
+ and lr, lr, #512-32
|
||||
+ str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
|
||||
+ ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
|
||||
+VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
|
||||
+ bl ff_imdct_half_vfp
|
||||
+VFP vmov SCALE, s16
|
||||
+
|
||||
+ fmrx OLDFPSCR, FPSCR
|
||||
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
+ fmxr FPSCR, lr
|
||||
+ ldr P_SB2_DN, [sp, #16*4]
|
||||
+ ldr P_WIN_DN, [sp, #(16+6+0)*4]
|
||||
+ ldr P_OUT_DN, [sp, #(16+6+1)*4]
|
||||
+NOVFP vldr SCALE, [sp, #(16+6+3)*4]
|
||||
+
|
||||
+#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */
|
||||
+ add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
|
||||
+ add P_SB2_UP, P_SB2_DN, #16*4
|
||||
+ add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
|
||||
+ add P_OUT_UP, P_OUT_DN, #16*4
|
||||
+ add P_SB2_DN, P_SB2_DN, #16*4
|
||||
+ add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
|
||||
+ add P_OUT_DN, P_OUT_DN, #16*4
|
||||
+ mov I, #4
|
||||
+1:
|
||||
+ vldmia P_SB2_UP!, {VB0-VB3}
|
||||
+ vldmdb P_SB2_DN!, {VA0-VA3}
|
||||
+ .set J, 512 - 64
|
||||
+ .set OFFSET, -IMM_OFF_SKEW
|
||||
+ inner_loop ab,, head
|
||||
+ .rept 7
|
||||
+ inner_loop ab, tail, head
|
||||
+ .endr
|
||||
+ inner_loop ab, tail
|
||||
+ add P_WIN_UP, P_WIN_UP, #4*4
|
||||
+ sub P_WIN_DN, P_WIN_DN, #4*4
|
||||
+ vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
|
||||
+ add P_SB, P_SB, #(512+4)*4
|
||||
+ subs I, I, #1
|
||||
+ vmul.f VA0, VA0, SCALE
|
||||
+ vstmia P_OUT_UP!, {VB0-VB3}
|
||||
+ vstmdb P_OUT_DN!, {VA0-VA3}
|
||||
+ bne 1b
|
||||
+
|
||||
+ add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
|
||||
+ sub P_SB2_UP, P_SB2_UP, #(16+16)*4
|
||||
+ add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
|
||||
+ mov I, #4
|
||||
+1:
|
||||
+ vldr.d d4, zero @ d4 = VC0
|
||||
+ vldr.d d5, zero
|
||||
+ vldr.d d6, zero @ d6 = VD0
|
||||
+ vldr.d d7, zero
|
||||
+ .set J, 512 - 64
|
||||
+ .set OFFSET, -IMM_OFF_SKEW
|
||||
+ inner_loop cd,, head
|
||||
+ .rept 7
|
||||
+ inner_loop cd, tail, head
|
||||
+ .endr
|
||||
+ inner_loop cd, tail
|
||||
+ add P_WIN_UP, P_WIN_UP, #4*4
|
||||
+ sub P_WIN_DN, P_WIN_DN, #4*4
|
||||
+ add P_SB, P_SB, #(512+4)*4
|
||||
+ subs I, I, #1
|
||||
+ vstmia P_SB2_UP!, {VC0-VC3}
|
||||
+ vstmdb P_SB2_DN!, {VD0-VD3}
|
||||
+ bne 1b
|
||||
+
|
||||
+ fmxr FPSCR, OLDFPSCR
|
||||
+ vpop {s16-s31}
|
||||
+ pop {r3-r7,pc}
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq IMDCT
|
||||
+ .unreq ORIG_P_SB
|
||||
+ .unreq P_SB_OFF
|
||||
+ .unreq I
|
||||
+ .unreq P_SB2_UP
|
||||
+ .unreq OLDFPSCR
|
||||
+ .unreq P_SB2_DN
|
||||
+ .unreq P_WIN_DN
|
||||
+ .unreq P_OUT_DN
|
||||
+ .unreq P_SB
|
||||
+ .unreq J_WRAP
|
||||
+ .unreq P_WIN_UP
|
||||
+ .unreq P_OUT_UP
|
||||
+
|
||||
+ .unreq SCALE
|
||||
+ .unreq SBUF_DAT_REV0
|
||||
+ .unreq SBUF_DAT_REV1
|
||||
+ .unreq SBUF_DAT_REV2
|
||||
+ .unreq SBUF_DAT_REV3
|
||||
+ .unreq VA0
|
||||
+ .unreq VA3
|
||||
+ .unreq VB0
|
||||
+ .unreq VB3
|
||||
+ .unreq VC0
|
||||
+ .unreq VC3
|
||||
+ .unreq VD0
|
||||
+ .unreq VD3
|
||||
+ .unreq SBUF_DAT0
|
||||
+ .unreq SBUF_DAT1
|
||||
+ .unreq SBUF_DAT2
|
||||
+ .unreq SBUF_DAT3
|
||||
+ .unreq SBUF_DAT_ALT0
|
||||
+ .unreq SBUF_DAT_ALT1
|
||||
+ .unreq SBUF_DAT_ALT2
|
||||
+ .unreq SBUF_DAT_ALT3
|
||||
+ .unreq WIN_DN_DAT0
|
||||
+ .unreq WIN_UP_DAT0
|
||||
+
|
||||
+ .align 3
|
||||
+zero: .word 0, 0
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,102 @@
|
||||
From 8ead63b22d31bf71976fc6964922b43d8e0d660b Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 15 Jul 2013 18:28:10 +0100
|
||||
Subject: [PATCH 40/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of int32_to_float_fmul_scalar
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 1175.0 4.4 366.2 18.3 +220.8%
|
||||
Overall 19285.5 292.0 18420.5 489.1 +4.7%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/fmtconvert_init_arm.c | 10 ++++++
|
||||
libavcodec/arm/fmtconvert_vfp.S | 38 +++++++++++++++++++++++
|
||||
2 files changed, 48 insertions(+)
|
||||
|
||||
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
|
||||
index 1d99c97..de3b78b 100644
|
||||
--- a/libavcodec/arm/fmtconvert_init_arm.c
|
||||
+++ b/libavcodec/arm/fmtconvert_init_arm.c
|
||||
@@ -28,6 +28,9 @@
|
||||
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
|
||||
float mul, int len);
|
||||
|
||||
+void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
|
||||
+ float mul, int len);
|
||||
+
|
||||
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
|
||||
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
||||
|
||||
@@ -38,6 +41,13 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
|
||||
+ if (!have_vfpv3(cpu_flags)) {
|
||||
+ // This function doesn't use anything armv6 specific in itself,
|
||||
+ // but ff_float_to_int16_vfp which is in the same assembly source
|
||||
+ // file does, thus the whole file requires armv6 to be built.
|
||||
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
|
||||
+ }
|
||||
+
|
||||
c->float_to_int16 = ff_float_to_int16_vfp;
|
||||
}
|
||||
|
||||
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
|
||||
index 7b012bc..3cc3e56 100644
|
||||
--- a/libavcodec/arm/fmtconvert_vfp.S
|
||||
+++ b/libavcodec/arm/fmtconvert_vfp.S
|
||||
@@ -1,5 +1,6 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
|
||||
+ * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
@@ -76,3 +77,40 @@ function ff_float_to_int16_vfp, export=1
|
||||
vpop {d8-d11}
|
||||
pop {r4-r8,pc}
|
||||
endfunc
|
||||
+
|
||||
+/**
|
||||
+ * ARM VFP optimised int32 to float conversion.
|
||||
+ * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
|
||||
+ * (16 bytes alignment is best for BCM2835), little-endian.
|
||||
+ */
|
||||
+@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
|
||||
+function ff_int32_to_float_fmul_scalar_vfp, export=1
|
||||
+VFP tmp .req a4
|
||||
+VFP len .req a3
|
||||
+NOVFP tmp .req a3
|
||||
+NOVFP len .req a4
|
||||
+NOVFP vmov s0, a3
|
||||
+ ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
|
||||
+ fmrx ip, FPSCR
|
||||
+ fmxr FPSCR, tmp
|
||||
+1:
|
||||
+ vldmia a2!, {s8-s15}
|
||||
+ vcvt.f32.s32 s8, s8
|
||||
+ vcvt.f32.s32 s9, s9
|
||||
+ vcvt.f32.s32 s10, s10
|
||||
+ vcvt.f32.s32 s11, s11
|
||||
+ vcvt.f32.s32 s12, s12
|
||||
+ vcvt.f32.s32 s13, s13
|
||||
+ vcvt.f32.s32 s14, s14
|
||||
+ vcvt.f32.s32 s15, s15
|
||||
+ vmul.f32 s8, s8, s0
|
||||
+ subs len, len, #8
|
||||
+ vstmia a1!, {s8-s11}
|
||||
+ vstmia a1!, {s12-s15}
|
||||
+ bne 1b
|
||||
+
|
||||
+ fmxr FPSCR, ip
|
||||
+ bx lr
|
||||
+endfunc
|
||||
+ .unreq tmp
|
||||
+ .unreq len
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,78 @@
|
||||
From 7901e7216cf6406a2ea430c71af94ebee72f262b Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 15 Jul 2013 18:28:11 +0100
|
||||
Subject: [PATCH 41/49] [ffmpeg] - backport - fmtconvert: Add a new method,
|
||||
int32_to_float_fmul_array8
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This is similar to int32_to_float_fmul_scalar, but
|
||||
loads a new scalar multiplier every 8 input samples.
|
||||
This enables the use of much larger input arrays, which
|
||||
is important for pipelining on some CPUs (such as
|
||||
ARMv6).
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/fmtconvert.c | 10 ++++++++++
|
||||
libavcodec/fmtconvert.h | 16 ++++++++++++++++
|
||||
2 files changed, 26 insertions(+)
|
||||
|
||||
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
|
||||
index 79e9645..1c45d35 100644
|
||||
--- a/libavcodec/fmtconvert.c
|
||||
+++ b/libavcodec/fmtconvert.c
|
||||
@@ -30,6 +30,15 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
|
||||
dst[i] = src[i] * mul;
|
||||
}
|
||||
|
||||
+static void int32_to_float_fmul_array8_c(FmtConvertContext *c, float *dst,
|
||||
+ const int32_t *src, const float *mul,
|
||||
+ int len)
|
||||
+{
|
||||
+ int i;
|
||||
+ for (i = 0; i < len; i += 8)
|
||||
+ c->int32_to_float_fmul_scalar(&dst[i], &src[i], *mul++, 8);
|
||||
+}
|
||||
+
|
||||
static av_always_inline int float_to_int16_one(const float *src){
|
||||
return av_clip_int16(lrintf(*src));
|
||||
}
|
||||
@@ -79,6 +88,7 @@ void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
|
||||
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
|
||||
{
|
||||
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
|
||||
+ c->int32_to_float_fmul_array8 = int32_to_float_fmul_array8_c;
|
||||
c->float_to_int16 = float_to_int16_c;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_c;
|
||||
c->float_interleave = ff_float_interleave_c;
|
||||
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
|
||||
index 3fb9f4e..02468dc 100644
|
||||
--- a/libavcodec/fmtconvert.h
|
||||
+++ b/libavcodec/fmtconvert.h
|
||||
@@ -38,6 +38,22 @@ typedef struct FmtConvertContext {
|
||||
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
|
||||
|
||||
/**
|
||||
+ * Convert an array of int32_t to float and multiply by a float value from another array,
|
||||
+ * stepping along the float array once for each 8 integers.
|
||||
+ * @param c pointer to FmtConvertContext.
|
||||
+ * @param dst destination array of float.
|
||||
+ * constraints: 16-byte aligned
|
||||
+ * @param src source array of int32_t.
|
||||
+ * constraints: 16-byte aligned
|
||||
+ * @param mul source array of float multipliers.
|
||||
+ * @param len number of elements to convert.
|
||||
+ * constraints: multiple of 8
|
||||
+ */
|
||||
+ void (*int32_to_float_fmul_array8)(struct FmtConvertContext *c,
|
||||
+ float *dst, const int32_t *src,
|
||||
+ const float *mul, int len);
|
||||
+
|
||||
+ /**
|
||||
* Convert an array of float to an array of int16_t.
|
||||
*
|
||||
* Convert floats from in the range [-32768.0,32767.0] to ints
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,90 @@
|
||||
From fa755fe82fe4cfbb85b7c57501912da2e1f316bc Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Tue, 16 Jul 2013 15:41:18 +0300
|
||||
Subject: [PATCH 42/49] [ffmpeg] - backport - dcadec: Use
|
||||
int32_to_float_fmul_array8
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/dcadec.c | 23 +++++++++++++++--------
|
||||
1 file changed, 15 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
|
||||
index 1b955e4..b648613 100644
|
||||
--- a/libavcodec/dcadec.c
|
||||
+++ b/libavcodec/dcadec.c
|
||||
@@ -1302,7 +1302,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
|
||||
|
||||
/* FIXME */
|
||||
float (*subband_samples)[DCA_SUBBANDS][8] = s->subband_samples[block_index];
|
||||
- LOCAL_ALIGNED_16(int, block, [8]);
|
||||
+ LOCAL_ALIGNED_16(int, block, [8 * DCA_SUBBANDS]);
|
||||
|
||||
/*
|
||||
* Audio data
|
||||
@@ -1315,6 +1315,8 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
|
||||
quant_step_table = lossy_quant_d;
|
||||
|
||||
for (k = base_channel; k < s->prim_channels; k++) {
|
||||
+ float rscale[DCA_SUBBANDS];
|
||||
+
|
||||
if (get_bits_left(&s->gb) < 0)
|
||||
return AVERROR_INVALIDDATA;
|
||||
|
||||
@@ -1337,11 +1339,12 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
|
||||
* Extract bits from the bit stream
|
||||
*/
|
||||
if (!abits) {
|
||||
- memset(subband_samples[k][l], 0, 8 * sizeof(subband_samples[0][0][0]));
|
||||
+ rscale[l] = 0;
|
||||
+ memset(block + 8 * l, 0, 8 * sizeof(block[0]));
|
||||
} else {
|
||||
/* Deal with transients */
|
||||
int sfi = s->transition_mode[k][l] && subsubframe >= s->transition_mode[k][l];
|
||||
- float rscale = quant_step_size * s->scale_factor[k][l][sfi] *
|
||||
+ rscale[l] = quant_step_size * s->scale_factor[k][l][sfi] *
|
||||
s->scalefactor_adj[k][sel];
|
||||
|
||||
if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) {
|
||||
@@ -1355,7 +1358,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
|
||||
block_code1 = get_bits(&s->gb, size);
|
||||
block_code2 = get_bits(&s->gb, size);
|
||||
err = decode_blockcodes(block_code1, block_code2,
|
||||
- levels, block);
|
||||
+ levels, block + 8 * l);
|
||||
if (err) {
|
||||
av_log(s->avctx, AV_LOG_ERROR,
|
||||
"ERROR: block code look-up failed\n");
|
||||
@@ -1364,19 +1367,23 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
|
||||
} else {
|
||||
/* no coding */
|
||||
for (m = 0; m < 8; m++)
|
||||
- block[m] = get_sbits(&s->gb, abits - 3);
|
||||
+ block[8 * l + m] = get_sbits(&s->gb, abits - 3);
|
||||
}
|
||||
} else {
|
||||
/* Huffman coded */
|
||||
for (m = 0; m < 8; m++)
|
||||
- block[m] = get_bitalloc(&s->gb,
|
||||
+ block[8 * l + m] = get_bitalloc(&s->gb,
|
||||
&dca_smpl_bitalloc[abits], sel);
|
||||
}
|
||||
|
||||
- s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
|
||||
- block, rscale, 8);
|
||||
}
|
||||
+ }
|
||||
|
||||
+ s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, subband_samples[k][0],
|
||||
+ block, rscale, 8 * s->vq_start_subband[k]);
|
||||
+
|
||||
+ for (l = 0; l < s->vq_start_subband[k]; l++) {
|
||||
+ int m;
|
||||
/*
|
||||
* Inverse ADPCM if in prediction mode
|
||||
*/
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,222 @@
|
||||
From c908a710261f33130569c4360175d8f19a282d67 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 15 Jul 2013 18:28:12 +0100
|
||||
Subject: [PATCH 43/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of int32_to_float_fmul_array8
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 366.2 18.3 277.8 13.7 +31.9%
|
||||
Overall 18420.5 489.1 17049.5 408.2 +8.0%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/fmtconvert_init_arm.c | 6 +-
|
||||
libavcodec/arm/fmtconvert_vfp.S | 162 +++++++++++++++++++++++
|
||||
2 files changed, 167 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
|
||||
index de3b78b..92d94a0 100644
|
||||
--- a/libavcodec/arm/fmtconvert_init_arm.c
|
||||
+++ b/libavcodec/arm/fmtconvert_init_arm.c
|
||||
@@ -30,6 +30,9 @@ void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
|
||||
|
||||
void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
|
||||
float mul, int len);
|
||||
+void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
|
||||
+ const int32_t *src, const float *mul,
|
||||
+ int len);
|
||||
|
||||
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
|
||||
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
|
||||
@@ -42,10 +45,11 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
|
||||
|
||||
if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
|
||||
if (!have_vfpv3(cpu_flags)) {
|
||||
- // This function doesn't use anything armv6 specific in itself,
|
||||
+ // These functions don't use anything armv6 specific in themselves,
|
||||
// but ff_float_to_int16_vfp which is in the same assembly source
|
||||
// file does, thus the whole file requires armv6 to be built.
|
||||
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
|
||||
+ c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
|
||||
}
|
||||
|
||||
c->float_to_int16 = ff_float_to_int16_vfp;
|
||||
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
|
||||
index 3cc3e56..a6d4ebd 100644
|
||||
--- a/libavcodec/arm/fmtconvert_vfp.S
|
||||
+++ b/libavcodec/arm/fmtconvert_vfp.S
|
||||
@@ -83,6 +83,168 @@ endfunc
|
||||
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
|
||||
* (16 bytes alignment is best for BCM2835), little-endian.
|
||||
*/
|
||||
+@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
|
||||
+function ff_int32_to_float_fmul_array8_vfp, export=1
|
||||
+ push {lr}
|
||||
+ ldr a1, [sp, #4]
|
||||
+ subs lr, a1, #3*8
|
||||
+ bcc 50f @ too short to pipeline
|
||||
+ @ Now need to find (len / 8) % 3. The approximation
|
||||
+ @ x / 24 = (x * 0xAB) >> 12
|
||||
+ @ is good for x < 4096, which is true for both AC3 and DCA.
|
||||
+ mov a1, #0xAB
|
||||
+ ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
|
||||
+ mul a1, lr, a1
|
||||
+ vpush {s16-s31}
|
||||
+ mov a1, a1, lsr #12
|
||||
+ add a1, a1, a1, lsl #1
|
||||
+ rsb a1, a1, lr, lsr #3
|
||||
+ cmp a1, #1
|
||||
+ fmrx a1, FPSCR
|
||||
+ fmxr FPSCR, ip
|
||||
+ beq 11f
|
||||
+ blo 10f
|
||||
+ @ Array is (2 + multiple of 3) x 8 floats long
|
||||
+ @ drop through...
|
||||
+ vldmia a3!, {s16-s23}
|
||||
+ vldmia a4!, {s2,s3}
|
||||
+ vldmia a3!, {s24-s31}
|
||||
+ vcvt.f32.s32 s16, s16
|
||||
+ vcvt.f32.s32 s17, s17
|
||||
+ vcvt.f32.s32 s18, s18
|
||||
+ vcvt.f32.s32 s19, s19
|
||||
+ vcvt.f32.s32 s20, s20
|
||||
+ vcvt.f32.s32 s21, s21
|
||||
+ vcvt.f32.s32 s22, s22
|
||||
+ vcvt.f32.s32 s23, s23
|
||||
+ vmul.f32 s16, s16, s2
|
||||
+ @ drop through...
|
||||
+3:
|
||||
+ vldmia a3!, {s8-s15}
|
||||
+ vldmia a4!, {s1}
|
||||
+ vcvt.f32.s32 s24, s24
|
||||
+ vcvt.f32.s32 s25, s25
|
||||
+ vcvt.f32.s32 s26, s26
|
||||
+ vcvt.f32.s32 s27, s27
|
||||
+ vcvt.f32.s32 s28, s28
|
||||
+ vcvt.f32.s32 s29, s29
|
||||
+ vcvt.f32.s32 s30, s30
|
||||
+ vcvt.f32.s32 s31, s31
|
||||
+ vmul.f32 s24, s24, s3
|
||||
+ vstmia a2!, {s16-s19}
|
||||
+ vstmia a2!, {s20-s23}
|
||||
+2:
|
||||
+ vldmia a3!, {s16-s23}
|
||||
+ vldmia a4!, {s2}
|
||||
+ vcvt.f32.s32 s8, s8
|
||||
+ vcvt.f32.s32 s9, s9
|
||||
+ vcvt.f32.s32 s10, s10
|
||||
+ vcvt.f32.s32 s11, s11
|
||||
+ vcvt.f32.s32 s12, s12
|
||||
+ vcvt.f32.s32 s13, s13
|
||||
+ vcvt.f32.s32 s14, s14
|
||||
+ vcvt.f32.s32 s15, s15
|
||||
+ vmul.f32 s8, s8, s1
|
||||
+ vstmia a2!, {s24-s27}
|
||||
+ vstmia a2!, {s28-s31}
|
||||
+1:
|
||||
+ vldmia a3!, {s24-s31}
|
||||
+ vldmia a4!, {s3}
|
||||
+ vcvt.f32.s32 s16, s16
|
||||
+ vcvt.f32.s32 s17, s17
|
||||
+ vcvt.f32.s32 s18, s18
|
||||
+ vcvt.f32.s32 s19, s19
|
||||
+ vcvt.f32.s32 s20, s20
|
||||
+ vcvt.f32.s32 s21, s21
|
||||
+ vcvt.f32.s32 s22, s22
|
||||
+ vcvt.f32.s32 s23, s23
|
||||
+ vmul.f32 s16, s16, s2
|
||||
+ vstmia a2!, {s8-s11}
|
||||
+ vstmia a2!, {s12-s15}
|
||||
+
|
||||
+ subs lr, lr, #8*3
|
||||
+ bpl 3b
|
||||
+
|
||||
+ vcvt.f32.s32 s24, s24
|
||||
+ vcvt.f32.s32 s25, s25
|
||||
+ vcvt.f32.s32 s26, s26
|
||||
+ vcvt.f32.s32 s27, s27
|
||||
+ vcvt.f32.s32 s28, s28
|
||||
+ vcvt.f32.s32 s29, s29
|
||||
+ vcvt.f32.s32 s30, s30
|
||||
+ vcvt.f32.s32 s31, s31
|
||||
+ vmul.f32 s24, s24, s3
|
||||
+ vstmia a2!, {s16-s19}
|
||||
+ vstmia a2!, {s20-s23}
|
||||
+ vstmia a2!, {s24-s27}
|
||||
+ vstmia a2!, {s28-s31}
|
||||
+
|
||||
+ fmxr FPSCR, a1
|
||||
+ vpop {s16-s31}
|
||||
+ pop {pc}
|
||||
+
|
||||
+10: @ Array is (multiple of 3) x 8 floats long
|
||||
+ vldmia a3!, {s8-s15}
|
||||
+ vldmia a4!, {s1,s2}
|
||||
+ vldmia a3!, {s16-s23}
|
||||
+ vcvt.f32.s32 s8, s8
|
||||
+ vcvt.f32.s32 s9, s9
|
||||
+ vcvt.f32.s32 s10, s10
|
||||
+ vcvt.f32.s32 s11, s11
|
||||
+ vcvt.f32.s32 s12, s12
|
||||
+ vcvt.f32.s32 s13, s13
|
||||
+ vcvt.f32.s32 s14, s14
|
||||
+ vcvt.f32.s32 s15, s15
|
||||
+ vmul.f32 s8, s8, s1
|
||||
+ b 1b
|
||||
+
|
||||
+11: @ Array is (1 + multiple of 3) x 8 floats long
|
||||
+ vldmia a3!, {s24-s31}
|
||||
+ vldmia a4!, {s3}
|
||||
+ vldmia a3!, {s8-s15}
|
||||
+ vldmia a4!, {s1}
|
||||
+ vcvt.f32.s32 s24, s24
|
||||
+ vcvt.f32.s32 s25, s25
|
||||
+ vcvt.f32.s32 s26, s26
|
||||
+ vcvt.f32.s32 s27, s27
|
||||
+ vcvt.f32.s32 s28, s28
|
||||
+ vcvt.f32.s32 s29, s29
|
||||
+ vcvt.f32.s32 s30, s30
|
||||
+ vcvt.f32.s32 s31, s31
|
||||
+ vmul.f32 s24, s24, s3
|
||||
+ b 2b
|
||||
+
|
||||
+50:
|
||||
+ ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
|
||||
+ fmrx ip, FPSCR
|
||||
+ fmxr FPSCR, lr
|
||||
+51:
|
||||
+ vldmia a3!, {s8-s15}
|
||||
+ vldmia a4!, {s0}
|
||||
+ vcvt.f32.s32 s8, s8
|
||||
+ vcvt.f32.s32 s9, s9
|
||||
+ vcvt.f32.s32 s10, s10
|
||||
+ vcvt.f32.s32 s11, s11
|
||||
+ vcvt.f32.s32 s12, s12
|
||||
+ vcvt.f32.s32 s13, s13
|
||||
+ vcvt.f32.s32 s14, s14
|
||||
+ vcvt.f32.s32 s15, s15
|
||||
+ vmul.f32 s8, s8, s0
|
||||
+ subs a1, a1, #8
|
||||
+ vstmia a2!, {s8-s11}
|
||||
+ vstmia a2!, {s12-s15}
|
||||
+ bne 51b
|
||||
+
|
||||
+ fmxr FPSCR, ip
|
||||
+ pop {pc}
|
||||
+endfunc
|
||||
+
|
||||
+/**
|
||||
+ * ARM VFP optimised int32 to float conversion.
|
||||
+ * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
|
||||
+ * (16 bytes alignment is best for BCM2835), little-endian.
|
||||
+ * TODO: could be further optimised by unrolling and interleaving, as above
|
||||
+ */
|
||||
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
|
||||
function ff_int32_to_float_fmul_scalar_vfp, export=1
|
||||
VFP tmp .req a4
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,274 @@
|
||||
From 15520de67fc951213ab32661b8b368a9439e8b9a Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
|
||||
Date: Fri, 19 Jul 2013 10:59:17 +0300
|
||||
Subject: [PATCH 44/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of imdct_half
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 2653.0 28.5 1108.8 51.4 +139.3%
|
||||
Overall 17049.5 408.2 15973.0 223.2 +6.7%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/Makefile | 1 +
|
||||
libavcodec/arm/fft_init_arm.c | 9 ++
|
||||
libavcodec/arm/mdct_vfp.S | 205 ++++++++++++++++++++++++++++++
|
||||
3 files changed, 215 insertions(+)
|
||||
create mode 100644 libavcodec/arm/mdct_vfp.S
|
||||
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index aee9d73..27e80d5 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -59,6 +59,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
|
||||
arm/simple_idct_armv6.o \
|
||||
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
+VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
|
||||
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
|
||||
index fe0acc5..a000ea5 100644
|
||||
--- a/libavcodec/arm/fft_init_arm.c
|
||||
+++ b/libavcodec/arm/fft_init_arm.c
|
||||
@@ -26,6 +26,8 @@
|
||||
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
|
||||
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
|
||||
|
||||
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
+
|
||||
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
|
||||
@@ -48,6 +50,13 @@ av_cold void ff_fft_init_arm(FFTContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
+ if (have_vfp(cpu_flags)) {
|
||||
+#if CONFIG_MDCT
|
||||
+ if (!have_vfpv3(cpu_flags))
|
||||
+ s->imdct_half = ff_imdct_half_vfp;
|
||||
+#endif
|
||||
+ }
|
||||
+
|
||||
if (have_neon(cpu_flags)) {
|
||||
#if CONFIG_FFT
|
||||
s->fft_permute = ff_fft_permute_neon;
|
||||
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
|
||||
new file mode 100644
|
||||
index 0000000..0623e96
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/mdct_vfp.S
|
||||
@@ -0,0 +1,205 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2013 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of Libav.
|
||||
+ *
|
||||
+ * Libav is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * Libav is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with Libav; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+CONTEXT .req a1
|
||||
+ORIGOUT .req a2
|
||||
+IN .req a3
|
||||
+OUT .req v1
|
||||
+REVTAB .req v2
|
||||
+TCOS .req v3
|
||||
+TSIN .req v4
|
||||
+OLDFPSCR .req v5
|
||||
+J0 .req a2
|
||||
+J1 .req a4
|
||||
+J2 .req ip
|
||||
+J3 .req lr
|
||||
+
|
||||
+.macro prerotation_innerloop
|
||||
+ .set trig_lo, k
|
||||
+ .set trig_hi, n4 - k - 2
|
||||
+ .set in_lo, trig_lo * 2
|
||||
+ .set in_hi, trig_hi * 2
|
||||
+ vldr d8, [TCOS, #trig_lo*4] @ s16,s17
|
||||
+ vldr d9, [TCOS, #trig_hi*4] @ s18,s19
|
||||
+ vldr s0, [IN, #in_hi*4 + 12]
|
||||
+ vldr s1, [IN, #in_hi*4 + 4]
|
||||
+ vldr s2, [IN, #in_lo*4 + 12]
|
||||
+ vldr s3, [IN, #in_lo*4 + 4]
|
||||
+ vmul.f s8, s0, s16 @ vector operation
|
||||
+ vldr d10, [TSIN, #trig_lo*4] @ s20,s21
|
||||
+ vldr d11, [TSIN, #trig_hi*4] @ s22,s23
|
||||
+ vldr s4, [IN, #in_lo*4]
|
||||
+ vldr s5, [IN, #in_lo*4 + 8]
|
||||
+ vldr s6, [IN, #in_hi*4]
|
||||
+ vldr s7, [IN, #in_hi*4 + 8]
|
||||
+ ldr J0, [REVTAB, #trig_lo*2]
|
||||
+ vmul.f s12, s0, s20 @ vector operation
|
||||
+ ldr J2, [REVTAB, #trig_hi*2]
|
||||
+ mov J1, J0, lsr #16
|
||||
+ and J0, J0, #255 @ halfword value will be < n4
|
||||
+ vmls.f s8, s4, s20 @ vector operation
|
||||
+ mov J3, J2, lsr #16
|
||||
+ and J2, J2, #255 @ halfword value will be < n4
|
||||
+ add J0, OUT, J0, lsl #3
|
||||
+ vmla.f s12, s4, s16 @ vector operation
|
||||
+ add J1, OUT, J1, lsl #3
|
||||
+ add J2, OUT, J2, lsl #3
|
||||
+ add J3, OUT, J3, lsl #3
|
||||
+ vstr s8, [J0]
|
||||
+ vstr s9, [J1]
|
||||
+ vstr s10, [J2]
|
||||
+ vstr s11, [J3]
|
||||
+ vstr s12, [J0, #4]
|
||||
+ vstr s13, [J1, #4]
|
||||
+ vstr s14, [J2, #4]
|
||||
+ vstr s15, [J3, #4]
|
||||
+ .set k, k + 2
|
||||
+.endm
|
||||
+
|
||||
+.macro postrotation_innerloop tail, head
|
||||
+ .set trig_lo_head, n8 - k - 2
|
||||
+ .set trig_hi_head, n8 + k
|
||||
+ .set out_lo_head, trig_lo_head * 2
|
||||
+ .set out_hi_head, trig_hi_head * 2
|
||||
+ .set trig_lo_tail, n8 - (k - 2) - 2
|
||||
+ .set trig_hi_tail, n8 + (k - 2)
|
||||
+ .set out_lo_tail, trig_lo_tail * 2
|
||||
+ .set out_hi_tail, trig_hi_tail * 2
|
||||
+ .if (k & 2) == 0
|
||||
+ TCOS_D0_HEAD .req d10 @ s20,s21
|
||||
+ TCOS_D1_HEAD .req d11 @ s22,s23
|
||||
+ TCOS_S0_TAIL .req s24
|
||||
+ .else
|
||||
+ TCOS_D0_HEAD .req d12 @ s24,s25
|
||||
+ TCOS_D1_HEAD .req d13 @ s26,s27
|
||||
+ TCOS_S0_TAIL .req s20
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
|
||||
+ vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
|
||||
+ vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vldr s0, [OUT, #out_lo_head*4]
|
||||
+ vldr s1, [OUT, #out_lo_head*4 + 8]
|
||||
+ vldr s2, [OUT, #out_hi_head*4]
|
||||
+ vldr s3, [OUT, #out_hi_head*4 + 8]
|
||||
+ vldr s4, [OUT, #out_lo_head*4 + 4]
|
||||
+ vldr s5, [OUT, #out_lo_head*4 + 12]
|
||||
+ vldr s6, [OUT, #out_hi_head*4 + 4]
|
||||
+ vldr s7, [OUT, #out_hi_head*4 + 12]
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vstr s8, [OUT, #out_lo_tail*4]
|
||||
+ vstr s9, [OUT, #out_lo_tail*4 + 8]
|
||||
+ vstr s10, [OUT, #out_hi_tail*4]
|
||||
+ vstr s11, [OUT, #out_hi_tail*4 + 8]
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vmul.f s8, s4, s16 @ vector operation
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vstr s12, [OUT, #out_hi_tail*4 + 12]
|
||||
+ vstr s13, [OUT, #out_hi_tail*4 + 4]
|
||||
+ vstr s14, [OUT, #out_lo_tail*4 + 12]
|
||||
+ vstr s15, [OUT, #out_lo_tail*4 + 4]
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vmul.f s12, s0, s16 @ vector operation
|
||||
+ vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
|
||||
+ .endif
|
||||
+ .unreq TCOS_D0_HEAD
|
||||
+ .unreq TCOS_D1_HEAD
|
||||
+ .unreq TCOS_S0_TAIL
|
||||
+ .ifnc "\head",""
|
||||
+ .set k, k + 2
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+
|
||||
+/* void ff_imdct_half_vfp(FFTContext *s,
|
||||
+ * FFTSample *output,
|
||||
+ * const FFTSample *input)
|
||||
+ */
|
||||
+function ff_imdct_half_vfp, export=1
|
||||
+ ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
+ teq ip, #6
|
||||
+ it ne
|
||||
+ bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA
|
||||
+
|
||||
+ .set n, 1<<6
|
||||
+ .set n2, n/2
|
||||
+ .set n4, n/4
|
||||
+ .set n8, n/8
|
||||
+
|
||||
+ push {v1-v5,lr}
|
||||
+ vpush {s16-s27}
|
||||
+ fmrx OLDFPSCR, FPSCR
|
||||
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
+ fmxr FPSCR, lr
|
||||
+ mov OUT, ORIGOUT
|
||||
+ ldr REVTAB, [CONTEXT, #2*4]
|
||||
+ ldr TCOS, [CONTEXT, #6*4]
|
||||
+ ldr TSIN, [CONTEXT, #7*4]
|
||||
+
|
||||
+ .set k, 0
|
||||
+ .rept n8/2
|
||||
+ prerotation_innerloop
|
||||
+ .endr
|
||||
+
|
||||
+ fmxr FPSCR, OLDFPSCR
|
||||
+ mov a1, OUT
|
||||
+ bl ff_fft16_vfp
|
||||
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
+ fmxr FPSCR, lr
|
||||
+
|
||||
+ .set k, 0
|
||||
+ postrotation_innerloop , head
|
||||
+ .rept n8/2 - 1
|
||||
+ postrotation_innerloop tail, head
|
||||
+ .endr
|
||||
+ postrotation_innerloop tail
|
||||
+
|
||||
+ fmxr FPSCR, OLDFPSCR
|
||||
+ vpop {s16-s27}
|
||||
+ pop {v1-v5,pc}
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq CONTEXT
|
||||
+ .unreq ORIGOUT
|
||||
+ .unreq IN
|
||||
+ .unreq OUT
|
||||
+ .unreq REVTAB
|
||||
+ .unreq TCOS
|
||||
+ .unreq TSIN
|
||||
+ .unreq OLDFPSCR
|
||||
+ .unreq J0
|
||||
+ .unreq J1
|
||||
+ .unreq J2
|
||||
+ .unreq J3
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,58 @@
|
||||
From 8e0babd84c7e03cf678aab8bcf7e2106fe2b3de6 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
|
||||
Date: Fri, 19 Jul 2013 11:03:32 +0300
|
||||
Subject: [PATCH 45/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of dca_lfe_fir
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 868.2 33.5 436.0 27.0 +99.1%
|
||||
Overall 15973.0 223.2 15577.5 83.2 +2.5%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/Makefile | 3 ++-
|
||||
libavcodec/arm/dcadsp_init_arm.c | 4 ++++
|
||||
2 files changed, 6 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index 27e80d5..7fe5bb5 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -58,7 +58,8 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
|
||||
arm/dsputil_armv6.o \
|
||||
arm/simple_idct_armv6.o \
|
||||
|
||||
-VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
|
||||
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
|
||||
+ arm/synth_filter_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
|
||||
index 56568e0..a1efbff 100644
|
||||
--- a/libavcodec/arm/dcadsp_init_arm.c
|
||||
+++ b/libavcodec/arm/dcadsp_init_arm.c
|
||||
@@ -24,6 +24,8 @@
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavcodec/dcadsp.h"
|
||||
|
||||
+void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||||
+ int decifactor, float scale);
|
||||
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
|
||||
@@ -31,6 +33,8 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
|
||||
+ s->lfe_fir = ff_dca_lfe_fir_vfp;
|
||||
if (have_neon(cpu_flags))
|
||||
s->lfe_fir = ff_dca_lfe_fir_neon;
|
||||
}
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,339 @@
|
||||
From 018b74ea9d8f52788db18ed40838afca05e7b4df Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
|
||||
Date: Fri, 19 Jul 2013 11:23:57 +0300
|
||||
Subject: [PATCH 46/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of fft16
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 1389.3 4.2 967.8 35.1 +43.6%
|
||||
Overall 15577.5 83.2 15400.0 336.4 +1.2%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/Makefile | 1 +
|
||||
libavcodec/arm/fft_vfp.S | 298 +++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 299 insertions(+)
|
||||
create mode 100644 libavcodec/arm/fft_vfp.S
|
||||
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index 7fe5bb5..7390a8b 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -60,6 +60,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
|
||||
|
||||
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
|
||||
arm/synth_filter_vfp.o
|
||||
+VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
|
||||
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
|
||||
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
|
||||
|
||||
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
|
||||
new file mode 100644
|
||||
index 0000000..7845ebb
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/fft_vfp.S
|
||||
@@ -0,0 +1,298 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2013 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of Libav.
|
||||
+ *
|
||||
+ * Libav is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * Libav is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with Libav; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+@ TODO: * FFTs wider than 16
|
||||
+@ * dispatch code
|
||||
+
|
||||
+function fft4_vfp
|
||||
+ vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
|
||||
+ vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
|
||||
+ vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
|
||||
+ vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
|
||||
+ @ stall
|
||||
+ vadd.f s12, s0, s8 @ i0
|
||||
+ vadd.f s13, s1, s9 @ i1
|
||||
+ vadd.f s14, s2, s10 @ i2
|
||||
+ vadd.f s15, s3, s11 @ i3
|
||||
+ vsub.f s8, s0, s8 @ i4
|
||||
+ vsub.f s9, s1, s9 @ i5
|
||||
+ vsub.f s10, s2, s10 @ i6
|
||||
+ vsub.f s11, s3, s11 @ i7
|
||||
+ @ stall
|
||||
+ @ stall
|
||||
+ vadd.f s0, s12, s14 @ z[0].re
|
||||
+ vsub.f s4, s12, s14 @ z[2].re
|
||||
+ vadd.f s1, s13, s15 @ z[0].im
|
||||
+ vsub.f s5, s13, s15 @ z[2].im
|
||||
+ vadd.f s7, s9, s10 @ z[3].im
|
||||
+ vsub.f s3, s9, s10 @ z[1].im
|
||||
+ vadd.f s2, s8, s11 @ z[1].re
|
||||
+ vsub.f s6, s8, s11 @ z[3].re
|
||||
+ @ stall
|
||||
+ @ stall
|
||||
+ vstr d0, [a1, #0*2*4]
|
||||
+ vstr d2, [a1, #2*2*4]
|
||||
+ @ stall
|
||||
+ @ stall
|
||||
+ vstr d1, [a1, #1*2*4]
|
||||
+ vstr d3, [a1, #3*2*4]
|
||||
+
|
||||
+ bx lr
|
||||
+endfunc
|
||||
+
|
||||
+.macro macro_fft8_head
|
||||
+ @ FFT4
|
||||
+ vldr d4, [a1, #0 * 2*4]
|
||||
+ vldr d6, [a1, #1 * 2*4]
|
||||
+ vldr d5, [a1, #2 * 2*4]
|
||||
+ vldr d7, [a1, #3 * 2*4]
|
||||
+ @ BF
|
||||
+ vldr d12, [a1, #4 * 2*4]
|
||||
+ vadd.f s16, s8, s12 @ vector op
|
||||
+ vldr d14, [a1, #5 * 2*4]
|
||||
+ vldr d13, [a1, #6 * 2*4]
|
||||
+ vldr d15, [a1, #7 * 2*4]
|
||||
+ vsub.f s20, s8, s12 @ vector op
|
||||
+ vadd.f s0, s16, s18
|
||||
+ vsub.f s2, s16, s18
|
||||
+ vadd.f s1, s17, s19
|
||||
+ vsub.f s3, s17, s19
|
||||
+ vadd.f s7, s21, s22
|
||||
+ vsub.f s5, s21, s22
|
||||
+ vadd.f s4, s20, s23
|
||||
+ vsub.f s6, s20, s23
|
||||
+ vsub.f s20, s24, s28 @ vector op
|
||||
+ vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
|
||||
+ vstr d1, [a1, #1 * 2*4]
|
||||
+ vldr s0, cos1pi4
|
||||
+ vadd.f s16, s24, s28 @ vector op
|
||||
+ vstr d2, [a1, #2 * 2*4]
|
||||
+ vstr d3, [a1, #3 * 2*4]
|
||||
+ vldr d12, [a1, #0 * 2*4]
|
||||
+ @ TRANSFORM
|
||||
+ vmul.f s20, s20, s0 @ vector x scalar op
|
||||
+ vldr d13, [a1, #1 * 2*4]
|
||||
+ vldr d14, [a1, #2 * 2*4]
|
||||
+ vldr d15, [a1, #3 * 2*4]
|
||||
+ @ BUTTERFLIES
|
||||
+ vadd.f s0, s18, s16
|
||||
+ vadd.f s1, s17, s19
|
||||
+ vsub.f s2, s17, s19
|
||||
+ vsub.f s3, s18, s16
|
||||
+ vadd.f s4, s21, s20
|
||||
+ vsub.f s5, s21, s20
|
||||
+ vadd.f s6, s22, s23
|
||||
+ vsub.f s7, s22, s23
|
||||
+ vadd.f s8, s0, s24 @ vector op
|
||||
+ vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
|
||||
+ vstr d1, [a1, #1 * 2*4]
|
||||
+ vldr d6, [a1, #0 * 2*4]
|
||||
+ vldr d7, [a1, #1 * 2*4]
|
||||
+ vadd.f s1, s5, s6
|
||||
+ vadd.f s0, s7, s4
|
||||
+ vsub.f s2, s5, s6
|
||||
+ vsub.f s3, s7, s4
|
||||
+ vsub.f s12, s24, s12 @ vector op
|
||||
+ vsub.f s5, s29, s1
|
||||
+ vsub.f s4, s28, s0
|
||||
+ vsub.f s6, s30, s2
|
||||
+ vsub.f s7, s31, s3
|
||||
+ vadd.f s16, s0, s28 @ vector op
|
||||
+ vstr d6, [a1, #4 * 2*4]
|
||||
+ vstr d7, [a1, #6 * 2*4]
|
||||
+ vstr d4, [a1, #0 * 2*4]
|
||||
+ vstr d5, [a1, #2 * 2*4]
|
||||
+ vstr d2, [a1, #5 * 2*4]
|
||||
+ vstr d3, [a1, #7 * 2*4]
|
||||
+.endm
|
||||
+
|
||||
+.macro macro_fft8_tail
|
||||
+ vstr d8, [a1, #1 * 2*4]
|
||||
+ vstr d9, [a1, #3 * 2*4]
|
||||
+.endm
|
||||
+
|
||||
+function fft8_vfp
|
||||
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
+ fmrx a2, FPSCR
|
||||
+ fmxr FPSCR, a3
|
||||
+ vpush {s16-s31}
|
||||
+
|
||||
+ macro_fft8_head
|
||||
+ macro_fft8_tail
|
||||
+
|
||||
+ vpop {s16-s31}
|
||||
+ fmxr FPSCR, a2
|
||||
+ bx lr
|
||||
+endfunc
|
||||
+
|
||||
+.align 3
|
||||
+cos1pi4: @ cos(1*pi/4) = sqrt(2)
|
||||
+ .float 0.707106769084930419921875
|
||||
+cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
|
||||
+ .float 0.92387950420379638671875
|
||||
+cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
|
||||
+ .float 0.3826834261417388916015625
|
||||
+
|
||||
+function ff_fft16_vfp, export=1
|
||||
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
|
||||
+ fmrx a2, FPSCR
|
||||
+ fmxr FPSCR, a3
|
||||
+ vpush {s16-s31}
|
||||
+
|
||||
+ macro_fft8_head
|
||||
+ @ FFT4(z+8)
|
||||
+ vldr d10, [a1, #8 * 2*4]
|
||||
+ vldr d12, [a1, #9 * 2*4]
|
||||
+ vldr d11, [a1, #10 * 2*4]
|
||||
+ vldr d13, [a1, #11 * 2*4]
|
||||
+ macro_fft8_tail
|
||||
+ vadd.f s16, s20, s24 @ vector op
|
||||
+ @ FFT4(z+12)
|
||||
+ vldr d4, [a1, #12 * 2*4]
|
||||
+ vldr d6, [a1, #13 * 2*4]
|
||||
+ vldr d5, [a1, #14 * 2*4]
|
||||
+ vsub.f s20, s20, s24 @ vector op
|
||||
+ vldr d7, [a1, #15 * 2*4]
|
||||
+ vadd.f s0, s16, s18
|
||||
+ vsub.f s4, s16, s18
|
||||
+ vadd.f s1, s17, s19
|
||||
+ vsub.f s5, s17, s19
|
||||
+ vadd.f s7, s21, s22
|
||||
+ vsub.f s3, s21, s22
|
||||
+ vadd.f s2, s20, s23
|
||||
+ vsub.f s6, s20, s23
|
||||
+ vadd.f s16, s8, s12 @ vector op
|
||||
+ vstr d0, [a1, #8 * 2*4]
|
||||
+ vstr d2, [a1, #10 * 2*4]
|
||||
+ vstr d1, [a1, #9 * 2*4]
|
||||
+ vsub.f s20, s8, s12
|
||||
+ vstr d3, [a1, #11 * 2*4]
|
||||
+ @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
|
||||
+ vldr d12, [a1, #10 * 2*4]
|
||||
+ vadd.f s0, s16, s18
|
||||
+ vadd.f s1, s17, s19
|
||||
+ vsub.f s6, s16, s18
|
||||
+ vsub.f s7, s17, s19
|
||||
+ vsub.f s3, s21, s22
|
||||
+ vadd.f s2, s20, s23
|
||||
+ vadd.f s5, s21, s22
|
||||
+ vsub.f s4, s20, s23
|
||||
+ vstr d0, [a1, #12 * 2*4]
|
||||
+ vmov s0, s6
|
||||
+ @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
|
||||
+ vldr d6, [a1, #9 * 2*4]
|
||||
+ vstr d1, [a1, #13 * 2*4]
|
||||
+ vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
|
||||
+ vstr d2, [a1, #15 * 2*4]
|
||||
+ vldr d7, [a1, #13 * 2*4]
|
||||
+ vadd.f s4, s25, s24
|
||||
+ vsub.f s5, s25, s24
|
||||
+ vsub.f s6, s0, s7
|
||||
+ vadd.f s7, s0, s7
|
||||
+ vmul.f s20, s12, s3 @ vector op
|
||||
+ @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
|
||||
+ vldr d4, [a1, #11 * 2*4]
|
||||
+ vldr d5, [a1, #15 * 2*4]
|
||||
+ vldr s1, cos3pi8
|
||||
+ vmul.f s24, s4, s2 @ vector * scalar op
|
||||
+ vmul.f s28, s12, s1 @ vector * scalar op
|
||||
+ vmul.f s12, s8, s1 @ vector * scalar op
|
||||
+ vadd.f s4, s20, s29
|
||||
+ vsub.f s5, s21, s28
|
||||
+ vsub.f s6, s22, s31
|
||||
+ vadd.f s7, s23, s30
|
||||
+ vmul.f s8, s8, s3 @ vector * scalar op
|
||||
+ vldr d8, [a1, #1 * 2*4]
|
||||
+ vldr d9, [a1, #5 * 2*4]
|
||||
+ vldr d10, [a1, #3 * 2*4]
|
||||
+ vldr d11, [a1, #7 * 2*4]
|
||||
+ vldr d14, [a1, #2 * 2*4]
|
||||
+ vadd.f s0, s6, s4
|
||||
+ vadd.f s1, s5, s7
|
||||
+ vsub.f s2, s5, s7
|
||||
+ vsub.f s3, s6, s4
|
||||
+ vadd.f s4, s12, s9
|
||||
+ vsub.f s5, s13, s8
|
||||
+ vsub.f s6, s14, s11
|
||||
+ vadd.f s7, s15, s10
|
||||
+ vadd.f s12, s0, s16 @ vector op
|
||||
+ vstr d0, [a1, #1 * 2*4]
|
||||
+ vstr d1, [a1, #5 * 2*4]
|
||||
+ vldr d4, [a1, #1 * 2*4]
|
||||
+ vldr d5, [a1, #5 * 2*4]
|
||||
+ vadd.f s0, s6, s4
|
||||
+ vadd.f s1, s5, s7
|
||||
+ vsub.f s2, s5, s7
|
||||
+ vsub.f s3, s6, s4
|
||||
+ vsub.f s8, s16, s8 @ vector op
|
||||
+ vstr d6, [a1, #1 * 2*4]
|
||||
+ vstr d7, [a1, #5 * 2*4]
|
||||
+ vldr d15, [a1, #6 * 2*4]
|
||||
+ vsub.f s4, s20, s0
|
||||
+ vsub.f s5, s21, s1
|
||||
+ vsub.f s6, s22, s2
|
||||
+ vsub.f s7, s23, s3
|
||||
+ vadd.f s20, s0, s20 @ vector op
|
||||
+ vstr d4, [a1, #9 * 2*4]
|
||||
+ @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
|
||||
+ vldr d6, [a1, #8 * 2*4]
|
||||
+ vstr d5, [a1, #13 * 2*4]
|
||||
+ vldr d7, [a1, #12 * 2*4]
|
||||
+ vstr d2, [a1, #11 * 2*4]
|
||||
+ vldr d8, [a1, #0 * 2*4]
|
||||
+ vstr d3, [a1, #15 * 2*4]
|
||||
+ vldr d9, [a1, #4 * 2*4]
|
||||
+ vadd.f s0, s26, s24
|
||||
+ vadd.f s1, s25, s27
|
||||
+ vsub.f s2, s25, s27
|
||||
+ vsub.f s3, s26, s24
|
||||
+ vadd.f s4, s14, s12
|
||||
+ vadd.f s5, s13, s15
|
||||
+ vsub.f s6, s13, s15
|
||||
+ vsub.f s7, s14, s12
|
||||
+ vadd.f s8, s0, s28 @ vector op
|
||||
+ vstr d0, [a1, #3 * 2*4]
|
||||
+ vstr d1, [a1, #7 * 2*4]
|
||||
+ vldr d6, [a1, #3 * 2*4]
|
||||
+ vldr d7, [a1, #7 * 2*4]
|
||||
+ vsub.f s0, s16, s4
|
||||
+ vsub.f s1, s17, s5
|
||||
+ vsub.f s2, s18, s6
|
||||
+ vsub.f s3, s19, s7
|
||||
+ vsub.f s12, s28, s12 @ vector op
|
||||
+ vadd.f s16, s4, s16 @ vector op
|
||||
+ vstr d10, [a1, #3 * 2*4]
|
||||
+ vstr d11, [a1, #7 * 2*4]
|
||||
+ vstr d4, [a1, #2 * 2*4]
|
||||
+ vstr d5, [a1, #6 * 2*4]
|
||||
+ vstr d0, [a1, #8 * 2*4]
|
||||
+ vstr d1, [a1, #12 * 2*4]
|
||||
+ vstr d6, [a1, #10 * 2*4]
|
||||
+ vstr d7, [a1, #14 * 2*4]
|
||||
+ vstr d8, [a1, #0 * 2*4]
|
||||
+ vstr d9, [a1, #4 * 2*4]
|
||||
+
|
||||
+ vpop {s16-s31}
|
||||
+ fmxr FPSCR, a2
|
||||
+ bx lr
|
||||
+endfunc
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,140 @@
|
||||
From ed16009b0a05fbd344832d5ad2e982c169aec42c Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 15 Jul 2013 18:28:16 +0100
|
||||
Subject: [PATCH 47/49] [ffmpeg] - backport - dcadsp: Add a new method,
|
||||
qmf_32_subbands
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This does most of the work formerly carried out by
|
||||
the static function qmf_32_subbands() in dcadec.c.
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/dcadec.c | 26 +++++---------------------
|
||||
libavcodec/dcadsp.c | 30 ++++++++++++++++++++++++++++++
|
||||
libavcodec/dcadsp.h | 9 +++++++++
|
||||
3 files changed, 44 insertions(+), 21 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
|
||||
index b648613..4054d63 100644
|
||||
--- a/libavcodec/dcadec.c
|
||||
+++ b/libavcodec/dcadec.c
|
||||
@@ -1108,10 +1108,8 @@ static void qmf_32_subbands(DCAContext *s, int chans,
|
||||
float scale)
|
||||
{
|
||||
const float *prCoeff;
|
||||
- int i;
|
||||
|
||||
int sb_act = s->subband_activity[chans];
|
||||
- int subindex;
|
||||
|
||||
scale *= sqrt(1 / 8.0);
|
||||
|
||||
@@ -1121,25 +1119,11 @@ static void qmf_32_subbands(DCAContext *s, int chans,
|
||||
else /* Perfect reconstruction */
|
||||
prCoeff = fir_32bands_perfect;
|
||||
|
||||
- for (i = sb_act; i < 32; i++)
|
||||
- s->raXin[i] = 0.0;
|
||||
-
|
||||
- /* Reconstructed channel sample index */
|
||||
- for (subindex = 0; subindex < 8; subindex++) {
|
||||
- /* Load in one sample from each subband and clear inactive subbands */
|
||||
- for (i = 0; i < sb_act; i++) {
|
||||
- unsigned sign = (i - 1) & 2;
|
||||
- uint32_t v = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
|
||||
- AV_WN32A(&s->raXin[i], v);
|
||||
- }
|
||||
-
|
||||
- s->synth.synth_filter_float(&s->imdct,
|
||||
- s->subband_fir_hist[chans],
|
||||
- &s->hist_index[chans],
|
||||
- s->subband_fir_noidea[chans], prCoeff,
|
||||
- samples_out, s->raXin, scale);
|
||||
- samples_out += 32;
|
||||
- }
|
||||
+ s->dcadsp.qmf_32_subbands(samples_in, sb_act, &s->synth, &s->imdct,
|
||||
+ s->subband_fir_hist[chans],
|
||||
+ &s->hist_index[chans],
|
||||
+ s->subband_fir_noidea[chans], prCoeff,
|
||||
+ samples_out, s->raXin, scale);
|
||||
}
|
||||
|
||||
static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
|
||||
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
|
||||
index dd4994d..ab63f1b 100644
|
||||
--- a/libavcodec/dcadsp.c
|
||||
+++ b/libavcodec/dcadsp.c
|
||||
@@ -20,6 +20,7 @@
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
+#include "libavutil/intreadwrite.h"
|
||||
#include "dcadsp.h"
|
||||
|
||||
static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
|
||||
@@ -44,8 +45,37 @@ static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
|
||||
}
|
||||
}
|
||||
|
||||
+static void dca_qmf_32_subbands(float samples_in[32][8], int sb_act,
|
||||
+ SynthFilterContext *synth, FFTContext *imdct,
|
||||
+ float synth_buf_ptr[512],
|
||||
+ int *synth_buf_offset, float synth_buf2[32],
|
||||
+ const float window[512], float *samples_out,
|
||||
+ float raXin[32], float scale)
|
||||
+{
|
||||
+ int i;
|
||||
+ int subindex;
|
||||
+
|
||||
+ for (i = sb_act; i < 32; i++)
|
||||
+ raXin[i] = 0.0;
|
||||
+
|
||||
+ /* Reconstructed channel sample index */
|
||||
+ for (subindex = 0; subindex < 8; subindex++) {
|
||||
+ /* Load in one sample from each subband and clear inactive subbands */
|
||||
+ for (i = 0; i < sb_act; i++) {
|
||||
+ unsigned sign = (i - 1) & 2;
|
||||
+ uint32_t v = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
|
||||
+ AV_WN32A(&raXin[i], v);
|
||||
+ }
|
||||
+
|
||||
+ synth->synth_filter_float(imdct, synth_buf_ptr, synth_buf_offset,
|
||||
+ synth_buf2, window, samples_out, raXin, scale);
|
||||
+ samples_out += 32;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void ff_dcadsp_init(DCADSPContext *s)
|
||||
{
|
||||
s->lfe_fir = dca_lfe_fir_c;
|
||||
+ s->qmf_32_subbands = dca_qmf_32_subbands;
|
||||
if (ARCH_ARM) ff_dcadsp_init_arm(s);
|
||||
}
|
||||
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
|
||||
index bb157f7..d86c1f3 100644
|
||||
--- a/libavcodec/dcadsp.h
|
||||
+++ b/libavcodec/dcadsp.h
|
||||
@@ -19,9 +19,18 @@
|
||||
#ifndef AVCODEC_DCADSP_H
|
||||
#define AVCODEC_DCADSP_H
|
||||
|
||||
+#include "avfft.h"
|
||||
+#include "synth_filter.h"
|
||||
+
|
||||
typedef struct DCADSPContext {
|
||||
void (*lfe_fir)(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
+ void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
|
||||
+ SynthFilterContext *synth, FFTContext *imdct,
|
||||
+ float synth_buf_ptr[512],
|
||||
+ int *synth_buf_offset, float synth_buf2[32],
|
||||
+ const float window[512], float *samples_out,
|
||||
+ float raXin[32], float scale);
|
||||
} DCADSPContext;
|
||||
|
||||
void ff_dcadsp_init(DCADSPContext *s);
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,551 @@
|
||||
From a6c273927c5bb212e806be6ae10c81dcd81b2152 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 15 Jul 2013 18:28:17 +0100
|
||||
Subject: [PATCH 48/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
|
||||
of qmf_32_subbands
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
This function 1323.0 98.0 746.2 60.6 +77.3%
|
||||
Overall 15400.0 336.4 14147.5 288.4 +8.9%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/arm/dcadsp_init_arm.c | 10 +-
|
||||
libavcodec/arm/dcadsp_vfp.S | 493 +++++++++++++++++++++++++++
|
||||
2 files changed, 502 insertions(+), 1 deletion(-)
|
||||
create mode 100644 libavcodec/arm/dcadsp_vfp.S
|
||||
|
||||
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
|
||||
index a1efbff..58267a2 100644
|
||||
--- a/libavcodec/arm/dcadsp_init_arm.c
|
||||
+++ b/libavcodec/arm/dcadsp_init_arm.c
|
||||
@@ -26,6 +26,12 @@
|
||||
|
||||
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
+void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
|
||||
+ SynthFilterContext *synth, FFTContext *imdct,
|
||||
+ float synth_buf_ptr[512],
|
||||
+ int *synth_buf_offset, float synth_buf2[32],
|
||||
+ const float window[512], float *samples_out,
|
||||
+ float raXin[32], float scale);
|
||||
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
|
||||
int decifactor, float scale);
|
||||
|
||||
@@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
- if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
|
||||
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
|
||||
s->lfe_fir = ff_dca_lfe_fir_vfp;
|
||||
+ s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
|
||||
+ }
|
||||
if (have_neon(cpu_flags))
|
||||
s->lfe_fir = ff_dca_lfe_fir_neon;
|
||||
}
|
||||
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
|
||||
new file mode 100644
|
||||
index 0000000..6039e87
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/dcadsp_vfp.S
|
||||
@@ -0,0 +1,493 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2013 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of Libav.
|
||||
+ *
|
||||
+ * Libav is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * Libav is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with Libav; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+POUT .req a1
|
||||
+PIN .req a2
|
||||
+PCOEF .req a3
|
||||
+DECIFACTOR .req a4
|
||||
+OLDFPSCR .req a4
|
||||
+COUNTER .req ip
|
||||
+
|
||||
+SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
|
||||
+SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
|
||||
+IN0 .req s4
|
||||
+IN1 .req s5
|
||||
+IN2 .req s6
|
||||
+IN3 .req s7
|
||||
+IN4 .req s0
|
||||
+IN5 .req s1
|
||||
+IN6 .req s2
|
||||
+IN7 .req s3
|
||||
+COEF0 .req s8 @ coefficient elements
|
||||
+COEF1 .req s9
|
||||
+COEF2 .req s10
|
||||
+COEF3 .req s11
|
||||
+COEF4 .req s12
|
||||
+COEF5 .req s13
|
||||
+COEF6 .req s14
|
||||
+COEF7 .req s15
|
||||
+ACCUM0 .req s16 @ double-buffered multiply-accumulate results
|
||||
+ACCUM4 .req s20
|
||||
+POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
|
||||
+POST1 .req s25
|
||||
+POST2 .req s26
|
||||
+POST3 .req s27
|
||||
+
|
||||
+
|
||||
+.macro inner_loop decifactor, dir, tail, head
|
||||
+ .ifc "\dir","up"
|
||||
+ .set X, 0
|
||||
+ .set Y, 4
|
||||
+ .else
|
||||
+ .set X, 4*JMAX*4 - 4
|
||||
+ .set Y, -4
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
|
||||
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
|
||||
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
|
||||
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
|
||||
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
|
||||
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
|
||||
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
|
||||
+ .ifc "\tail",""
|
||||
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
|
||||
+ .endif
|
||||
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
|
||||
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
|
||||
+ .ifnc "\tail",""
|
||||
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
|
||||
+ .endif
|
||||
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
|
||||
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
|
||||
+ .endif
|
||||
+ .ifnc "\tail",""
|
||||
+ vstmia POUT!, {POST0-POST3}
|
||||
+ .endif
|
||||
+ .ifnc "\head",""
|
||||
+ vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
|
||||
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
|
||||
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
|
||||
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
|
||||
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
|
||||
+ vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
|
||||
+ .if \decifactor == 32
|
||||
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
|
||||
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
|
||||
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
|
||||
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
|
||||
+ vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
|
||||
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
|
||||
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
|
||||
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
|
||||
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
|
||||
+ vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
|
||||
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
|
||||
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
|
||||
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
|
||||
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
|
||||
+ vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
|
||||
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
|
||||
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
|
||||
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
|
||||
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
|
||||
+ vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
|
||||
+ .endif
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+.macro dca_lfe_fir decifactor
|
||||
+ .if \decifactor == 32
|
||||
+ .set JMAX, 8
|
||||
+ vpush {s16-s31}
|
||||
+ vmov SCALE32, s0 @ duplicate scalar across vector
|
||||
+ vldr IN4, [PIN, #-4*4]
|
||||
+ vldr IN5, [PIN, #-5*4]
|
||||
+ vldr IN6, [PIN, #-6*4]
|
||||
+ vldr IN7, [PIN, #-7*4]
|
||||
+ .else
|
||||
+ .set JMAX, 4
|
||||
+ vpush {s16-s27}
|
||||
+ .endif
|
||||
+
|
||||
+ mov COUNTER, #\decifactor/4 - 1
|
||||
+ inner_loop \decifactor, up,, head
|
||||
+1: add PCOEF, PCOEF, #4*JMAX*4
|
||||
+ subs COUNTER, COUNTER, #1
|
||||
+ inner_loop \decifactor, up, tail, head
|
||||
+ bne 1b
|
||||
+ inner_loop \decifactor, up, tail
|
||||
+
|
||||
+ mov COUNTER, #\decifactor/4 - 1
|
||||
+ inner_loop \decifactor, down,, head
|
||||
+1: sub PCOEF, PCOEF, #4*JMAX*4
|
||||
+ subs COUNTER, COUNTER, #1
|
||||
+ inner_loop \decifactor, down, tail, head
|
||||
+ bne 1b
|
||||
+ inner_loop \decifactor, down, tail
|
||||
+
|
||||
+ .if \decifactor == 32
|
||||
+ vpop {s16-s31}
|
||||
+ .else
|
||||
+ vpop {s16-s27}
|
||||
+ .endif
|
||||
+ fmxr FPSCR, OLDFPSCR
|
||||
+ bx lr
|
||||
+.endm
|
||||
+
|
||||
+
|
||||
+/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
|
||||
+ * int decifactor, float scale)
|
||||
+ */
|
||||
+function ff_dca_lfe_fir_vfp, export=1
|
||||
+ teq DECIFACTOR, #32
|
||||
+ fmrx OLDFPSCR, FPSCR
|
||||
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
+ fmxr FPSCR, ip
|
||||
+NOVFP vldr s0, [sp]
|
||||
+ vldr IN0, [PIN, #-0*4]
|
||||
+ vldr IN1, [PIN, #-1*4]
|
||||
+ vldr IN2, [PIN, #-2*4]
|
||||
+ vldr IN3, [PIN, #-3*4]
|
||||
+ beq 32f
|
||||
+64: dca_lfe_fir 64
|
||||
+ .ltorg
|
||||
+32: dca_lfe_fir 32
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq POUT
|
||||
+ .unreq PIN
|
||||
+ .unreq PCOEF
|
||||
+ .unreq DECIFACTOR
|
||||
+ .unreq OLDFPSCR
|
||||
+ .unreq COUNTER
|
||||
+
|
||||
+ .unreq SCALE32
|
||||
+ .unreq SCALE64
|
||||
+ .unreq IN0
|
||||
+ .unreq IN1
|
||||
+ .unreq IN2
|
||||
+ .unreq IN3
|
||||
+ .unreq IN4
|
||||
+ .unreq IN5
|
||||
+ .unreq IN6
|
||||
+ .unreq IN7
|
||||
+ .unreq COEF0
|
||||
+ .unreq COEF1
|
||||
+ .unreq COEF2
|
||||
+ .unreq COEF3
|
||||
+ .unreq COEF4
|
||||
+ .unreq COEF5
|
||||
+ .unreq COEF6
|
||||
+ .unreq COEF7
|
||||
+ .unreq ACCUM0
|
||||
+ .unreq ACCUM4
|
||||
+ .unreq POST0
|
||||
+ .unreq POST1
|
||||
+ .unreq POST2
|
||||
+ .unreq POST3
|
||||
+
|
||||
+
|
||||
+IN .req a1
|
||||
+SBACT .req a2
|
||||
+OLDFPSCR .req a3
|
||||
+IMDCT .req a4
|
||||
+WINDOW .req v1
|
||||
+OUT .req v2
|
||||
+BUF .req v3
|
||||
+SCALEINT .req v4 @ only used in softfp case
|
||||
+COUNT .req v5
|
||||
+
|
||||
+SCALE .req s0
|
||||
+
|
||||
+/* Stack layout differs in softfp and hardfp cases:
|
||||
+ *
|
||||
+ * hardfp
|
||||
+ * fp -> 6 arg words saved by caller
|
||||
+ * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
|
||||
+ * s16-s23 on entry
|
||||
+ * align 16
|
||||
+ * buf -> 8*32*4 bytes buffer
|
||||
+ * s0 on entry
|
||||
+ * sp -> 3 arg words for callee
|
||||
+ *
|
||||
+ * softfp
|
||||
+ * fp -> 7 arg words saved by caller
|
||||
+ * a4,v1-v5,fp,lr on entry
|
||||
+ * s16-s23 on entry
|
||||
+ * align 16
|
||||
+ * buf -> 8*32*4 bytes buffer
|
||||
+ * sp -> 4 arg words for callee
|
||||
+ */
|
||||
+
|
||||
+/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
|
||||
+ * SynthFilterContext *synth, FFTContext *imdct,
|
||||
+ * float (*synth_buf_ptr)[512],
|
||||
+ * int *synth_buf_offset, float (*synth_buf2)[32],
|
||||
+ * const float (*window)[512], float *samples_out,
|
||||
+ * float (*raXin)[32], float scale);
|
||||
+ */
|
||||
+function ff_dca_qmf_32_subbands_vfp, export=1
|
||||
+VFP push {a3-a4,v1-v3,v5,fp,lr}
|
||||
+NOVFP push {a4,v1-v5,fp,lr}
|
||||
+ add fp, sp, #8*4
|
||||
+ vpush {s16-s23}
|
||||
+ @ The buffer pointed at by raXin isn't big enough for us to do a
|
||||
+ @ complete matrix transposition as we want to, so allocate an
|
||||
+ @ alternative buffer from the stack. Align to 4 words for speed.
|
||||
+ sub BUF, sp, #8*32*4
|
||||
+ bic BUF, BUF, #15
|
||||
+ mov sp, BUF
|
||||
+ ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
|
||||
+ fmrx OLDFPSCR, FPSCR
|
||||
+ fmxr FPSCR, lr
|
||||
+ @ COUNT is used to count down 2 things at once:
|
||||
+ @ bits 0-4 are the number of word pairs remaining in the output row
|
||||
+ @ bits 5-31 are the number of words to copy (with possible negation)
|
||||
+ @ from the source matrix before we start zeroing the remainder
|
||||
+ mov COUNT, #(-4 << 5) + 16
|
||||
+ adds COUNT, COUNT, SBACT, lsl #5
|
||||
+ bmi 2f
|
||||
+1:
|
||||
+ vldr s8, [IN, #(0*8+0)*4]
|
||||
+ vldr s10, [IN, #(0*8+1)*4]
|
||||
+ vldr s12, [IN, #(0*8+2)*4]
|
||||
+ vldr s14, [IN, #(0*8+3)*4]
|
||||
+ vldr s16, [IN, #(0*8+4)*4]
|
||||
+ vldr s18, [IN, #(0*8+5)*4]
|
||||
+ vldr s20, [IN, #(0*8+6)*4]
|
||||
+ vldr s22, [IN, #(0*8+7)*4]
|
||||
+ vneg.f s8, s8
|
||||
+ vldr s9, [IN, #(1*8+0)*4]
|
||||
+ vldr s11, [IN, #(1*8+1)*4]
|
||||
+ vldr s13, [IN, #(1*8+2)*4]
|
||||
+ vldr s15, [IN, #(1*8+3)*4]
|
||||
+ vneg.f s16, s16
|
||||
+ vldr s17, [IN, #(1*8+4)*4]
|
||||
+ vldr s19, [IN, #(1*8+5)*4]
|
||||
+ vldr s21, [IN, #(1*8+6)*4]
|
||||
+ vldr s23, [IN, #(1*8+7)*4]
|
||||
+ vstr d4, [BUF, #(0*32+0)*4]
|
||||
+ vstr d5, [BUF, #(1*32+0)*4]
|
||||
+ vstr d6, [BUF, #(2*32+0)*4]
|
||||
+ vstr d7, [BUF, #(3*32+0)*4]
|
||||
+ vstr d8, [BUF, #(4*32+0)*4]
|
||||
+ vstr d9, [BUF, #(5*32+0)*4]
|
||||
+ vstr d10, [BUF, #(6*32+0)*4]
|
||||
+ vstr d11, [BUF, #(7*32+0)*4]
|
||||
+ vldr s9, [IN, #(3*8+0)*4]
|
||||
+ vldr s11, [IN, #(3*8+1)*4]
|
||||
+ vldr s13, [IN, #(3*8+2)*4]
|
||||
+ vldr s15, [IN, #(3*8+3)*4]
|
||||
+ vldr s17, [IN, #(3*8+4)*4]
|
||||
+ vldr s19, [IN, #(3*8+5)*4]
|
||||
+ vldr s21, [IN, #(3*8+6)*4]
|
||||
+ vldr s23, [IN, #(3*8+7)*4]
|
||||
+ vneg.f s9, s9
|
||||
+ vldr s8, [IN, #(2*8+0)*4]
|
||||
+ vldr s10, [IN, #(2*8+1)*4]
|
||||
+ vldr s12, [IN, #(2*8+2)*4]
|
||||
+ vldr s14, [IN, #(2*8+3)*4]
|
||||
+ vneg.f s17, s17
|
||||
+ vldr s16, [IN, #(2*8+4)*4]
|
||||
+ vldr s18, [IN, #(2*8+5)*4]
|
||||
+ vldr s20, [IN, #(2*8+6)*4]
|
||||
+ vldr s22, [IN, #(2*8+7)*4]
|
||||
+ vstr d4, [BUF, #(0*32+2)*4]
|
||||
+ vstr d5, [BUF, #(1*32+2)*4]
|
||||
+ vstr d6, [BUF, #(2*32+2)*4]
|
||||
+ vstr d7, [BUF, #(3*32+2)*4]
|
||||
+ vstr d8, [BUF, #(4*32+2)*4]
|
||||
+ vstr d9, [BUF, #(5*32+2)*4]
|
||||
+ vstr d10, [BUF, #(6*32+2)*4]
|
||||
+ vstr d11, [BUF, #(7*32+2)*4]
|
||||
+ add IN, IN, #4*8*4
|
||||
+ add BUF, BUF, #4*4
|
||||
+ subs COUNT, COUNT, #(4 << 5) + 2
|
||||
+ bpl 1b
|
||||
+2: @ Now deal with trailing < 4 samples
|
||||
+ adds COUNT, COUNT, #3 << 5
|
||||
+ bmi 4f @ sb_act was a multiple of 4
|
||||
+ bics lr, COUNT, #0x1F
|
||||
+ bne 3f
|
||||
+ @ sb_act was n*4+1
|
||||
+ vldr s8, [IN, #(0*8+0)*4]
|
||||
+ vldr s10, [IN, #(0*8+1)*4]
|
||||
+ vldr s12, [IN, #(0*8+2)*4]
|
||||
+ vldr s14, [IN, #(0*8+3)*4]
|
||||
+ vldr s16, [IN, #(0*8+4)*4]
|
||||
+ vldr s18, [IN, #(0*8+5)*4]
|
||||
+ vldr s20, [IN, #(0*8+6)*4]
|
||||
+ vldr s22, [IN, #(0*8+7)*4]
|
||||
+ vneg.f s8, s8
|
||||
+ vldr s9, zero
|
||||
+ vldr s11, zero
|
||||
+ vldr s13, zero
|
||||
+ vldr s15, zero
|
||||
+ vneg.f s16, s16
|
||||
+ vldr s17, zero
|
||||
+ vldr s19, zero
|
||||
+ vldr s21, zero
|
||||
+ vldr s23, zero
|
||||
+ vstr d4, [BUF, #(0*32+0)*4]
|
||||
+ vstr d5, [BUF, #(1*32+0)*4]
|
||||
+ vstr d6, [BUF, #(2*32+0)*4]
|
||||
+ vstr d7, [BUF, #(3*32+0)*4]
|
||||
+ vstr d8, [BUF, #(4*32+0)*4]
|
||||
+ vstr d9, [BUF, #(5*32+0)*4]
|
||||
+ vstr d10, [BUF, #(6*32+0)*4]
|
||||
+ vstr d11, [BUF, #(7*32+0)*4]
|
||||
+ add BUF, BUF, #2*4
|
||||
+ sub COUNT, COUNT, #1
|
||||
+ b 4f
|
||||
+3: @ sb_act was n*4+2 or n*4+3, so do the first 2
|
||||
+ vldr s8, [IN, #(0*8+0)*4]
|
||||
+ vldr s10, [IN, #(0*8+1)*4]
|
||||
+ vldr s12, [IN, #(0*8+2)*4]
|
||||
+ vldr s14, [IN, #(0*8+3)*4]
|
||||
+ vldr s16, [IN, #(0*8+4)*4]
|
||||
+ vldr s18, [IN, #(0*8+5)*4]
|
||||
+ vldr s20, [IN, #(0*8+6)*4]
|
||||
+ vldr s22, [IN, #(0*8+7)*4]
|
||||
+ vneg.f s8, s8
|
||||
+ vldr s9, [IN, #(1*8+0)*4]
|
||||
+ vldr s11, [IN, #(1*8+1)*4]
|
||||
+ vldr s13, [IN, #(1*8+2)*4]
|
||||
+ vldr s15, [IN, #(1*8+3)*4]
|
||||
+ vneg.f s16, s16
|
||||
+ vldr s17, [IN, #(1*8+4)*4]
|
||||
+ vldr s19, [IN, #(1*8+5)*4]
|
||||
+ vldr s21, [IN, #(1*8+6)*4]
|
||||
+ vldr s23, [IN, #(1*8+7)*4]
|
||||
+ vstr d4, [BUF, #(0*32+0)*4]
|
||||
+ vstr d5, [BUF, #(1*32+0)*4]
|
||||
+ vstr d6, [BUF, #(2*32+0)*4]
|
||||
+ vstr d7, [BUF, #(3*32+0)*4]
|
||||
+ vstr d8, [BUF, #(4*32+0)*4]
|
||||
+ vstr d9, [BUF, #(5*32+0)*4]
|
||||
+ vstr d10, [BUF, #(6*32+0)*4]
|
||||
+ vstr d11, [BUF, #(7*32+0)*4]
|
||||
+ add BUF, BUF, #2*4
|
||||
+ sub COUNT, COUNT, #(2 << 5) + 1
|
||||
+ bics lr, COUNT, #0x1F
|
||||
+ bne 4f
|
||||
+ @ sb_act was n*4+3
|
||||
+ vldr s8, [IN, #(2*8+0)*4]
|
||||
+ vldr s10, [IN, #(2*8+1)*4]
|
||||
+ vldr s12, [IN, #(2*8+2)*4]
|
||||
+ vldr s14, [IN, #(2*8+3)*4]
|
||||
+ vldr s16, [IN, #(2*8+4)*4]
|
||||
+ vldr s18, [IN, #(2*8+5)*4]
|
||||
+ vldr s20, [IN, #(2*8+6)*4]
|
||||
+ vldr s22, [IN, #(2*8+7)*4]
|
||||
+ vldr s9, zero
|
||||
+ vldr s11, zero
|
||||
+ vldr s13, zero
|
||||
+ vldr s15, zero
|
||||
+ vldr s17, zero
|
||||
+ vldr s19, zero
|
||||
+ vldr s21, zero
|
||||
+ vldr s23, zero
|
||||
+ vstr d4, [BUF, #(0*32+0)*4]
|
||||
+ vstr d5, [BUF, #(1*32+0)*4]
|
||||
+ vstr d6, [BUF, #(2*32+0)*4]
|
||||
+ vstr d7, [BUF, #(3*32+0)*4]
|
||||
+ vstr d8, [BUF, #(4*32+0)*4]
|
||||
+ vstr d9, [BUF, #(5*32+0)*4]
|
||||
+ vstr d10, [BUF, #(6*32+0)*4]
|
||||
+ vstr d11, [BUF, #(7*32+0)*4]
|
||||
+ add BUF, BUF, #2*4
|
||||
+ sub COUNT, COUNT, #1
|
||||
+4: @ Now fill the remainder with 0
|
||||
+ vldr s8, zero
|
||||
+ vldr s9, zero
|
||||
+ ands COUNT, COUNT, #0x1F
|
||||
+ beq 6f
|
||||
+5: vstr d4, [BUF, #(0*32+0)*4]
|
||||
+ vstr d4, [BUF, #(1*32+0)*4]
|
||||
+ vstr d4, [BUF, #(2*32+0)*4]
|
||||
+ vstr d4, [BUF, #(3*32+0)*4]
|
||||
+ vstr d4, [BUF, #(4*32+0)*4]
|
||||
+ vstr d4, [BUF, #(5*32+0)*4]
|
||||
+ vstr d4, [BUF, #(6*32+0)*4]
|
||||
+ vstr d4, [BUF, #(7*32+0)*4]
|
||||
+ add BUF, BUF, #2*4
|
||||
+ subs COUNT, COUNT, #1
|
||||
+ bne 5b
|
||||
+6:
|
||||
+ fmxr FPSCR, OLDFPSCR
|
||||
+ ldr WINDOW, [fp, #3*4]
|
||||
+ ldr OUT, [fp, #4*4]
|
||||
+ sub BUF, BUF, #32*4
|
||||
+NOVFP ldr SCALEINT, [fp, #6*4]
|
||||
+ mov COUNT, #8
|
||||
+VFP vpush {SCALE}
|
||||
+VFP sub sp, sp, #3*4
|
||||
+NOVFP sub sp, sp, #4*4
|
||||
+7:
|
||||
+VFP ldr a1, [fp, #-7*4] @ imdct
|
||||
+NOVFP ldr a1, [fp, #-8*4]
|
||||
+ ldmia fp, {a2-a4}
|
||||
+VFP stmia sp, {WINDOW, OUT, BUF}
|
||||
+NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
|
||||
+VFP vldr SCALE, [sp, #3*4]
|
||||
+ bl ff_synth_filter_float_vfp
|
||||
+ add OUT, OUT, #32*4
|
||||
+ add BUF, BUF, #32*4
|
||||
+ subs COUNT, COUNT, #1
|
||||
+ bne 7b
|
||||
+
|
||||
+A sub sp, fp, #(8+8)*4
|
||||
+T sub fp, fp, #(8+8)*4
|
||||
+T mov sp, fp
|
||||
+ vpop {s16-s23}
|
||||
+VFP pop {a3-a4,v1-v3,v5,fp,pc}
|
||||
+NOVFP pop {a4,v1-v5,fp,pc}
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq IN
|
||||
+ .unreq SBACT
|
||||
+ .unreq OLDFPSCR
|
||||
+ .unreq IMDCT
|
||||
+ .unreq WINDOW
|
||||
+ .unreq OUT
|
||||
+ .unreq BUF
|
||||
+ .unreq SCALEINT
|
||||
+ .unreq COUNT
|
||||
+
|
||||
+ .unreq SCALE
|
||||
+
|
||||
+ .align 2
|
||||
+zero: .word 0
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,64 @@
|
||||
From 101f5a2c5db12605c24fe4aa41b3fabacfd3bad3 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
|
||||
Date: Mon, 22 Jul 2013 12:33:22 +0300
|
||||
Subject: [PATCH 49/49] [ffmpeg] - backport - arm: Mangle external symbols
|
||||
properly in new vfp assembly files
|
||||
|
||||
Reviewed-by: Kostya Shishkov
|
||||
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
|
||||
---
|
||||
libavcodec/arm/dcadsp_vfp.S | 2 +-
|
||||
libavcodec/arm/mdct_vfp.S | 4 ++--
|
||||
libavcodec/arm/synth_filter_vfp.S | 2 +-
|
||||
3 files changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
|
||||
index 6039e87..5892a84 100644
|
||||
--- a/libavcodec/arm/dcadsp_vfp.S
|
||||
+++ b/libavcodec/arm/dcadsp_vfp.S
|
||||
@@ -463,7 +463,7 @@ NOVFP ldr a1, [fp, #-8*4]
|
||||
VFP stmia sp, {WINDOW, OUT, BUF}
|
||||
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
|
||||
VFP vldr SCALE, [sp, #3*4]
|
||||
- bl ff_synth_filter_float_vfp
|
||||
+ bl X(ff_synth_filter_float_vfp)
|
||||
add OUT, OUT, #32*4
|
||||
add BUF, BUF, #32*4
|
||||
subs COUNT, COUNT, #1
|
||||
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
|
||||
index 0623e96..94db24f 100644
|
||||
--- a/libavcodec/arm/mdct_vfp.S
|
||||
+++ b/libavcodec/arm/mdct_vfp.S
|
||||
@@ -151,7 +151,7 @@ function ff_imdct_half_vfp, export=1
|
||||
ldr ip, [CONTEXT, #5*4] @ mdct_bits
|
||||
teq ip, #6
|
||||
it ne
|
||||
- bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA
|
||||
+ bne X(ff_imdct_half_c) @ only case currently accelerated is the one used by DCA
|
||||
|
||||
.set n, 1<<6
|
||||
.set n2, n/2
|
||||
@@ -175,7 +175,7 @@ function ff_imdct_half_vfp, export=1
|
||||
|
||||
fmxr FPSCR, OLDFPSCR
|
||||
mov a1, OUT
|
||||
- bl ff_fft16_vfp
|
||||
+ bl X(ff_fft16_vfp)
|
||||
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
|
||||
fmxr FPSCR, lr
|
||||
|
||||
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
|
||||
index c219c41..e6e6408 100644
|
||||
--- a/libavcodec/arm/synth_filter_vfp.S
|
||||
+++ b/libavcodec/arm/synth_filter_vfp.S
|
||||
@@ -132,7 +132,7 @@ function ff_synth_filter_float_vfp, export=1
|
||||
str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
|
||||
ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
|
||||
VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
|
||||
- bl ff_imdct_half_vfp
|
||||
+ bl X(ff_imdct_half_vfp)
|
||||
VFP vmov SCALE, s16
|
||||
|
||||
fmrx OLDFPSCR, FPSCR
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,72 @@
|
||||
From 5ce8f2bf354b7adf904ac3e1438915586c5a0bb1 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 31 Jul 2013 23:46:08 +0100
|
||||
Subject: [PATCH 51/54] [ffmpeg] - backport - avio: Add an internal function
|
||||
for reading without copying
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
As long as there is enough contiguous data in the avio buffer,
|
||||
just return a pointer to it instead of copying it to the caller
|
||||
provided buffer.
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavformat/avio_internal.h | 17 +++++++++++++++++
|
||||
libavformat/aviobuf.c | 12 ++++++++++++
|
||||
2 files changed, 29 insertions(+)
|
||||
|
||||
diff --git a/libavformat/avio_internal.h b/libavformat/avio_internal.h
|
||||
index cf36764..e9ece57 100644
|
||||
--- a/libavformat/avio_internal.h
|
||||
+++ b/libavformat/avio_internal.h
|
||||
@@ -38,6 +38,23 @@ int ffio_init_context(AVIOContext *s,
|
||||
|
||||
|
||||
/**
|
||||
+ * Read size bytes from AVIOContext, returning a pointer.
|
||||
+ * Note that the data pointed at by the returned pointer is only
|
||||
+ * valid until the next call that references the same IO context.
|
||||
+ * @param s IO context
|
||||
+ * @param buf pointer to buffer into which to assemble the requested
|
||||
+ * data if it is not available in contiguous addresses in the
|
||||
+ * underlying buffer
|
||||
+ * @param size number of bytes requested
|
||||
+ * @param data address at which to store pointer: this will be a
|
||||
+ * a direct pointer into the underlying buffer if the requested
|
||||
+ * number of bytes are available at contiguous addresses, otherwise
|
||||
+ * will be a copy of buf
|
||||
+ * @return number of bytes read or AVERROR
|
||||
+ */
|
||||
+int ffio_read_indirect(AVIOContext *s, unsigned char *buf, int size, unsigned char **data);
|
||||
+
|
||||
+/**
|
||||
* Read size bytes from AVIOContext into buf.
|
||||
* This reads at most 1 packet. If that is not enough fewer bytes will be
|
||||
* returned.
|
||||
diff --git a/libavformat/aviobuf.c b/libavformat/aviobuf.c
|
||||
index 7a73a17..465c46d 100644
|
||||
--- a/libavformat/aviobuf.c
|
||||
+++ b/libavformat/aviobuf.c
|
||||
@@ -522,6 +522,18 @@ int avio_read(AVIOContext *s, unsigned char *buf, int size)
|
||||
return size1 - size;
|
||||
}
|
||||
|
||||
+int ffio_read_indirect(AVIOContext *s, unsigned char *buf, int size, unsigned char **data)
|
||||
+{
|
||||
+ if (s->buf_end - s->buf_ptr >= size && !s->write_flag) {
|
||||
+ *data = s->buf_ptr;
|
||||
+ s->buf_ptr += size;
|
||||
+ return size;
|
||||
+ } else {
|
||||
+ *data = buf;
|
||||
+ return avio_read(s, buf, size);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
int ffio_read_partial(AVIOContext *s, unsigned char *buf, int size)
|
||||
{
|
||||
int len;
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,149 @@
|
||||
From 1496d8c12075c0f3783e348a5d73fef9e3000b0f Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 31 Jul 2013 23:46:08 +0100
|
||||
Subject: [PATCH 52/54] [ffmpeg] - backport - mpegts: Remove one memcpy per
|
||||
packet
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This was being performed to ensure that a complete packet was held in
|
||||
contiguous memory, prior to parsing the packet. However, the source buffer
|
||||
is typically large enough that the packet was already contiguous, so it is
|
||||
beneficial to return the packet by reference in most cases.
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
memcpy 720.7 32.7 649.8 25.1 +10.9%
|
||||
Overall 2372.7 46.1 2291.7 21.8 +3.5%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavformat/mpegts.c | 41 ++++++++++++++++++++++++++-------------
|
||||
1 file changed, 28 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
|
||||
index b5f5d63..5307521 100644
|
||||
--- a/libavformat/mpegts.c
|
||||
+++ b/libavformat/mpegts.c
|
||||
@@ -1863,17 +1863,17 @@ static int mpegts_resync(AVFormatContext *s)
|
||||
}
|
||||
|
||||
/* return -1 if error or EOF. Return 0 if OK. */
|
||||
-static int read_packet(AVFormatContext *s, uint8_t *buf, int raw_packet_size)
|
||||
+static int read_packet(AVFormatContext *s, uint8_t *buf, int raw_packet_size, uint8_t **data)
|
||||
{
|
||||
AVIOContext *pb = s->pb;
|
||||
- int skip, len;
|
||||
+ int len;
|
||||
|
||||
for(;;) {
|
||||
- len = avio_read(pb, buf, TS_PACKET_SIZE);
|
||||
+ len = ffio_read_indirect(pb, buf, TS_PACKET_SIZE, data);
|
||||
if (len != TS_PACKET_SIZE)
|
||||
return len < 0 ? len : AVERROR_EOF;
|
||||
/* check packet sync byte */
|
||||
- if (buf[0] != 0x47) {
|
||||
+ if ((*data)[0] != 0x47) {
|
||||
/* find a new packet start */
|
||||
avio_seek(pb, -TS_PACKET_SIZE, SEEK_CUR);
|
||||
if (mpegts_resync(s) < 0)
|
||||
@@ -1881,19 +1881,25 @@ static int read_packet(AVFormatContext *s, uint8_t *buf, int raw_packet_size)
|
||||
else
|
||||
continue;
|
||||
} else {
|
||||
- skip = raw_packet_size - TS_PACKET_SIZE;
|
||||
- if (skip > 0)
|
||||
- avio_skip(pb, skip);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static void finished_reading_packet(AVFormatContext *s, int raw_packet_size)
|
||||
+{
|
||||
+ AVIOContext *pb = s->pb;
|
||||
+ int skip = raw_packet_size - TS_PACKET_SIZE;
|
||||
+ if (skip > 0)
|
||||
+ avio_skip(pb, skip);
|
||||
+}
|
||||
+
|
||||
static int handle_packets(MpegTSContext *ts, int nb_packets)
|
||||
{
|
||||
AVFormatContext *s = ts->stream;
|
||||
uint8_t packet[TS_PACKET_SIZE + FF_INPUT_BUFFER_PADDING_SIZE];
|
||||
+ uint8_t *data;
|
||||
int packet_num, ret = 0;
|
||||
|
||||
if (avio_tell(s->pb) != ts->last_pos) {
|
||||
@@ -1926,10 +1932,11 @@ static int handle_packets(MpegTSContext *ts, int nb_packets)
|
||||
if (ts->stop_parse > 0)
|
||||
break;
|
||||
|
||||
- ret = read_packet(s, packet, ts->raw_packet_size);
|
||||
+ ret = read_packet(s, packet, ts->raw_packet_size, &data);
|
||||
if (ret != 0)
|
||||
break;
|
||||
- ret = handle_packet(ts, packet);
|
||||
+ ret = handle_packet(ts, data);
|
||||
+ finished_reading_packet(s, ts->raw_packet_size);
|
||||
if (ret != 0)
|
||||
break;
|
||||
}
|
||||
@@ -2087,6 +2094,7 @@ static int mpegts_read_header(AVFormatContext *s)
|
||||
int64_t pcrs[2], pcr_h;
|
||||
int packet_count[2];
|
||||
uint8_t packet[TS_PACKET_SIZE];
|
||||
+ uint8_t *data;
|
||||
|
||||
/* only read packets */
|
||||
|
||||
@@ -2102,18 +2110,21 @@ static int mpegts_read_header(AVFormatContext *s)
|
||||
nb_pcrs = 0;
|
||||
nb_packets = 0;
|
||||
for(;;) {
|
||||
- ret = read_packet(s, packet, ts->raw_packet_size);
|
||||
+ ret = read_packet(s, packet, ts->raw_packet_size, &data);
|
||||
if (ret < 0)
|
||||
return -1;
|
||||
- pid = AV_RB16(packet + 1) & 0x1fff;
|
||||
+ pid = AV_RB16(data + 1) & 0x1fff;
|
||||
if ((pcr_pid == -1 || pcr_pid == pid) &&
|
||||
- parse_pcr(&pcr_h, &pcr_l, packet) == 0) {
|
||||
+ parse_pcr(&pcr_h, &pcr_l, data) == 0) {
|
||||
+ finished_reading_packet(s, ts->raw_packet_size);
|
||||
pcr_pid = pid;
|
||||
packet_count[nb_pcrs] = nb_packets;
|
||||
pcrs[nb_pcrs] = pcr_h * 300 + pcr_l;
|
||||
nb_pcrs++;
|
||||
if (nb_pcrs >= 2)
|
||||
break;
|
||||
+ } else {
|
||||
+ finished_reading_packet(s, ts->raw_packet_size);
|
||||
}
|
||||
nb_packets++;
|
||||
}
|
||||
@@ -2145,15 +2156,19 @@ static int mpegts_raw_read_packet(AVFormatContext *s,
|
||||
int64_t pcr_h, next_pcr_h, pos;
|
||||
int pcr_l, next_pcr_l;
|
||||
uint8_t pcr_buf[12];
|
||||
+ uint8_t *data;
|
||||
|
||||
if (av_new_packet(pkt, TS_PACKET_SIZE) < 0)
|
||||
return AVERROR(ENOMEM);
|
||||
pkt->pos= avio_tell(s->pb);
|
||||
- ret = read_packet(s, pkt->data, ts->raw_packet_size);
|
||||
+ ret = read_packet(s, pkt->data, ts->raw_packet_size, &data);
|
||||
if (ret < 0) {
|
||||
av_free_packet(pkt);
|
||||
return ret;
|
||||
}
|
||||
+ if (data != pkt->data)
|
||||
+ memcpy(pkt->data, data, ts->raw_packet_size);
|
||||
+ finished_reading_packet(s, ts->raw_packet_size);
|
||||
if (ts->mpeg2ts_compute_pcr) {
|
||||
/* compute exact PCR for each packet */
|
||||
if (parse_pcr(&pcr_h, &pcr_l, pkt->data) == 0) {
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,47 @@
|
||||
From 6aec5772fd5331b3514f308ab0895f6234b60045 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 5 Aug 2013 13:12:51 +0100
|
||||
Subject: [PATCH 53/54] [ffmpeg] - backport - mpegts: Make discard_pid()
|
||||
faster for single-program streams
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When a stream contains a single program, there's no point in doing a
|
||||
PID -> program lookup. Normally the one and only program isn't disabled,
|
||||
so no packets should be discarded.
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
discard_pid() 73.8 9.4 20.2 1.5 +264.8%
|
||||
Overall 2300.8 28.0 2253.1 20.6 +2.1%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavformat/mpegts.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
|
||||
index 5307521..82dd209 100644
|
||||
--- a/libavformat/mpegts.c
|
||||
+++ b/libavformat/mpegts.c
|
||||
@@ -268,6 +268,17 @@ static int discard_pid(MpegTSContext *ts, unsigned int pid)
|
||||
int i, j, k;
|
||||
int used = 0, discarded = 0;
|
||||
struct Program *p;
|
||||
+
|
||||
+ /* If none of the programs have .discard=AVDISCARD_ALL then there's
|
||||
+ * no way we have to discard this packet
|
||||
+ */
|
||||
+ for (k = 0; k < ts->stream->nb_programs; k++) {
|
||||
+ if (ts->stream->programs[k]->discard == AVDISCARD_ALL)
|
||||
+ break;
|
||||
+ }
|
||||
+ if (k == ts->stream->nb_programs)
|
||||
+ return 0;
|
||||
+
|
||||
for(i=0; i<ts->nb_prg; i++) {
|
||||
p = &ts->prg[i];
|
||||
for(j=0; j<p->nb_pids; j++) {
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,76 @@
|
||||
From b79aa2b89ed9027a72a10c1d26ccdf2bb385d57b Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 5 Aug 2013 13:12:49 +0100
|
||||
Subject: [PATCH 54/54] [ffmpeg] - backport - mpegts: Remove one 64-bit
|
||||
integer modulus operation per packet
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
The common case of the pointer having increased by one packet (which results
|
||||
in no change to the modulus) can be detected with a 64-bit subtraction,
|
||||
which is far cheaper than a division on many platforms.
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Change
|
||||
Divisions 248.3 8.8 51.5 7.4 +381.7%
|
||||
Overall 2773.2 25.6 2372.5 43.1 +16.9%
|
||||
|
||||
Signed-off-by: Martin Storsjö <martin@martin.st>
|
||||
---
|
||||
libavcodec/mathops.h | 9 +++++++++
|
||||
libavformat/mpegts.c | 5 ++++-
|
||||
2 files changed, 13 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
|
||||
index 592f5a5..1d57342 100644
|
||||
--- a/libavcodec/mathops.h
|
||||
+++ b/libavcodec/mathops.h
|
||||
@@ -195,6 +195,15 @@ if ((y) < (x)) {\
|
||||
# define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32))
|
||||
#endif /* FASTDIV */
|
||||
|
||||
+#ifndef MOD_UNLIKELY
|
||||
+# define MOD_UNLIKELY(modulus, dividend, divisor, prev_dividend) \
|
||||
+ do { \
|
||||
+ if ((prev_dividend) == 0 || (dividend) - (prev_dividend) != (divisor)) \
|
||||
+ (modulus) = (dividend) % (divisor); \
|
||||
+ (prev_dividend) = (dividend); \
|
||||
+ } while (0)
|
||||
+#endif
|
||||
+
|
||||
static inline av_const unsigned int ff_sqrt(unsigned int a)
|
||||
{
|
||||
unsigned int b;
|
||||
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
|
||||
index 82dd209..b995f60 100644
|
||||
--- a/libavformat/mpegts.c
|
||||
+++ b/libavformat/mpegts.c
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "libavutil/avassert.h"
|
||||
#include "libavcodec/bytestream.h"
|
||||
#include "libavcodec/get_bits.h"
|
||||
+#include "libavcodec/mathops.h"
|
||||
#include "avformat.h"
|
||||
#include "mpegts.h"
|
||||
#include "internal.h"
|
||||
@@ -99,6 +100,8 @@ struct MpegTSContext {
|
||||
int raw_packet_size;
|
||||
|
||||
int pos47;
|
||||
+ /** position corresponding to pos47, or 0 if pos47 invalid */
|
||||
+ int64_t pos;
|
||||
|
||||
/** if true, all pids are analyzed to find streams */
|
||||
int auto_guess;
|
||||
@@ -1814,7 +1817,7 @@ static int handle_packet(MpegTSContext *ts, const uint8_t *packet)
|
||||
return 0;
|
||||
|
||||
pos = avio_tell(ts->stream->pb);
|
||||
- ts->pos47= pos % ts->raw_packet_size;
|
||||
+ MOD_UNLIKELY(ts->pos47, pos, ts->raw_packet_size, ts->pos);
|
||||
|
||||
if (tss->type == MPEGTS_SECTION) {
|
||||
if (is_start) {
|
||||
--
|
||||
1.7.9.5
|
@ -0,0 +1,110 @@
|
||||
From 8067f55edf3719182aed6e5b57b7863889f80218 Mon Sep 17 00:00:00 2001
|
||||
From: =?utf8?q?Reimar=20D=C3=B6ffinger?= <Reimar.Doeffinger@gmx.de>
|
||||
Date: Sat, 16 Mar 2013 13:36:20 +0100
|
||||
Subject: [PATCH] Fix compilation on ARM with android gcc 4.7
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=utf8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
With the current code it fails due to running out
|
||||
of registers.
|
||||
So code the store offsets manually into the assembler
|
||||
instead.
|
||||
Passes "make fate-dts".
|
||||
|
||||
Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
|
||||
---
|
||||
libavcodec/arm/dca.h | 74 ++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 36 insertions(+), 38 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
|
||||
index 2cfd18a..431b62e 100644
|
||||
--- a/libavcodec/arm/dca.h
|
||||
+++ b/libavcodec/arm/dca.h
|
||||
@@ -34,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
|
||||
{
|
||||
int v0, v1, v2, v3, v4, v5;
|
||||
|
||||
- __asm__ ("smmul %8, %14, %18 \n"
|
||||
- "smmul %11, %15, %18 \n"
|
||||
- "smlabb %14, %8, %17, %14 \n"
|
||||
- "smlabb %15, %11, %17, %15 \n"
|
||||
- "smmul %9, %8, %18 \n"
|
||||
- "smmul %12, %11, %18 \n"
|
||||
- "sub %14, %14, %16, lsr #1 \n"
|
||||
- "sub %15, %15, %16, lsr #1 \n"
|
||||
- "smlabb %8, %9, %17, %8 \n"
|
||||
- "smlabb %11, %12, %17, %11 \n"
|
||||
- "smmul %10, %9, %18 \n"
|
||||
- "smmul %13, %12, %18 \n"
|
||||
- "str %14, %0 \n"
|
||||
- "str %15, %4 \n"
|
||||
- "sub %8, %8, %16, lsr #1 \n"
|
||||
- "sub %11, %11, %16, lsr #1 \n"
|
||||
- "smlabb %9, %10, %17, %9 \n"
|
||||
- "smlabb %12, %13, %17, %12 \n"
|
||||
- "smmul %14, %10, %18 \n"
|
||||
- "smmul %15, %13, %18 \n"
|
||||
- "str %8, %1 \n"
|
||||
- "str %11, %5 \n"
|
||||
- "sub %9, %9, %16, lsr #1 \n"
|
||||
- "sub %12, %12, %16, lsr #1 \n"
|
||||
- "smlabb %10, %14, %17, %10 \n"
|
||||
- "smlabb %13, %15, %17, %13 \n"
|
||||
- "str %9, %2 \n"
|
||||
- "str %12, %6 \n"
|
||||
- "sub %10, %10, %16, lsr #1 \n"
|
||||
- "sub %13, %13, %16, lsr #1 \n"
|
||||
- "str %10, %3 \n"
|
||||
- "str %13, %7 \n"
|
||||
- : "=m"(values[0]), "=m"(values[1]),
|
||||
- "=m"(values[2]), "=m"(values[3]),
|
||||
- "=m"(values[4]), "=m"(values[5]),
|
||||
- "=m"(values[6]), "=m"(values[7]),
|
||||
- "=&r"(v0), "=&r"(v1), "=&r"(v2),
|
||||
+ __asm__ ("smmul %0, %6, %10 \n"
|
||||
+ "smmul %3, %7, %10 \n"
|
||||
+ "smlabb %6, %0, %9, %6 \n"
|
||||
+ "smlabb %7, %3, %9, %7 \n"
|
||||
+ "smmul %1, %0, %10 \n"
|
||||
+ "smmul %4, %3, %10 \n"
|
||||
+ "sub %6, %6, %8, lsr #1 \n"
|
||||
+ "sub %7, %7, %8, lsr #1 \n"
|
||||
+ "smlabb %0, %1, %9, %0 \n"
|
||||
+ "smlabb %3, %4, %9, %3 \n"
|
||||
+ "smmul %2, %1, %10 \n"
|
||||
+ "smmul %5, %4, %10 \n"
|
||||
+ "str %6, [%11, #0] \n"
|
||||
+ "str %7, [%11, #16] \n"
|
||||
+ "sub %0, %0, %8, lsr #1 \n"
|
||||
+ "sub %3, %3, %8, lsr #1 \n"
|
||||
+ "smlabb %1, %2, %9, %1 \n"
|
||||
+ "smlabb %4, %5, %9, %4 \n"
|
||||
+ "smmul %6, %2, %10 \n"
|
||||
+ "smmul %7, %5, %10 \n"
|
||||
+ "str %0, [%11, #4] \n"
|
||||
+ "str %3, [%11, #20] \n"
|
||||
+ "sub %1, %1, %8, lsr #1 \n"
|
||||
+ "sub %4, %4, %8, lsr #1 \n"
|
||||
+ "smlabb %2, %6, %9, %2 \n"
|
||||
+ "smlabb %5, %7, %9, %5 \n"
|
||||
+ "str %1, [%11, #8] \n"
|
||||
+ "str %4, [%11, #24] \n"
|
||||
+ "sub %2, %2, %8, lsr #1 \n"
|
||||
+ "sub %5, %5, %8, lsr #1 \n"
|
||||
+ "str %2, [%11, #12] \n"
|
||||
+ "str %5, [%11, #28] \n"
|
||||
+ : "=&r"(v0), "=&r"(v1), "=&r"(v2),
|
||||
"=&r"(v3), "=&r"(v4), "=&r"(v5),
|
||||
"+&r"(code1), "+&r"(code2)
|
||||
- : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
|
||||
+ : "r"(levels - 1), "r"(-levels),
|
||||
+ "r"(ff_inverse[levels]), "r"(values)
|
||||
+ : "memory");
|
||||
|
||||
return code1 | code2;
|
||||
}
|
||||
--
|
||||
1.7.10.4
|
||||
|
||||
|
@ -1,42 +1,15 @@
|
||||
From 67895a77c9e5f519166dd0ce4a2a98649194b11b Mon Sep 17 00:00:00 2001
|
||||
From: Rainer Hochecker <fernetmenta@online.de>
|
||||
Date: Sat, 8 Oct 2011 16:45:13 +0200
|
||||
Subject: [PATCH] ffmpeg: add xvba hwaccel
|
||||
|
||||
---
|
||||
configure | 11 ++
|
||||
libavcodec/Makefile | 6 ++
|
||||
libavcodec/allcodecs.c | 4 +
|
||||
libavcodec/h264.c | 3 +
|
||||
libavcodec/xvba.c | 66 ++++++++++++
|
||||
libavcodec/xvba.h | 71 +++++++++++++
|
||||
libavcodec/xvba_h264.c | 192 ++++++++++++++++++++++++++++++++++
|
||||
libavcodec/xvba_internal.h | 24 +++++
|
||||
libavcodec/xvba_mpeg2.c | 52 +++++++++
|
||||
libavcodec/xvba_vc1.c | 190 +++++++++++++++++++++++++++++++++
|
||||
libavutil/pixdesc.c | 6 ++
|
||||
libavutil/pixfmt.h | 1 +
|
||||
12 files changed, 626 insertions(+)
|
||||
create mode 100644 libavcodec/xvba.c
|
||||
create mode 100644 libavcodec/xvba.h
|
||||
create mode 100644 libavcodec/xvba_h264.c
|
||||
create mode 100644 libavcodec/xvba_internal.h
|
||||
create mode 100644 libavcodec/xvba_mpeg2.c
|
||||
create mode 100644 libavcodec/xvba_vc1.c
|
||||
|
||||
diff --git a/configure b/configure
|
||||
index 351611d..876a6ea 100755
|
||||
--- a/configure
|
||||
+++ b/configure
|
||||
@@ -144,6 +144,7 @@ Hardware accelerators:
|
||||
--enable-vaapi enable VAAPI code
|
||||
diff -Naur ffmpeg-1.2.3/configure ffmpeg-1.2.3.patch/configure
|
||||
--- ffmpeg-1.2.3/configure 2013-09-09 22:46:04.636832059 +0200
|
||||
+++ ffmpeg-1.2.3.patch/configure 2013-09-09 22:47:15.023872481 +0200
|
||||
@@ -144,6 +144,7 @@
|
||||
--disable-vaapi disable VAAPI code [autodetect]
|
||||
--enable-vda enable VDA code
|
||||
--enable-vdpau enable VDPAU code
|
||||
--disable-vdpau disable VDPAU code [autodetect]
|
||||
+ --disable-xvba disable XVBA code
|
||||
|
||||
Individual component options:
|
||||
--disable-everything disable all components listed below
|
||||
@@ -1197,6 +1198,7 @@ HWACCEL_LIST="
|
||||
@@ -1197,6 +1198,7 @@
|
||||
vaapi
|
||||
vda
|
||||
vdpau
|
||||
@ -44,7 +17,7 @@ index 351611d..876a6ea 100755
|
||||
"
|
||||
|
||||
LIBRARY_LIST="
|
||||
@@ -1827,6 +1829,7 @@ crystalhd_deps="libcrystalhd_libcrystalhd_if_h"
|
||||
@@ -1827,6 +1829,7 @@
|
||||
dxva2_deps="dxva2api_h"
|
||||
vaapi_deps="va_va_h"
|
||||
vda_deps="VideoDecodeAcceleration_VDADecoder_h pthreads"
|
||||
@ -52,7 +25,7 @@ index 351611d..876a6ea 100755
|
||||
vda_extralibs="-framework CoreFoundation -framework VideoDecodeAcceleration -framework QuartzCore"
|
||||
vdpau_deps="vdpau_vdpau_h vdpau_vdpau_x11_h"
|
||||
|
||||
@@ -1847,6 +1850,8 @@ h264_vdpau_decoder_deps="vdpau"
|
||||
@@ -1847,6 +1850,8 @@
|
||||
h264_vdpau_decoder_select="h264_decoder"
|
||||
h264_vdpau_hwaccel_deps="vdpau"
|
||||
h264_vdpau_hwaccel_select="h264_decoder"
|
||||
@ -61,7 +34,7 @@ index 351611d..876a6ea 100755
|
||||
mpeg_vdpau_decoder_deps="vdpau"
|
||||
mpeg_vdpau_decoder_select="mpegvideo_decoder"
|
||||
mpeg1_vdpau_decoder_deps="vdpau"
|
||||
@@ -1859,6 +1864,8 @@ mpeg2_dxva2_hwaccel_select="mpeg2video_decoder"
|
||||
@@ -1859,6 +1864,8 @@
|
||||
mpeg2_vaapi_hwaccel_deps="vaapi"
|
||||
mpeg2_vaapi_hwaccel_select="mpeg2video_decoder"
|
||||
mpeg2_vdpau_hwaccel_deps="vdpau"
|
||||
@ -70,7 +43,7 @@ index 351611d..876a6ea 100755
|
||||
mpeg2_vdpau_hwaccel_select="mpeg2video_decoder"
|
||||
mpeg4_crystalhd_decoder_select="crystalhd"
|
||||
mpeg4_vaapi_hwaccel_deps="vaapi"
|
||||
@@ -1877,11 +1884,14 @@ vc1_vdpau_decoder_deps="vdpau"
|
||||
@@ -1877,11 +1884,14 @@
|
||||
vc1_vdpau_decoder_select="vc1_decoder"
|
||||
vc1_vdpau_hwaccel_deps="vdpau"
|
||||
vc1_vdpau_hwaccel_select="vc1_decoder"
|
||||
@ -85,71 +58,18 @@ index 351611d..876a6ea 100755
|
||||
|
||||
# parsers
|
||||
h264_parser_select="golomb h264chroma h264dsp h264pred h264qpel videodsp"
|
||||
@@ -3832,6 +3842,7 @@ check_header termios.h
|
||||
check_header unistd.h
|
||||
@@ -3836,6 +3846,7 @@
|
||||
check_header vdpau/vdpau.h
|
||||
check_header vdpau/vdpau_x11.h
|
||||
+check_header amd/amdxvba.h
|
||||
check_cpp_condition vdpau/vdpau.h "defined(VDP_DECODER_PROFILE_MPEG4_PART2_SP)" && enable vdpau_mpeg4_support
|
||||
+check_header amd/amdxvba.h
|
||||
|
||||
check_header VideoDecodeAcceleration/VDADecoder.h
|
||||
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
|
||||
index dc065a5..c386923 100644
|
||||
--- a/libavcodec/Makefile
|
||||
+++ b/libavcodec/Makefile
|
||||
@@ -12,6 +12,7 @@ HEADERS = avcodec.h \
|
||||
vdpau.h \
|
||||
version.h \
|
||||
xvmc.h \
|
||||
+ xvba.h \
|
||||
|
||||
OBJS = allcodecs.o \
|
||||
audioconvert.o \
|
||||
@@ -73,6 +74,7 @@ OBJS-$(CONFIG_SHARED) += log2_tab.o
|
||||
OBJS-$(CONFIG_SINEWIN) += sinewin.o
|
||||
OBJS-$(CONFIG_VAAPI) += vaapi.o
|
||||
OBJS-$(CONFIG_VDPAU) += vdpau.o
|
||||
+OBJS-$(CONFIG_XVBA) += xvba.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += videodsp.o
|
||||
OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
|
||||
|
||||
@@ -232,6 +234,7 @@ OBJS-$(CONFIG_H264_VAAPI_HWACCEL) += vaapi_h264.o
|
||||
OBJS-$(CONFIG_H264_VDA_HWACCEL) += vda_h264.o
|
||||
OBJS-$(CONFIG_H264_VDA_DECODER) += vda_h264_dec.o
|
||||
OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o
|
||||
+OBJS-$(CONFIG_H264_XVBA_HWACCEL) += xvba_h264.o
|
||||
OBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuv.o huffyuvdec.o
|
||||
OBJS-$(CONFIG_HUFFYUV_ENCODER) += huffyuv.o huffyuvenc.o
|
||||
OBJS-$(CONFIG_IAC_DECODER) += imc.o
|
||||
@@ -295,6 +298,7 @@ OBJS-$(CONFIG_MPEG1VIDEO_ENCODER) += mpeg12enc.o mpeg12.o \
|
||||
OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL) += dxva2_mpeg2.o
|
||||
OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL) += vaapi_mpeg2.o
|
||||
OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL) += vdpau_mpeg12.o
|
||||
+OBJS-$(CONFIG_MPEG2_XVBA_HWACCEL) += xvba_mpeg2.o
|
||||
OBJS-$(CONFIG_MPEG2VIDEO_DECODER) += mpeg12.o mpeg12data.o
|
||||
OBJS-$(CONFIG_MPEG2VIDEO_ENCODER) += mpeg12enc.o mpeg12.o \
|
||||
timecode.o
|
||||
@@ -459,6 +463,7 @@ OBJS-$(CONFIG_VC1_DECODER) += vc1dec.o vc1.o vc1data.o vc1dsp.o \
|
||||
OBJS-$(CONFIG_VC1_DXVA2_HWACCEL) += dxva2_vc1.o
|
||||
OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o
|
||||
OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o
|
||||
+OBJS-$(CONFIG_VC1_XVBA_HWACCEL) += xvba_vc1.o
|
||||
OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o
|
||||
OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o
|
||||
OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o
|
||||
@@ -788,6 +793,7 @@ SKIPHEADERS-$(CONFIG_LIBSCHROEDINGER) += libschroedinger.h
|
||||
SKIPHEADERS-$(CONFIG_LIBUTVIDEO) += libutvideo.h
|
||||
SKIPHEADERS-$(CONFIG_MPEG_XVMC_DECODER) += xvmc.h
|
||||
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_internal.h
|
||||
+SKIPHEADERS-$(CONFIG_XVBA) += xvba_internal.h
|
||||
SKIPHEADERS-$(CONFIG_VDA) += vda.h
|
||||
SKIPHEADERS-$(CONFIG_VDPAU) += vdpau.h
|
||||
SKIPHEADERS-$(HAVE_OS2THREADS) += os2threads.h
|
||||
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
|
||||
index 584446f..7a8f61c 100644
|
||||
--- a/libavcodec/allcodecs.c
|
||||
+++ b/libavcodec/allcodecs.c
|
||||
@@ -79,18 +79,22 @@ void avcodec_register_all(void)
|
||||
check_header windows.h
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/allcodecs.c ffmpeg-1.2.3.patch/libavcodec/allcodecs.c
|
||||
--- ffmpeg-1.2.3/libavcodec/allcodecs.c 2013-08-27 02:13:44.000000000 +0200
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/allcodecs.c 2013-09-09 22:46:40.577852790 +0200
|
||||
@@ -79,18 +79,22 @@
|
||||
REGISTER_HWACCEL(H264_VAAPI, h264_vaapi);
|
||||
REGISTER_HWACCEL(H264_VDA, h264_vda);
|
||||
REGISTER_HWACCEL(H264_VDPAU, h264_vdpau);
|
||||
@ -172,10 +92,9 @@ index 584446f..7a8f61c 100644
|
||||
|
||||
/* video codecs */
|
||||
REGISTER_ENCODER(A64MULTI, a64multi);
|
||||
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
|
||||
index 937ad7a..299039f 100644
|
||||
--- a/libavcodec/h264.c
|
||||
+++ b/libavcodec/h264.c
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/h264.c ffmpeg-1.2.3.patch/libavcodec/h264.c
|
||||
--- ffmpeg-1.2.3/libavcodec/h264.c 2013-09-09 22:46:04.639832061 +0200
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/h264.c 2013-09-09 22:46:40.579852792 +0200
|
||||
@@ -81,6 +81,9 @@
|
||||
#if CONFIG_H264_VDPAU_HWACCEL
|
||||
AV_PIX_FMT_VDPAU,
|
||||
@ -186,11 +105,60 @@ index 937ad7a..299039f 100644
|
||||
AV_PIX_FMT_YUV420P,
|
||||
AV_PIX_FMT_NONE
|
||||
};
|
||||
diff --git a/libavcodec/xvba.c b/libavcodec/xvba.c
|
||||
new file mode 100644
|
||||
index 0000000..be29e5d
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/xvba.c
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/Makefile ffmpeg-1.2.3.patch/libavcodec/Makefile
|
||||
--- ffmpeg-1.2.3/libavcodec/Makefile 2013-08-27 02:13:44.000000000 +0200
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/Makefile 2013-09-09 22:46:40.580852793 +0200
|
||||
@@ -12,6 +12,7 @@
|
||||
vdpau.h \
|
||||
version.h \
|
||||
xvmc.h \
|
||||
+ xvba.h \
|
||||
|
||||
OBJS = allcodecs.o \
|
||||
audioconvert.o \
|
||||
@@ -73,6 +74,7 @@
|
||||
OBJS-$(CONFIG_SINEWIN) += sinewin.o
|
||||
OBJS-$(CONFIG_VAAPI) += vaapi.o
|
||||
OBJS-$(CONFIG_VDPAU) += vdpau.o
|
||||
+OBJS-$(CONFIG_XVBA) += xvba.o
|
||||
OBJS-$(CONFIG_VIDEODSP) += videodsp.o
|
||||
OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
|
||||
|
||||
@@ -232,6 +234,7 @@
|
||||
OBJS-$(CONFIG_H264_VDA_HWACCEL) += vda_h264.o
|
||||
OBJS-$(CONFIG_H264_VDA_DECODER) += vda_h264_dec.o
|
||||
OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o
|
||||
+OBJS-$(CONFIG_H264_XVBA_HWACCEL) += xvba_h264.o
|
||||
OBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuv.o huffyuvdec.o
|
||||
OBJS-$(CONFIG_HUFFYUV_ENCODER) += huffyuv.o huffyuvenc.o
|
||||
OBJS-$(CONFIG_IAC_DECODER) += imc.o
|
||||
@@ -295,6 +298,7 @@
|
||||
OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL) += dxva2_mpeg2.o
|
||||
OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL) += vaapi_mpeg2.o
|
||||
OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL) += vdpau_mpeg12.o
|
||||
+OBJS-$(CONFIG_MPEG2_XVBA_HWACCEL) += xvba_mpeg2.o
|
||||
OBJS-$(CONFIG_MPEG2VIDEO_DECODER) += mpeg12.o mpeg12data.o
|
||||
OBJS-$(CONFIG_MPEG2VIDEO_ENCODER) += mpeg12enc.o mpeg12.o \
|
||||
timecode.o
|
||||
@@ -459,6 +463,7 @@
|
||||
OBJS-$(CONFIG_VC1_DXVA2_HWACCEL) += dxva2_vc1.o
|
||||
OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o
|
||||
OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o
|
||||
+OBJS-$(CONFIG_VC1_XVBA_HWACCEL) += xvba_vc1.o
|
||||
OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o
|
||||
OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o
|
||||
OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o
|
||||
@@ -788,6 +793,7 @@
|
||||
SKIPHEADERS-$(CONFIG_LIBUTVIDEO) += libutvideo.h
|
||||
SKIPHEADERS-$(CONFIG_MPEG_XVMC_DECODER) += xvmc.h
|
||||
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_internal.h
|
||||
+SKIPHEADERS-$(CONFIG_XVBA) += xvba_internal.h
|
||||
SKIPHEADERS-$(CONFIG_VDA) += vda.h
|
||||
SKIPHEADERS-$(CONFIG_VDPAU) += vdpau.h
|
||||
SKIPHEADERS-$(HAVE_OS2THREADS) += os2threads.h
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/xvba.c ffmpeg-1.2.3.patch/libavcodec/xvba.c
|
||||
--- ffmpeg-1.2.3/libavcodec/xvba.c 1970-01-01 01:00:00.000000000 +0100
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/xvba.c 2013-09-09 22:46:40.580852793 +0200
|
||||
@@ -0,0 +1,66 @@
|
||||
+/*
|
||||
+ * HW decode acceleration for MPEG-2, H.264 and VC-1
|
||||
@ -258,11 +226,9 @@ index 0000000..be29e5d
|
||||
+ render->num_slices++;
|
||||
+}
|
||||
+
|
||||
diff --git a/libavcodec/xvba.h b/libavcodec/xvba.h
|
||||
new file mode 100644
|
||||
index 0000000..9f9ff0c
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/xvba.h
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/xvba.h ffmpeg-1.2.3.patch/libavcodec/xvba.h
|
||||
--- ffmpeg-1.2.3/libavcodec/xvba.h 1970-01-01 01:00:00.000000000 +0100
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/xvba.h 2013-09-09 22:46:40.581852794 +0200
|
||||
@@ -0,0 +1,71 @@
|
||||
+/*
|
||||
+ * HW decode acceleration for MPEG-2, H.264 and VC-1
|
||||
@ -335,11 +301,9 @@ index 0000000..9f9ff0c
|
||||
+};
|
||||
+
|
||||
+#endif /* AVCODEC_XVBA_H */
|
||||
diff --git a/libavcodec/xvba_h264.c b/libavcodec/xvba_h264.c
|
||||
new file mode 100644
|
||||
index 0000000..ae45f3a
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/xvba_h264.c
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_h264.c ffmpeg-1.2.3.patch/libavcodec/xvba_h264.c
|
||||
--- ffmpeg-1.2.3/libavcodec/xvba_h264.c 1970-01-01 01:00:00.000000000 +0100
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_h264.c 2013-09-09 22:46:40.582852794 +0200
|
||||
@@ -0,0 +1,192 @@
|
||||
+/*
|
||||
+ * H.264 HW decode acceleration through XVBA
|
||||
@ -533,11 +497,9 @@ index 0000000..ae45f3a
|
||||
+ .end_frame = end_frame,
|
||||
+ .decode_slice = decode_slice,
|
||||
+};
|
||||
diff --git a/libavcodec/xvba_internal.h b/libavcodec/xvba_internal.h
|
||||
new file mode 100644
|
||||
index 0000000..9653f85
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/xvba_internal.h
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_internal.h ffmpeg-1.2.3.patch/libavcodec/xvba_internal.h
|
||||
--- ffmpeg-1.2.3/libavcodec/xvba_internal.h 1970-01-01 01:00:00.000000000 +0100
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_internal.h 2013-09-09 22:46:40.582852794 +0200
|
||||
@@ -0,0 +1,24 @@
|
||||
+/*
|
||||
+ * HW decode acceleration for MPEG-2, H.264 and VC-1
|
||||
@ -563,11 +525,9 @@ index 0000000..9653f85
|
||||
+
|
||||
+int ff_xvba_translate_profile(int profile);
|
||||
+void ff_xvba_add_slice_data(struct xvba_render_state *render, const uint8_t *buffer, uint32_t size);
|
||||
diff --git a/libavcodec/xvba_mpeg2.c b/libavcodec/xvba_mpeg2.c
|
||||
new file mode 100644
|
||||
index 0000000..0fc7d78
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/xvba_mpeg2.c
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_mpeg2.c ffmpeg-1.2.3.patch/libavcodec/xvba_mpeg2.c
|
||||
--- ffmpeg-1.2.3/libavcodec/xvba_mpeg2.c 1970-01-01 01:00:00.000000000 +0100
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_mpeg2.c 2013-09-09 22:46:40.582852794 +0200
|
||||
@@ -0,0 +1,52 @@
|
||||
+/*
|
||||
+ * MPEG-2 HW decode acceleration through XVBA
|
||||
@ -621,11 +581,9 @@ index 0000000..0fc7d78
|
||||
+ .decode_slice = decode_slice,
|
||||
+ .priv_data_size = 0,
|
||||
+};
|
||||
diff --git a/libavcodec/xvba_vc1.c b/libavcodec/xvba_vc1.c
|
||||
new file mode 100644
|
||||
index 0000000..bf3d9c2
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/xvba_vc1.c
|
||||
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_vc1.c ffmpeg-1.2.3.patch/libavcodec/xvba_vc1.c
|
||||
--- ffmpeg-1.2.3/libavcodec/xvba_vc1.c 1970-01-01 01:00:00.000000000 +0100
|
||||
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_vc1.c 2013-09-09 22:46:40.583852794 +0200
|
||||
@@ -0,0 +1,190 @@
|
||||
+/*
|
||||
+ * VC-1 HW decode acceleration through XVBA
|
||||
@ -817,11 +775,10 @@ index 0000000..bf3d9c2
|
||||
+ .end_frame = end_frame,
|
||||
+ .decode_slice = decode_slice,
|
||||
+};
|
||||
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
|
||||
index 1016dba..53dfec1 100644
|
||||
--- a/libavutil/pixdesc.c
|
||||
+++ b/libavutil/pixdesc.c
|
||||
@@ -1141,6 +1141,12 @@ void av_write_image_line(const uint16_t *src,
|
||||
diff -Naur ffmpeg-1.2.3/libavutil/pixdesc.c ffmpeg-1.2.3.patch/libavutil/pixdesc.c
|
||||
--- ffmpeg-1.2.3/libavutil/pixdesc.c 2013-08-27 02:13:47.000000000 +0200
|
||||
+++ ffmpeg-1.2.3.patch/libavutil/pixdesc.c 2013-09-09 22:46:40.584852795 +0200
|
||||
@@ -1141,6 +1141,12 @@
|
||||
.log2_chroma_h = 1,
|
||||
.flags = PIX_FMT_HWACCEL,
|
||||
},
|
||||
@ -834,11 +791,10 @@ index 1016dba..53dfec1 100644
|
||||
[AV_PIX_FMT_YUV420P9LE] = {
|
||||
.name = "yuv420p9le",
|
||||
.nb_components = 3,
|
||||
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
|
||||
index 1c00ac4..6437e29 100644
|
||||
--- a/libavutil/pixfmt.h
|
||||
+++ b/libavutil/pixfmt.h
|
||||
@@ -124,6 +124,7 @@ enum AVPixelFormat {
|
||||
diff -Naur ffmpeg-1.2.3/libavutil/pixfmt.h ffmpeg-1.2.3.patch/libavutil/pixfmt.h
|
||||
--- ffmpeg-1.2.3/libavutil/pixfmt.h 2013-08-27 02:13:47.000000000 +0200
|
||||
+++ ffmpeg-1.2.3.patch/libavutil/pixfmt.h 2013-09-09 22:46:40.585852796 +0200
|
||||
@@ -124,6 +124,7 @@
|
||||
AV_PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers
|
||||
AV_PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers
|
||||
AV_PIX_FMT_VAAPI_VLD, ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
|
||||
@ -846,6 +802,3 @@ index 1c00ac4..6437e29 100644
|
||||
|
||||
AV_PIX_FMT_YUV420P16LE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
|
||||
AV_PIX_FMT_YUV420P16BE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
|
||||
--
|
||||
1.8.1.5
|
||||
|
Loading…
x
Reference in New Issue
Block a user