ffmpeg: update to ffmpeg-1.2.3, sync upstream patches

Signed-off-by: Stephan Raue <stephan@openelec.tv>
This commit is contained in:
Stephan Raue 2013-09-10 01:23:52 +02:00
parent badfce90d2
commit ed2f08ddc6
56 changed files with 2782 additions and 1728 deletions

View File

@ -21,7 +21,7 @@
PKG_NAME="ffmpeg"
PKG_VERSION="0.10.7"
if [ "$XBMC" = "master" ]; then
PKG_VERSION="1.2.1"
PKG_VERSION="1.2.3"
fi
PKG_REV="1"
PKG_ARCH="any"

View File

@ -1,22 +0,0 @@
Subject: [libav-devel] [PATCH 1/2] vaapi: return early from ff_vaapi_render_picture() without picture
From: Janne Grunau janne-libav at jannau.net
Fixes an assertion when called on uninitialized frame. Spotted after
seeking in vlc. (backported from libav mailing list)
---
diff --git a/libavcodec/vaapi.c b/libavcodec/vaapi.c
index a220a9d..94959bf 100644
--- a/libavcodec/vaapi.c
+++ b/libavcodec/vaapi.c
@@ -46,6 +46,9 @@ int ff_vaapi_render_picture(struct vaapi_context *vactx, VASurfaceID surface)
VABufferID va_buffers[3];
unsigned int n_va_buffers = 0;
+ if (!vactx->pic_param_buf_id)
+ return 0;
+
vaUnmapBuffer(vactx->display, vactx->pic_param_buf_id);
va_buffers[n_va_buffers++] = vactx->pic_param_buf_id;

View File

@ -0,0 +1,311 @@
From 40daea3c1bafa9cea37b65f856c3c0432767d760 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:09 +0100
Subject: [PATCH 39/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of synth_filter_float
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 9295.0 114.9 4853.2 83.5 +91.5%
Overall 23699.8 397.6 19285.5 292.0 +22.9%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/fft_init_arm.c | 8 +
libavcodec/arm/synth_filter_vfp.S | 243 ++++++++++++++++++++++++++
3 files changed, 252 insertions(+)
create mode 100644 libavcodec/arm/synth_filter_vfp.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 1c91d62..aee9d73 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -58,6 +58,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 8c98abc..fe0acc5 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)
void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
+void ff_synth_filter_float_vfp(FFTContext *imdct,
+ float *synth_buf_ptr, int *synth_buf_offset,
+ float synth_buf2[32], const float window[512],
+ float out[32], const float in[32],
+ float scale);
+
void ff_synth_filter_float_neon(FFTContext *imdct,
float *synth_buf_ptr, int *synth_buf_offset,
float synth_buf2[32], const float window[512],
@@ -71,6 +77,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
+ s->synth_filter_float = ff_synth_filter_float_vfp;
if (have_neon(cpu_flags))
s->synth_filter_float = ff_synth_filter_float_neon;
}
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
new file mode 100644
index 0000000..c219c41
--- /dev/null
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+IMDCT .req r0
+ORIG_P_SB .req r1
+P_SB_OFF .req r2
+I .req r0
+P_SB2_UP .req r1
+OLDFPSCR .req r2
+P_SB2_DN .req r3
+P_WIN_DN .req r4
+P_OUT_DN .req r5
+P_SB .req r6
+J_WRAP .req r7
+P_WIN_UP .req r12
+P_OUT_UP .req r14
+
+SCALE .req s0
+SBUF_DAT_REV0 .req s4
+SBUF_DAT_REV1 .req s5
+SBUF_DAT_REV2 .req s6
+SBUF_DAT_REV3 .req s7
+VA0 .req s8
+VA3 .req s11
+VB0 .req s12
+VB3 .req s15
+VC0 .req s8
+VC3 .req s11
+VD0 .req s12
+VD3 .req s15
+SBUF_DAT0 .req s16
+SBUF_DAT1 .req s17
+SBUF_DAT2 .req s18
+SBUF_DAT3 .req s19
+SBUF_DAT_ALT0 .req s20
+SBUF_DAT_ALT1 .req s21
+SBUF_DAT_ALT2 .req s22
+SBUF_DAT_ALT3 .req s23
+WIN_DN_DAT0 .req s24
+WIN_UP_DAT0 .req s28
+
+
+.macro inner_loop half, tail, head
+ .if (OFFSET & (64*4)) == 0 @ even numbered call
+ SBUF_DAT_THIS0 .req SBUF_DAT0
+ SBUF_DAT_THIS1 .req SBUF_DAT1
+ SBUF_DAT_THIS2 .req SBUF_DAT2
+ SBUF_DAT_THIS3 .req SBUF_DAT3
+ .ifnc "\head",""
+ vldr d8, [P_SB, #OFFSET] @ d8 = SBUF_DAT
+ vldr d9, [P_SB, #OFFSET+8]
+ .endif
+ .else
+ SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
+ SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
+ SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
+ SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
+ .ifnc "\head",""
+ vldr d10, [P_SB, #OFFSET] @ d10 = SBUF_DAT_ALT
+ vldr d11, [P_SB, #OFFSET+8]
+ .endif
+ .endif
+ .ifnc "\tail",""
+ .ifc "\half","ab"
+ vmls.f VA0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
+ .else
+ vmla.f VD0, SBUF_DAT_REV0, WIN_DN_DAT0 @ all operands treated as vectors
+ .endif
+ .endif
+ .ifnc "\head",""
+ vldr d14, [P_WIN_UP, #OFFSET] @ d14 = WIN_UP_DAT
+ vldr d15, [P_WIN_UP, #OFFSET+8]
+ vldr d12, [P_WIN_DN, #OFFSET] @ d12 = WIN_DN_DAT
+ vldr d13, [P_WIN_DN, #OFFSET+8]
+ vmov SBUF_DAT_REV3, SBUF_DAT_THIS0
+ vmov SBUF_DAT_REV2, SBUF_DAT_THIS1
+ vmov SBUF_DAT_REV1, SBUF_DAT_THIS2
+ vmov SBUF_DAT_REV0, SBUF_DAT_THIS3
+ .ifc "\half","ab"
+ vmla.f VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
+ .else
+ vmla.f VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
+ .endif
+ teq J_WRAP, #J
+ bne 2f @ strongly predictable, so better than cond exec in this case
+ sub P_SB, P_SB, #512*4
+2:
+ .set J, J - 64
+ .set OFFSET, OFFSET + 64*4
+ .endif
+ .unreq SBUF_DAT_THIS0
+ .unreq SBUF_DAT_THIS1
+ .unreq SBUF_DAT_THIS2
+ .unreq SBUF_DAT_THIS3
+.endm
+
+
+/* void ff_synth_filter_float_vfp(FFTContext *imdct,
+ * float *synth_buf_ptr, int *synth_buf_offset,
+ * float synth_buf2[32], const float window[512],
+ * float out[32], const float in[32], float scale)
+ */
+function ff_synth_filter_float_vfp, export=1
+ push {r3-r7,lr}
+ vpush {s16-s31}
+ ldr lr, [P_SB_OFF]
+ add a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
+ mov P_SB, a2 @ and keep a copy for ourselves
+ bic J_WRAP, lr, #63 @ mangled to make testing for wrap easier in inner loop
+ sub lr, lr, #32
+ and lr, lr, #512-32
+ str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
+ ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
+VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
+ bl ff_imdct_half_vfp
+VFP vmov SCALE, s16
+
+ fmrx OLDFPSCR, FPSCR
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, lr
+ ldr P_SB2_DN, [sp, #16*4]
+ ldr P_WIN_DN, [sp, #(16+6+0)*4]
+ ldr P_OUT_DN, [sp, #(16+6+1)*4]
+NOVFP vldr SCALE, [sp, #(16+6+3)*4]
+
+#define IMM_OFF_SKEW 956 /* also valid immediate constant when you add 16*4 */
+ add P_SB, P_SB, #IMM_OFF_SKEW @ so we can use -ve offsets to use full immediate offset range
+ add P_SB2_UP, P_SB2_DN, #16*4
+ add P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
+ add P_OUT_UP, P_OUT_DN, #16*4
+ add P_SB2_DN, P_SB2_DN, #16*4
+ add P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
+ add P_OUT_DN, P_OUT_DN, #16*4
+ mov I, #4
+1:
+ vldmia P_SB2_UP!, {VB0-VB3}
+ vldmdb P_SB2_DN!, {VA0-VA3}
+ .set J, 512 - 64
+ .set OFFSET, -IMM_OFF_SKEW
+ inner_loop ab,, head
+ .rept 7
+ inner_loop ab, tail, head
+ .endr
+ inner_loop ab, tail
+ add P_WIN_UP, P_WIN_UP, #4*4
+ sub P_WIN_DN, P_WIN_DN, #4*4
+ vmul.f VB0, VB0, SCALE @ SCALE treated as scalar
+ add P_SB, P_SB, #(512+4)*4
+ subs I, I, #1
+ vmul.f VA0, VA0, SCALE
+ vstmia P_OUT_UP!, {VB0-VB3}
+ vstmdb P_OUT_DN!, {VA0-VA3}
+ bne 1b
+
+ add P_SB2_DN, P_SB2_DN, #(16+28-12)*4
+ sub P_SB2_UP, P_SB2_UP, #(16+16)*4
+ add P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
+ mov I, #4
+1:
+ vldr.d d4, zero @ d4 = VC0
+ vldr.d d5, zero
+ vldr.d d6, zero @ d6 = VD0
+ vldr.d d7, zero
+ .set J, 512 - 64
+ .set OFFSET, -IMM_OFF_SKEW
+ inner_loop cd,, head
+ .rept 7
+ inner_loop cd, tail, head
+ .endr
+ inner_loop cd, tail
+ add P_WIN_UP, P_WIN_UP, #4*4
+ sub P_WIN_DN, P_WIN_DN, #4*4
+ add P_SB, P_SB, #(512+4)*4
+ subs I, I, #1
+ vstmia P_SB2_UP!, {VC0-VC3}
+ vstmdb P_SB2_DN!, {VD0-VD3}
+ bne 1b
+
+ fmxr FPSCR, OLDFPSCR
+ vpop {s16-s31}
+ pop {r3-r7,pc}
+endfunc
+
+ .unreq IMDCT
+ .unreq ORIG_P_SB
+ .unreq P_SB_OFF
+ .unreq I
+ .unreq P_SB2_UP
+ .unreq OLDFPSCR
+ .unreq P_SB2_DN
+ .unreq P_WIN_DN
+ .unreq P_OUT_DN
+ .unreq P_SB
+ .unreq J_WRAP
+ .unreq P_WIN_UP
+ .unreq P_OUT_UP
+
+ .unreq SCALE
+ .unreq SBUF_DAT_REV0
+ .unreq SBUF_DAT_REV1
+ .unreq SBUF_DAT_REV2
+ .unreq SBUF_DAT_REV3
+ .unreq VA0
+ .unreq VA3
+ .unreq VB0
+ .unreq VB3
+ .unreq VC0
+ .unreq VC3
+ .unreq VD0
+ .unreq VD3
+ .unreq SBUF_DAT0
+ .unreq SBUF_DAT1
+ .unreq SBUF_DAT2
+ .unreq SBUF_DAT3
+ .unreq SBUF_DAT_ALT0
+ .unreq SBUF_DAT_ALT1
+ .unreq SBUF_DAT_ALT2
+ .unreq SBUF_DAT_ALT3
+ .unreq WIN_DN_DAT0
+ .unreq WIN_UP_DAT0
+
+ .align 3
+zero: .word 0, 0
--
1.7.9.5

View File

@ -0,0 +1,102 @@
From 8ead63b22d31bf71976fc6964922b43d8e0d660b Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:10 +0100
Subject: [PATCH 40/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of int32_to_float_fmul_scalar
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 1175.0 4.4 366.2 18.3 +220.8%
Overall 19285.5 292.0 18420.5 489.1 +4.7%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/fmtconvert_init_arm.c | 10 ++++++
libavcodec/arm/fmtconvert_vfp.S | 38 +++++++++++++++++++++++
2 files changed, 48 insertions(+)
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index 1d99c97..de3b78b 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -28,6 +28,9 @@
void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
float mul, int len);
+void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
+ float mul, int len);
+
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
@@ -38,6 +41,13 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
int cpu_flags = av_get_cpu_flags();
if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
+ if (!have_vfpv3(cpu_flags)) {
+ // This function doesn't use anything armv6 specific in itself,
+ // but ff_float_to_int16_vfp which is in the same assembly source
+ // file does, thus the whole file requires armv6 to be built.
+ c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
+ }
+
c->float_to_int16 = ff_float_to_int16_vfp;
}
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 7b012bc..3cc3e56 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008 Siarhei Siamashka <ssvb@users.sourceforge.net>
+ * Copyright (c) 2013 RISC OS Open Ltd <bavison@riscosopen.org>
*
* This file is part of FFmpeg.
*
@@ -76,3 +77,40 @@ function ff_float_to_int16_vfp, export=1
vpop {d8-d11}
pop {r4-r8,pc}
endfunc
+
+/**
+ * ARM VFP optimised int32 to float conversion.
+ * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
+ * (16 bytes alignment is best for BCM2835), little-endian.
+ */
+@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
+function ff_int32_to_float_fmul_scalar_vfp, export=1
+VFP tmp .req a4
+VFP len .req a3
+NOVFP tmp .req a3
+NOVFP len .req a4
+NOVFP vmov s0, a3
+ ldr tmp, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
+ fmrx ip, FPSCR
+ fmxr FPSCR, tmp
+1:
+ vldmia a2!, {s8-s15}
+ vcvt.f32.s32 s8, s8
+ vcvt.f32.s32 s9, s9
+ vcvt.f32.s32 s10, s10
+ vcvt.f32.s32 s11, s11
+ vcvt.f32.s32 s12, s12
+ vcvt.f32.s32 s13, s13
+ vcvt.f32.s32 s14, s14
+ vcvt.f32.s32 s15, s15
+ vmul.f32 s8, s8, s0
+ subs len, len, #8
+ vstmia a1!, {s8-s11}
+ vstmia a1!, {s12-s15}
+ bne 1b
+
+ fmxr FPSCR, ip
+ bx lr
+endfunc
+ .unreq tmp
+ .unreq len
--
1.7.9.5

View File

@ -0,0 +1,78 @@
From 7901e7216cf6406a2ea430c71af94ebee72f262b Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:11 +0100
Subject: [PATCH 41/49] [ffmpeg] - backport - fmtconvert: Add a new method,
int32_to_float_fmul_array8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This is similar to int32_to_float_fmul_scalar, but
loads a new scalar multiplier every 8 input samples.
This enables the use of much larger input arrays, which
is important for pipelining on some CPUs (such as
ARMv6).
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/fmtconvert.c | 10 ++++++++++
libavcodec/fmtconvert.h | 16 ++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index 79e9645..1c45d35 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -30,6 +30,15 @@ static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul,
dst[i] = src[i] * mul;
}
+static void int32_to_float_fmul_array8_c(FmtConvertContext *c, float *dst,
+ const int32_t *src, const float *mul,
+ int len)
+{
+ int i;
+ for (i = 0; i < len; i += 8)
+ c->int32_to_float_fmul_scalar(&dst[i], &src[i], *mul++, 8);
+}
+
static av_always_inline int float_to_int16_one(const float *src){
return av_clip_int16(lrintf(*src));
}
@@ -79,6 +88,7 @@ void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
{
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
+ c->int32_to_float_fmul_array8 = int32_to_float_fmul_array8_c;
c->float_to_int16 = float_to_int16_c;
c->float_to_int16_interleave = float_to_int16_interleave_c;
c->float_interleave = ff_float_interleave_c;
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index 3fb9f4e..02468dc 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -38,6 +38,22 @@ typedef struct FmtConvertContext {
void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
/**
+ * Convert an array of int32_t to float and multiply by a float value from another array,
+ * stepping along the float array once for each 8 integers.
+ * @param c pointer to FmtConvertContext.
+ * @param dst destination array of float.
+ * constraints: 16-byte aligned
+ * @param src source array of int32_t.
+ * constraints: 16-byte aligned
+ * @param mul source array of float multipliers.
+ * @param len number of elements to convert.
+ * constraints: multiple of 8
+ */
+ void (*int32_to_float_fmul_array8)(struct FmtConvertContext *c,
+ float *dst, const int32_t *src,
+ const float *mul, int len);
+
+ /**
* Convert an array of float to an array of int16_t.
*
* Convert floats from in the range [-32768.0,32767.0] to ints
--
1.7.9.5

View File

@ -0,0 +1,90 @@
From fa755fe82fe4cfbb85b7c57501912da2e1f316bc Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Tue, 16 Jul 2013 15:41:18 +0300
Subject: [PATCH 42/49] [ffmpeg] - backport - dcadec: Use
int32_to_float_fmul_array8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/dcadec.c | 23 +++++++++++++++--------
1 file changed, 15 insertions(+), 8 deletions(-)
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 1b955e4..b648613 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -1302,7 +1302,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
/* FIXME */
float (*subband_samples)[DCA_SUBBANDS][8] = s->subband_samples[block_index];
- LOCAL_ALIGNED_16(int, block, [8]);
+ LOCAL_ALIGNED_16(int, block, [8 * DCA_SUBBANDS]);
/*
* Audio data
@@ -1315,6 +1315,8 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
quant_step_table = lossy_quant_d;
for (k = base_channel; k < s->prim_channels; k++) {
+ float rscale[DCA_SUBBANDS];
+
if (get_bits_left(&s->gb) < 0)
return AVERROR_INVALIDDATA;
@@ -1337,11 +1339,12 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
* Extract bits from the bit stream
*/
if (!abits) {
- memset(subband_samples[k][l], 0, 8 * sizeof(subband_samples[0][0][0]));
+ rscale[l] = 0;
+ memset(block + 8 * l, 0, 8 * sizeof(block[0]));
} else {
/* Deal with transients */
int sfi = s->transition_mode[k][l] && subsubframe >= s->transition_mode[k][l];
- float rscale = quant_step_size * s->scale_factor[k][l][sfi] *
+ rscale[l] = quant_step_size * s->scale_factor[k][l][sfi] *
s->scalefactor_adj[k][sel];
if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) {
@@ -1355,7 +1358,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
block_code1 = get_bits(&s->gb, size);
block_code2 = get_bits(&s->gb, size);
err = decode_blockcodes(block_code1, block_code2,
- levels, block);
+ levels, block + 8 * l);
if (err) {
av_log(s->avctx, AV_LOG_ERROR,
"ERROR: block code look-up failed\n");
@@ -1364,19 +1367,23 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
} else {
/* no coding */
for (m = 0; m < 8; m++)
- block[m] = get_sbits(&s->gb, abits - 3);
+ block[8 * l + m] = get_sbits(&s->gb, abits - 3);
}
} else {
/* Huffman coded */
for (m = 0; m < 8; m++)
- block[m] = get_bitalloc(&s->gb,
+ block[8 * l + m] = get_bitalloc(&s->gb,
&dca_smpl_bitalloc[abits], sel);
}
- s->fmt_conv.int32_to_float_fmul_scalar(subband_samples[k][l],
- block, rscale, 8);
}
+ }
+ s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, subband_samples[k][0],
+ block, rscale, 8 * s->vq_start_subband[k]);
+
+ for (l = 0; l < s->vq_start_subband[k]; l++) {
+ int m;
/*
* Inverse ADPCM if in prediction mode
*/
--
1.7.9.5

View File

@ -0,0 +1,222 @@
From c908a710261f33130569c4360175d8f19a282d67 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:12 +0100
Subject: [PATCH 43/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of int32_to_float_fmul_array8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 366.2 18.3 277.8 13.7 +31.9%
Overall 18420.5 489.1 17049.5 408.2 +8.0%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/fmtconvert_init_arm.c | 6 +-
libavcodec/arm/fmtconvert_vfp.S | 162 +++++++++++++++++++++++
2 files changed, 167 insertions(+), 1 deletion(-)
diff --git a/libavcodec/arm/fmtconvert_init_arm.c b/libavcodec/arm/fmtconvert_init_arm.c
index de3b78b..92d94a0 100644
--- a/libavcodec/arm/fmtconvert_init_arm.c
+++ b/libavcodec/arm/fmtconvert_init_arm.c
@@ -30,6 +30,9 @@ void ff_int32_to_float_fmul_scalar_neon(float *dst, const int *src,
void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src,
float mul, int len);
+void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst,
+ const int32_t *src, const float *mul,
+ int len);
void ff_float_to_int16_neon(int16_t *dst, const float *src, long len);
void ff_float_to_int16_interleave_neon(int16_t *, const float **, long, int);
@@ -42,10 +45,11 @@ av_cold void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx
if (have_vfp(cpu_flags) && have_armv6(cpu_flags)) {
if (!have_vfpv3(cpu_flags)) {
- // This function doesn't use anything armv6 specific in itself,
+ // These functions don't use anything armv6 specific in themselves,
// but ff_float_to_int16_vfp which is in the same assembly source
// file does, thus the whole file requires armv6 to be built.
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_vfp;
+ c->int32_to_float_fmul_array8 = ff_int32_to_float_fmul_array8_vfp;
}
c->float_to_int16 = ff_float_to_int16_vfp;
diff --git a/libavcodec/arm/fmtconvert_vfp.S b/libavcodec/arm/fmtconvert_vfp.S
index 3cc3e56..a6d4ebd 100644
--- a/libavcodec/arm/fmtconvert_vfp.S
+++ b/libavcodec/arm/fmtconvert_vfp.S
@@ -83,6 +83,168 @@ endfunc
* Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
* (16 bytes alignment is best for BCM2835), little-endian.
*/
+@ void ff_int32_to_float_fmul_array8_vfp(FmtConvertContext *c, float *dst, const int32_t *src, const float *mul, int len)
+function ff_int32_to_float_fmul_array8_vfp, export=1
+ push {lr}
+ ldr a1, [sp, #4]
+ subs lr, a1, #3*8
+ bcc 50f @ too short to pipeline
+ @ Now need to find (len / 8) % 3. The approximation
+ @ x / 24 = (x * 0xAB) >> 12
+ @ is good for x < 4096, which is true for both AC3 and DCA.
+ mov a1, #0xAB
+ ldr ip, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
+ mul a1, lr, a1
+ vpush {s16-s31}
+ mov a1, a1, lsr #12
+ add a1, a1, a1, lsl #1
+ rsb a1, a1, lr, lsr #3
+ cmp a1, #1
+ fmrx a1, FPSCR
+ fmxr FPSCR, ip
+ beq 11f
+ blo 10f
+ @ Array is (2 + multiple of 3) x 8 floats long
+ @ drop through...
+ vldmia a3!, {s16-s23}
+ vldmia a4!, {s2,s3}
+ vldmia a3!, {s24-s31}
+ vcvt.f32.s32 s16, s16
+ vcvt.f32.s32 s17, s17
+ vcvt.f32.s32 s18, s18
+ vcvt.f32.s32 s19, s19
+ vcvt.f32.s32 s20, s20
+ vcvt.f32.s32 s21, s21
+ vcvt.f32.s32 s22, s22
+ vcvt.f32.s32 s23, s23
+ vmul.f32 s16, s16, s2
+ @ drop through...
+3:
+ vldmia a3!, {s8-s15}
+ vldmia a4!, {s1}
+ vcvt.f32.s32 s24, s24
+ vcvt.f32.s32 s25, s25
+ vcvt.f32.s32 s26, s26
+ vcvt.f32.s32 s27, s27
+ vcvt.f32.s32 s28, s28
+ vcvt.f32.s32 s29, s29
+ vcvt.f32.s32 s30, s30
+ vcvt.f32.s32 s31, s31
+ vmul.f32 s24, s24, s3
+ vstmia a2!, {s16-s19}
+ vstmia a2!, {s20-s23}
+2:
+ vldmia a3!, {s16-s23}
+ vldmia a4!, {s2}
+ vcvt.f32.s32 s8, s8
+ vcvt.f32.s32 s9, s9
+ vcvt.f32.s32 s10, s10
+ vcvt.f32.s32 s11, s11
+ vcvt.f32.s32 s12, s12
+ vcvt.f32.s32 s13, s13
+ vcvt.f32.s32 s14, s14
+ vcvt.f32.s32 s15, s15
+ vmul.f32 s8, s8, s1
+ vstmia a2!, {s24-s27}
+ vstmia a2!, {s28-s31}
+1:
+ vldmia a3!, {s24-s31}
+ vldmia a4!, {s3}
+ vcvt.f32.s32 s16, s16
+ vcvt.f32.s32 s17, s17
+ vcvt.f32.s32 s18, s18
+ vcvt.f32.s32 s19, s19
+ vcvt.f32.s32 s20, s20
+ vcvt.f32.s32 s21, s21
+ vcvt.f32.s32 s22, s22
+ vcvt.f32.s32 s23, s23
+ vmul.f32 s16, s16, s2
+ vstmia a2!, {s8-s11}
+ vstmia a2!, {s12-s15}
+
+ subs lr, lr, #8*3
+ bpl 3b
+
+ vcvt.f32.s32 s24, s24
+ vcvt.f32.s32 s25, s25
+ vcvt.f32.s32 s26, s26
+ vcvt.f32.s32 s27, s27
+ vcvt.f32.s32 s28, s28
+ vcvt.f32.s32 s29, s29
+ vcvt.f32.s32 s30, s30
+ vcvt.f32.s32 s31, s31
+ vmul.f32 s24, s24, s3
+ vstmia a2!, {s16-s19}
+ vstmia a2!, {s20-s23}
+ vstmia a2!, {s24-s27}
+ vstmia a2!, {s28-s31}
+
+ fmxr FPSCR, a1
+ vpop {s16-s31}
+ pop {pc}
+
+10: @ Array is (multiple of 3) x 8 floats long
+ vldmia a3!, {s8-s15}
+ vldmia a4!, {s1,s2}
+ vldmia a3!, {s16-s23}
+ vcvt.f32.s32 s8, s8
+ vcvt.f32.s32 s9, s9
+ vcvt.f32.s32 s10, s10
+ vcvt.f32.s32 s11, s11
+ vcvt.f32.s32 s12, s12
+ vcvt.f32.s32 s13, s13
+ vcvt.f32.s32 s14, s14
+ vcvt.f32.s32 s15, s15
+ vmul.f32 s8, s8, s1
+ b 1b
+
+11: @ Array is (1 + multiple of 3) x 8 floats long
+ vldmia a3!, {s24-s31}
+ vldmia a4!, {s3}
+ vldmia a3!, {s8-s15}
+ vldmia a4!, {s1}
+ vcvt.f32.s32 s24, s24
+ vcvt.f32.s32 s25, s25
+ vcvt.f32.s32 s26, s26
+ vcvt.f32.s32 s27, s27
+ vcvt.f32.s32 s28, s28
+ vcvt.f32.s32 s29, s29
+ vcvt.f32.s32 s30, s30
+ vcvt.f32.s32 s31, s31
+ vmul.f32 s24, s24, s3
+ b 2b
+
+50:
+ ldr lr, =0x03070000 @ RunFast mode, short vectors of length 8, stride 1
+ fmrx ip, FPSCR
+ fmxr FPSCR, lr
+51:
+ vldmia a3!, {s8-s15}
+ vldmia a4!, {s0}
+ vcvt.f32.s32 s8, s8
+ vcvt.f32.s32 s9, s9
+ vcvt.f32.s32 s10, s10
+ vcvt.f32.s32 s11, s11
+ vcvt.f32.s32 s12, s12
+ vcvt.f32.s32 s13, s13
+ vcvt.f32.s32 s14, s14
+ vcvt.f32.s32 s15, s15
+ vmul.f32 s8, s8, s0
+ subs a1, a1, #8
+ vstmia a2!, {s8-s11}
+ vstmia a2!, {s12-s15}
+ bne 51b
+
+ fmxr FPSCR, ip
+ pop {pc}
+endfunc
+
+/**
+ * ARM VFP optimised int32 to float conversion.
+ * Assume len is a multiple of 8, destination buffer is at least 4 bytes aligned
+ * (16 bytes alignment is best for BCM2835), little-endian.
+ * TODO: could be further optimised by unrolling and interleaving, as above
+ */
@ void ff_int32_to_float_fmul_scalar_vfp(float *dst, const int32_t *src, float mul, int len)
function ff_int32_to_float_fmul_scalar_vfp, export=1
VFP tmp .req a4
--
1.7.9.5

View File

@ -0,0 +1,274 @@
From 15520de67fc951213ab32661b8b368a9439e8b9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 19 Jul 2013 10:59:17 +0300
Subject: [PATCH 44/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of imdct_half
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 2653.0 28.5 1108.8 51.4 +139.3%
Overall 17049.5 408.2 15973.0 223.2 +6.7%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/fft_init_arm.c | 9 ++
libavcodec/arm/mdct_vfp.S | 205 ++++++++++++++++++++++++++++++
3 files changed, 215 insertions(+)
create mode 100644 libavcodec/arm/mdct_vfp.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index aee9d73..27e80d5 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -59,6 +59,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
arm/simple_idct_armv6.o \
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
NEON-OBJS-$(CONFIG_FFT) += arm/fft_neon.o \
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index fe0acc5..a000ea5 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -26,6 +26,8 @@
void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
+
void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
@@ -48,6 +50,13 @@ av_cold void ff_fft_init_arm(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_vfp(cpu_flags)) {
+#if CONFIG_MDCT
+ if (!have_vfpv3(cpu_flags))
+ s->imdct_half = ff_imdct_half_vfp;
+#endif
+ }
+
if (have_neon(cpu_flags)) {
#if CONFIG_FFT
s->fft_permute = ff_fft_permute_neon;
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
new file mode 100644
index 0000000..0623e96
--- /dev/null
+++ b/libavcodec/arm/mdct_vfp.S
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+CONTEXT .req a1
+ORIGOUT .req a2
+IN .req a3
+OUT .req v1
+REVTAB .req v2
+TCOS .req v3
+TSIN .req v4
+OLDFPSCR .req v5
+J0 .req a2
+J1 .req a4
+J2 .req ip
+J3 .req lr
+
+.macro prerotation_innerloop
+ .set trig_lo, k
+ .set trig_hi, n4 - k - 2
+ .set in_lo, trig_lo * 2
+ .set in_hi, trig_hi * 2
+ vldr d8, [TCOS, #trig_lo*4] @ s16,s17
+ vldr d9, [TCOS, #trig_hi*4] @ s18,s19
+ vldr s0, [IN, #in_hi*4 + 12]
+ vldr s1, [IN, #in_hi*4 + 4]
+ vldr s2, [IN, #in_lo*4 + 12]
+ vldr s3, [IN, #in_lo*4 + 4]
+ vmul.f s8, s0, s16 @ vector operation
+ vldr d10, [TSIN, #trig_lo*4] @ s20,s21
+ vldr d11, [TSIN, #trig_hi*4] @ s22,s23
+ vldr s4, [IN, #in_lo*4]
+ vldr s5, [IN, #in_lo*4 + 8]
+ vldr s6, [IN, #in_hi*4]
+ vldr s7, [IN, #in_hi*4 + 8]
+ ldr J0, [REVTAB, #trig_lo*2]
+ vmul.f s12, s0, s20 @ vector operation
+ ldr J2, [REVTAB, #trig_hi*2]
+ mov J1, J0, lsr #16
+ and J0, J0, #255 @ halfword value will be < n4
+ vmls.f s8, s4, s20 @ vector operation
+ mov J3, J2, lsr #16
+ and J2, J2, #255 @ halfword value will be < n4
+ add J0, OUT, J0, lsl #3
+ vmla.f s12, s4, s16 @ vector operation
+ add J1, OUT, J1, lsl #3
+ add J2, OUT, J2, lsl #3
+ add J3, OUT, J3, lsl #3
+ vstr s8, [J0]
+ vstr s9, [J1]
+ vstr s10, [J2]
+ vstr s11, [J3]
+ vstr s12, [J0, #4]
+ vstr s13, [J1, #4]
+ vstr s14, [J2, #4]
+ vstr s15, [J3, #4]
+ .set k, k + 2
+.endm
+
+.macro postrotation_innerloop tail, head
+ .set trig_lo_head, n8 - k - 2
+ .set trig_hi_head, n8 + k
+ .set out_lo_head, trig_lo_head * 2
+ .set out_hi_head, trig_hi_head * 2
+ .set trig_lo_tail, n8 - (k - 2) - 2
+ .set trig_hi_tail, n8 + (k - 2)
+ .set out_lo_tail, trig_lo_tail * 2
+ .set out_hi_tail, trig_hi_tail * 2
+ .if (k & 2) == 0
+ TCOS_D0_HEAD .req d10 @ s20,s21
+ TCOS_D1_HEAD .req d11 @ s22,s23
+ TCOS_S0_TAIL .req s24
+ .else
+ TCOS_D0_HEAD .req d12 @ s24,s25
+ TCOS_D1_HEAD .req d13 @ s26,s27
+ TCOS_S0_TAIL .req s20
+ .endif
+ .ifnc "\tail",""
+ vmls.f s8, s0, TCOS_S0_TAIL @ vector operation
+ .endif
+ .ifnc "\head",""
+ vldr d8, [TSIN, #trig_lo_head*4] @ s16,s17
+ vldr d9, [TSIN, #trig_hi_head*4] @ s18,s19
+ vldr TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
+ .endif
+ .ifnc "\tail",""
+ vmla.f s12, s4, TCOS_S0_TAIL @ vector operation
+ .endif
+ .ifnc "\head",""
+ vldr s0, [OUT, #out_lo_head*4]
+ vldr s1, [OUT, #out_lo_head*4 + 8]
+ vldr s2, [OUT, #out_hi_head*4]
+ vldr s3, [OUT, #out_hi_head*4 + 8]
+ vldr s4, [OUT, #out_lo_head*4 + 4]
+ vldr s5, [OUT, #out_lo_head*4 + 12]
+ vldr s6, [OUT, #out_hi_head*4 + 4]
+ vldr s7, [OUT, #out_hi_head*4 + 12]
+ .endif
+ .ifnc "\tail",""
+ vstr s8, [OUT, #out_lo_tail*4]
+ vstr s9, [OUT, #out_lo_tail*4 + 8]
+ vstr s10, [OUT, #out_hi_tail*4]
+ vstr s11, [OUT, #out_hi_tail*4 + 8]
+ .endif
+ .ifnc "\head",""
+ vmul.f s8, s4, s16 @ vector operation
+ .endif
+ .ifnc "\tail",""
+ vstr s12, [OUT, #out_hi_tail*4 + 12]
+ vstr s13, [OUT, #out_hi_tail*4 + 4]
+ vstr s14, [OUT, #out_lo_tail*4 + 12]
+ vstr s15, [OUT, #out_lo_tail*4 + 4]
+ .endif
+ .ifnc "\head",""
+ vmul.f s12, s0, s16 @ vector operation
+ vldr TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
+ .endif
+ .unreq TCOS_D0_HEAD
+ .unreq TCOS_D1_HEAD
+ .unreq TCOS_S0_TAIL
+ .ifnc "\head",""
+ .set k, k + 2
+ .endif
+.endm
+
+
+/* void ff_imdct_half_vfp(FFTContext *s,
+ * FFTSample *output,
+ * const FFTSample *input)
+ */
+function ff_imdct_half_vfp, export=1
+ ldr ip, [CONTEXT, #5*4] @ mdct_bits
+ teq ip, #6
+ it ne
+ bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA
+
+ .set n, 1<<6
+ .set n2, n/2
+ .set n4, n/4
+ .set n8, n/8
+
+ push {v1-v5,lr}
+ vpush {s16-s27}
+ fmrx OLDFPSCR, FPSCR
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, lr
+ mov OUT, ORIGOUT
+ ldr REVTAB, [CONTEXT, #2*4]
+ ldr TCOS, [CONTEXT, #6*4]
+ ldr TSIN, [CONTEXT, #7*4]
+
+ .set k, 0
+ .rept n8/2
+ prerotation_innerloop
+ .endr
+
+ fmxr FPSCR, OLDFPSCR
+ mov a1, OUT
+ bl ff_fft16_vfp
+ ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, lr
+
+ .set k, 0
+ postrotation_innerloop , head
+ .rept n8/2 - 1
+ postrotation_innerloop tail, head
+ .endr
+ postrotation_innerloop tail
+
+ fmxr FPSCR, OLDFPSCR
+ vpop {s16-s27}
+ pop {v1-v5,pc}
+endfunc
+
+ .unreq CONTEXT
+ .unreq ORIGOUT
+ .unreq IN
+ .unreq OUT
+ .unreq REVTAB
+ .unreq TCOS
+ .unreq TSIN
+ .unreq OLDFPSCR
+ .unreq J0
+ .unreq J1
+ .unreq J2
+ .unreq J3
--
1.7.9.5

View File

@ -0,0 +1,58 @@
From 8e0babd84c7e03cf678aab8bcf7e2106fe2b3de6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 19 Jul 2013 11:03:32 +0300
Subject: [PATCH 45/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of dca_lfe_fir
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 868.2 33.5 436.0 27.0 +99.1%
Overall 15973.0 223.2 15577.5 83.2 +2.5%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/Makefile | 3 ++-
libavcodec/arm/dcadsp_init_arm.c | 4 ++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 27e80d5..7fe5bb5 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -58,7 +58,8 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
arm/dsputil_armv6.o \
arm/simple_idct_armv6.o \
-VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
+ arm/synth_filter_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index 56568e0..a1efbff 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -24,6 +24,8 @@
#include "libavutil/attributes.h"
#include "libavcodec/dcadsp.h"
+void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
+ int decifactor, float scale);
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
int decifactor, float scale);
@@ -31,6 +33,8 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
+ s->lfe_fir = ff_dca_lfe_fir_vfp;
if (have_neon(cpu_flags))
s->lfe_fir = ff_dca_lfe_fir_neon;
}
--
1.7.9.5

View File

@ -0,0 +1,339 @@
From 018b74ea9d8f52788db18ed40838afca05e7b4df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 19 Jul 2013 11:23:57 +0300
Subject: [PATCH 46/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of fft16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 1389.3 4.2 967.8 35.1 +43.6%
Overall 15577.5 83.2 15400.0 336.4 +1.2%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/fft_vfp.S | 298 +++++++++++++++++++++++++++++++++++
2 files changed, 299 insertions(+)
create mode 100644 libavcodec/arm/fft_vfp.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 7fe5bb5..7390a8b 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -60,6 +60,7 @@ ARMV6-OBJS += arm/dsputil_init_armv6.o \
VFP-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_vfp.o \
arm/synth_filter_vfp.o
+VFP-OBJS-$(CONFIG_FFT) += arm/fft_vfp.o
VFP-OBJS-$(CONFIG_MDCT) += arm/mdct_vfp.o
VFP-OBJS-$(HAVE_ARMV6) += arm/fmtconvert_vfp.o
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
new file mode 100644
index 0000000..7845ebb
--- /dev/null
+++ b/libavcodec/arm/fft_vfp.S
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+@ TODO: * FFTs wider than 16
+@ * dispatch code
+
+function fft4_vfp
+ vldr d0, [a1, #0*2*4] @ s0,s1 = z[0]
+ vldr d4, [a1, #1*2*4] @ s8,s9 = z[1]
+ vldr d1, [a1, #2*2*4] @ s2,s3 = z[2]
+ vldr d5, [a1, #3*2*4] @ s10,s11 = z[3]
+ @ stall
+ vadd.f s12, s0, s8 @ i0
+ vadd.f s13, s1, s9 @ i1
+ vadd.f s14, s2, s10 @ i2
+ vadd.f s15, s3, s11 @ i3
+ vsub.f s8, s0, s8 @ i4
+ vsub.f s9, s1, s9 @ i5
+ vsub.f s10, s2, s10 @ i6
+ vsub.f s11, s3, s11 @ i7
+ @ stall
+ @ stall
+ vadd.f s0, s12, s14 @ z[0].re
+ vsub.f s4, s12, s14 @ z[2].re
+ vadd.f s1, s13, s15 @ z[0].im
+ vsub.f s5, s13, s15 @ z[2].im
+ vadd.f s7, s9, s10 @ z[3].im
+ vsub.f s3, s9, s10 @ z[1].im
+ vadd.f s2, s8, s11 @ z[1].re
+ vsub.f s6, s8, s11 @ z[3].re
+ @ stall
+ @ stall
+ vstr d0, [a1, #0*2*4]
+ vstr d2, [a1, #2*2*4]
+ @ stall
+ @ stall
+ vstr d1, [a1, #1*2*4]
+ vstr d3, [a1, #3*2*4]
+
+ bx lr
+endfunc
+
+.macro macro_fft8_head
+ @ FFT4
+ vldr d4, [a1, #0 * 2*4]
+ vldr d6, [a1, #1 * 2*4]
+ vldr d5, [a1, #2 * 2*4]
+ vldr d7, [a1, #3 * 2*4]
+ @ BF
+ vldr d12, [a1, #4 * 2*4]
+ vadd.f s16, s8, s12 @ vector op
+ vldr d14, [a1, #5 * 2*4]
+ vldr d13, [a1, #6 * 2*4]
+ vldr d15, [a1, #7 * 2*4]
+ vsub.f s20, s8, s12 @ vector op
+ vadd.f s0, s16, s18
+ vsub.f s2, s16, s18
+ vadd.f s1, s17, s19
+ vsub.f s3, s17, s19
+ vadd.f s7, s21, s22
+ vsub.f s5, s21, s22
+ vadd.f s4, s20, s23
+ vsub.f s6, s20, s23
+ vsub.f s20, s24, s28 @ vector op
+ vstr d0, [a1, #0 * 2*4] @ transfer s0-s7 to s24-s31 via memory
+ vstr d1, [a1, #1 * 2*4]
+ vldr s0, cos1pi4
+ vadd.f s16, s24, s28 @ vector op
+ vstr d2, [a1, #2 * 2*4]
+ vstr d3, [a1, #3 * 2*4]
+ vldr d12, [a1, #0 * 2*4]
+ @ TRANSFORM
+ vmul.f s20, s20, s0 @ vector x scalar op
+ vldr d13, [a1, #1 * 2*4]
+ vldr d14, [a1, #2 * 2*4]
+ vldr d15, [a1, #3 * 2*4]
+ @ BUTTERFLIES
+ vadd.f s0, s18, s16
+ vadd.f s1, s17, s19
+ vsub.f s2, s17, s19
+ vsub.f s3, s18, s16
+ vadd.f s4, s21, s20
+ vsub.f s5, s21, s20
+ vadd.f s6, s22, s23
+ vsub.f s7, s22, s23
+ vadd.f s8, s0, s24 @ vector op
+ vstr d0, [a1, #0 * 2*4] @ transfer s0-s3 to s12-s15 via memory
+ vstr d1, [a1, #1 * 2*4]
+ vldr d6, [a1, #0 * 2*4]
+ vldr d7, [a1, #1 * 2*4]
+ vadd.f s1, s5, s6
+ vadd.f s0, s7, s4
+ vsub.f s2, s5, s6
+ vsub.f s3, s7, s4
+ vsub.f s12, s24, s12 @ vector op
+ vsub.f s5, s29, s1
+ vsub.f s4, s28, s0
+ vsub.f s6, s30, s2
+ vsub.f s7, s31, s3
+ vadd.f s16, s0, s28 @ vector op
+ vstr d6, [a1, #4 * 2*4]
+ vstr d7, [a1, #6 * 2*4]
+ vstr d4, [a1, #0 * 2*4]
+ vstr d5, [a1, #2 * 2*4]
+ vstr d2, [a1, #5 * 2*4]
+ vstr d3, [a1, #7 * 2*4]
+.endm
+
+.macro macro_fft8_tail
+ vstr d8, [a1, #1 * 2*4]
+ vstr d9, [a1, #3 * 2*4]
+.endm
+
+function fft8_vfp
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
+ fmrx a2, FPSCR
+ fmxr FPSCR, a3
+ vpush {s16-s31}
+
+ macro_fft8_head
+ macro_fft8_tail
+
+ vpop {s16-s31}
+ fmxr FPSCR, a2
+ bx lr
+endfunc
+
+.align 3
+cos1pi4: @ cos(1*pi/4) = sqrt(2)
+ .float 0.707106769084930419921875
+cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
+ .float 0.92387950420379638671875
+cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
+ .float 0.3826834261417388916015625
+
+function ff_fft16_vfp, export=1
+ ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1
+ fmrx a2, FPSCR
+ fmxr FPSCR, a3
+ vpush {s16-s31}
+
+ macro_fft8_head
+ @ FFT4(z+8)
+ vldr d10, [a1, #8 * 2*4]
+ vldr d12, [a1, #9 * 2*4]
+ vldr d11, [a1, #10 * 2*4]
+ vldr d13, [a1, #11 * 2*4]
+ macro_fft8_tail
+ vadd.f s16, s20, s24 @ vector op
+ @ FFT4(z+12)
+ vldr d4, [a1, #12 * 2*4]
+ vldr d6, [a1, #13 * 2*4]
+ vldr d5, [a1, #14 * 2*4]
+ vsub.f s20, s20, s24 @ vector op
+ vldr d7, [a1, #15 * 2*4]
+ vadd.f s0, s16, s18
+ vsub.f s4, s16, s18
+ vadd.f s1, s17, s19
+ vsub.f s5, s17, s19
+ vadd.f s7, s21, s22
+ vsub.f s3, s21, s22
+ vadd.f s2, s20, s23
+ vsub.f s6, s20, s23
+ vadd.f s16, s8, s12 @ vector op
+ vstr d0, [a1, #8 * 2*4]
+ vstr d2, [a1, #10 * 2*4]
+ vstr d1, [a1, #9 * 2*4]
+ vsub.f s20, s8, s12
+ vstr d3, [a1, #11 * 2*4]
+ @ TRANSFORM(z[2],z[6],z[10],z[14],cos1pi4,cos1pi4)
+ vldr d12, [a1, #10 * 2*4]
+ vadd.f s0, s16, s18
+ vadd.f s1, s17, s19
+ vsub.f s6, s16, s18
+ vsub.f s7, s17, s19
+ vsub.f s3, s21, s22
+ vadd.f s2, s20, s23
+ vadd.f s5, s21, s22
+ vsub.f s4, s20, s23
+ vstr d0, [a1, #12 * 2*4]
+ vmov s0, s6
+ @ TRANSFORM(z[1],z[5],z[9],z[13],cos1pi8,cos3pi8)
+ vldr d6, [a1, #9 * 2*4]
+ vstr d1, [a1, #13 * 2*4]
+ vldr d1, cos1pi4 @ s2 = cos1pi4, s3 = cos1pi8
+ vstr d2, [a1, #15 * 2*4]
+ vldr d7, [a1, #13 * 2*4]
+ vadd.f s4, s25, s24
+ vsub.f s5, s25, s24
+ vsub.f s6, s0, s7
+ vadd.f s7, s0, s7
+ vmul.f s20, s12, s3 @ vector op
+ @ TRANSFORM(z[3],z[7],z[11],z[15],cos3pi8,cos1pi8)
+ vldr d4, [a1, #11 * 2*4]
+ vldr d5, [a1, #15 * 2*4]
+ vldr s1, cos3pi8
+ vmul.f s24, s4, s2 @ vector * scalar op
+ vmul.f s28, s12, s1 @ vector * scalar op
+ vmul.f s12, s8, s1 @ vector * scalar op
+ vadd.f s4, s20, s29
+ vsub.f s5, s21, s28
+ vsub.f s6, s22, s31
+ vadd.f s7, s23, s30
+ vmul.f s8, s8, s3 @ vector * scalar op
+ vldr d8, [a1, #1 * 2*4]
+ vldr d9, [a1, #5 * 2*4]
+ vldr d10, [a1, #3 * 2*4]
+ vldr d11, [a1, #7 * 2*4]
+ vldr d14, [a1, #2 * 2*4]
+ vadd.f s0, s6, s4
+ vadd.f s1, s5, s7
+ vsub.f s2, s5, s7
+ vsub.f s3, s6, s4
+ vadd.f s4, s12, s9
+ vsub.f s5, s13, s8
+ vsub.f s6, s14, s11
+ vadd.f s7, s15, s10
+ vadd.f s12, s0, s16 @ vector op
+ vstr d0, [a1, #1 * 2*4]
+ vstr d1, [a1, #5 * 2*4]
+ vldr d4, [a1, #1 * 2*4]
+ vldr d5, [a1, #5 * 2*4]
+ vadd.f s0, s6, s4
+ vadd.f s1, s5, s7
+ vsub.f s2, s5, s7
+ vsub.f s3, s6, s4
+ vsub.f s8, s16, s8 @ vector op
+ vstr d6, [a1, #1 * 2*4]
+ vstr d7, [a1, #5 * 2*4]
+ vldr d15, [a1, #6 * 2*4]
+ vsub.f s4, s20, s0
+ vsub.f s5, s21, s1
+ vsub.f s6, s22, s2
+ vsub.f s7, s23, s3
+ vadd.f s20, s0, s20 @ vector op
+ vstr d4, [a1, #9 * 2*4]
+ @ TRANSFORM_ZERO(z[0],z[4],z[8],z[12])
+ vldr d6, [a1, #8 * 2*4]
+ vstr d5, [a1, #13 * 2*4]
+ vldr d7, [a1, #12 * 2*4]
+ vstr d2, [a1, #11 * 2*4]
+ vldr d8, [a1, #0 * 2*4]
+ vstr d3, [a1, #15 * 2*4]
+ vldr d9, [a1, #4 * 2*4]
+ vadd.f s0, s26, s24
+ vadd.f s1, s25, s27
+ vsub.f s2, s25, s27
+ vsub.f s3, s26, s24
+ vadd.f s4, s14, s12
+ vadd.f s5, s13, s15
+ vsub.f s6, s13, s15
+ vsub.f s7, s14, s12
+ vadd.f s8, s0, s28 @ vector op
+ vstr d0, [a1, #3 * 2*4]
+ vstr d1, [a1, #7 * 2*4]
+ vldr d6, [a1, #3 * 2*4]
+ vldr d7, [a1, #7 * 2*4]
+ vsub.f s0, s16, s4
+ vsub.f s1, s17, s5
+ vsub.f s2, s18, s6
+ vsub.f s3, s19, s7
+ vsub.f s12, s28, s12 @ vector op
+ vadd.f s16, s4, s16 @ vector op
+ vstr d10, [a1, #3 * 2*4]
+ vstr d11, [a1, #7 * 2*4]
+ vstr d4, [a1, #2 * 2*4]
+ vstr d5, [a1, #6 * 2*4]
+ vstr d0, [a1, #8 * 2*4]
+ vstr d1, [a1, #12 * 2*4]
+ vstr d6, [a1, #10 * 2*4]
+ vstr d7, [a1, #14 * 2*4]
+ vstr d8, [a1, #0 * 2*4]
+ vstr d9, [a1, #4 * 2*4]
+
+ vpop {s16-s31}
+ fmxr FPSCR, a2
+ bx lr
+endfunc
--
1.7.9.5

View File

@ -0,0 +1,140 @@
From ed16009b0a05fbd344832d5ad2e982c169aec42c Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:16 +0100
Subject: [PATCH 47/49] [ffmpeg] - backport - dcadsp: Add a new method,
qmf_32_subbands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This does most of the work formerly carried out by
the static function qmf_32_subbands() in dcadec.c.
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/dcadec.c | 26 +++++---------------------
libavcodec/dcadsp.c | 30 ++++++++++++++++++++++++++++++
libavcodec/dcadsp.h | 9 +++++++++
3 files changed, 44 insertions(+), 21 deletions(-)
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index b648613..4054d63 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -1108,10 +1108,8 @@ static void qmf_32_subbands(DCAContext *s, int chans,
float scale)
{
const float *prCoeff;
- int i;
int sb_act = s->subband_activity[chans];
- int subindex;
scale *= sqrt(1 / 8.0);
@@ -1121,25 +1119,11 @@ static void qmf_32_subbands(DCAContext *s, int chans,
else /* Perfect reconstruction */
prCoeff = fir_32bands_perfect;
- for (i = sb_act; i < 32; i++)
- s->raXin[i] = 0.0;
-
- /* Reconstructed channel sample index */
- for (subindex = 0; subindex < 8; subindex++) {
- /* Load in one sample from each subband and clear inactive subbands */
- for (i = 0; i < sb_act; i++) {
- unsigned sign = (i - 1) & 2;
- uint32_t v = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
- AV_WN32A(&s->raXin[i], v);
- }
-
- s->synth.synth_filter_float(&s->imdct,
- s->subband_fir_hist[chans],
- &s->hist_index[chans],
- s->subband_fir_noidea[chans], prCoeff,
- samples_out, s->raXin, scale);
- samples_out += 32;
- }
+ s->dcadsp.qmf_32_subbands(samples_in, sb_act, &s->synth, &s->imdct,
+ s->subband_fir_hist[chans],
+ &s->hist_index[chans],
+ s->subband_fir_noidea[chans], prCoeff,
+ samples_out, s->raXin, scale);
}
static void lfe_interpolation_fir(DCAContext *s, int decimation_select,
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index dd4994d..ab63f1b 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -20,6 +20,7 @@
*/
#include "config.h"
+#include "libavutil/intreadwrite.h"
#include "dcadsp.h"
static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
@@ -44,8 +45,37 @@ static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
}
}
+static void dca_qmf_32_subbands(float samples_in[32][8], int sb_act,
+ SynthFilterContext *synth, FFTContext *imdct,
+ float synth_buf_ptr[512],
+ int *synth_buf_offset, float synth_buf2[32],
+ const float window[512], float *samples_out,
+ float raXin[32], float scale)
+{
+ int i;
+ int subindex;
+
+ for (i = sb_act; i < 32; i++)
+ raXin[i] = 0.0;
+
+ /* Reconstructed channel sample index */
+ for (subindex = 0; subindex < 8; subindex++) {
+ /* Load in one sample from each subband and clear inactive subbands */
+ for (i = 0; i < sb_act; i++) {
+ unsigned sign = (i - 1) & 2;
+ uint32_t v = AV_RN32A(&samples_in[i][subindex]) ^ sign << 30;
+ AV_WN32A(&raXin[i], v);
+ }
+
+ synth->synth_filter_float(imdct, synth_buf_ptr, synth_buf_offset,
+ synth_buf2, window, samples_out, raXin, scale);
+ samples_out += 32;
+ }
+}
+
void ff_dcadsp_init(DCADSPContext *s)
{
s->lfe_fir = dca_lfe_fir_c;
+ s->qmf_32_subbands = dca_qmf_32_subbands;
if (ARCH_ARM) ff_dcadsp_init_arm(s);
}
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index bb157f7..d86c1f3 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -19,9 +19,18 @@
#ifndef AVCODEC_DCADSP_H
#define AVCODEC_DCADSP_H
+#include "avfft.h"
+#include "synth_filter.h"
+
typedef struct DCADSPContext {
void (*lfe_fir)(float *out, const float *in, const float *coefs,
int decifactor, float scale);
+ void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
+ SynthFilterContext *synth, FFTContext *imdct,
+ float synth_buf_ptr[512],
+ int *synth_buf_offset, float synth_buf2[32],
+ const float window[512], float *samples_out,
+ float raXin[32], float scale);
} DCADSPContext;
void ff_dcadsp_init(DCADSPContext *s);
--
1.7.9.5

View File

@ -0,0 +1,551 @@
From a6c273927c5bb212e806be6ae10c81dcd81b2152 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 15 Jul 2013 18:28:17 +0100
Subject: [PATCH 48/49] [ffmpeg] - backport - arm: Add VFP-accelerated version
of qmf_32_subbands
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Before After
Mean StdDev Mean StdDev Change
This function 1323.0 98.0 746.2 60.6 +77.3%
Overall 15400.0 336.4 14147.5 288.4 +8.9%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/arm/dcadsp_init_arm.c | 10 +-
libavcodec/arm/dcadsp_vfp.S | 493 +++++++++++++++++++++++++++
2 files changed, 502 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/arm/dcadsp_vfp.S
diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c
index a1efbff..58267a2 100644
--- a/libavcodec/arm/dcadsp_init_arm.c
+++ b/libavcodec/arm/dcadsp_init_arm.c
@@ -26,6 +26,12 @@
void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
int decifactor, float scale);
+void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
+ SynthFilterContext *synth, FFTContext *imdct,
+ float synth_buf_ptr[512],
+ int *synth_buf_offset, float synth_buf2[32],
+ const float window[512], float *samples_out,
+ float raXin[32], float scale);
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
int decifactor, float scale);
@@ -33,8 +39,10 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
- if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
+ if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
s->lfe_fir = ff_dca_lfe_fir_vfp;
+ s->qmf_32_subbands = ff_dca_qmf_32_subbands_vfp;
+ }
if (have_neon(cpu_flags))
s->lfe_fir = ff_dca_lfe_fir_neon;
}
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
new file mode 100644
index 0000000..6039e87
--- /dev/null
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -0,0 +1,493 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+POUT .req a1
+PIN .req a2
+PCOEF .req a3
+DECIFACTOR .req a4
+OLDFPSCR .req a4
+COUNTER .req ip
+
+SCALE32 .req s28 @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
+SCALE64 .req s0 @ spare register in scalar bank when decifactor=64 / JMAX=4
+IN0 .req s4
+IN1 .req s5
+IN2 .req s6
+IN3 .req s7
+IN4 .req s0
+IN5 .req s1
+IN6 .req s2
+IN7 .req s3
+COEF0 .req s8 @ coefficient elements
+COEF1 .req s9
+COEF2 .req s10
+COEF3 .req s11
+COEF4 .req s12
+COEF5 .req s13
+COEF6 .req s14
+COEF7 .req s15
+ACCUM0 .req s16 @ double-buffered multiply-accumulate results
+ACCUM4 .req s20
+POST0 .req s24 @ do long-latency post-multiply in this vector in parallel
+POST1 .req s25
+POST2 .req s26
+POST3 .req s27
+
+
+.macro inner_loop decifactor, dir, tail, head
+ .ifc "\dir","up"
+ .set X, 0
+ .set Y, 4
+ .else
+ .set X, 4*JMAX*4 - 4
+ .set Y, -4
+ .endif
+ .ifnc "\head",""
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
+ .endif
+ .ifnc "\tail",""
+ vadd.f POST0, ACCUM0, ACCUM4 @ vector operation
+ .endif
+ .ifnc "\head",""
+ vmul.f ACCUM0, COEF0, IN0 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
+ .endif
+ .ifnc "\tail",""
+ vmul.f POST0, POST0, SCALE\decifactor @ vector operation (SCALE may be scalar)
+ .endif
+ .ifnc "\head",""
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
+ .ifc "\tail",""
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
+ .endif
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
+ .ifnc "\tail",""
+ vmul.f ACCUM4, COEF4, IN1 @ vector operation
+ .endif
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
+ .endif
+ .ifnc "\tail",""
+ vstmia POUT!, {POST0-POST3}
+ .endif
+ .ifnc "\head",""
+ vmla.f ACCUM0, COEF0, IN2 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
+ vmla.f ACCUM4, COEF4, IN3 @ vector = vector * scalar
+ .if \decifactor == 32
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
+ vmla.f ACCUM0, COEF0, IN4 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
+ vmla.f ACCUM4, COEF4, IN5 @ vector = vector * scalar
+ vldr COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
+ vldr COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
+ vldr COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
+ vldr COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
+ vmla.f ACCUM0, COEF0, IN6 @ vector = vector * scalar
+ vldr COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
+ vldr COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
+ vldr COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
+ vldr COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
+ vmla.f ACCUM4, COEF4, IN7 @ vector = vector * scalar
+ .endif
+ .endif
+.endm
+
+.macro dca_lfe_fir decifactor
+ .if \decifactor == 32
+ .set JMAX, 8
+ vpush {s16-s31}
+ vmov SCALE32, s0 @ duplicate scalar across vector
+ vldr IN4, [PIN, #-4*4]
+ vldr IN5, [PIN, #-5*4]
+ vldr IN6, [PIN, #-6*4]
+ vldr IN7, [PIN, #-7*4]
+ .else
+ .set JMAX, 4
+ vpush {s16-s27}
+ .endif
+
+ mov COUNTER, #\decifactor/4 - 1
+ inner_loop \decifactor, up,, head
+1: add PCOEF, PCOEF, #4*JMAX*4
+ subs COUNTER, COUNTER, #1
+ inner_loop \decifactor, up, tail, head
+ bne 1b
+ inner_loop \decifactor, up, tail
+
+ mov COUNTER, #\decifactor/4 - 1
+ inner_loop \decifactor, down,, head
+1: sub PCOEF, PCOEF, #4*JMAX*4
+ subs COUNTER, COUNTER, #1
+ inner_loop \decifactor, down, tail, head
+ bne 1b
+ inner_loop \decifactor, down, tail
+
+ .if \decifactor == 32
+ vpop {s16-s31}
+ .else
+ vpop {s16-s27}
+ .endif
+ fmxr FPSCR, OLDFPSCR
+ bx lr
+.endm
+
+
+/* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
+ * int decifactor, float scale)
+ */
+function ff_dca_lfe_fir_vfp, export=1
+ teq DECIFACTOR, #32
+ fmrx OLDFPSCR, FPSCR
+ ldr ip, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
+ fmxr FPSCR, ip
+NOVFP vldr s0, [sp]
+ vldr IN0, [PIN, #-0*4]
+ vldr IN1, [PIN, #-1*4]
+ vldr IN2, [PIN, #-2*4]
+ vldr IN3, [PIN, #-3*4]
+ beq 32f
+64: dca_lfe_fir 64
+ .ltorg
+32: dca_lfe_fir 32
+endfunc
+
+ .unreq POUT
+ .unreq PIN
+ .unreq PCOEF
+ .unreq DECIFACTOR
+ .unreq OLDFPSCR
+ .unreq COUNTER
+
+ .unreq SCALE32
+ .unreq SCALE64
+ .unreq IN0
+ .unreq IN1
+ .unreq IN2
+ .unreq IN3
+ .unreq IN4
+ .unreq IN5
+ .unreq IN6
+ .unreq IN7
+ .unreq COEF0
+ .unreq COEF1
+ .unreq COEF2
+ .unreq COEF3
+ .unreq COEF4
+ .unreq COEF5
+ .unreq COEF6
+ .unreq COEF7
+ .unreq ACCUM0
+ .unreq ACCUM4
+ .unreq POST0
+ .unreq POST1
+ .unreq POST2
+ .unreq POST3
+
+
+IN .req a1
+SBACT .req a2
+OLDFPSCR .req a3
+IMDCT .req a4
+WINDOW .req v1
+OUT .req v2
+BUF .req v3
+SCALEINT .req v4 @ only used in softfp case
+COUNT .req v5
+
+SCALE .req s0
+
+/* Stack layout differs in softfp and hardfp cases:
+ *
+ * hardfp
+ * fp -> 6 arg words saved by caller
+ * a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
+ * s16-s23 on entry
+ * align 16
+ * buf -> 8*32*4 bytes buffer
+ * s0 on entry
+ * sp -> 3 arg words for callee
+ *
+ * softfp
+ * fp -> 7 arg words saved by caller
+ * a4,v1-v5,fp,lr on entry
+ * s16-s23 on entry
+ * align 16
+ * buf -> 8*32*4 bytes buffer
+ * sp -> 4 arg words for callee
+ */
+
+/* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
+ * SynthFilterContext *synth, FFTContext *imdct,
+ * float (*synth_buf_ptr)[512],
+ * int *synth_buf_offset, float (*synth_buf2)[32],
+ * const float (*window)[512], float *samples_out,
+ * float (*raXin)[32], float scale);
+ */
+function ff_dca_qmf_32_subbands_vfp, export=1
+VFP push {a3-a4,v1-v3,v5,fp,lr}
+NOVFP push {a4,v1-v5,fp,lr}
+ add fp, sp, #8*4
+ vpush {s16-s23}
+ @ The buffer pointed at by raXin isn't big enough for us to do a
+ @ complete matrix transposition as we want to, so allocate an
+ @ alternative buffer from the stack. Align to 4 words for speed.
+ sub BUF, sp, #8*32*4
+ bic BUF, BUF, #15
+ mov sp, BUF
+ ldr lr, =0x03330000 @ RunFast mode, short vectors of length 4, stride 2
+ fmrx OLDFPSCR, FPSCR
+ fmxr FPSCR, lr
+ @ COUNT is used to count down 2 things at once:
+ @ bits 0-4 are the number of word pairs remaining in the output row
+ @ bits 5-31 are the number of words to copy (with possible negation)
+ @ from the source matrix before we start zeroing the remainder
+ mov COUNT, #(-4 << 5) + 16
+ adds COUNT, COUNT, SBACT, lsl #5
+ bmi 2f
+1:
+ vldr s8, [IN, #(0*8+0)*4]
+ vldr s10, [IN, #(0*8+1)*4]
+ vldr s12, [IN, #(0*8+2)*4]
+ vldr s14, [IN, #(0*8+3)*4]
+ vldr s16, [IN, #(0*8+4)*4]
+ vldr s18, [IN, #(0*8+5)*4]
+ vldr s20, [IN, #(0*8+6)*4]
+ vldr s22, [IN, #(0*8+7)*4]
+ vneg.f s8, s8
+ vldr s9, [IN, #(1*8+0)*4]
+ vldr s11, [IN, #(1*8+1)*4]
+ vldr s13, [IN, #(1*8+2)*4]
+ vldr s15, [IN, #(1*8+3)*4]
+ vneg.f s16, s16
+ vldr s17, [IN, #(1*8+4)*4]
+ vldr s19, [IN, #(1*8+5)*4]
+ vldr s21, [IN, #(1*8+6)*4]
+ vldr s23, [IN, #(1*8+7)*4]
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ vldr s9, [IN, #(3*8+0)*4]
+ vldr s11, [IN, #(3*8+1)*4]
+ vldr s13, [IN, #(3*8+2)*4]
+ vldr s15, [IN, #(3*8+3)*4]
+ vldr s17, [IN, #(3*8+4)*4]
+ vldr s19, [IN, #(3*8+5)*4]
+ vldr s21, [IN, #(3*8+6)*4]
+ vldr s23, [IN, #(3*8+7)*4]
+ vneg.f s9, s9
+ vldr s8, [IN, #(2*8+0)*4]
+ vldr s10, [IN, #(2*8+1)*4]
+ vldr s12, [IN, #(2*8+2)*4]
+ vldr s14, [IN, #(2*8+3)*4]
+ vneg.f s17, s17
+ vldr s16, [IN, #(2*8+4)*4]
+ vldr s18, [IN, #(2*8+5)*4]
+ vldr s20, [IN, #(2*8+6)*4]
+ vldr s22, [IN, #(2*8+7)*4]
+ vstr d4, [BUF, #(0*32+2)*4]
+ vstr d5, [BUF, #(1*32+2)*4]
+ vstr d6, [BUF, #(2*32+2)*4]
+ vstr d7, [BUF, #(3*32+2)*4]
+ vstr d8, [BUF, #(4*32+2)*4]
+ vstr d9, [BUF, #(5*32+2)*4]
+ vstr d10, [BUF, #(6*32+2)*4]
+ vstr d11, [BUF, #(7*32+2)*4]
+ add IN, IN, #4*8*4
+ add BUF, BUF, #4*4
+ subs COUNT, COUNT, #(4 << 5) + 2
+ bpl 1b
+2: @ Now deal with trailing < 4 samples
+ adds COUNT, COUNT, #3 << 5
+ bmi 4f @ sb_act was a multiple of 4
+ bics lr, COUNT, #0x1F
+ bne 3f
+ @ sb_act was n*4+1
+ vldr s8, [IN, #(0*8+0)*4]
+ vldr s10, [IN, #(0*8+1)*4]
+ vldr s12, [IN, #(0*8+2)*4]
+ vldr s14, [IN, #(0*8+3)*4]
+ vldr s16, [IN, #(0*8+4)*4]
+ vldr s18, [IN, #(0*8+5)*4]
+ vldr s20, [IN, #(0*8+6)*4]
+ vldr s22, [IN, #(0*8+7)*4]
+ vneg.f s8, s8
+ vldr s9, zero
+ vldr s11, zero
+ vldr s13, zero
+ vldr s15, zero
+ vneg.f s16, s16
+ vldr s17, zero
+ vldr s19, zero
+ vldr s21, zero
+ vldr s23, zero
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ sub COUNT, COUNT, #1
+ b 4f
+3: @ sb_act was n*4+2 or n*4+3, so do the first 2
+ vldr s8, [IN, #(0*8+0)*4]
+ vldr s10, [IN, #(0*8+1)*4]
+ vldr s12, [IN, #(0*8+2)*4]
+ vldr s14, [IN, #(0*8+3)*4]
+ vldr s16, [IN, #(0*8+4)*4]
+ vldr s18, [IN, #(0*8+5)*4]
+ vldr s20, [IN, #(0*8+6)*4]
+ vldr s22, [IN, #(0*8+7)*4]
+ vneg.f s8, s8
+ vldr s9, [IN, #(1*8+0)*4]
+ vldr s11, [IN, #(1*8+1)*4]
+ vldr s13, [IN, #(1*8+2)*4]
+ vldr s15, [IN, #(1*8+3)*4]
+ vneg.f s16, s16
+ vldr s17, [IN, #(1*8+4)*4]
+ vldr s19, [IN, #(1*8+5)*4]
+ vldr s21, [IN, #(1*8+6)*4]
+ vldr s23, [IN, #(1*8+7)*4]
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ sub COUNT, COUNT, #(2 << 5) + 1
+ bics lr, COUNT, #0x1F
+ bne 4f
+ @ sb_act was n*4+3
+ vldr s8, [IN, #(2*8+0)*4]
+ vldr s10, [IN, #(2*8+1)*4]
+ vldr s12, [IN, #(2*8+2)*4]
+ vldr s14, [IN, #(2*8+3)*4]
+ vldr s16, [IN, #(2*8+4)*4]
+ vldr s18, [IN, #(2*8+5)*4]
+ vldr s20, [IN, #(2*8+6)*4]
+ vldr s22, [IN, #(2*8+7)*4]
+ vldr s9, zero
+ vldr s11, zero
+ vldr s13, zero
+ vldr s15, zero
+ vldr s17, zero
+ vldr s19, zero
+ vldr s21, zero
+ vldr s23, zero
+ vstr d4, [BUF, #(0*32+0)*4]
+ vstr d5, [BUF, #(1*32+0)*4]
+ vstr d6, [BUF, #(2*32+0)*4]
+ vstr d7, [BUF, #(3*32+0)*4]
+ vstr d8, [BUF, #(4*32+0)*4]
+ vstr d9, [BUF, #(5*32+0)*4]
+ vstr d10, [BUF, #(6*32+0)*4]
+ vstr d11, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ sub COUNT, COUNT, #1
+4: @ Now fill the remainder with 0
+ vldr s8, zero
+ vldr s9, zero
+ ands COUNT, COUNT, #0x1F
+ beq 6f
+5: vstr d4, [BUF, #(0*32+0)*4]
+ vstr d4, [BUF, #(1*32+0)*4]
+ vstr d4, [BUF, #(2*32+0)*4]
+ vstr d4, [BUF, #(3*32+0)*4]
+ vstr d4, [BUF, #(4*32+0)*4]
+ vstr d4, [BUF, #(5*32+0)*4]
+ vstr d4, [BUF, #(6*32+0)*4]
+ vstr d4, [BUF, #(7*32+0)*4]
+ add BUF, BUF, #2*4
+ subs COUNT, COUNT, #1
+ bne 5b
+6:
+ fmxr FPSCR, OLDFPSCR
+ ldr WINDOW, [fp, #3*4]
+ ldr OUT, [fp, #4*4]
+ sub BUF, BUF, #32*4
+NOVFP ldr SCALEINT, [fp, #6*4]
+ mov COUNT, #8
+VFP vpush {SCALE}
+VFP sub sp, sp, #3*4
+NOVFP sub sp, sp, #4*4
+7:
+VFP ldr a1, [fp, #-7*4] @ imdct
+NOVFP ldr a1, [fp, #-8*4]
+ ldmia fp, {a2-a4}
+VFP stmia sp, {WINDOW, OUT, BUF}
+NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
+VFP vldr SCALE, [sp, #3*4]
+ bl ff_synth_filter_float_vfp
+ add OUT, OUT, #32*4
+ add BUF, BUF, #32*4
+ subs COUNT, COUNT, #1
+ bne 7b
+
+A sub sp, fp, #(8+8)*4
+T sub fp, fp, #(8+8)*4
+T mov sp, fp
+ vpop {s16-s23}
+VFP pop {a3-a4,v1-v3,v5,fp,pc}
+NOVFP pop {a4,v1-v5,fp,pc}
+endfunc
+
+ .unreq IN
+ .unreq SBACT
+ .unreq OLDFPSCR
+ .unreq IMDCT
+ .unreq WINDOW
+ .unreq OUT
+ .unreq BUF
+ .unreq SCALEINT
+ .unreq COUNT
+
+ .unreq SCALE
+
+ .align 2
+zero: .word 0
--
1.7.9.5

View File

@ -0,0 +1,64 @@
From 101f5a2c5db12605c24fe4aa41b3fabacfd3bad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Mon, 22 Jul 2013 12:33:22 +0300
Subject: [PATCH 49/49] [ffmpeg] - backport - arm: Mangle external symbols
properly in new vfp assembly files
Reviewed-by: Kostya Shishkov
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
libavcodec/arm/dcadsp_vfp.S | 2 +-
libavcodec/arm/mdct_vfp.S | 4 ++--
libavcodec/arm/synth_filter_vfp.S | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/libavcodec/arm/dcadsp_vfp.S b/libavcodec/arm/dcadsp_vfp.S
index 6039e87..5892a84 100644
--- a/libavcodec/arm/dcadsp_vfp.S
+++ b/libavcodec/arm/dcadsp_vfp.S
@@ -463,7 +463,7 @@ NOVFP ldr a1, [fp, #-8*4]
VFP stmia sp, {WINDOW, OUT, BUF}
NOVFP stmia sp, {WINDOW, OUT, BUF, SCALEINT}
VFP vldr SCALE, [sp, #3*4]
- bl ff_synth_filter_float_vfp
+ bl X(ff_synth_filter_float_vfp)
add OUT, OUT, #32*4
add BUF, BUF, #32*4
subs COUNT, COUNT, #1
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
index 0623e96..94db24f 100644
--- a/libavcodec/arm/mdct_vfp.S
+++ b/libavcodec/arm/mdct_vfp.S
@@ -151,7 +151,7 @@ function ff_imdct_half_vfp, export=1
ldr ip, [CONTEXT, #5*4] @ mdct_bits
teq ip, #6
it ne
- bne ff_imdct_half_c @ only case currently accelerated is the one used by DCA
+ bne X(ff_imdct_half_c) @ only case currently accelerated is the one used by DCA
.set n, 1<<6
.set n2, n/2
@@ -175,7 +175,7 @@ function ff_imdct_half_vfp, export=1
fmxr FPSCR, OLDFPSCR
mov a1, OUT
- bl ff_fft16_vfp
+ bl X(ff_fft16_vfp)
ldr lr, =0x03030000 @ RunFast mode, short vectors of length 4, stride 1
fmxr FPSCR, lr
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index c219c41..e6e6408 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -132,7 +132,7 @@ function ff_synth_filter_float_vfp, export=1
str lr, [P_SB_OFF] @ rotate offset, modulo buffer size, ready for next call
ldr a3, [sp, #(16+6+2)*4] @ fetch in from stack, to pass to imdct_half
VFP vmov s16, SCALE @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
- bl ff_imdct_half_vfp
+ bl X(ff_imdct_half_vfp)
VFP vmov SCALE, s16
fmrx OLDFPSCR, FPSCR
--
1.7.9.5

View File

@ -0,0 +1,72 @@
From 5ce8f2bf354b7adf904ac3e1438915586c5a0bb1 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 31 Jul 2013 23:46:08 +0100
Subject: [PATCH 51/54] [ffmpeg] - backport - avio: Add an internal function
for reading without copying
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
As long as there is enough contiguous data in the avio buffer,
just return a pointer to it instead of copying it to the caller
provided buffer.
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavformat/avio_internal.h | 17 +++++++++++++++++
libavformat/aviobuf.c | 12 ++++++++++++
2 files changed, 29 insertions(+)
diff --git a/libavformat/avio_internal.h b/libavformat/avio_internal.h
index cf36764..e9ece57 100644
--- a/libavformat/avio_internal.h
+++ b/libavformat/avio_internal.h
@@ -38,6 +38,23 @@ int ffio_init_context(AVIOContext *s,
/**
+ * Read size bytes from AVIOContext, returning a pointer.
+ * Note that the data pointed at by the returned pointer is only
+ * valid until the next call that references the same IO context.
+ * @param s IO context
+ * @param buf pointer to buffer into which to assemble the requested
+ * data if it is not available in contiguous addresses in the
+ * underlying buffer
+ * @param size number of bytes requested
+ * @param data address at which to store pointer: this will be a
+ * a direct pointer into the underlying buffer if the requested
+ * number of bytes are available at contiguous addresses, otherwise
+ * will be a copy of buf
+ * @return number of bytes read or AVERROR
+ */
+int ffio_read_indirect(AVIOContext *s, unsigned char *buf, int size, unsigned char **data);
+
+/**
* Read size bytes from AVIOContext into buf.
* This reads at most 1 packet. If that is not enough fewer bytes will be
* returned.
diff --git a/libavformat/aviobuf.c b/libavformat/aviobuf.c
index 7a73a17..465c46d 100644
--- a/libavformat/aviobuf.c
+++ b/libavformat/aviobuf.c
@@ -522,6 +522,18 @@ int avio_read(AVIOContext *s, unsigned char *buf, int size)
return size1 - size;
}
+int ffio_read_indirect(AVIOContext *s, unsigned char *buf, int size, unsigned char **data)
+{
+ if (s->buf_end - s->buf_ptr >= size && !s->write_flag) {
+ *data = s->buf_ptr;
+ s->buf_ptr += size;
+ return size;
+ } else {
+ *data = buf;
+ return avio_read(s, buf, size);
+ }
+}
+
int ffio_read_partial(AVIOContext *s, unsigned char *buf, int size)
{
int len;
--
1.7.9.5

View File

@ -0,0 +1,149 @@
From 1496d8c12075c0f3783e348a5d73fef9e3000b0f Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 31 Jul 2013 23:46:08 +0100
Subject: [PATCH 52/54] [ffmpeg] - backport - mpegts: Remove one memcpy per
packet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This was being performed to ensure that a complete packet was held in
contiguous memory, prior to parsing the packet. However, the source buffer
is typically large enough that the packet was already contiguous, so it is
beneficial to return the packet by reference in most cases.
Before After
Mean StdDev Mean StdDev Change
memcpy 720.7 32.7 649.8 25.1 +10.9%
Overall 2372.7 46.1 2291.7 21.8 +3.5%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavformat/mpegts.c | 41 ++++++++++++++++++++++++++-------------
1 file changed, 28 insertions(+), 13 deletions(-)
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index b5f5d63..5307521 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -1863,17 +1863,17 @@ static int mpegts_resync(AVFormatContext *s)
}
/* return -1 if error or EOF. Return 0 if OK. */
-static int read_packet(AVFormatContext *s, uint8_t *buf, int raw_packet_size)
+static int read_packet(AVFormatContext *s, uint8_t *buf, int raw_packet_size, uint8_t **data)
{
AVIOContext *pb = s->pb;
- int skip, len;
+ int len;
for(;;) {
- len = avio_read(pb, buf, TS_PACKET_SIZE);
+ len = ffio_read_indirect(pb, buf, TS_PACKET_SIZE, data);
if (len != TS_PACKET_SIZE)
return len < 0 ? len : AVERROR_EOF;
/* check packet sync byte */
- if (buf[0] != 0x47) {
+ if ((*data)[0] != 0x47) {
/* find a new packet start */
avio_seek(pb, -TS_PACKET_SIZE, SEEK_CUR);
if (mpegts_resync(s) < 0)
@@ -1881,19 +1881,25 @@ static int read_packet(AVFormatContext *s, uint8_t *buf, int raw_packet_size)
else
continue;
} else {
- skip = raw_packet_size - TS_PACKET_SIZE;
- if (skip > 0)
- avio_skip(pb, skip);
break;
}
}
return 0;
}
+static void finished_reading_packet(AVFormatContext *s, int raw_packet_size)
+{
+ AVIOContext *pb = s->pb;
+ int skip = raw_packet_size - TS_PACKET_SIZE;
+ if (skip > 0)
+ avio_skip(pb, skip);
+}
+
static int handle_packets(MpegTSContext *ts, int nb_packets)
{
AVFormatContext *s = ts->stream;
uint8_t packet[TS_PACKET_SIZE + FF_INPUT_BUFFER_PADDING_SIZE];
+ uint8_t *data;
int packet_num, ret = 0;
if (avio_tell(s->pb) != ts->last_pos) {
@@ -1926,10 +1932,11 @@ static int handle_packets(MpegTSContext *ts, int nb_packets)
if (ts->stop_parse > 0)
break;
- ret = read_packet(s, packet, ts->raw_packet_size);
+ ret = read_packet(s, packet, ts->raw_packet_size, &data);
if (ret != 0)
break;
- ret = handle_packet(ts, packet);
+ ret = handle_packet(ts, data);
+ finished_reading_packet(s, ts->raw_packet_size);
if (ret != 0)
break;
}
@@ -2087,6 +2094,7 @@ static int mpegts_read_header(AVFormatContext *s)
int64_t pcrs[2], pcr_h;
int packet_count[2];
uint8_t packet[TS_PACKET_SIZE];
+ uint8_t *data;
/* only read packets */
@@ -2102,18 +2110,21 @@ static int mpegts_read_header(AVFormatContext *s)
nb_pcrs = 0;
nb_packets = 0;
for(;;) {
- ret = read_packet(s, packet, ts->raw_packet_size);
+ ret = read_packet(s, packet, ts->raw_packet_size, &data);
if (ret < 0)
return -1;
- pid = AV_RB16(packet + 1) & 0x1fff;
+ pid = AV_RB16(data + 1) & 0x1fff;
if ((pcr_pid == -1 || pcr_pid == pid) &&
- parse_pcr(&pcr_h, &pcr_l, packet) == 0) {
+ parse_pcr(&pcr_h, &pcr_l, data) == 0) {
+ finished_reading_packet(s, ts->raw_packet_size);
pcr_pid = pid;
packet_count[nb_pcrs] = nb_packets;
pcrs[nb_pcrs] = pcr_h * 300 + pcr_l;
nb_pcrs++;
if (nb_pcrs >= 2)
break;
+ } else {
+ finished_reading_packet(s, ts->raw_packet_size);
}
nb_packets++;
}
@@ -2145,15 +2156,19 @@ static int mpegts_raw_read_packet(AVFormatContext *s,
int64_t pcr_h, next_pcr_h, pos;
int pcr_l, next_pcr_l;
uint8_t pcr_buf[12];
+ uint8_t *data;
if (av_new_packet(pkt, TS_PACKET_SIZE) < 0)
return AVERROR(ENOMEM);
pkt->pos= avio_tell(s->pb);
- ret = read_packet(s, pkt->data, ts->raw_packet_size);
+ ret = read_packet(s, pkt->data, ts->raw_packet_size, &data);
if (ret < 0) {
av_free_packet(pkt);
return ret;
}
+ if (data != pkt->data)
+ memcpy(pkt->data, data, ts->raw_packet_size);
+ finished_reading_packet(s, ts->raw_packet_size);
if (ts->mpeg2ts_compute_pcr) {
/* compute exact PCR for each packet */
if (parse_pcr(&pcr_h, &pcr_l, pkt->data) == 0) {
--
1.7.9.5

View File

@ -0,0 +1,47 @@
From 6aec5772fd5331b3514f308ab0895f6234b60045 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 5 Aug 2013 13:12:51 +0100
Subject: [PATCH 53/54] [ffmpeg] - backport - mpegts: Make discard_pid()
faster for single-program streams
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
When a stream contains a single program, there's no point in doing a
PID -> program lookup. Normally the one and only program isn't disabled,
so no packets should be discarded.
Before After
Mean StdDev Mean StdDev Change
discard_pid() 73.8 9.4 20.2 1.5 +264.8%
Overall 2300.8 28.0 2253.1 20.6 +2.1%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavformat/mpegts.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 5307521..82dd209 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -268,6 +268,17 @@ static int discard_pid(MpegTSContext *ts, unsigned int pid)
int i, j, k;
int used = 0, discarded = 0;
struct Program *p;
+
+ /* If none of the programs have .discard=AVDISCARD_ALL then there's
+ * no way we have to discard this packet
+ */
+ for (k = 0; k < ts->stream->nb_programs; k++) {
+ if (ts->stream->programs[k]->discard == AVDISCARD_ALL)
+ break;
+ }
+ if (k == ts->stream->nb_programs)
+ return 0;
+
for(i=0; i<ts->nb_prg; i++) {
p = &ts->prg[i];
for(j=0; j<p->nb_pids; j++) {
--
1.7.9.5

View File

@ -0,0 +1,76 @@
From b79aa2b89ed9027a72a10c1d26ccdf2bb385d57b Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 5 Aug 2013 13:12:49 +0100
Subject: [PATCH 54/54] [ffmpeg] - backport - mpegts: Remove one 64-bit
integer modulus operation per packet
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The common case of the pointer having increased by one packet (which results
in no change to the modulus) can be detected with a 64-bit subtraction,
which is far cheaper than a division on many platforms.
Before After
Mean StdDev Mean StdDev Change
Divisions 248.3 8.8 51.5 7.4 +381.7%
Overall 2773.2 25.6 2372.5 43.1 +16.9%
Signed-off-by: Martin Storsjö <martin@martin.st>
---
libavcodec/mathops.h | 9 +++++++++
libavformat/mpegts.c | 5 ++++-
2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/libavcodec/mathops.h b/libavcodec/mathops.h
index 592f5a5..1d57342 100644
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -195,6 +195,15 @@ if ((y) < (x)) {\
# define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32))
#endif /* FASTDIV */
+#ifndef MOD_UNLIKELY
+# define MOD_UNLIKELY(modulus, dividend, divisor, prev_dividend) \
+ do { \
+ if ((prev_dividend) == 0 || (dividend) - (prev_dividend) != (divisor)) \
+ (modulus) = (dividend) % (divisor); \
+ (prev_dividend) = (dividend); \
+ } while (0)
+#endif
+
static inline av_const unsigned int ff_sqrt(unsigned int a)
{
unsigned int b;
diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
index 82dd209..b995f60 100644
--- a/libavformat/mpegts.c
+++ b/libavformat/mpegts.c
@@ -28,6 +28,7 @@
#include "libavutil/avassert.h"
#include "libavcodec/bytestream.h"
#include "libavcodec/get_bits.h"
+#include "libavcodec/mathops.h"
#include "avformat.h"
#include "mpegts.h"
#include "internal.h"
@@ -99,6 +100,8 @@ struct MpegTSContext {
int raw_packet_size;
int pos47;
+ /** position corresponding to pos47, or 0 if pos47 invalid */
+ int64_t pos;
/** if true, all pids are analyzed to find streams */
int auto_guess;
@@ -1814,7 +1817,7 @@ static int handle_packet(MpegTSContext *ts, const uint8_t *packet)
return 0;
pos = avio_tell(ts->stream->pb);
- ts->pos47= pos % ts->raw_packet_size;
+ MOD_UNLIKELY(ts->pos47, pos, ts->raw_packet_size, ts->pos);
if (tss->type == MPEGTS_SECTION) {
if (is_start) {
--
1.7.9.5

View File

@ -0,0 +1,110 @@
From 8067f55edf3719182aed6e5b57b7863889f80218 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Reimar=20D=C3=B6ffinger?= <Reimar.Doeffinger@gmx.de>
Date: Sat, 16 Mar 2013 13:36:20 +0100
Subject: [PATCH] Fix compilation on ARM with android gcc 4.7
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit
With the current code it fails due to running out
of registers.
So code the store offsets manually into the assembler
instead.
Passes "make fate-dts".
Signed-off-by: Reimar Döffinger <Reimar.Doeffinger@gmx.de>
---
libavcodec/arm/dca.h | 74 ++++++++++++++++++++++++--------------------------
1 file changed, 36 insertions(+), 38 deletions(-)
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 2cfd18a..431b62e 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -34,46 +34,44 @@ static inline int decode_blockcodes(int code1, int code2, int levels,
{
int v0, v1, v2, v3, v4, v5;
- __asm__ ("smmul %8, %14, %18 \n"
- "smmul %11, %15, %18 \n"
- "smlabb %14, %8, %17, %14 \n"
- "smlabb %15, %11, %17, %15 \n"
- "smmul %9, %8, %18 \n"
- "smmul %12, %11, %18 \n"
- "sub %14, %14, %16, lsr #1 \n"
- "sub %15, %15, %16, lsr #1 \n"
- "smlabb %8, %9, %17, %8 \n"
- "smlabb %11, %12, %17, %11 \n"
- "smmul %10, %9, %18 \n"
- "smmul %13, %12, %18 \n"
- "str %14, %0 \n"
- "str %15, %4 \n"
- "sub %8, %8, %16, lsr #1 \n"
- "sub %11, %11, %16, lsr #1 \n"
- "smlabb %9, %10, %17, %9 \n"
- "smlabb %12, %13, %17, %12 \n"
- "smmul %14, %10, %18 \n"
- "smmul %15, %13, %18 \n"
- "str %8, %1 \n"
- "str %11, %5 \n"
- "sub %9, %9, %16, lsr #1 \n"
- "sub %12, %12, %16, lsr #1 \n"
- "smlabb %10, %14, %17, %10 \n"
- "smlabb %13, %15, %17, %13 \n"
- "str %9, %2 \n"
- "str %12, %6 \n"
- "sub %10, %10, %16, lsr #1 \n"
- "sub %13, %13, %16, lsr #1 \n"
- "str %10, %3 \n"
- "str %13, %7 \n"
- : "=m"(values[0]), "=m"(values[1]),
- "=m"(values[2]), "=m"(values[3]),
- "=m"(values[4]), "=m"(values[5]),
- "=m"(values[6]), "=m"(values[7]),
- "=&r"(v0), "=&r"(v1), "=&r"(v2),
+ __asm__ ("smmul %0, %6, %10 \n"
+ "smmul %3, %7, %10 \n"
+ "smlabb %6, %0, %9, %6 \n"
+ "smlabb %7, %3, %9, %7 \n"
+ "smmul %1, %0, %10 \n"
+ "smmul %4, %3, %10 \n"
+ "sub %6, %6, %8, lsr #1 \n"
+ "sub %7, %7, %8, lsr #1 \n"
+ "smlabb %0, %1, %9, %0 \n"
+ "smlabb %3, %4, %9, %3 \n"
+ "smmul %2, %1, %10 \n"
+ "smmul %5, %4, %10 \n"
+ "str %6, [%11, #0] \n"
+ "str %7, [%11, #16] \n"
+ "sub %0, %0, %8, lsr #1 \n"
+ "sub %3, %3, %8, lsr #1 \n"
+ "smlabb %1, %2, %9, %1 \n"
+ "smlabb %4, %5, %9, %4 \n"
+ "smmul %6, %2, %10 \n"
+ "smmul %7, %5, %10 \n"
+ "str %0, [%11, #4] \n"
+ "str %3, [%11, #20] \n"
+ "sub %1, %1, %8, lsr #1 \n"
+ "sub %4, %4, %8, lsr #1 \n"
+ "smlabb %2, %6, %9, %2 \n"
+ "smlabb %5, %7, %9, %5 \n"
+ "str %1, [%11, #8] \n"
+ "str %4, [%11, #24] \n"
+ "sub %2, %2, %8, lsr #1 \n"
+ "sub %5, %5, %8, lsr #1 \n"
+ "str %2, [%11, #12] \n"
+ "str %5, [%11, #28] \n"
+ : "=&r"(v0), "=&r"(v1), "=&r"(v2),
"=&r"(v3), "=&r"(v4), "=&r"(v5),
"+&r"(code1), "+&r"(code2)
- : "r"(levels - 1), "r"(-levels), "r"(ff_inverse[levels]));
+ : "r"(levels - 1), "r"(-levels),
+ "r"(ff_inverse[levels]), "r"(values)
+ : "memory");
return code1 | code2;
}
--
1.7.10.4

View File

@ -1,42 +1,15 @@
From 67895a77c9e5f519166dd0ce4a2a98649194b11b Mon Sep 17 00:00:00 2001
From: Rainer Hochecker <fernetmenta@online.de>
Date: Sat, 8 Oct 2011 16:45:13 +0200
Subject: [PATCH] ffmpeg: add xvba hwaccel
---
configure | 11 ++
libavcodec/Makefile | 6 ++
libavcodec/allcodecs.c | 4 +
libavcodec/h264.c | 3 +
libavcodec/xvba.c | 66 ++++++++++++
libavcodec/xvba.h | 71 +++++++++++++
libavcodec/xvba_h264.c | 192 ++++++++++++++++++++++++++++++++++
libavcodec/xvba_internal.h | 24 +++++
libavcodec/xvba_mpeg2.c | 52 +++++++++
libavcodec/xvba_vc1.c | 190 +++++++++++++++++++++++++++++++++
libavutil/pixdesc.c | 6 ++
libavutil/pixfmt.h | 1 +
12 files changed, 626 insertions(+)
create mode 100644 libavcodec/xvba.c
create mode 100644 libavcodec/xvba.h
create mode 100644 libavcodec/xvba_h264.c
create mode 100644 libavcodec/xvba_internal.h
create mode 100644 libavcodec/xvba_mpeg2.c
create mode 100644 libavcodec/xvba_vc1.c
diff --git a/configure b/configure
index 351611d..876a6ea 100755
--- a/configure
+++ b/configure
@@ -144,6 +144,7 @@ Hardware accelerators:
--enable-vaapi enable VAAPI code
diff -Naur ffmpeg-1.2.3/configure ffmpeg-1.2.3.patch/configure
--- ffmpeg-1.2.3/configure 2013-09-09 22:46:04.636832059 +0200
+++ ffmpeg-1.2.3.patch/configure 2013-09-09 22:47:15.023872481 +0200
@@ -144,6 +144,7 @@
--disable-vaapi disable VAAPI code [autodetect]
--enable-vda enable VDA code
--enable-vdpau enable VDPAU code
--disable-vdpau disable VDPAU code [autodetect]
+ --disable-xvba disable XVBA code
Individual component options:
--disable-everything disable all components listed below
@@ -1197,6 +1198,7 @@ HWACCEL_LIST="
@@ -1197,6 +1198,7 @@
vaapi
vda
vdpau
@ -44,7 +17,7 @@ index 351611d..876a6ea 100755
"
LIBRARY_LIST="
@@ -1827,6 +1829,7 @@ crystalhd_deps="libcrystalhd_libcrystalhd_if_h"
@@ -1827,6 +1829,7 @@
dxva2_deps="dxva2api_h"
vaapi_deps="va_va_h"
vda_deps="VideoDecodeAcceleration_VDADecoder_h pthreads"
@ -52,7 +25,7 @@ index 351611d..876a6ea 100755
vda_extralibs="-framework CoreFoundation -framework VideoDecodeAcceleration -framework QuartzCore"
vdpau_deps="vdpau_vdpau_h vdpau_vdpau_x11_h"
@@ -1847,6 +1850,8 @@ h264_vdpau_decoder_deps="vdpau"
@@ -1847,6 +1850,8 @@
h264_vdpau_decoder_select="h264_decoder"
h264_vdpau_hwaccel_deps="vdpau"
h264_vdpau_hwaccel_select="h264_decoder"
@ -61,7 +34,7 @@ index 351611d..876a6ea 100755
mpeg_vdpau_decoder_deps="vdpau"
mpeg_vdpau_decoder_select="mpegvideo_decoder"
mpeg1_vdpau_decoder_deps="vdpau"
@@ -1859,6 +1864,8 @@ mpeg2_dxva2_hwaccel_select="mpeg2video_decoder"
@@ -1859,6 +1864,8 @@
mpeg2_vaapi_hwaccel_deps="vaapi"
mpeg2_vaapi_hwaccel_select="mpeg2video_decoder"
mpeg2_vdpau_hwaccel_deps="vdpau"
@ -70,7 +43,7 @@ index 351611d..876a6ea 100755
mpeg2_vdpau_hwaccel_select="mpeg2video_decoder"
mpeg4_crystalhd_decoder_select="crystalhd"
mpeg4_vaapi_hwaccel_deps="vaapi"
@@ -1877,11 +1884,14 @@ vc1_vdpau_decoder_deps="vdpau"
@@ -1877,11 +1884,14 @@
vc1_vdpau_decoder_select="vc1_decoder"
vc1_vdpau_hwaccel_deps="vdpau"
vc1_vdpau_hwaccel_select="vc1_decoder"
@ -85,71 +58,18 @@ index 351611d..876a6ea 100755
# parsers
h264_parser_select="golomb h264chroma h264dsp h264pred h264qpel videodsp"
@@ -3832,6 +3842,7 @@ check_header termios.h
check_header unistd.h
@@ -3836,6 +3846,7 @@
check_header vdpau/vdpau.h
check_header vdpau/vdpau_x11.h
+check_header amd/amdxvba.h
check_cpp_condition vdpau/vdpau.h "defined(VDP_DECODER_PROFILE_MPEG4_PART2_SP)" && enable vdpau_mpeg4_support
+check_header amd/amdxvba.h
check_header VideoDecodeAcceleration/VDADecoder.h
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index dc065a5..c386923 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -12,6 +12,7 @@ HEADERS = avcodec.h \
vdpau.h \
version.h \
xvmc.h \
+ xvba.h \
OBJS = allcodecs.o \
audioconvert.o \
@@ -73,6 +74,7 @@ OBJS-$(CONFIG_SHARED) += log2_tab.o
OBJS-$(CONFIG_SINEWIN) += sinewin.o
OBJS-$(CONFIG_VAAPI) += vaapi.o
OBJS-$(CONFIG_VDPAU) += vdpau.o
+OBJS-$(CONFIG_XVBA) += xvba.o
OBJS-$(CONFIG_VIDEODSP) += videodsp.o
OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
@@ -232,6 +234,7 @@ OBJS-$(CONFIG_H264_VAAPI_HWACCEL) += vaapi_h264.o
OBJS-$(CONFIG_H264_VDA_HWACCEL) += vda_h264.o
OBJS-$(CONFIG_H264_VDA_DECODER) += vda_h264_dec.o
OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o
+OBJS-$(CONFIG_H264_XVBA_HWACCEL) += xvba_h264.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuv.o huffyuvdec.o
OBJS-$(CONFIG_HUFFYUV_ENCODER) += huffyuv.o huffyuvenc.o
OBJS-$(CONFIG_IAC_DECODER) += imc.o
@@ -295,6 +298,7 @@ OBJS-$(CONFIG_MPEG1VIDEO_ENCODER) += mpeg12enc.o mpeg12.o \
OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL) += dxva2_mpeg2.o
OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL) += vaapi_mpeg2.o
OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL) += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG2_XVBA_HWACCEL) += xvba_mpeg2.o
OBJS-$(CONFIG_MPEG2VIDEO_DECODER) += mpeg12.o mpeg12data.o
OBJS-$(CONFIG_MPEG2VIDEO_ENCODER) += mpeg12enc.o mpeg12.o \
timecode.o
@@ -459,6 +463,7 @@ OBJS-$(CONFIG_VC1_DECODER) += vc1dec.o vc1.o vc1data.o vc1dsp.o \
OBJS-$(CONFIG_VC1_DXVA2_HWACCEL) += dxva2_vc1.o
OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o
OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o
+OBJS-$(CONFIG_VC1_XVBA_HWACCEL) += xvba_vc1.o
OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o
OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o
OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o
@@ -788,6 +793,7 @@ SKIPHEADERS-$(CONFIG_LIBSCHROEDINGER) += libschroedinger.h
SKIPHEADERS-$(CONFIG_LIBUTVIDEO) += libutvideo.h
SKIPHEADERS-$(CONFIG_MPEG_XVMC_DECODER) += xvmc.h
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_internal.h
+SKIPHEADERS-$(CONFIG_XVBA) += xvba_internal.h
SKIPHEADERS-$(CONFIG_VDA) += vda.h
SKIPHEADERS-$(CONFIG_VDPAU) += vdpau.h
SKIPHEADERS-$(HAVE_OS2THREADS) += os2threads.h
diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
index 584446f..7a8f61c 100644
--- a/libavcodec/allcodecs.c
+++ b/libavcodec/allcodecs.c
@@ -79,18 +79,22 @@ void avcodec_register_all(void)
check_header windows.h
diff -Naur ffmpeg-1.2.3/libavcodec/allcodecs.c ffmpeg-1.2.3.patch/libavcodec/allcodecs.c
--- ffmpeg-1.2.3/libavcodec/allcodecs.c 2013-08-27 02:13:44.000000000 +0200
+++ ffmpeg-1.2.3.patch/libavcodec/allcodecs.c 2013-09-09 22:46:40.577852790 +0200
@@ -79,18 +79,22 @@
REGISTER_HWACCEL(H264_VAAPI, h264_vaapi);
REGISTER_HWACCEL(H264_VDA, h264_vda);
REGISTER_HWACCEL(H264_VDPAU, h264_vdpau);
@ -172,10 +92,9 @@ index 584446f..7a8f61c 100644
/* video codecs */
REGISTER_ENCODER(A64MULTI, a64multi);
diff --git a/libavcodec/h264.c b/libavcodec/h264.c
index 937ad7a..299039f 100644
--- a/libavcodec/h264.c
+++ b/libavcodec/h264.c
diff -Naur ffmpeg-1.2.3/libavcodec/h264.c ffmpeg-1.2.3.patch/libavcodec/h264.c
--- ffmpeg-1.2.3/libavcodec/h264.c 2013-09-09 22:46:04.639832061 +0200
+++ ffmpeg-1.2.3.patch/libavcodec/h264.c 2013-09-09 22:46:40.579852792 +0200
@@ -81,6 +81,9 @@
#if CONFIG_H264_VDPAU_HWACCEL
AV_PIX_FMT_VDPAU,
@ -186,11 +105,60 @@ index 937ad7a..299039f 100644
AV_PIX_FMT_YUV420P,
AV_PIX_FMT_NONE
};
diff --git a/libavcodec/xvba.c b/libavcodec/xvba.c
new file mode 100644
index 0000000..be29e5d
--- /dev/null
+++ b/libavcodec/xvba.c
diff -Naur ffmpeg-1.2.3/libavcodec/Makefile ffmpeg-1.2.3.patch/libavcodec/Makefile
--- ffmpeg-1.2.3/libavcodec/Makefile 2013-08-27 02:13:44.000000000 +0200
+++ ffmpeg-1.2.3.patch/libavcodec/Makefile 2013-09-09 22:46:40.580852793 +0200
@@ -12,6 +12,7 @@
vdpau.h \
version.h \
xvmc.h \
+ xvba.h \
OBJS = allcodecs.o \
audioconvert.o \
@@ -73,6 +74,7 @@
OBJS-$(CONFIG_SINEWIN) += sinewin.o
OBJS-$(CONFIG_VAAPI) += vaapi.o
OBJS-$(CONFIG_VDPAU) += vdpau.o
+OBJS-$(CONFIG_XVBA) += xvba.o
OBJS-$(CONFIG_VIDEODSP) += videodsp.o
OBJS-$(CONFIG_VP3DSP) += vp3dsp.o
@@ -232,6 +234,7 @@
OBJS-$(CONFIG_H264_VDA_HWACCEL) += vda_h264.o
OBJS-$(CONFIG_H264_VDA_DECODER) += vda_h264_dec.o
OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o
+OBJS-$(CONFIG_H264_XVBA_HWACCEL) += xvba_h264.o
OBJS-$(CONFIG_HUFFYUV_DECODER) += huffyuv.o huffyuvdec.o
OBJS-$(CONFIG_HUFFYUV_ENCODER) += huffyuv.o huffyuvenc.o
OBJS-$(CONFIG_IAC_DECODER) += imc.o
@@ -295,6 +298,7 @@
OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL) += dxva2_mpeg2.o
OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL) += vaapi_mpeg2.o
OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL) += vdpau_mpeg12.o
+OBJS-$(CONFIG_MPEG2_XVBA_HWACCEL) += xvba_mpeg2.o
OBJS-$(CONFIG_MPEG2VIDEO_DECODER) += mpeg12.o mpeg12data.o
OBJS-$(CONFIG_MPEG2VIDEO_ENCODER) += mpeg12enc.o mpeg12.o \
timecode.o
@@ -459,6 +463,7 @@
OBJS-$(CONFIG_VC1_DXVA2_HWACCEL) += dxva2_vc1.o
OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o
OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o
+OBJS-$(CONFIG_VC1_XVBA_HWACCEL) += xvba_vc1.o
OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o
OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o
OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o
@@ -788,6 +793,7 @@
SKIPHEADERS-$(CONFIG_LIBUTVIDEO) += libutvideo.h
SKIPHEADERS-$(CONFIG_MPEG_XVMC_DECODER) += xvmc.h
SKIPHEADERS-$(CONFIG_VAAPI) += vaapi_internal.h
+SKIPHEADERS-$(CONFIG_XVBA) += xvba_internal.h
SKIPHEADERS-$(CONFIG_VDA) += vda.h
SKIPHEADERS-$(CONFIG_VDPAU) += vdpau.h
SKIPHEADERS-$(HAVE_OS2THREADS) += os2threads.h
diff -Naur ffmpeg-1.2.3/libavcodec/xvba.c ffmpeg-1.2.3.patch/libavcodec/xvba.c
--- ffmpeg-1.2.3/libavcodec/xvba.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-1.2.3.patch/libavcodec/xvba.c 2013-09-09 22:46:40.580852793 +0200
@@ -0,0 +1,66 @@
+/*
+ * HW decode acceleration for MPEG-2, H.264 and VC-1
@ -258,11 +226,9 @@ index 0000000..be29e5d
+ render->num_slices++;
+}
+
diff --git a/libavcodec/xvba.h b/libavcodec/xvba.h
new file mode 100644
index 0000000..9f9ff0c
--- /dev/null
+++ b/libavcodec/xvba.h
diff -Naur ffmpeg-1.2.3/libavcodec/xvba.h ffmpeg-1.2.3.patch/libavcodec/xvba.h
--- ffmpeg-1.2.3/libavcodec/xvba.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-1.2.3.patch/libavcodec/xvba.h 2013-09-09 22:46:40.581852794 +0200
@@ -0,0 +1,71 @@
+/*
+ * HW decode acceleration for MPEG-2, H.264 and VC-1
@ -335,11 +301,9 @@ index 0000000..9f9ff0c
+};
+
+#endif /* AVCODEC_XVBA_H */
diff --git a/libavcodec/xvba_h264.c b/libavcodec/xvba_h264.c
new file mode 100644
index 0000000..ae45f3a
--- /dev/null
+++ b/libavcodec/xvba_h264.c
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_h264.c ffmpeg-1.2.3.patch/libavcodec/xvba_h264.c
--- ffmpeg-1.2.3/libavcodec/xvba_h264.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_h264.c 2013-09-09 22:46:40.582852794 +0200
@@ -0,0 +1,192 @@
+/*
+ * H.264 HW decode acceleration through XVBA
@ -533,11 +497,9 @@ index 0000000..ae45f3a
+ .end_frame = end_frame,
+ .decode_slice = decode_slice,
+};
diff --git a/libavcodec/xvba_internal.h b/libavcodec/xvba_internal.h
new file mode 100644
index 0000000..9653f85
--- /dev/null
+++ b/libavcodec/xvba_internal.h
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_internal.h ffmpeg-1.2.3.patch/libavcodec/xvba_internal.h
--- ffmpeg-1.2.3/libavcodec/xvba_internal.h 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_internal.h 2013-09-09 22:46:40.582852794 +0200
@@ -0,0 +1,24 @@
+/*
+ * HW decode acceleration for MPEG-2, H.264 and VC-1
@ -563,11 +525,9 @@ index 0000000..9653f85
+
+int ff_xvba_translate_profile(int profile);
+void ff_xvba_add_slice_data(struct xvba_render_state *render, const uint8_t *buffer, uint32_t size);
diff --git a/libavcodec/xvba_mpeg2.c b/libavcodec/xvba_mpeg2.c
new file mode 100644
index 0000000..0fc7d78
--- /dev/null
+++ b/libavcodec/xvba_mpeg2.c
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_mpeg2.c ffmpeg-1.2.3.patch/libavcodec/xvba_mpeg2.c
--- ffmpeg-1.2.3/libavcodec/xvba_mpeg2.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_mpeg2.c 2013-09-09 22:46:40.582852794 +0200
@@ -0,0 +1,52 @@
+/*
+ * MPEG-2 HW decode acceleration through XVBA
@ -621,11 +581,9 @@ index 0000000..0fc7d78
+ .decode_slice = decode_slice,
+ .priv_data_size = 0,
+};
diff --git a/libavcodec/xvba_vc1.c b/libavcodec/xvba_vc1.c
new file mode 100644
index 0000000..bf3d9c2
--- /dev/null
+++ b/libavcodec/xvba_vc1.c
diff -Naur ffmpeg-1.2.3/libavcodec/xvba_vc1.c ffmpeg-1.2.3.patch/libavcodec/xvba_vc1.c
--- ffmpeg-1.2.3/libavcodec/xvba_vc1.c 1970-01-01 01:00:00.000000000 +0100
+++ ffmpeg-1.2.3.patch/libavcodec/xvba_vc1.c 2013-09-09 22:46:40.583852794 +0200
@@ -0,0 +1,190 @@
+/*
+ * VC-1 HW decode acceleration through XVBA
@ -817,11 +775,10 @@ index 0000000..bf3d9c2
+ .end_frame = end_frame,
+ .decode_slice = decode_slice,
+};
diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
index 1016dba..53dfec1 100644
--- a/libavutil/pixdesc.c
+++ b/libavutil/pixdesc.c
@@ -1141,6 +1141,12 @@ void av_write_image_line(const uint16_t *src,
diff -Naur ffmpeg-1.2.3/libavutil/pixdesc.c ffmpeg-1.2.3.patch/libavutil/pixdesc.c
--- ffmpeg-1.2.3/libavutil/pixdesc.c 2013-08-27 02:13:47.000000000 +0200
+++ ffmpeg-1.2.3.patch/libavutil/pixdesc.c 2013-09-09 22:46:40.584852795 +0200
@@ -1141,6 +1141,12 @@
.log2_chroma_h = 1,
.flags = PIX_FMT_HWACCEL,
},
@ -834,11 +791,10 @@ index 1016dba..53dfec1 100644
[AV_PIX_FMT_YUV420P9LE] = {
.name = "yuv420p9le",
.nb_components = 3,
diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
index 1c00ac4..6437e29 100644
--- a/libavutil/pixfmt.h
+++ b/libavutil/pixfmt.h
@@ -124,6 +124,7 @@ enum AVPixelFormat {
diff -Naur ffmpeg-1.2.3/libavutil/pixfmt.h ffmpeg-1.2.3.patch/libavutil/pixfmt.h
--- ffmpeg-1.2.3/libavutil/pixfmt.h 2013-08-27 02:13:47.000000000 +0200
+++ ffmpeg-1.2.3.patch/libavutil/pixfmt.h 2013-09-09 22:46:40.585852796 +0200
@@ -124,6 +124,7 @@
AV_PIX_FMT_VAAPI_MOCO, ///< HW acceleration through VA API at motion compensation entry-point, Picture.data[3] contains a vaapi_render_state struct which contains macroblocks as well as various fields extracted from headers
AV_PIX_FMT_VAAPI_IDCT, ///< HW acceleration through VA API at IDCT entry-point, Picture.data[3] contains a vaapi_render_state struct which contains fields extracted from headers
AV_PIX_FMT_VAAPI_VLD, ///< HW decoding through VA API, Picture.data[3] contains a vaapi_render_state struct which contains the bitstream of the slices as well as various fields extracted from headers
@ -846,6 +802,3 @@ index 1c00ac4..6437e29 100644
AV_PIX_FMT_YUV420P16LE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), little-endian
AV_PIX_FMT_YUV420P16BE, ///< planar YUV 4:2:0, 24bpp, (1 Cr & Cb sample per 2x2 Y samples), big-endian
--
1.8.1.5