projects/RPi/patches/ffmpeg: add RPi specific patches

Signed-off-by: Stephan Raue <stephan@openelec.tv>
This commit is contained in:
Stephan Raue 2014-05-01 21:37:30 +02:00
parent cc971f66ea
commit 5084896ac1
10 changed files with 3294 additions and 0 deletions

View File

@ -0,0 +1,752 @@
From 8cdb3bf2837a3fb4fff3c6586316f81ae5f7b6cd Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 16 Apr 2014 01:51:31 +0100
Subject: [PATCH 1/3] h264: Move search code search functions into separate
source files.
This permits re-use with parsers for codecs which use similar start codes.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
libavcodec/Makefile | 2 +-
libavcodec/arm/Makefile | 2 +-
libavcodec/arm/h264dsp_armv6.S | 253 --------------------------------------
libavcodec/arm/h264dsp_init_arm.c | 4 +-
libavcodec/arm/startcode_armv6.S | 253 ++++++++++++++++++++++++++++++++++++++
libavcodec/h264dsp.c | 31 +----
libavcodec/startcode.c | 57 +++++++++
libavcodec/startcode.h | 35 ++++++
8 files changed, 351 insertions(+), 286 deletions(-)
delete mode 100644 libavcodec/arm/h264dsp_armv6.S
create mode 100644 libavcodec/arm/startcode_armv6.S
create mode 100644 libavcodec/startcode.c
create mode 100644 libavcodec/startcode.h
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index b56ecd1..19caf11 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -49,7 +49,7 @@ OBJS-$(CONFIG_FFT) += avfft.o fft_fixed.o fft_float.o \
OBJS-$(CONFIG_GOLOMB) += golomb.o
OBJS-$(CONFIG_H263DSP) += h263dsp.o
OBJS-$(CONFIG_H264CHROMA) += h264chroma.o
-OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o
+OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o startcode.o
OBJS-$(CONFIG_H264PRED) += h264pred.o
OBJS-$(CONFIG_H264QPEL) += h264qpel.o
OBJS-$(CONFIG_HPELDSP) += hpeldsp.o
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a8446b2..b6410b2 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -47,7 +47,7 @@ ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \
arm/simple_idct_armv6.o \
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
-ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
+ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S
deleted file mode 100644
index 2758262..0000000
--- a/libavcodec/arm/h264dsp_armv6.S
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2013 RISC OS Open Ltd
- * Author: Ben Avison <bavison@riscosopen.org>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/arm/asm.S"
-
-RESULT .req a1
-BUF .req a1
-SIZE .req a2
-PATTERN .req a3
-PTR .req a4
-DAT0 .req v1
-DAT1 .req v2
-DAT2 .req v3
-DAT3 .req v4
-TMP0 .req v5
-TMP1 .req v6
-TMP2 .req ip
-TMP3 .req lr
-
-#define PRELOAD_DISTANCE 4
-
-.macro innerloop4
- ldr DAT0, [PTR], #4
- subs SIZE, SIZE, #4 @ C flag survives rest of macro
- sub TMP0, DAT0, PATTERN, lsr #14
- bic TMP0, TMP0, DAT0
- ands TMP0, TMP0, PATTERN
-.endm
-
-.macro innerloop16 decrement, do_preload
- ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
- .ifnc "\do_preload",""
- pld [PTR, #PRELOAD_DISTANCE*32]
- .endif
- .ifnc "\decrement",""
- subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
- .endif
- sub TMP0, DAT0, PATTERN, lsr #14
- sub TMP1, DAT1, PATTERN, lsr #14
- bic TMP0, TMP0, DAT0
- bic TMP1, TMP1, DAT1
- sub TMP2, DAT2, PATTERN, lsr #14
- sub TMP3, DAT3, PATTERN, lsr #14
- ands TMP0, TMP0, PATTERN
- bic TMP2, TMP2, DAT2
- it eq
- andseq TMP1, TMP1, PATTERN
- bic TMP3, TMP3, DAT3
- itt eq
- andseq TMP2, TMP2, PATTERN
- andseq TMP3, TMP3, PATTERN
-.endm
-
-/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
-function ff_h264_find_start_code_candidate_armv6, export=1
- push {v1-v6,lr}
- mov PTR, BUF
- @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
- @ before using code that does preloads
- cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
- blo 60f
-
- @ Get to word-alignment, 1 byte at a time
- tst PTR, #3
- beq 2f
-1: ldrb DAT0, [PTR], #1
- sub SIZE, SIZE, #1
- teq DAT0, #0
- beq 90f
- tst PTR, #3
- bne 1b
-2: @ Get to 4-word alignment, 1 word at a time
- ldr PATTERN, =0x80008000
- setend be
- tst PTR, #12
- beq 4f
-3: innerloop4
- bne 91f
- tst PTR, #12
- bne 3b
-4: @ Get to cacheline (8-word) alignment
- tst PTR, #16
- beq 5f
- innerloop16 16
- bne 93f
-5: @ Check complete cachelines, with preloading
- @ We need to stop when there are still (PRELOAD_DISTANCE+1)
- @ complete cachelines to go
- sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
-6: innerloop16 , do_preload
- bne 93f
- innerloop16 32
- bne 93f
- bcs 6b
- @ Preload trailing part-cacheline, if any
- tst SIZE, #31
- beq 7f
- pld [PTR, #(PRELOAD_DISTANCE+1)*32]
- @ Check remaining data without doing any more preloads. First
- @ do in chunks of 4 words:
-7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
- bmi 9f
-8: innerloop16 16
- bne 93f
- bcs 8b
- @ Then in words:
-9: adds SIZE, SIZE, #16 - 4
- bmi 11f
-10: innerloop4
- bne 91f
- bcs 10b
-11: setend le
- @ Check second byte of final halfword
- ldrb DAT0, [PTR, #-1]
- teq DAT0, #0
- beq 90f
- @ Check any remaining bytes
- tst SIZE, #3
- beq 13f
-12: ldrb DAT0, [PTR], #1
- sub SIZE, SIZE, #1
- teq DAT0, #0
- beq 90f
- tst SIZE, #3
- bne 12b
- @ No candidate found
-13: sub RESULT, PTR, BUF
- b 99f
-
-60: @ Small buffer - simply check by looping over bytes
- subs SIZE, SIZE, #1
- bcc 99f
-61: ldrb DAT0, [PTR], #1
- subs SIZE, SIZE, #1
- teq DAT0, #0
- beq 90f
- bcs 61b
- @ No candidate found
- sub RESULT, PTR, BUF
- b 99f
-
-90: @ Found a candidate at the preceding byte
- sub RESULT, PTR, BUF
- sub RESULT, RESULT, #1
- b 99f
-
-91: @ Found a candidate somewhere in the preceding 4 bytes
- sub RESULT, PTR, BUF
- sub RESULT, RESULT, #4
- sub TMP0, DAT0, #0x20000
- bics TMP0, TMP0, DAT0
- itt pl
- ldrbpl DAT0, [PTR, #-3]
- addpl RESULT, RESULT, #2
- bpl 92f
- teq RESULT, #0
- beq 98f @ don't look back a byte if found at first byte in buffer
- ldrb DAT0, [PTR, #-5]
-92: teq DAT0, #0
- it eq
- subeq RESULT, RESULT, #1
- b 98f
-
-93: @ Found a candidate somewhere in the preceding 16 bytes
- sub RESULT, PTR, BUF
- sub RESULT, RESULT, #16
- teq TMP0, #0
- beq 95f @ not in first 4 bytes
- sub TMP0, DAT0, #0x20000
- bics TMP0, TMP0, DAT0
- itt pl
- ldrbpl DAT0, [PTR, #-15]
- addpl RESULT, RESULT, #2
- bpl 94f
- teq RESULT, #0
- beq 98f @ don't look back a byte if found at first byte in buffer
- ldrb DAT0, [PTR, #-17]
-94: teq DAT0, #0
- it eq
- subeq RESULT, RESULT, #1
- b 98f
-95: add RESULT, RESULT, #4
- teq TMP1, #0
- beq 96f @ not in next 4 bytes
- sub TMP1, DAT1, #0x20000
- bics TMP1, TMP1, DAT1
- itee mi
- ldrbmi DAT0, [PTR, #-13]
- ldrbpl DAT0, [PTR, #-11]
- addpl RESULT, RESULT, #2
- teq DAT0, #0
- it eq
- subeq RESULT, RESULT, #1
- b 98f
-96: add RESULT, RESULT, #4
- teq TMP2, #0
- beq 97f @ not in next 4 bytes
- sub TMP2, DAT2, #0x20000
- bics TMP2, TMP2, DAT2
- itee mi
- ldrbmi DAT0, [PTR, #-9]
- ldrbpl DAT0, [PTR, #-7]
- addpl RESULT, RESULT, #2
- teq DAT0, #0
- it eq
- subeq RESULT, RESULT, #1
- b 98f
-97: add RESULT, RESULT, #4
- sub TMP3, DAT3, #0x20000
- bics TMP3, TMP3, DAT3
- itee mi
- ldrbmi DAT0, [PTR, #-5]
- ldrbpl DAT0, [PTR, #-3]
- addpl RESULT, RESULT, #2
- teq DAT0, #0
- it eq
- subeq RESULT, RESULT, #1
- @ drop through to 98f
-98: setend le
-99: pop {v1-v6,pc}
-endfunc
-
- .unreq RESULT
- .unreq BUF
- .unreq SIZE
- .unreq PATTERN
- .unreq PTR
- .unreq DAT0
- .unreq DAT1
- .unreq DAT2
- .unreq DAT3
- .unreq TMP0
- .unreq TMP1
- .unreq TMP2
- .unreq TMP3
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index a0418fd..eb6c514 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -24,7 +24,7 @@
#include "libavutil/arm/cpu.h"
#include "libavcodec/h264dsp.h"
-int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
+int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size);
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
@@ -109,7 +109,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
int cpu_flags = av_get_cpu_flags();
if (have_armv6(cpu_flags))
- c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
+ c->h264_find_start_code_candidate = ff_startcode_find_candidate_armv6;
if (have_neon(cpu_flags))
h264dsp_init_neon(c, bit_depth, chroma_format_idc);
}
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
new file mode 100644
index 0000000..a46f009
--- /dev/null
+++ b/libavcodec/arm/startcode_armv6.S
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+RESULT .req a1
+BUF .req a1
+SIZE .req a2
+PATTERN .req a3
+PTR .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+TMP0 .req v5
+TMP1 .req v6
+TMP2 .req ip
+TMP3 .req lr
+
+#define PRELOAD_DISTANCE 4
+
+.macro innerloop4
+ ldr DAT0, [PTR], #4
+ subs SIZE, SIZE, #4 @ C flag survives rest of macro
+ sub TMP0, DAT0, PATTERN, lsr #14
+ bic TMP0, TMP0, DAT0
+ ands TMP0, TMP0, PATTERN
+.endm
+
+.macro innerloop16 decrement, do_preload
+ ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
+ .ifnc "\do_preload",""
+ pld [PTR, #PRELOAD_DISTANCE*32]
+ .endif
+ .ifnc "\decrement",""
+ subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
+ .endif
+ sub TMP0, DAT0, PATTERN, lsr #14
+ sub TMP1, DAT1, PATTERN, lsr #14
+ bic TMP0, TMP0, DAT0
+ bic TMP1, TMP1, DAT1
+ sub TMP2, DAT2, PATTERN, lsr #14
+ sub TMP3, DAT3, PATTERN, lsr #14
+ ands TMP0, TMP0, PATTERN
+ bic TMP2, TMP2, DAT2
+ it eq
+ andseq TMP1, TMP1, PATTERN
+ bic TMP3, TMP3, DAT3
+ itt eq
+ andseq TMP2, TMP2, PATTERN
+ andseq TMP3, TMP3, PATTERN
+.endm
+
+/* int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size) */
+function ff_startcode_find_candidate_armv6, export=1
+ push {v1-v6,lr}
+ mov PTR, BUF
+ @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
+ @ before using code that does preloads
+ cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
+ blo 60f
+
+ @ Get to word-alignment, 1 byte at a time
+ tst PTR, #3
+ beq 2f
+1: ldrb DAT0, [PTR], #1
+ sub SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ tst PTR, #3
+ bne 1b
+2: @ Get to 4-word alignment, 1 word at a time
+ ldr PATTERN, =0x80008000
+ setend be
+ tst PTR, #12
+ beq 4f
+3: innerloop4
+ bne 91f
+ tst PTR, #12
+ bne 3b
+4: @ Get to cacheline (8-word) alignment
+ tst PTR, #16
+ beq 5f
+ innerloop16 16
+ bne 93f
+5: @ Check complete cachelines, with preloading
+ @ We need to stop when there are still (PRELOAD_DISTANCE+1)
+ @ complete cachelines to go
+ sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
+6: innerloop16 , do_preload
+ bne 93f
+ innerloop16 32
+ bne 93f
+ bcs 6b
+ @ Preload trailing part-cacheline, if any
+ tst SIZE, #31
+ beq 7f
+ pld [PTR, #(PRELOAD_DISTANCE+1)*32]
+ @ Check remaining data without doing any more preloads. First
+ @ do in chunks of 4 words:
+7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
+ bmi 9f
+8: innerloop16 16
+ bne 93f
+ bcs 8b
+ @ Then in words:
+9: adds SIZE, SIZE, #16 - 4
+ bmi 11f
+10: innerloop4
+ bne 91f
+ bcs 10b
+11: setend le
+ @ Check second byte of final halfword
+ ldrb DAT0, [PTR, #-1]
+ teq DAT0, #0
+ beq 90f
+ @ Check any remaining bytes
+ tst SIZE, #3
+ beq 13f
+12: ldrb DAT0, [PTR], #1
+ sub SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ tst SIZE, #3
+ bne 12b
+ @ No candidate found
+13: sub RESULT, PTR, BUF
+ b 99f
+
+60: @ Small buffer - simply check by looping over bytes
+ subs SIZE, SIZE, #1
+ bcc 99f
+61: ldrb DAT0, [PTR], #1
+ subs SIZE, SIZE, #1
+ teq DAT0, #0
+ beq 90f
+ bcs 61b
+ @ No candidate found
+ sub RESULT, PTR, BUF
+ b 99f
+
+90: @ Found a candidate at the preceding byte
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #1
+ b 99f
+
+91: @ Found a candidate somewhere in the preceding 4 bytes
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #4
+ sub TMP0, DAT0, #0x20000
+ bics TMP0, TMP0, DAT0
+ itt pl
+ ldrbpl DAT0, [PTR, #-3]
+ addpl RESULT, RESULT, #2
+ bpl 92f
+ teq RESULT, #0
+ beq 98f @ don't look back a byte if found at first byte in buffer
+ ldrb DAT0, [PTR, #-5]
+92: teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+
+93: @ Found a candidate somewhere in the preceding 16 bytes
+ sub RESULT, PTR, BUF
+ sub RESULT, RESULT, #16
+ teq TMP0, #0
+ beq 95f @ not in first 4 bytes
+ sub TMP0, DAT0, #0x20000
+ bics TMP0, TMP0, DAT0
+ itt pl
+ ldrbpl DAT0, [PTR, #-15]
+ addpl RESULT, RESULT, #2
+ bpl 94f
+ teq RESULT, #0
+ beq 98f @ don't look back a byte if found at first byte in buffer
+ ldrb DAT0, [PTR, #-17]
+94: teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+95: add RESULT, RESULT, #4
+ teq TMP1, #0
+ beq 96f @ not in next 4 bytes
+ sub TMP1, DAT1, #0x20000
+ bics TMP1, TMP1, DAT1
+ itee mi
+ ldrbmi DAT0, [PTR, #-13]
+ ldrbpl DAT0, [PTR, #-11]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+96: add RESULT, RESULT, #4
+ teq TMP2, #0
+ beq 97f @ not in next 4 bytes
+ sub TMP2, DAT2, #0x20000
+ bics TMP2, TMP2, DAT2
+ itee mi
+ ldrbmi DAT0, [PTR, #-9]
+ ldrbpl DAT0, [PTR, #-7]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ b 98f
+97: add RESULT, RESULT, #4
+ sub TMP3, DAT3, #0x20000
+ bics TMP3, TMP3, DAT3
+ itee mi
+ ldrbmi DAT0, [PTR, #-5]
+ ldrbpl DAT0, [PTR, #-3]
+ addpl RESULT, RESULT, #2
+ teq DAT0, #0
+ it eq
+ subeq RESULT, RESULT, #1
+ @ drop through to 98f
+98: setend le
+99: pop {v1-v6,pc}
+endfunc
+
+ .unreq RESULT
+ .unreq BUF
+ .unreq SIZE
+ .unreq PATTERN
+ .unreq PTR
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq TMP0
+ .unreq TMP1
+ .unreq TMP2
+ .unreq TMP3
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index a2a4aba..a4da776 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -33,6 +33,7 @@
#include "avcodec.h"
#include "h264dsp.h"
#include "h264idct.h"
+#include "startcode.h"
#include "libavutil/common.h"
#define BIT_DEPTH 8
@@ -63,34 +64,6 @@
#include "h264addpx_template.c"
#undef BIT_DEPTH
-static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
-{
- int i = 0;
-#if HAVE_FAST_UNALIGNED
- /* we check i < size instead of i + 3 / 7 because it is
- * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
- * bytes at the end.
- */
-# if HAVE_FAST_64BIT
- while (i < size &&
- !((~*(const uint64_t *)(buf + i) &
- (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
- 0x8080808080808080ULL))
- i += 8;
-# else
- while (i < size &&
- !((~*(const uint32_t *)(buf + i) &
- (*(const uint32_t *)(buf + i) - 0x01010101U)) &
- 0x80808080U))
- i += 4;
-# endif
-#endif
- for (; i < size; i++)
- if (!buf[i])
- break;
- return i;
-}
-
av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
@@ -178,7 +151,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
H264_DSP(8);
break;
}
- c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
+ c->h264_find_start_code_candidate = ff_startcode_find_candidate_c;
if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc);
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
new file mode 100644
index 0000000..5df7695
--- /dev/null
+++ b/libavcodec/startcode.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "startcode.h"
+#include "config.h"
+
+int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
+{
+ int i = 0;
+#if HAVE_FAST_UNALIGNED
+ /* we check i < size instead of i + 3 / 7 because it is
+ * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
+ * bytes at the end.
+ */
+# if HAVE_FAST_64BIT
+ while (i < size &&
+ !((~*(const uint64_t *)(buf + i) &
+ (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+ 0x8080808080808080ULL))
+ i += 8;
+# else
+ while (i < size &&
+ !((~*(const uint32_t *)(buf + i) &
+ (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+ 0x80808080U))
+ i += 4;
+# endif
+#endif
+ for (; i < size; i++)
+ if (!buf[i])
+ break;
+ return i;
+}
diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h
new file mode 100644
index 0000000..cc55d5f
--- /dev/null
+++ b/libavcodec/startcode.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_STARTCODE_H
+#define AVCODEC_STARTCODE_H
+
+#include <stdint.h>
+
+int ff_startcode_find_candidate_c(const uint8_t *buf, int size);
+
+#endif /* AVCODEC_STARTCODE_H */
--
1.9.1

View File

@ -0,0 +1,65 @@
From 425d69b993d25489e4830766507d9d8f6c819802 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 19 Mar 2014 17:26:19 +0000
Subject: [PATCH 1/6] truehd: tune VLC decoding for ARM.
Profiling on a Raspberry Pi revealed the best performance to correspond
with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
in particular are as follows:
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant)
6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5%
8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5%
8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6%
6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6%
6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1%
8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4%
8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1%
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
libavcodec/mlpdec.c | 13 ++++++++++---
1 file changed, 10 insertions(+), 3 deletions(-)
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 93ed552..cbd9000 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -37,9 +37,16 @@
#include "mlp_parser.h"
#include "mlpdsp.h"
#include "mlp.h"
+#include "config.h"
/** number of bits used for VLC lookup - longest Huffman code is 9 */
+#if ARCH_ARM == 1
+#define VLC_BITS 5
+#define VLC_STATIC_SIZE 64
+#else
#define VLC_BITS 9
+#define VLC_STATIC_SIZE 512
+#endif
typedef struct SubStream {
/// Set if a valid restart header has been read. Otherwise the substream cannot be decoded.
@@ -193,13 +200,13 @@ static av_cold void init_static(void)
if (!huff_vlc[0].bits) {
INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18,
&ff_mlp_huffman_tables[0][0][1], 2, 1,
- &ff_mlp_huffman_tables[0][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16,
&ff_mlp_huffman_tables[1][0][1], 2, 1,
- &ff_mlp_huffman_tables[1][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE);
INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15,
&ff_mlp_huffman_tables[2][0][1], 2, 1,
- &ff_mlp_huffman_tables[2][0][0], 2, 1, 512);
+ &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE);
}
ff_mlp_init_crc();
--
1.9.1

View File

@ -0,0 +1,557 @@
From bfe3d8c8e4e046163dc314aa16207413e377283f Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 3 Mar 2014 19:44:23 +0000
Subject: [PATCH 2/6] truehd: add hand-scheduled ARM asm version of
mlp_filter_channel.
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
function in particular are as follows:
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%
Experiments with adding preload instructions to this function yielded no
useful benefit, so these have not been included.
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 2 +
libavcodec/arm/mlpdsp_arm.S | 433 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 36 ++++
libavcodec/mlpdsp.c | 2 +
libavcodec/mlpdsp.h | 1 +
5 files changed, 474 insertions(+)
create mode 100644 libavcodec/arm/mlpdsp_arm.S
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index a8446b2..ba673b1 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -22,6 +22,8 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
+ arm/mlpdsp_arm.o
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
new file mode 100644
index 0000000..615819d
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define MAX_CHANNELS 8
+#define MAX_FIR_ORDER 8
+#define MAX_IIR_ORDER 4
+#define MAX_RATEFACTOR 4
+#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
+
+PST .req a1
+PCO .req a2
+AC0 .req a3
+AC1 .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+ST0 .req v5
+ST1 .req v6
+ST2 .req sl
+ST3 .req fp
+I .req ip
+PSAMP .req lr
+
+
+// Some macros that do loads/multiplies where the register number is determined
+// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
+
+.macro load group, index, base, offset
+ .altmacro
+ load_ \group, %(\index), \base, \offset
+ .noaltmacro
+.endm
+
+.macro load_ group, index, base, offset
+ ldr \group\index, [\base, #\offset]
+.endm
+
+.macro loadd group, index, base, offset
+ .altmacro
+ loadd_ \group, %(\index), %(\index+1), \base, \offset
+ .noaltmacro
+.endm
+
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A ldr \group\index0, [\base, #\offset]
+A ldr \group\index1, [\base, #(\offset) + 4]
+A .else
+ ldrd \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
+
+.macro multiply index, accumulate, long
+ .altmacro
+ multiply_ %(\index), \accumulate, \long
+ .noaltmacro
+.endm
+
+.macro multiply_ index, accumulate, long
+ .if \long
+ .if \accumulate
+ smlal AC0, AC1, CO\index, ST\index
+ .else
+ smull AC0, AC1, CO\index, ST\index
+ .endif
+ .else
+ .if \accumulate
+ mla AC0, CO\index, ST\index, AC0
+ .else
+ mul AC0, CO\index, ST\index
+ .endif
+ .endif
+.endm
+
+// A macro to update the load register number and load offsets
+
+.macro inc howmany
+ .set LOAD_REG, (LOAD_REG + \howmany) & 3
+ .set OFFSET_CO, OFFSET_CO + 4 * \howmany
+ .set OFFSET_ST, OFFSET_ST + 4 * \howmany
+ .if FIR_REMAIN > 0
+ .set FIR_REMAIN, FIR_REMAIN - \howmany
+ .if FIR_REMAIN == 0
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .endif
+ .elseif IIR_REMAIN > 0
+ .set IIR_REMAIN, IIR_REMAIN - \howmany
+ .endif
+.endm
+
+// Macro to implement the inner loop for one specific combination of parameters
+
+.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
+ .set TOTAL_TAPS, \iir_taps + \fir_taps
+
+ // Deal with register allocation...
+ .set DEFINED_SHIFT, 0
+ .set DEFINED_MASK, 0
+ .set SHUFFLE_SHIFT, 0
+ .set SHUFFLE_MASK, 0
+ .set SPILL_SHIFT, 0
+ .set SPILL_MASK, 0
+ .if TOTAL_TAPS == 0
+ // Little register pressure in this case - just keep MASK where it was
+ .if !\mask_minus1
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+ .else
+ .if \shift_0
+ .if !\mask_minus1
+ // AC1 is unused with shift 0
+ MASK .req AC1
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif \shift_8
+ .if !\mask_minus1
+ .if TOTAL_TAPS <= 4
+ // All coefficients are preloaded (so pointer not needed)
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .else
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .else // shift not 0 or 8
+ .if TOTAL_TAPS <= 3
+ // All coefficients are preloaded, and at least one CO register is unused
+ .if \fir_taps & 1
+ SHIFT .req CO0
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .else
+ SHIFT .req CO3
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .endif
+ .if !\mask_minus1
+ MASK .req PCO
+ .set DEFINED_MASK, 1
+ .set SHUFFLE_MASK, 1
+ .endif
+ .elseif TOTAL_TAPS == 4
+ // All coefficients are preloaded
+ SHIFT .req PCO
+ .set DEFINED_SHIFT, 1
+ .set SHUFFLE_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .else
+ .set SPILL_SHIFT, 1
+ .if !\mask_minus1
+ .set SPILL_MASK, 1
+ .endif
+ .endif
+ .endif
+ .endif
+ .if SPILL_SHIFT
+ SHIFT .req ST0
+ .set DEFINED_SHIFT, 1
+ .endif
+ .if SPILL_MASK
+ MASK .req ST1
+ .set DEFINED_MASK, 1
+ .endif
+
+ // Preload coefficients if possible
+ .if TOTAL_TAPS <= 4
+ .set OFFSET_CO, 0
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .rept \fir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .rept \iir_taps
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .set LOAD_REG, (LOAD_REG + 1) & 3
+ .set OFFSET_CO, OFFSET_CO + 4
+ .endr
+ .endif
+
+ // Move mask/shift to final positions if necessary
+ // Need to do this after preloading, because in some cases we
+ // reuse the coefficient pointer register
+ .if SHUFFLE_SHIFT
+ mov SHIFT, ST0
+ .endif
+ .if SHUFFLE_MASK
+ mov MASK, ST1
+ .endif
+
+ // Begin loop
+01:
+ .if TOTAL_TAPS == 0
+ // Things simplify a lot in this case
+ // In fact this could be pipelined further if it's worth it...
+ ldr ST0, [PSAMP]
+ subs I, I, #1
+ .if !\mask_minus1
+ and ST0, ST0, MASK
+ .endif
+ str ST0, [PST, #-4]!
+ str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST0, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .else
+ .if \fir_taps & 1
+ .set LOAD_REG, 1
+ .else
+ .set LOAD_REG, 0
+ .endif
+ .set LOAD_BANK, 0
+ .set FIR_REMAIN, \fir_taps
+ .set IIR_REMAIN, \iir_taps
+ .if FIR_REMAIN == 0 // only IIR terms
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+ .else
+ .set OFFSET_CO, 0
+ .set OFFSET_ST, 0
+ .endif
+ .set MUL_REG, LOAD_REG
+ .set COUNTER, 0
+ .rept TOTAL_TAPS + 2
+ // Do load(s)
+ .if FIR_REMAIN != 0 || IIR_REMAIN != 0
+ .if COUNTER == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif COUNTER == 1 && (\fir_taps & 1) == 0
+ .if TOTAL_TAPS > 4
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .elseif LOAD_BANK == 0
+ .if TOTAL_TAPS > 4
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load CO, LOAD_REG, PCO, OFFSET_CO
+ .else
+ loadd CO, LOAD_REG, PCO, OFFSET_CO
+ .endif
+ .endif
+ .set LOAD_BANK, 1
+ .else
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+ load ST, LOAD_REG, PST, OFFSET_ST
+ inc 1
+ .else
+ loadd ST, LOAD_REG, PST, OFFSET_ST
+ inc 2
+ .endif
+ .set LOAD_BANK, 0
+ .endif
+ .endif
+
+ // Do interleaved multiplies, slightly delayed
+ .if COUNTER >= 2
+ multiply MUL_REG, COUNTER > 2, !\shift_0
+ .set MUL_REG, (MUL_REG + 1) & 3
+ .endif
+ .set COUNTER, COUNTER + 1
+ .endr
+
+ // Post-process the result of the multiplies
+ .if SPILL_SHIFT
+ ldr SHIFT, [sp, #9*4 + 0*4]
+ .endif
+ .if SPILL_MASK
+ ldr MASK, [sp, #9*4 + 1*4]
+ .endif
+ ldr ST2, [PSAMP]
+ subs I, I, #1
+ .if \shift_8
+ mov AC0, AC0, lsr #8
+ orr AC0, AC0, AC1, lsl #24
+ .elseif !\shift_0
+ rsb ST3, SHIFT, #32
+ mov AC0, AC0, lsr SHIFT
+A orr AC0, AC0, AC1, lsl ST3
+T mov AC1, AC1, lsl ST3
+T orr AC0, AC0, AC1
+ .endif
+ .if \mask_minus1
+ add ST3, ST2, AC0
+ .else
+ add ST2, ST2, AC0
+ and ST3, ST2, MASK
+ sub ST2, ST3, AC0
+ .endif
+ str ST3, [PST, #-4]!
+ str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+ str ST3, [PSAMP], #4 * MAX_CHANNELS
+ bne 01b
+ .endif
+ b 99f
+
+ .if DEFINED_SHIFT
+ .unreq SHIFT
+ .endif
+ .if DEFINED_MASK
+ .unreq MASK
+ .endif
+.endm
+
+.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
+A ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps)
+T tbh [pc, a3, lsl #1]
+0:
+A .word 0, 70f, 71f, 72f, 73f, 74f
+T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2
+ .if \iir_taps <= 3
+A .word 75f
+T .hword (75f - 0b) / 2
+ .if \iir_taps <= 2
+A .word 76f
+T .hword (76f - 0b) / 2
+ .if \iir_taps <= 1
+A .word 77f
+T .hword (77f - 0b) / 2
+ .if \iir_taps == 0
+A .word 78f
+T .hword (78f - 0b) / 2
+ .endif
+ .endif
+ .endif
+ .endif
+70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
+71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
+72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
+73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
+74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
+ .if \iir_taps <= 3
+75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
+ .if \iir_taps <= 2
+76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
+ .if \iir_taps <= 1
+77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
+ .if \iir_taps == 0
+78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
+ .endif
+ .endif
+ .endif
+ .endif
+.endm
+
+.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
+A ldr pc, [pc, a4, LSL #2] // irorder is in range 0-4
+T tbh [pc, a4, lsl #1]
+0:
+A .word 0, 60f, 61f, 62f, 63f, 64f
+T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2
+60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
+61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
+62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
+63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
+64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
+.endm
+
+/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ * int firorder, int iirorder,
+ * unsigned int filter_shift, int32_t mask,
+ * int blocksize, int32_t *sample_buffer);
+ */
+function ff_mlp_filter_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {ST0,ST1,I,PSAMP}
+ cmp ST1, #-1
+ bne 30f
+ movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 20f
+ bcs 10f
+ switch_on_iir_taps 1, 1, 0
+10: switch_on_iir_taps 1, 0, 1
+20: switch_on_iir_taps 1, 0, 0
+30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+ bne 50f
+ bcs 40f
+ switch_on_iir_taps 0, 1, 0
+40: switch_on_iir_taps 0, 0, 1
+50: switch_on_iir_taps 0, 0, 0
+99: pop {v1-fp,pc}
+endfunc
+
+ .unreq PST
+ .unreq PCO
+ .unreq AC0
+ .unreq AC1
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq ST0
+ .unreq ST1
+ .unreq ST2
+ .unreq ST3
+ .unreq I
+ .unreq PSAMP
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
new file mode 100644
index 0000000..9a14815
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/mlpdsp.h"
+
+void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ int firorder, int iirorder,
+ unsigned int filter_shift, int32_t mask,
+ int blocksize, int32_t *sample_buffer);
+
+av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+{
+ c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+}
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index b413e86..4b403b8 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ if (ARCH_ARM)
+ ff_mlpdsp_init_arm(c);
if (ARCH_X86)
ff_mlpdsp_init_x86(c);
}
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index 84a8aa3..129bcfe 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -32,6 +32,7 @@ typedef struct MLPDSPContext {
} MLPDSPContext;
void ff_mlpdsp_init(MLPDSPContext *c);
+void ff_mlpdsp_init_arm(MLPDSPContext *c);
void ff_mlpdsp_init_x86(MLPDSPContext *c);
#endif /* AVCODEC_MLPDSP_H */
--
1.9.1

View File

@ -0,0 +1,143 @@
From a60747132a1a6652ac0d18f3f110a20ea637ac30 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 16 Apr 2014 01:51:32 +0100
Subject: [PATCH 2/3] vc-1: Add platform-specific start code search routine to
VC1DSPContext.
Initialise VC1DSPContext for parser as well as for decoder.
Note, the VC-1 code doesn't actually use the function pointer yet.
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
libavcodec/Makefile | 6 +++---
libavcodec/arm/Makefile | 2 ++
libavcodec/arm/vc1dsp_init_arm.c | 4 ++++
libavcodec/vc1.c | 2 ++
libavcodec/vc1dec.c | 1 -
libavcodec/vc1dsp.c | 3 +++
libavcodec/vc1dsp.h | 8 ++++++++
7 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
index 19caf11..120f85a 100644
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -458,7 +458,7 @@ OBJS-$(CONFIG_VB_DECODER) += vb.o
OBJS-$(CONFIG_VBLE_DECODER) += vble.o
OBJS-$(CONFIG_VC1_DECODER) += vc1dec.o vc1.o vc1data.o vc1dsp.o \
msmpeg4dec.o msmpeg4.o msmpeg4data.o \
- wmv2dsp.o
+ wmv2dsp.o startcode.o
OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o
OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o
OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o
@@ -783,9 +783,9 @@ OBJS-$(CONFIG_PNM_PARSER) += pnm_parser.o pnm.o
OBJS-$(CONFIG_RV30_PARSER) += rv34_parser.o
OBJS-$(CONFIG_RV40_PARSER) += rv34_parser.o
OBJS-$(CONFIG_TAK_PARSER) += tak_parser.o tak.o
-OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o \
+OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o vc1dsp.o \
msmpeg4.o msmpeg4data.o mpeg4video.o \
- h263.o
+ h263.o startcode.o
OBJS-$(CONFIG_VORBIS_PARSER) += vorbis_parser.o xiph.o
OBJS-$(CONFIG_VP3_PARSER) += vp3_parser.o
OBJS-$(CONFIG_VP8_PARSER) += vp8_parser.o
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index b6410b2..fa2b18e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -51,6 +51,8 @@ ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
+ARMV6-OBJS-$(CONFIG_VC1_DECODER) += arm/startcode_armv6.o
+ARMV6-OBJS-$(CONFIG_VC1_PARSER) += arm/startcode_armv6.o
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
arm/vp8dsp_armv6.o
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
index 47d4126..4a84848 100644
--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
@@ -23,10 +23,14 @@
#include "libavcodec/vc1dsp.h"
#include "vc1dsp.h"
+int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size);
+
av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
+ if (have_armv6(cpu_flags))
+ dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_armv6;
if (have_neon(cpu_flags))
ff_vc1dsp_init_neon(dsp);
}
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 49d4885..cb941dd 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -1706,5 +1706,7 @@ av_cold int ff_vc1_init_common(VC1Context *v)
v->pq = -1;
v->mvrange = 0; /* 7.1.1.18, p80 */
+ ff_vc1dsp_init(&v->vc1dsp);
+
return 0;
}
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
index 30fee47..67cda42 100644
--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
@@ -5631,7 +5631,6 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
ff_vc1_decode_end(avctx);
ff_h264chroma_init(&v->h264chroma, 8);
- ff_vc1dsp_init(&v->vc1dsp);
if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) {
int count = 0;
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index ec9c17b..09a9006 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -30,6 +30,7 @@
#include "h264chroma.h"
#include "rnd_avg.h"
#include "vc1dsp.h"
+#include "startcode.h"
/* Apply overlap transform to horizontal edge */
static void vc1_v_overlap_c(uint8_t *src, int stride)
@@ -947,6 +948,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c;
#endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+ dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_c;
+
if (ARCH_AARCH64)
ff_vc1dsp_init_aarch64(dsp);
if (ARCH_ARM)
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index 990fbc3..6a90eed 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -74,6 +74,14 @@ typedef struct VC1DSPContext {
void (*sprite_v_double_twoscale)(uint8_t *dst, const uint8_t *src1a, const uint8_t *src1b, int offset1,
const uint8_t *src2a, const uint8_t *src2b, int offset2,
int alpha, int width);
+
+ /**
+ * Search buf from the start for up to size bytes. Return the index
+ * of a zero byte, or >= size if not found. Ideally, use lookahead
+ * to filter out any zero bytes that are known to not be followed by
+ * one or more further zero bytes and a one byte.
+ */
+ int (*vc1_find_start_code_candidate)(const uint8_t *buf, int size);
} VC1DSPContext;
void ff_vc1dsp_init(VC1DSPContext* c);
--
1.9.1

View File

@ -0,0 +1,158 @@
From bb74fc44081fb6d7923ce1b7ed3e3e6514695f3e Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 5 Mar 2014 21:01:28 +0000
Subject: [PATCH 3/6] truehd: break out part of rematrix_channels into
platform-specific callback.
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
3 files changed, 68 insertions(+), 25 deletions(-)
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index cbd9000..01ded5c 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -1024,7 +1024,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
{
SubStream *s = &m->substream[substr];
- unsigned int mat, src_ch, i;
+ unsigned int mat;
unsigned int maxchan;
maxchan = s->max_matrix_channel;
@@ -1036,31 +1036,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
}
for (mat = 0; mat < s->num_primitive_matrices; mat++) {
- int matrix_noise_shift = s->matrix_noise_shift[mat];
unsigned int dest_ch = s->matrix_out_ch[mat];
- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
- int32_t *coeffs = s->matrix_coeff[mat];
- int index = s->num_primitive_matrices - mat;
- int index2 = 2 * index + 1;
-
- /* TODO: DSPContext? */
-
- for (i = 0; i < s->blockpos; i++) {
- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
- int32_t *samples = m->sample_buffer[i];
- int64_t accum = 0;
-
- for (src_ch = 0; src_ch <= maxchan; src_ch++)
- accum += (int64_t) samples[src_ch] * coeffs[src_ch];
-
- if (matrix_noise_shift) {
- index &= m->access_unit_size_pow2 - 1;
- accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
- index += index2;
- }
-
- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
- }
+ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
+ s->matrix_coeff[mat],
+ &m->bypassed_lsbs[0][mat],
+ m->noise_buffer,
+ s->num_primitive_matrices - mat,
+ dest_ch,
+ s->blockpos,
+ maxchan,
+ s->matrix_noise_shift[mat],
+ m->access_unit_size_pow2,
+ MSB_MASK(s->quant_step_size[dest_ch]));
}
}
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 4b403b8..7a359b0 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
}
}
+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask)
+{
+ unsigned int src_ch, i;
+ int index2 = 2 * index + 1;
+ for (i = 0; i < blockpos; i++) {
+ int64_t accum = 0;
+
+ for (src_ch = 0; src_ch <= maxchan; src_ch++)
+ accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+
+ if (matrix_noise_shift) {
+ index &= access_unit_size_pow2 - 1;
+ accum += noise_buffer[index] << (matrix_noise_shift + 7);
+ index += index2;
+ }
+
+ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
+ bypassed_lsbs += MAX_CHANNELS;
+ samples += MAX_CHANNELS;
+ }
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index 129bcfe..f98e9be 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -24,11 +24,34 @@
#include <stdint.h>
+void ff_mlp_rematrix_channel(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+ void (*mlp_rematrix_channel)(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
} MLPDSPContext;
void ff_mlpdsp_init(MLPDSPContext *c);
--
1.9.1

View File

@ -0,0 +1,401 @@
From c39df43eae03768427243668c040de8437c4f79c Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 23 Apr 2014 01:41:04 +0100
Subject: [PATCH 3/3] vc-1: Optimise parser (with special attention to ARM)
The previous implementation of the parser made four passes over each input
buffer (reduced to two if the container format already guaranteed the input
buffer corresponded to frames, such as with MKV). But these buffers are
often 200K in size, certainly enough to flush the data out of L1 cache, and
for many CPUs, all the way out to main memory. The passes were:
1) locate frame boundaries (not needed for MKV etc)
2) copy the data into a contiguous block (not needed for MKV etc)
3) locate the start codes within each frame
4) unescape the data between start codes
After this, the unescaped data was parsed to extract certain header fields,
but because the unescape operation was so large, this was usually also
effectively operating on uncached memory. Most of the unescaped data was
simply thrown away and never processed further. Only step 2 - because it
used memcpy - was using prefetch, making things even worse.
This patch reorganises these steps so that, aside from the copying, the
operations are performed in parallel, maximising cache utilisation. No more
than the worst-case number of bytes needed for header parsing is unescaped.
Most of the data is, in practice, only read in order to search for a start
code, for which optimised implementations already existed in the H264 codec
(notably the ARM version uses prefetch, so we end up doing both remaining
passes at maximum speed). For MKV files, we know when we've found the last
start code of interest in a given frame, so we are able to avoid doing even
that one remaining pass for most of the buffer.
In some use-cases (such as the Raspberry Pi) video decode is handled by the
GPU, but the entire elementary stream is still fed through the parser to
pick out certain elements of the header which are necessary to manage the
decode process. As you might expect, in these cases, the performance of the
parser is significant.
To measure parser performance, I used the same VC-1 elementary stream in
either an MPEG-2 transport stream or a MKV file, and fed it through ffmpeg
with -c:v copy -c:a copy -f null. These are the gperftools counts for
those streams, both filtered to only include vc1_parse() and its callees,
and unfiltered (to include the whole binary). Lower numbers are better:
Before After
File Filtered Mean StdDev Mean StdDev Confidence Change
M2TS No 861.7 8.2 650.5 8.1 100.0% +32.5%
MKV No 868.9 7.4 731.7 9.0 100.0% +18.8%
M2TS Yes 250.0 11.2 27.2 3.4 100.0% +817.9%
MKV Yes 149.0 12.8 1.7 0.8 100.0% +8526.3%
Yes, that last case shows vc1_parse() running 86 times faster! The M2TS
case does show a larger absolute improvement though, since it was worse
to begin with.
This patch has been tested with the FATE suite (albeit on x86 for speed).
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
---
libavcodec/vc1_parser.c | 284 ++++++++++++++++++++++++++++++------------------
1 file changed, 180 insertions(+), 104 deletions(-)
diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
index cc29ce1..4ed14bc 100644
--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
@@ -30,122 +30,88 @@
#include "vc1.h"
#include "get_bits.h"
+/** The maximum number of bytes of a sequence, entry point or
+ * frame header whose values we pay any attention to */
+#define UNESCAPED_THRESHOLD 37
+
+/** The maximum number of bytes of a sequence, entry point or
+ * frame header which must be valid memory (because they are
+ * used to update the bitstream cache in skip_bits() calls)
+ */
+#define UNESCAPED_LIMIT 144
+
+typedef enum {
+ NO_MATCH,
+ ONE_ZERO,
+ TWO_ZEROS,
+ ONE
+} VC1ParseSearchState;
+
typedef struct {
ParseContext pc;
VC1Context v;
+ uint8_t prev_start_code;
+ size_t bytes_to_skip;
+ uint8_t unesc_buffer[UNESCAPED_LIMIT];
+ size_t unesc_index;
+ VC1ParseSearchState search_state;
} VC1ParseContext;
-static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx,
- const uint8_t *buf, int buf_size)
+static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
+ const uint8_t *buf, int buf_size)
{
+ /* Parse the header we just finished unescaping */
VC1ParseContext *vpc = s->priv_data;
GetBitContext gb;
- const uint8_t *start, *end, *next;
- uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
-
+ int ret;
vpc->v.s.avctx = avctx;
vpc->v.parse_only = 1;
- vpc->v.first_pic_header_flag = 1;
- next = buf;
- s->repeat_pict = 0;
-
- for(start = buf, end = buf + buf_size; next < end; start = next){
- int buf2_size, size;
- int ret;
-
- next = find_next_marker(start + 4, end);
- size = next - start - 4;
- buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
- init_get_bits(&gb, buf2, buf2_size * 8);
- if(size <= 0) continue;
- switch(AV_RB32(start)){
- case VC1_CODE_SEQHDR:
- ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
- break;
- case VC1_CODE_ENTRYPOINT:
- ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
- break;
- case VC1_CODE_FRAME:
- if(vpc->v.profile < PROFILE_ADVANCED)
- ret = ff_vc1_parse_frame_header (&vpc->v, &gb);
- else
- ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
-
- if (ret < 0)
- break;
-
- /* keep AV_PICTURE_TYPE_BI internal to VC1 */
- if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
- s->pict_type = AV_PICTURE_TYPE_B;
- else
- s->pict_type = vpc->v.s.pict_type;
-
- if (avctx->ticks_per_frame > 1){
- // process pulldown flags
- s->repeat_pict = 1;
- // Pulldown flags are only valid when 'broadcast' has been set.
- // So ticks_per_frame will be 2
- if (vpc->v.rff){
- // repeat field
- s->repeat_pict = 2;
- }else if (vpc->v.rptfrm){
- // repeat frames
- s->repeat_pict = vpc->v.rptfrm * 2 + 1;
- }
- }
-
- if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
- s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
- else
- s->field_order = AV_FIELD_PROGRESSIVE;
+ init_get_bits(&gb, buf, buf_size * 8);
+ switch (vpc->prev_start_code) {
+ case VC1_CODE_SEQHDR & 0xFF:
+ ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
+ break;
+ case VC1_CODE_ENTRYPOINT & 0xFF:
+ ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
+ break;
+ case VC1_CODE_FRAME & 0xFF:
+ if(vpc->v.profile < PROFILE_ADVANCED)
+ ret = ff_vc1_parse_frame_header (&vpc->v, &gb);
+ else
+ ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+ if (ret < 0)
break;
- }
- }
- av_free(buf2);
-}
+ /* keep AV_PICTURE_TYPE_BI internal to VC1 */
+ if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
+ s->pict_type = AV_PICTURE_TYPE_B;
+ else
+ s->pict_type = vpc->v.s.pict_type;
-/**
- * Find the end of the current frame in the bitstream.
- * @return the position of the first byte of the next frame, or -1
- */
-static int vc1_find_frame_end(ParseContext *pc, const uint8_t *buf,
- int buf_size) {
- int pic_found, i;
- uint32_t state;
-
- pic_found= pc->frame_start_found;
- state= pc->state;
-
- i=0;
- if(!pic_found){
- for(i=0; i<buf_size; i++){
- state= (state<<8) | buf[i];
- if(state == VC1_CODE_FRAME || state == VC1_CODE_FIELD){
- i++;
- pic_found=1;
- break;
+ if (avctx->ticks_per_frame > 1){
+ // process pulldown flags
+ s->repeat_pict = 1;
+ // Pulldown flags are only valid when 'broadcast' has been set.
+ // So ticks_per_frame will be 2
+ if (vpc->v.rff){
+ // repeat field
+ s->repeat_pict = 2;
+ }else if (vpc->v.rptfrm){
+ // repeat frames
+ s->repeat_pict = vpc->v.rptfrm * 2 + 1;
}
+ }else{
+ s->repeat_pict = 0;
}
- }
- if(pic_found){
- /* EOF considered as end of frame */
- if (buf_size == 0)
- return 0;
- for(; i<buf_size; i++){
- state= (state<<8) | buf[i];
- if(IS_MARKER(state) && state != VC1_CODE_FIELD && state != VC1_CODE_SLICE){
- pc->frame_start_found=0;
- pc->state=-1;
- return i-3;
- }
- }
+ if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
+ s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
+ else
+ s->field_order = AV_FIELD_PROGRESSIVE;
+
+ break;
}
- pc->frame_start_found= pic_found;
- pc->state= state;
- return END_NOT_FOUND;
}
static int vc1_parse(AVCodecParserContext *s,
@@ -153,22 +119,127 @@ static int vc1_parse(AVCodecParserContext *s,
const uint8_t **poutbuf, int *poutbuf_size,
const uint8_t *buf, int buf_size)
{
+ /* Here we do the searching for frame boundaries and headers at
+ * the same time. Only a minimal amount at the start of each
+ * header is unescaped. */
VC1ParseContext *vpc = s->priv_data;
- int next;
+ int pic_found = vpc->pc.frame_start_found;
+ uint8_t *unesc_buffer = vpc->unesc_buffer;
+ size_t unesc_index = vpc->unesc_index;
+ VC1ParseSearchState search_state = vpc->search_state;
+ int next = END_NOT_FOUND;
+ int i = vpc->bytes_to_skip;
+
+ if (pic_found && buf_size == 0) {
+ /* EOF considered as end of frame */
+ memset(unesc_buffer + unesc_index, 0, UNESCAPED_THRESHOLD - unesc_index);
+ vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+ next = 0;
+ }
+ while (i < buf_size) {
+ int start_code_found = 0;
+ uint8_t b;
+ while (i < buf_size && unesc_index < UNESCAPED_THRESHOLD) {
+ b = buf[i++];
+ unesc_buffer[unesc_index++] = b;
+ if (search_state <= ONE_ZERO)
+ search_state = b ? NO_MATCH : search_state + 1;
+ else if (search_state == TWO_ZEROS) {
+ if (b == 1)
+ search_state = ONE;
+ else if (b > 1) {
+ if (b == 3)
+ unesc_index--; // swallow emulation prevention byte
+ search_state = NO_MATCH;
+ }
+ }
+ else { // search_state == ONE
+ // Header unescaping terminates early due to detection of next start code
+ search_state = NO_MATCH;
+ start_code_found = 1;
+ break;
+ }
+ }
+ if ((s->flags & PARSER_FLAG_COMPLETE_FRAMES) &&
+ unesc_index >= UNESCAPED_THRESHOLD &&
+ vpc->prev_start_code == (VC1_CODE_FRAME & 0xFF))
+ {
+ // No need to keep scanning the rest of the buffer for
+ // start codes if we know it contains a complete frame and
+ // we've already unescaped all we need of the frame header
+ vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+ break;
+ }
+ if (unesc_index >= UNESCAPED_THRESHOLD && !start_code_found) {
+ while (i < buf_size) {
+ if (search_state == NO_MATCH) {
+ i += vpc->v.vc1dsp.vc1_find_start_code_candidate(buf + i, buf_size - i);
+ if (i < buf_size) {
+ search_state = ONE_ZERO;
+ }
+ i++;
+ } else {
+ b = buf[i++];
+ if (search_state == ONE_ZERO)
+ search_state = b ? NO_MATCH : TWO_ZEROS;
+ else if (search_state == TWO_ZEROS) {
+ if (b >= 1)
+ search_state = b == 1 ? ONE : NO_MATCH;
+ }
+ else { // search_state == ONE
+ search_state = NO_MATCH;
+ start_code_found = 1;
+ break;
+ }
+ }
+ }
+ }
+ if (start_code_found) {
+ vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+
+ vpc->prev_start_code = b;
+ unesc_index = 0;
+
+ if (!(s->flags & PARSER_FLAG_COMPLETE_FRAMES)) {
+ if (!pic_found && (b == (VC1_CODE_FRAME & 0xFF) || b == (VC1_CODE_FIELD & 0xFF))) {
+ pic_found = 1;
+ }
+ else if (pic_found && b != (VC1_CODE_FIELD & 0xFF) && b != (VC1_CODE_SLICE & 0xFF)) {
+ next = i - 4;
+ pic_found = b == (VC1_CODE_FRAME & 0xFF);
+ break;
+ }
+ }
+ }
+ }
- if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
- next= buf_size;
- }else{
- next= vc1_find_frame_end(&vpc->pc, buf, buf_size);
+ vpc->pc.frame_start_found = pic_found;
+ vpc->unesc_index = unesc_index;
+ vpc->search_state = search_state;
+ if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+ next = buf_size;
+ } else {
if (ff_combine_frame(&vpc->pc, next, &buf, &buf_size) < 0) {
+ vpc->bytes_to_skip = 0;
*poutbuf = NULL;
*poutbuf_size = 0;
return buf_size;
}
}
- vc1_extract_headers(s, avctx, buf, buf_size);
+ vpc->v.first_pic_header_flag = 1;
+
+ /* If we return with a valid pointer to a combined frame buffer
+ * then on the next call then we'll have been unhelpfully rewound
+ * by up to 4 bytes (depending upon whether the start code
+ * overlapped the input buffer, and if so by how much). We don't
+ * want this: it will either cause spurious second detections of
+ * the start code we've already seen, or cause extra bytes to be
+ * inserted at the start of the unescaped buffer. */
+ vpc->bytes_to_skip = 4;
+ if (next < 0)
+ vpc->bytes_to_skip += next;
*poutbuf = buf;
*poutbuf_size = buf_size;
@@ -199,6 +270,11 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
{
VC1ParseContext *vpc = s->priv_data;
vpc->v.s.slice_context_count = 1;
+ vpc->v.first_pic_header_flag = 1;
+ vpc->prev_start_code = 0;
+ vpc->bytes_to_skip = 0;
+ vpc->unesc_index = 0;
+ vpc->search_state = NO_MATCH;
return ff_vc1_init_common(&vpc->v);
}
--
1.9.1

View File

@ -0,0 +1,285 @@
From 98428a8cf593587b403076bb54b46cc70ed17ff2 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Mon, 10 Mar 2014 14:42:05 +0000
Subject: [PATCH 4/6] truehd: add hand-scheduled ARM asm version of
ff_mlp_rematrix_channel.
Profiling results for overall audio decode and the rematrix_channels function
in particular are as follows:
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3%
6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant)
8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant)
8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant)
6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9%
6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3%
8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9%
8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3%
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/mlpdsp_arm.S | 222 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 12 +++
2 files changed, 234 insertions(+)
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
index 615819d..9b51d0c 100644
--- a/libavcodec/arm/mlpdsp_arm.S
+++ b/libavcodec/arm/mlpdsp_arm.S
@@ -431,3 +431,225 @@ endfunc
.unreq ST3
.unreq I
.unreq PSAMP
+
+/********************************************************************/
+
+PSA .req a1 // samples
+PCO .req a2 // coeffs
+PBL .req a3 // bypassed_lsbs
+INDEX .req a4
+CO0 .req v1
+CO1 .req v2
+CO2 .req v3
+CO3 .req v4
+SA0 .req v5
+SA1 .req v6
+SA2 .req sl
+SA3 .req fp
+AC0 .req ip
+AC1 .req lr
+NOISE .req SA0
+LSB .req SA1
+DCH .req SA2 // dest_ch
+MASK .req SA3
+
+ // INDEX is used as follows:
+ // bits 0..6 index2 (values up to 17, but wider so that we can
+ // add to index field without needing to mask)
+ // bits 7..14 i (values up to 160)
+ // bit 15 underflow detect for i
+ // bits 25..31 (if access_unit_size_pow2 == 128) \ index
+ // bits 26..31 (if access_unit_size_pow2 == 64) /
+
+.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
+ .if \maxchan == 1
+ // We can just leave the coefficients in registers in this case
+ ldrd CO0, CO1, [PCO]
+ .endif
+1:
+ .if \maxchan == 1
+ ldrd SA0, SA1, [PSA]
+ smull AC0, AC1, CO0, SA0
+ .elseif \maxchan == 5
+ ldr CO0, [PCO, #0]
+ ldr SA0, [PSA, #0]
+ ldr CO1, [PCO, #4]
+ ldr SA1, [PSA, #4]
+ ldrd CO2, CO3, [PCO, #8]
+ smull AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #8]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #16]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #16]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .else // \maxchan == 7
+ ldr CO2, [PCO, #0]
+ ldr SA2, [PSA, #0]
+ ldr CO3, [PCO, #4]
+ ldr SA3, [PSA, #4]
+ ldrd CO0, CO1, [PCO, #8]
+ smull AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #8]
+ smlal AC0, AC1, CO3, SA3
+ ldrd CO2, CO3, [PCO, #16]
+ smlal AC0, AC1, CO0, SA0
+ ldrd SA2, SA3, [PSA, #16]
+ smlal AC0, AC1, CO1, SA1
+ ldrd CO0, CO1, [PCO, #24]
+ smlal AC0, AC1, CO2, SA2
+ ldrd SA0, SA1, [PSA, #24]
+ smlal AC0, AC1, CO3, SA3
+ smlal AC0, AC1, CO0, SA0
+ .endif
+ ldm sp, {NOISE, DCH, MASK}
+ smlal AC0, AC1, CO1, SA1
+ .if \shift != 0
+ .if \index_mask == 63
+ add NOISE, NOISE, INDEX, lsr #32-6
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-6
+ .else // \index_mask == 127
+ add NOISE, NOISE, INDEX, lsr #32-7
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ ldrsb NOISE, [NOISE]
+ add INDEX, INDEX, INDEX, lsl #32-7
+ .endif
+ sub INDEX, INDEX, #1<<7
+ adds AC0, AC0, NOISE, lsl #\shift + 7
+ adc AC1, AC1, NOISE, asr #31
+ .else
+ ldrb LSB, [PBL], #MAX_CHANNELS
+ sub INDEX, INDEX, #1<<7
+ .endif
+ add PSA, PSA, #MAX_CHANNELS*4
+ mov AC0, AC0, lsr #14
+ orr AC0, AC0, AC1, lsl #18
+ .if !\mask_minus1
+ and AC0, AC0, MASK
+ .endif
+ add AC0, AC0, LSB
+ tst INDEX, #1<<15
+ str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
+ beq 1b
+ b 98f
+.endm
+
+.macro switch_on_maxchan shift, index_mask, mask_minus1
+ cmp v4, #5
+ blo 51f
+ beq 50f
+ implement_rematrix \shift, \index_mask, \mask_minus1, 7
+50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
+51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
+.endm
+
+.macro switch_on_mask shift, index_mask
+ cmp sl, #-1
+ bne 40f
+ switch_on_maxchan \shift, \index_mask, 1
+40: switch_on_maxchan \shift, \index_mask, 0
+.endm
+
+.macro switch_on_au_size shift
+ .if \shift == 0
+ switch_on_mask \shift, undefined
+ .else
+ teq v6, #64
+ bne 30f
+ orr INDEX, INDEX, v1, lsl #32-6
+ switch_on_mask \shift, 63
+30: orr INDEX, INDEX, v1, lsl #32-7
+ switch_on_mask \shift, 127
+ .endif
+.endm
+
+/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ * const int32_t *coeffs,
+ * const uint8_t *bypassed_lsbs,
+ * const int8_t *noise_buffer,
+ * int index,
+ * unsigned int dest_ch,
+ * uint16_t blockpos,
+ * unsigned int maxchan,
+ * int matrix_noise_shift,
+ * int access_unit_size_pow2,
+ * int32_t mask);
+ */
+function ff_mlp_rematrix_channel_arm, export=1
+ push {v1-fp,lr}
+ add v1, sp, #9*4 // point at arguments on stack
+ ldm v1, {v1-sl}
+ teq v4, #1
+ itt ne
+ teqne v4, #5
+ teqne v4, #7
+ bne 99f
+ teq v6, #64
+ it ne
+ teqne v6, #128
+ bne 99f
+ sub v2, v2, #MAX_CHANNELS
+ push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
+ movs INDEX, v3, lsl #7
+ beq 98f // just in case, do nothing if blockpos = 0
+ subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
+ adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
+ orr INDEX, INDEX, lr
+ // Switch on matrix_noise_shift: values 0 and 1 are
+ // disproportionately common so do those in a form the branch
+ // predictor can accelerate. Values can only go up to 15.
+ cmp v5, #1
+ beq 11f
+ blo 10f
+A ldr pc, [pc, v5, lsl #2]
+T tbh [pc, v5, lsl #1]
+0:
+A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f
+T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2
+T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2
+T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2
+10: switch_on_au_size 0
+11: switch_on_au_size 1
+12: switch_on_au_size 2
+13: switch_on_au_size 3
+14: switch_on_au_size 4
+15: switch_on_au_size 5
+16: switch_on_au_size 6
+17: switch_on_au_size 7
+18: switch_on_au_size 8
+19: switch_on_au_size 9
+20: switch_on_au_size 10
+21: switch_on_au_size 11
+22: switch_on_au_size 12
+23: switch_on_au_size 13
+24: switch_on_au_size 14
+25: switch_on_au_size 15
+
+98: add sp, sp, #3*4
+ pop {v1-fp,pc}
+99: // Can't handle these parameters, drop back to C
+ pop {v1-fp,lr}
+ b X(ff_mlp_rematrix_channel)
+endfunc
+
+ .unreq PSA
+ .unreq PCO
+ .unreq PBL
+ .unreq INDEX
+ .unreq CO0
+ .unreq CO1
+ .unreq CO2
+ .unreq CO3
+ .unreq SA0
+ .unreq SA1
+ .unreq SA2
+ .unreq SA3
+ .unreq AC0
+ .unreq AC1
+ .unreq NOISE
+ .unreq LSB
+ .unreq DCH
+ .unreq MASK
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 9a14815..1bb2276 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer);
+void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ const int32_t *coeffs,
+ const uint8_t *bypassed_lsbs,
+ const int8_t *noise_buffer,
+ int index,
+ unsigned int dest_ch,
+ uint16_t blockpos,
+ unsigned int maxchan,
+ int matrix_noise_shift,
+ int access_unit_size_pow2,
+ int32_t mask);
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
}
--
1.9.1

View File

@ -0,0 +1,197 @@
From 5bfcb7a691eb63c56f1485b60f399d79ff943799 Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Wed, 12 Mar 2014 18:18:39 +0000
Subject: [PATCH 5/6] truehd: break out part of output_data into
platform-specific callback.
Verified with profiling that this doesn't have a measurable effect upon
overall performance.
---
libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++
libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
3 files changed, 83 insertions(+), 17 deletions(-)
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
index 01ded5c..061dabc 100644
--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
@@ -363,6 +363,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
else
m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign,
+ m->substream[m->max_decoded_substream].output_shift,
+ m->substream[m->max_decoded_substream].max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
m->params_valid = 1;
for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
@@ -612,6 +616,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
if (substr == m->max_decoded_substream) {
m->avctx->channels = s->max_matrix_channel + 1;
m->avctx->channel_layout = s->ch_layout;
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) {
if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) ||
@@ -857,9 +865,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
return ret;
if (s->param_presence_flags & PARAM_OUTSHIFT)
- if (get_bits1(gbp))
+ if (get_bits1(gbp)) {
for (ch = 0; ch <= s->max_matrix_channel; ch++)
s->output_shift[ch] = get_sbits(gbp, 4);
+ if (substr == m->max_decoded_substream)
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+ }
if (s->param_presence_flags & PARAM_QUANTSTEP)
if (get_bits1(gbp))
@@ -1058,9 +1072,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
{
AVCodecContext *avctx = m->avctx;
SubStream *s = &m->substream[substr];
- unsigned int i, out_ch = 0;
- int32_t *data_32;
- int16_t *data_16;
int ret;
int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
@@ -1078,19 +1089,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
frame->nb_samples = s->blockpos;
if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
return ret;
- data_32 = (int32_t *)frame->data[0];
- data_16 = (int16_t *)frame->data[0];
-
- for (i = 0; i < s->blockpos; i++) {
- for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) {
- int mat_ch = s->ch_assign[out_ch];
- int32_t sample = m->sample_buffer[i][mat_ch]
- << s->output_shift[mat_ch];
- s->lossless_check_data ^= (sample & 0xffffff) << mat_ch;
- if (is32) *data_32++ = sample << 8;
- else *data_16++ = sample >> 8;
- }
- }
+ s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
+ s->blockpos,
+ m->sample_buffer,
+ frame->data[0],
+ s->ch_assign,
+ s->output_shift,
+ s->max_matrix_channel,
+ is32);
/* Update matrix encoding side data */
if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0)
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
index 7a359b0..3ae8c37 100644
--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
@@ -89,10 +89,48 @@ void ff_mlp_rematrix_channel(int32_t *samples,
}
}
+static int32_t (*mlp_select_pack_output(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+ return ff_mlp_pack_output;
+}
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32)
+{
+ unsigned int i, out_ch = 0;
+ int32_t *data_32 = data;
+ int16_t *data_16 = data;
+
+ for (i = 0; i < blockpos; i++) {
+ for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
+ int mat_ch = ch_assign[out_ch];
+ int32_t sample = sample_buffer[i][mat_ch]
+ << output_shift[mat_ch];
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+ if (is32)
+ *data_32++ = sample << 8;
+ else
+ *data_16++ = sample >> 8;
+ }
+ }
+ return lossless_check_data;
+}
+
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
{
c->mlp_filter_channel = mlp_filter_channel;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
+ c->mlp_select_pack_output = mlp_select_pack_output;
+ c->mlp_pack_output = ff_mlp_pack_output;
if (ARCH_ARM)
ff_mlpdsp_init_arm(c);
if (ARCH_X86)
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
index f98e9be..a0edeb7 100644
--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
@@ -23,6 +23,7 @@
#define AVCODEC_MLPDSP_H
#include <stdint.h>
+#include "mlp.h"
void ff_mlp_rematrix_channel(int32_t *samples,
const int32_t *coeffs,
@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32);
+
typedef struct MLPDSPContext {
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
int matrix_noise_shift,
int access_unit_size_pow2,
int32_t mask);
+ int32_t (*(*mlp_select_pack_output)(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+ int32_t (*mlp_pack_output)(int32_t lossless_check_data,
+ uint16_t blockpos,
+ int32_t (*sample_buffer)[MAX_CHANNELS],
+ void *data,
+ uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32);
} MLPDSPContext;
void ff_mlpdsp_init(MLPDSPContext *c);
--
1.9.1

View File

@ -0,0 +1,689 @@
From c647209386bd811cc1c33b4fc8ec17a00f8c8ded Mon Sep 17 00:00:00 2001
From: Ben Avison <bavison@riscosopen.org>
Date: Thu, 13 Mar 2014 00:21:55 +0000
Subject: [PATCH 6/6] truehd: add hand-scheduled ARM asm version of
ff_mlp_pack_output.
Profiling results for overall decode and the output_data function in
particular are as follows:
Before After
Mean StdDev Mean StdDev Confidence Change
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%
The assembly version has also been tested with a fuzz tester to ensure that
any combinations of inputs not exercised by my available test streams still
generate mathematically identical results to the C version.
---
libavcodec/arm/Makefile | 1 +
libavcodec/arm/mlpdsp_armv6.S | 530 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/mlpdsp_init_arm.c | 96 +++++++
3 files changed, 627 insertions(+)
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index ba673b1..7b2f923 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -52,6 +52,7 @@ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
arm/hpeldsp_armv6.o
+ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
arm/vp8dsp_init_armv6.o \
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
new file mode 100644
index 0000000..05a2c85
--- /dev/null
+++ b/libavcodec/arm/mlpdsp_armv6.S
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro loadregoffsh2 group, index, base, offgroup, offindex
+ .altmacro
+ loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
+ .noaltmacro
+.endm
+
+.macro loadregoffsh2_ group, index, base, offgroup, offindex
+ ldr \group\index, [\base, \offgroup\offindex, lsl #2]
+.endm
+
+.macro eorlslreg check, data, group, index
+ .altmacro
+ eorlslreg_ \check, \data, \group, %(\index)
+ .noaltmacro
+.endm
+
+.macro eorlslreg_ check, data, group, index
+ eor \check, \check, \data, lsl \group\index
+.endm
+
+.macro decr_modulo var, by, modulus
+ .set \var, \var - \by
+ .if \var == 0
+ .set \var, \modulus
+ .endif
+.endm
+
+ .macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
+ .else // size == 4
+ .if IDX1 > 4 || \channels==8
+ ldm IN!, {\r0, \r1, \r2, \r3}
+ .else
+ ldm IN, {\r0, \r1, \r2, \r3}
+ .if !\pointer_dead
+ add IN, IN, #(4 + 8 - \channels) * 4
+ .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+ .macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
+ .if \size == 2
+ .if IDX1 > 2
+ ldm IN!, {\r2, \r3}
+ .else
+//A .ifc \r2, ip
+//A .if \pointer_dead
+//A ldm IN, {\r2, \r3}
+//A .else
+//A ldr \r2, [IN], #4
+//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
+//A .endif
+//A .else
+ ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
+//A .endif
+ .endif
+ .endif
+ decr_modulo IDX1, \size, \channels
+ .endm
+
+.macro implement_pack inorder, channels, shift
+.if \inorder
+.ifc \shift, mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+SHIFT0 .req v5
+SHIFT1 .req v6
+SHIFT2 .req sl
+SHIFT3 .req fp
+SHIFT4 .req ip
+SHIFT5 .req lr
+
+ .macro output4words
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
+ load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
+ .if \channels == 2
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .elseif \channels == 6
+ .if IDX2 == 6
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .elseif IDX2 == 2
+ lsl DAT0, SHIFT4
+ lsl DAT1, SHIFT5
+ lsl DAT2, SHIFT0
+ lsl DAT3, SHIFT1
+ .else // IDX2 == 4
+ lsl DAT0, SHIFT2
+ lsl DAT1, SHIFT3
+ lsl DAT2, SHIFT4
+ lsl DAT3, SHIFT5
+ .endif
+ .elseif \channels == 8
+ .if IDX2 == 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ .else
+ uxtb SHIFT0, SHIFT5, ror #0
+ uxtb SHIFT1, SHIFT5, ror #8
+ uxtb SHIFT2, SHIFT5, ror #16
+ uxtb SHIFT3, SHIFT5, ror #24
+ .endif
+ lsl DAT0, SHIFT0
+ lsl DAT1, SHIFT1
+ lsl DAT2, SHIFT2
+ lsl DAT3, SHIFT3
+ .endif
+ eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ it eq
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack
+ ldr SHIFT1, =0x08080808
+ ldr SHIFT4, [SHIFT0]
+ .if \channels == 2
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ .else
+ ldr SHIFT5, [SHIFT0, #4]
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+ uadd8 SHIFT5, SHIFT5, SHIFT1
+ .if \channels == 6
+ uxtb SHIFT0, SHIFT4, ror #0
+ uxtb SHIFT1, SHIFT4, ror #8
+ uxtb SHIFT2, SHIFT4, ror #16
+ uxtb SHIFT3, SHIFT4, ror #24
+ uxtb SHIFT4, SHIFT5, ror #0
+ uxtb SHIFT5, SHIFT5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq SHIFT0
+ .unreq SHIFT1
+ .unreq SHIFT2
+ .unreq SHIFT3
+ .unreq SHIFT4
+ .unreq SHIFT5
+
+.else // not mixed
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+DAT4 .req v5
+DAT5 .req v6
+DAT6 .req sl // use these rather than the otherwise unused
+DAT7 .req fp // ip and lr so that we can load them usinf LDRD
+
+ .macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
+ .if \head
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ .endif
+ .if \head
+ load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+ .endif
+ .if \tail
+ eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
+ eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ stm OUT!, {\r4, \r5, \r6, \r7}
+ .endif
+ .if \head
+ lsl \r0, #8 + \shift
+ lsl \r1, #8 + \shift
+ lsl \r2, #8 + \shift
+ lsl \r3, #8 + \shift
+ .endif
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ it lo
+ bxlo lr
+ push {v1-v6,sl,fp,lr}
+ .set IDX1, \channels
+ .set IDX2, \channels
+ output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+0: beq 1f
+ .rept WORDS_PER_LOOP / 8
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+1:
+ .rept WORDS_PER_LOOP / 8 - 1
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
+ output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ pop {v1-v6,sl,fp,pc}
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq DAT4
+ .unreq DAT5
+ .unreq DAT6
+ .unreq DAT7
+
+.endif // mixed
+.else // not inorder
+.ifc \shift, mixed
+
+// This case not currently handled
+
+.else // not mixed
+
+#if !CONFIG_THUMB
+
+CHECK .req a1
+COUNT .req a2
+IN .req a3
+OUT .req a4
+DAT0 .req v1
+DAT1 .req v2
+DAT2 .req v3
+DAT3 .req v4
+CHAN0 .req v5
+CHAN1 .req v6
+CHAN2 .req sl
+CHAN3 .req fp
+CHAN4 .req ip
+CHAN5 .req lr
+
+ .macro output4words
+ .if \channels == 8
+ .if IDX1 == 8
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ .else
+ uxtb CHAN0, CHAN5, ror #0
+ uxtb CHAN1, CHAN5, ror #8
+ uxtb CHAN2, CHAN5, ror #16
+ uxtb CHAN3, CHAN5, ror #24
+ .endif
+ ldr DAT0, [IN, CHAN0, lsl #2]
+ ldr DAT1, [IN, CHAN1, lsl #2]
+ ldr DAT2, [IN, CHAN2, lsl #2]
+ ldr DAT3, [IN, CHAN3, lsl #2]
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ decr_modulo IDX1, 4, \channels
+ .else
+ .set SIZE_GROUP1, IDX1
+ .if SIZE_GROUP1 > 4
+ .set SIZE_GROUP1, 4
+ .endif
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
+ .if SIZE_GROUP1 == 2
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ add IN, IN, #8*4
+ .else // SIZE_GROUP1 == 4
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+ loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
+ .if IDX1 == 4
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP1, \channels
+ .if SIZE_GROUP2 == 2
+ loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
+ loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
+ .if IDX1 == 2
+ add IN, IN, #8*4
+ .endif
+ .endif
+ decr_modulo IDX1, SIZE_GROUP2, \channels
+ .endif
+ .if \channels == 8 // in this case we can corrupt CHAN0-3
+ rsb CHAN0, CHAN0, #8
+ rsb CHAN1, CHAN1, #8
+ rsb CHAN2, CHAN2, #8
+ rsb CHAN3, CHAN3, #8
+ lsl DAT0, #8 + \shift
+ lsl DAT1, #8 + \shift
+ lsl DAT2, #8 + \shift
+ lsl DAT3, #8 + \shift
+ eor CHECK, CHECK, DAT0, lsr CHAN0
+ eor CHECK, CHECK, DAT1, lsr CHAN1
+ eor CHECK, CHECK, DAT2, lsr CHAN2
+ eor CHECK, CHECK, DAT3, lsr CHAN3
+ .else
+ .if \shift != 0
+ lsl DAT0, #\shift
+ lsl DAT1, #\shift
+ lsl DAT2, #\shift
+ lsl DAT3, #\shift
+ .endif
+ bic DAT0, DAT0, #0xff000000
+ bic DAT1, DAT1, #0xff000000
+ bic DAT2, DAT2, #0xff000000
+ bic DAT3, DAT3, #0xff000000
+ eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
+ eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
+ decr_modulo IDX2, 2, \channels
+ lsl DAT0, #8
+ lsl DAT1, #8
+ lsl DAT2, #8
+ lsl DAT3, #8
+ .endif
+ stm OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
+ .endif
+ teq COUNT, #0
+ it eq
+ bxeq lr
+ push {v1-v6,sl,fp,lr}
+ ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack
+ ldr CHAN4, [CHAN0]
+ .if \channels == 2
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ .else
+ ldr CHAN5, [CHAN0, #4]
+ .if \channels == 6
+ uxtb CHAN0, CHAN4, ror #0
+ uxtb CHAN1, CHAN4, ror #8
+ uxtb CHAN2, CHAN4, ror #16
+ uxtb CHAN3, CHAN4, ror #24
+ uxtb CHAN4, CHAN5, ror #0
+ uxtb CHAN5, CHAN5, ror #8
+ .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+ output4words
+ .endr
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
+ bne 0b
+ pop {v1-v6,sl,fp,pc}
+ .ltorg
+endfunc
+ .purgem output4words
+
+ .unreq CHECK
+ .unreq COUNT
+ .unreq IN
+ .unreq OUT
+ .unreq DAT0
+ .unreq DAT1
+ .unreq DAT2
+ .unreq DAT3
+ .unreq CHAN0
+ .unreq CHAN1
+ .unreq CHAN2
+ .unreq CHAN3
+ .unreq CHAN4
+ .unreq CHAN5
+
+#endif // !CONFIG_THUMB
+
+.endif // mixed
+.endif // inorder
+.endm // implement_pack
+
+.macro pack_channels inorder, channels
+ implement_pack \inorder, \channels, 0
+ implement_pack \inorder, \channels, 1
+ implement_pack \inorder, \channels, 2
+ implement_pack \inorder, \channels, 3
+ implement_pack \inorder, \channels, 4
+ implement_pack \inorder, \channels, 5
+ implement_pack \inorder, \channels, mixed
+.endm
+
+.macro pack_order inorder
+ pack_channels \inorder, 2
+ pack_channels \inorder, 6
+ pack_channels \inorder, 8
+.endm
+
+ pack_order 0
+ pack_order 1
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
index 1bb2276..10ec316 100644
--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
@@ -41,8 +41,104 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
int access_unit_size_pow2,
int32_t mask);
+#define DECLARE_PACK(order,channels,shift) \
+ int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+#define ENUMERATE_PACK(order,channels,shift) \
+ ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
+#define PACK_CHANNELS(macro,order,channels) \
+ macro(order,channels,0) \
+ macro(order,channels,1) \
+ macro(order,channels,2) \
+ macro(order,channels,3) \
+ macro(order,channels,4) \
+ macro(order,channels,5) \
+ macro(order,channels,mixed)
+#define PACK_ORDER(macro,order) \
+ PACK_CHANNELS(macro,order,2) \
+ PACK_CHANNELS(macro,order,6) \
+ PACK_CHANNELS(macro,order,8)
+#define PACK_ALL(macro) \
+ PACK_ORDER(macro,outof) \
+ PACK_ORDER(macro,in)
+PACK_ALL(DECLARE_PACK)
+
+#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
+#if CONFIG_THUMB
+#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0
+#endif
+
+static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
+ int8_t *output_shift,
+ uint8_t max_matrix_channel,
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+ int ch_index;
+ int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
+ int inorder = 1;
+ static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
+ PACK_ALL(ENUMERATE_PACK)
+ };
+ int i;
+
+ if (!is32) // don't support 16-bit output (it's not used by TrueHD)
+ return ff_mlp_pack_output;
+
+ switch (max_matrix_channel) {
+ case 1:
+ ch_index = 0;
+ break;
+ case 5:
+ ch_index = 1;
+ break;
+ case 7:
+ ch_index = 2;
+ break;
+ default:
+ return ff_mlp_pack_output;
+ }
+
+ for (i = 0; i <= max_matrix_channel; i++) {
+ if (shift != 6 && output_shift[i] != shift)
+ shift = 6; // indicate mixed shifts
+ if (ch_assign[i] != i)
+ inorder = 0;
+ }
+#if CONFIG_THUMB
+ if (!inorder)
+ return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode
+#else
+ if (shift == 6 && !inorder)
+ return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
+#endif
+
+ return routine[(inorder*3+ch_index)*7+shift];
+}
+
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
{
+ int cpu_flags = av_get_cpu_flags();
+
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+ if (cpu_flags & AV_CPU_FLAG_ARMV6)
+ c->mlp_select_pack_output = mlp_select_pack_output_armv6;
}
--
1.9.1

View File

@ -0,0 +1,47 @@
commit 0e7427498cb1131671f6fe9d054245ae7e5a36f5
Author: popcornmix <popcornmix@gmail.com>
Date: Tue Mar 25 19:43:07 2014 +0000
[ffmpeg] Speed up wtv index creation
The index creation is O(N^2) with number of entries (typically thousands).
On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
By replacing with an O(N) loop, this takes virtually zero time
diff --git a/libavformat/wtvdec.c b/libavformat/wtvdec.c
index e423370..70898bd 100644
--- a/libavformat/wtvdec.c
+++ b/libavformat/wtvdec.c
@@ -980,21 +980,23 @@ static int read_header(AVFormatContext *s)
pb = wtvfile_open(s, root, root_size, ff_timeline_table_0_entries_Events_le16);
if (pb) {
int i;
+ AVIndexEntry *e = wtv->index_entries;
+ AVIndexEntry *e_end = wtv->index_entries + wtv->nb_index_entries - 1;
+ uint64_t last_position = 0;
while (1) {
uint64_t frame_nb = avio_rl64(pb);
uint64_t position = avio_rl64(pb);
+ while (frame_nb > e->size && e <= e_end) {
+ e->pos = last_position;
+ e++;
+ }
if (url_feof(pb))
break;
- for (i = wtv->nb_index_entries - 1; i >= 0; i--) {
- AVIndexEntry *e = wtv->index_entries + i;
- if (frame_nb > e->size)
- break;
- if (position > e->pos)
- e->pos = position;
- }
+ last_position = position;
}
+ e_end->pos = last_position;
wtvfile_close(pb);
- st->duration = wtv->index_entries[wtv->nb_index_entries - 1].timestamp;
+ st->duration = e_end->timestamp;
}
}
}