mirror of
https://github.com/LibreELEC/LibreELEC.tv.git
synced 2025-07-24 11:16:51 +00:00
projects/RPi/patches/ffmpeg: add RPi specific patches
Signed-off-by: Stephan Raue <stephan@openelec.tv>
This commit is contained in:
parent
cc971f66ea
commit
5084896ac1
@ -0,0 +1,752 @@
|
||||
From 8cdb3bf2837a3fb4fff3c6586316f81ae5f7b6cd Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 16 Apr 2014 01:51:31 +0100
|
||||
Subject: [PATCH 1/3] h264: Move search code search functions into separate
|
||||
source files.
|
||||
|
||||
This permits re-use with parsers for codecs which use similar start codes.
|
||||
|
||||
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
|
||||
---
|
||||
libavcodec/Makefile | 2 +-
|
||||
libavcodec/arm/Makefile | 2 +-
|
||||
libavcodec/arm/h264dsp_armv6.S | 253 --------------------------------------
|
||||
libavcodec/arm/h264dsp_init_arm.c | 4 +-
|
||||
libavcodec/arm/startcode_armv6.S | 253 ++++++++++++++++++++++++++++++++++++++
|
||||
libavcodec/h264dsp.c | 31 +----
|
||||
libavcodec/startcode.c | 57 +++++++++
|
||||
libavcodec/startcode.h | 35 ++++++
|
||||
8 files changed, 351 insertions(+), 286 deletions(-)
|
||||
delete mode 100644 libavcodec/arm/h264dsp_armv6.S
|
||||
create mode 100644 libavcodec/arm/startcode_armv6.S
|
||||
create mode 100644 libavcodec/startcode.c
|
||||
create mode 100644 libavcodec/startcode.h
|
||||
|
||||
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
|
||||
index b56ecd1..19caf11 100644
|
||||
--- a/libavcodec/Makefile
|
||||
+++ b/libavcodec/Makefile
|
||||
@@ -49,7 +49,7 @@ OBJS-$(CONFIG_FFT) += avfft.o fft_fixed.o fft_float.o \
|
||||
OBJS-$(CONFIG_GOLOMB) += golomb.o
|
||||
OBJS-$(CONFIG_H263DSP) += h263dsp.o
|
||||
OBJS-$(CONFIG_H264CHROMA) += h264chroma.o
|
||||
-OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o
|
||||
+OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o startcode.o
|
||||
OBJS-$(CONFIG_H264PRED) += h264pred.o
|
||||
OBJS-$(CONFIG_H264QPEL) += h264qpel.o
|
||||
OBJS-$(CONFIG_HPELDSP) += hpeldsp.o
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index a8446b2..b6410b2 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -47,7 +47,7 @@ ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \
|
||||
arm/simple_idct_armv6.o \
|
||||
|
||||
ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
|
||||
-ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
|
||||
+ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
|
||||
arm/hpeldsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
|
||||
diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S
|
||||
deleted file mode 100644
|
||||
index 2758262..0000000
|
||||
--- a/libavcodec/arm/h264dsp_armv6.S
|
||||
+++ /dev/null
|
||||
@@ -1,253 +0,0 @@
|
||||
-/*
|
||||
- * Copyright (c) 2013 RISC OS Open Ltd
|
||||
- * Author: Ben Avison <bavison@riscosopen.org>
|
||||
- *
|
||||
- * This file is part of FFmpeg.
|
||||
- *
|
||||
- * FFmpeg is free software; you can redistribute it and/or
|
||||
- * modify it under the terms of the GNU Lesser General Public
|
||||
- * License as published by the Free Software Foundation; either
|
||||
- * version 2.1 of the License, or (at your option) any later version.
|
||||
- *
|
||||
- * FFmpeg is distributed in the hope that it will be useful,
|
||||
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- * Lesser General Public License for more details.
|
||||
- *
|
||||
- * You should have received a copy of the GNU Lesser General Public
|
||||
- * License along with FFmpeg; if not, write to the Free Software
|
||||
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
- */
|
||||
-
|
||||
-#include "libavutil/arm/asm.S"
|
||||
-
|
||||
-RESULT .req a1
|
||||
-BUF .req a1
|
||||
-SIZE .req a2
|
||||
-PATTERN .req a3
|
||||
-PTR .req a4
|
||||
-DAT0 .req v1
|
||||
-DAT1 .req v2
|
||||
-DAT2 .req v3
|
||||
-DAT3 .req v4
|
||||
-TMP0 .req v5
|
||||
-TMP1 .req v6
|
||||
-TMP2 .req ip
|
||||
-TMP3 .req lr
|
||||
-
|
||||
-#define PRELOAD_DISTANCE 4
|
||||
-
|
||||
-.macro innerloop4
|
||||
- ldr DAT0, [PTR], #4
|
||||
- subs SIZE, SIZE, #4 @ C flag survives rest of macro
|
||||
- sub TMP0, DAT0, PATTERN, lsr #14
|
||||
- bic TMP0, TMP0, DAT0
|
||||
- ands TMP0, TMP0, PATTERN
|
||||
-.endm
|
||||
-
|
||||
-.macro innerloop16 decrement, do_preload
|
||||
- ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
|
||||
- .ifnc "\do_preload",""
|
||||
- pld [PTR, #PRELOAD_DISTANCE*32]
|
||||
- .endif
|
||||
- .ifnc "\decrement",""
|
||||
- subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
|
||||
- .endif
|
||||
- sub TMP0, DAT0, PATTERN, lsr #14
|
||||
- sub TMP1, DAT1, PATTERN, lsr #14
|
||||
- bic TMP0, TMP0, DAT0
|
||||
- bic TMP1, TMP1, DAT1
|
||||
- sub TMP2, DAT2, PATTERN, lsr #14
|
||||
- sub TMP3, DAT3, PATTERN, lsr #14
|
||||
- ands TMP0, TMP0, PATTERN
|
||||
- bic TMP2, TMP2, DAT2
|
||||
- it eq
|
||||
- andseq TMP1, TMP1, PATTERN
|
||||
- bic TMP3, TMP3, DAT3
|
||||
- itt eq
|
||||
- andseq TMP2, TMP2, PATTERN
|
||||
- andseq TMP3, TMP3, PATTERN
|
||||
-.endm
|
||||
-
|
||||
-/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
|
||||
-function ff_h264_find_start_code_candidate_armv6, export=1
|
||||
- push {v1-v6,lr}
|
||||
- mov PTR, BUF
|
||||
- @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
|
||||
- @ before using code that does preloads
|
||||
- cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
|
||||
- blo 60f
|
||||
-
|
||||
- @ Get to word-alignment, 1 byte at a time
|
||||
- tst PTR, #3
|
||||
- beq 2f
|
||||
-1: ldrb DAT0, [PTR], #1
|
||||
- sub SIZE, SIZE, #1
|
||||
- teq DAT0, #0
|
||||
- beq 90f
|
||||
- tst PTR, #3
|
||||
- bne 1b
|
||||
-2: @ Get to 4-word alignment, 1 word at a time
|
||||
- ldr PATTERN, =0x80008000
|
||||
- setend be
|
||||
- tst PTR, #12
|
||||
- beq 4f
|
||||
-3: innerloop4
|
||||
- bne 91f
|
||||
- tst PTR, #12
|
||||
- bne 3b
|
||||
-4: @ Get to cacheline (8-word) alignment
|
||||
- tst PTR, #16
|
||||
- beq 5f
|
||||
- innerloop16 16
|
||||
- bne 93f
|
||||
-5: @ Check complete cachelines, with preloading
|
||||
- @ We need to stop when there are still (PRELOAD_DISTANCE+1)
|
||||
- @ complete cachelines to go
|
||||
- sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
|
||||
-6: innerloop16 , do_preload
|
||||
- bne 93f
|
||||
- innerloop16 32
|
||||
- bne 93f
|
||||
- bcs 6b
|
||||
- @ Preload trailing part-cacheline, if any
|
||||
- tst SIZE, #31
|
||||
- beq 7f
|
||||
- pld [PTR, #(PRELOAD_DISTANCE+1)*32]
|
||||
- @ Check remaining data without doing any more preloads. First
|
||||
- @ do in chunks of 4 words:
|
||||
-7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
|
||||
- bmi 9f
|
||||
-8: innerloop16 16
|
||||
- bne 93f
|
||||
- bcs 8b
|
||||
- @ Then in words:
|
||||
-9: adds SIZE, SIZE, #16 - 4
|
||||
- bmi 11f
|
||||
-10: innerloop4
|
||||
- bne 91f
|
||||
- bcs 10b
|
||||
-11: setend le
|
||||
- @ Check second byte of final halfword
|
||||
- ldrb DAT0, [PTR, #-1]
|
||||
- teq DAT0, #0
|
||||
- beq 90f
|
||||
- @ Check any remaining bytes
|
||||
- tst SIZE, #3
|
||||
- beq 13f
|
||||
-12: ldrb DAT0, [PTR], #1
|
||||
- sub SIZE, SIZE, #1
|
||||
- teq DAT0, #0
|
||||
- beq 90f
|
||||
- tst SIZE, #3
|
||||
- bne 12b
|
||||
- @ No candidate found
|
||||
-13: sub RESULT, PTR, BUF
|
||||
- b 99f
|
||||
-
|
||||
-60: @ Small buffer - simply check by looping over bytes
|
||||
- subs SIZE, SIZE, #1
|
||||
- bcc 99f
|
||||
-61: ldrb DAT0, [PTR], #1
|
||||
- subs SIZE, SIZE, #1
|
||||
- teq DAT0, #0
|
||||
- beq 90f
|
||||
- bcs 61b
|
||||
- @ No candidate found
|
||||
- sub RESULT, PTR, BUF
|
||||
- b 99f
|
||||
-
|
||||
-90: @ Found a candidate at the preceding byte
|
||||
- sub RESULT, PTR, BUF
|
||||
- sub RESULT, RESULT, #1
|
||||
- b 99f
|
||||
-
|
||||
-91: @ Found a candidate somewhere in the preceding 4 bytes
|
||||
- sub RESULT, PTR, BUF
|
||||
- sub RESULT, RESULT, #4
|
||||
- sub TMP0, DAT0, #0x20000
|
||||
- bics TMP0, TMP0, DAT0
|
||||
- itt pl
|
||||
- ldrbpl DAT0, [PTR, #-3]
|
||||
- addpl RESULT, RESULT, #2
|
||||
- bpl 92f
|
||||
- teq RESULT, #0
|
||||
- beq 98f @ don't look back a byte if found at first byte in buffer
|
||||
- ldrb DAT0, [PTR, #-5]
|
||||
-92: teq DAT0, #0
|
||||
- it eq
|
||||
- subeq RESULT, RESULT, #1
|
||||
- b 98f
|
||||
-
|
||||
-93: @ Found a candidate somewhere in the preceding 16 bytes
|
||||
- sub RESULT, PTR, BUF
|
||||
- sub RESULT, RESULT, #16
|
||||
- teq TMP0, #0
|
||||
- beq 95f @ not in first 4 bytes
|
||||
- sub TMP0, DAT0, #0x20000
|
||||
- bics TMP0, TMP0, DAT0
|
||||
- itt pl
|
||||
- ldrbpl DAT0, [PTR, #-15]
|
||||
- addpl RESULT, RESULT, #2
|
||||
- bpl 94f
|
||||
- teq RESULT, #0
|
||||
- beq 98f @ don't look back a byte if found at first byte in buffer
|
||||
- ldrb DAT0, [PTR, #-17]
|
||||
-94: teq DAT0, #0
|
||||
- it eq
|
||||
- subeq RESULT, RESULT, #1
|
||||
- b 98f
|
||||
-95: add RESULT, RESULT, #4
|
||||
- teq TMP1, #0
|
||||
- beq 96f @ not in next 4 bytes
|
||||
- sub TMP1, DAT1, #0x20000
|
||||
- bics TMP1, TMP1, DAT1
|
||||
- itee mi
|
||||
- ldrbmi DAT0, [PTR, #-13]
|
||||
- ldrbpl DAT0, [PTR, #-11]
|
||||
- addpl RESULT, RESULT, #2
|
||||
- teq DAT0, #0
|
||||
- it eq
|
||||
- subeq RESULT, RESULT, #1
|
||||
- b 98f
|
||||
-96: add RESULT, RESULT, #4
|
||||
- teq TMP2, #0
|
||||
- beq 97f @ not in next 4 bytes
|
||||
- sub TMP2, DAT2, #0x20000
|
||||
- bics TMP2, TMP2, DAT2
|
||||
- itee mi
|
||||
- ldrbmi DAT0, [PTR, #-9]
|
||||
- ldrbpl DAT0, [PTR, #-7]
|
||||
- addpl RESULT, RESULT, #2
|
||||
- teq DAT0, #0
|
||||
- it eq
|
||||
- subeq RESULT, RESULT, #1
|
||||
- b 98f
|
||||
-97: add RESULT, RESULT, #4
|
||||
- sub TMP3, DAT3, #0x20000
|
||||
- bics TMP3, TMP3, DAT3
|
||||
- itee mi
|
||||
- ldrbmi DAT0, [PTR, #-5]
|
||||
- ldrbpl DAT0, [PTR, #-3]
|
||||
- addpl RESULT, RESULT, #2
|
||||
- teq DAT0, #0
|
||||
- it eq
|
||||
- subeq RESULT, RESULT, #1
|
||||
- @ drop through to 98f
|
||||
-98: setend le
|
||||
-99: pop {v1-v6,pc}
|
||||
-endfunc
|
||||
-
|
||||
- .unreq RESULT
|
||||
- .unreq BUF
|
||||
- .unreq SIZE
|
||||
- .unreq PATTERN
|
||||
- .unreq PTR
|
||||
- .unreq DAT0
|
||||
- .unreq DAT1
|
||||
- .unreq DAT2
|
||||
- .unreq DAT3
|
||||
- .unreq TMP0
|
||||
- .unreq TMP1
|
||||
- .unreq TMP2
|
||||
- .unreq TMP3
|
||||
diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
|
||||
index a0418fd..eb6c514 100644
|
||||
--- a/libavcodec/arm/h264dsp_init_arm.c
|
||||
+++ b/libavcodec/arm/h264dsp_init_arm.c
|
||||
@@ -24,7 +24,7 @@
|
||||
#include "libavutil/arm/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
-int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
|
||||
+int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size);
|
||||
|
||||
void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
@@ -109,7 +109,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_armv6(cpu_flags))
|
||||
- c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
|
||||
+ c->h264_find_start_code_candidate = ff_startcode_find_candidate_armv6;
|
||||
if (have_neon(cpu_flags))
|
||||
h264dsp_init_neon(c, bit_depth, chroma_format_idc);
|
||||
}
|
||||
diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
|
||||
new file mode 100644
|
||||
index 0000000..a46f009
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/startcode_armv6.S
|
||||
@@ -0,0 +1,253 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2013 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of FFmpeg.
|
||||
+ *
|
||||
+ * FFmpeg is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * FFmpeg is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with FFmpeg; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+RESULT .req a1
|
||||
+BUF .req a1
|
||||
+SIZE .req a2
|
||||
+PATTERN .req a3
|
||||
+PTR .req a4
|
||||
+DAT0 .req v1
|
||||
+DAT1 .req v2
|
||||
+DAT2 .req v3
|
||||
+DAT3 .req v4
|
||||
+TMP0 .req v5
|
||||
+TMP1 .req v6
|
||||
+TMP2 .req ip
|
||||
+TMP3 .req lr
|
||||
+
|
||||
+#define PRELOAD_DISTANCE 4
|
||||
+
|
||||
+.macro innerloop4
|
||||
+ ldr DAT0, [PTR], #4
|
||||
+ subs SIZE, SIZE, #4 @ C flag survives rest of macro
|
||||
+ sub TMP0, DAT0, PATTERN, lsr #14
|
||||
+ bic TMP0, TMP0, DAT0
|
||||
+ ands TMP0, TMP0, PATTERN
|
||||
+.endm
|
||||
+
|
||||
+.macro innerloop16 decrement, do_preload
|
||||
+ ldmia PTR!, {DAT0,DAT1,DAT2,DAT3}
|
||||
+ .ifnc "\do_preload",""
|
||||
+ pld [PTR, #PRELOAD_DISTANCE*32]
|
||||
+ .endif
|
||||
+ .ifnc "\decrement",""
|
||||
+ subs SIZE, SIZE, #\decrement @ C flag survives rest of macro
|
||||
+ .endif
|
||||
+ sub TMP0, DAT0, PATTERN, lsr #14
|
||||
+ sub TMP1, DAT1, PATTERN, lsr #14
|
||||
+ bic TMP0, TMP0, DAT0
|
||||
+ bic TMP1, TMP1, DAT1
|
||||
+ sub TMP2, DAT2, PATTERN, lsr #14
|
||||
+ sub TMP3, DAT3, PATTERN, lsr #14
|
||||
+ ands TMP0, TMP0, PATTERN
|
||||
+ bic TMP2, TMP2, DAT2
|
||||
+ it eq
|
||||
+ andseq TMP1, TMP1, PATTERN
|
||||
+ bic TMP3, TMP3, DAT3
|
||||
+ itt eq
|
||||
+ andseq TMP2, TMP2, PATTERN
|
||||
+ andseq TMP3, TMP3, PATTERN
|
||||
+.endm
|
||||
+
|
||||
+/* int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size) */
|
||||
+function ff_startcode_find_candidate_armv6, export=1
|
||||
+ push {v1-v6,lr}
|
||||
+ mov PTR, BUF
|
||||
+ @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
|
||||
+ @ before using code that does preloads
|
||||
+ cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
|
||||
+ blo 60f
|
||||
+
|
||||
+ @ Get to word-alignment, 1 byte at a time
|
||||
+ tst PTR, #3
|
||||
+ beq 2f
|
||||
+1: ldrb DAT0, [PTR], #1
|
||||
+ sub SIZE, SIZE, #1
|
||||
+ teq DAT0, #0
|
||||
+ beq 90f
|
||||
+ tst PTR, #3
|
||||
+ bne 1b
|
||||
+2: @ Get to 4-word alignment, 1 word at a time
|
||||
+ ldr PATTERN, =0x80008000
|
||||
+ setend be
|
||||
+ tst PTR, #12
|
||||
+ beq 4f
|
||||
+3: innerloop4
|
||||
+ bne 91f
|
||||
+ tst PTR, #12
|
||||
+ bne 3b
|
||||
+4: @ Get to cacheline (8-word) alignment
|
||||
+ tst PTR, #16
|
||||
+ beq 5f
|
||||
+ innerloop16 16
|
||||
+ bne 93f
|
||||
+5: @ Check complete cachelines, with preloading
|
||||
+ @ We need to stop when there are still (PRELOAD_DISTANCE+1)
|
||||
+ @ complete cachelines to go
|
||||
+ sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
|
||||
+6: innerloop16 , do_preload
|
||||
+ bne 93f
|
||||
+ innerloop16 32
|
||||
+ bne 93f
|
||||
+ bcs 6b
|
||||
+ @ Preload trailing part-cacheline, if any
|
||||
+ tst SIZE, #31
|
||||
+ beq 7f
|
||||
+ pld [PTR, #(PRELOAD_DISTANCE+1)*32]
|
||||
+ @ Check remaining data without doing any more preloads. First
|
||||
+ @ do in chunks of 4 words:
|
||||
+7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
|
||||
+ bmi 9f
|
||||
+8: innerloop16 16
|
||||
+ bne 93f
|
||||
+ bcs 8b
|
||||
+ @ Then in words:
|
||||
+9: adds SIZE, SIZE, #16 - 4
|
||||
+ bmi 11f
|
||||
+10: innerloop4
|
||||
+ bne 91f
|
||||
+ bcs 10b
|
||||
+11: setend le
|
||||
+ @ Check second byte of final halfword
|
||||
+ ldrb DAT0, [PTR, #-1]
|
||||
+ teq DAT0, #0
|
||||
+ beq 90f
|
||||
+ @ Check any remaining bytes
|
||||
+ tst SIZE, #3
|
||||
+ beq 13f
|
||||
+12: ldrb DAT0, [PTR], #1
|
||||
+ sub SIZE, SIZE, #1
|
||||
+ teq DAT0, #0
|
||||
+ beq 90f
|
||||
+ tst SIZE, #3
|
||||
+ bne 12b
|
||||
+ @ No candidate found
|
||||
+13: sub RESULT, PTR, BUF
|
||||
+ b 99f
|
||||
+
|
||||
+60: @ Small buffer - simply check by looping over bytes
|
||||
+ subs SIZE, SIZE, #1
|
||||
+ bcc 99f
|
||||
+61: ldrb DAT0, [PTR], #1
|
||||
+ subs SIZE, SIZE, #1
|
||||
+ teq DAT0, #0
|
||||
+ beq 90f
|
||||
+ bcs 61b
|
||||
+ @ No candidate found
|
||||
+ sub RESULT, PTR, BUF
|
||||
+ b 99f
|
||||
+
|
||||
+90: @ Found a candidate at the preceding byte
|
||||
+ sub RESULT, PTR, BUF
|
||||
+ sub RESULT, RESULT, #1
|
||||
+ b 99f
|
||||
+
|
||||
+91: @ Found a candidate somewhere in the preceding 4 bytes
|
||||
+ sub RESULT, PTR, BUF
|
||||
+ sub RESULT, RESULT, #4
|
||||
+ sub TMP0, DAT0, #0x20000
|
||||
+ bics TMP0, TMP0, DAT0
|
||||
+ itt pl
|
||||
+ ldrbpl DAT0, [PTR, #-3]
|
||||
+ addpl RESULT, RESULT, #2
|
||||
+ bpl 92f
|
||||
+ teq RESULT, #0
|
||||
+ beq 98f @ don't look back a byte if found at first byte in buffer
|
||||
+ ldrb DAT0, [PTR, #-5]
|
||||
+92: teq DAT0, #0
|
||||
+ it eq
|
||||
+ subeq RESULT, RESULT, #1
|
||||
+ b 98f
|
||||
+
|
||||
+93: @ Found a candidate somewhere in the preceding 16 bytes
|
||||
+ sub RESULT, PTR, BUF
|
||||
+ sub RESULT, RESULT, #16
|
||||
+ teq TMP0, #0
|
||||
+ beq 95f @ not in first 4 bytes
|
||||
+ sub TMP0, DAT0, #0x20000
|
||||
+ bics TMP0, TMP0, DAT0
|
||||
+ itt pl
|
||||
+ ldrbpl DAT0, [PTR, #-15]
|
||||
+ addpl RESULT, RESULT, #2
|
||||
+ bpl 94f
|
||||
+ teq RESULT, #0
|
||||
+ beq 98f @ don't look back a byte if found at first byte in buffer
|
||||
+ ldrb DAT0, [PTR, #-17]
|
||||
+94: teq DAT0, #0
|
||||
+ it eq
|
||||
+ subeq RESULT, RESULT, #1
|
||||
+ b 98f
|
||||
+95: add RESULT, RESULT, #4
|
||||
+ teq TMP1, #0
|
||||
+ beq 96f @ not in next 4 bytes
|
||||
+ sub TMP1, DAT1, #0x20000
|
||||
+ bics TMP1, TMP1, DAT1
|
||||
+ itee mi
|
||||
+ ldrbmi DAT0, [PTR, #-13]
|
||||
+ ldrbpl DAT0, [PTR, #-11]
|
||||
+ addpl RESULT, RESULT, #2
|
||||
+ teq DAT0, #0
|
||||
+ it eq
|
||||
+ subeq RESULT, RESULT, #1
|
||||
+ b 98f
|
||||
+96: add RESULT, RESULT, #4
|
||||
+ teq TMP2, #0
|
||||
+ beq 97f @ not in next 4 bytes
|
||||
+ sub TMP2, DAT2, #0x20000
|
||||
+ bics TMP2, TMP2, DAT2
|
||||
+ itee mi
|
||||
+ ldrbmi DAT0, [PTR, #-9]
|
||||
+ ldrbpl DAT0, [PTR, #-7]
|
||||
+ addpl RESULT, RESULT, #2
|
||||
+ teq DAT0, #0
|
||||
+ it eq
|
||||
+ subeq RESULT, RESULT, #1
|
||||
+ b 98f
|
||||
+97: add RESULT, RESULT, #4
|
||||
+ sub TMP3, DAT3, #0x20000
|
||||
+ bics TMP3, TMP3, DAT3
|
||||
+ itee mi
|
||||
+ ldrbmi DAT0, [PTR, #-5]
|
||||
+ ldrbpl DAT0, [PTR, #-3]
|
||||
+ addpl RESULT, RESULT, #2
|
||||
+ teq DAT0, #0
|
||||
+ it eq
|
||||
+ subeq RESULT, RESULT, #1
|
||||
+ @ drop through to 98f
|
||||
+98: setend le
|
||||
+99: pop {v1-v6,pc}
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq RESULT
|
||||
+ .unreq BUF
|
||||
+ .unreq SIZE
|
||||
+ .unreq PATTERN
|
||||
+ .unreq PTR
|
||||
+ .unreq DAT0
|
||||
+ .unreq DAT1
|
||||
+ .unreq DAT2
|
||||
+ .unreq DAT3
|
||||
+ .unreq TMP0
|
||||
+ .unreq TMP1
|
||||
+ .unreq TMP2
|
||||
+ .unreq TMP3
|
||||
diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
|
||||
index a2a4aba..a4da776 100644
|
||||
--- a/libavcodec/h264dsp.c
|
||||
+++ b/libavcodec/h264dsp.c
|
||||
@@ -33,6 +33,7 @@
|
||||
#include "avcodec.h"
|
||||
#include "h264dsp.h"
|
||||
#include "h264idct.h"
|
||||
+#include "startcode.h"
|
||||
#include "libavutil/common.h"
|
||||
|
||||
#define BIT_DEPTH 8
|
||||
@@ -63,34 +64,6 @@
|
||||
#include "h264addpx_template.c"
|
||||
#undef BIT_DEPTH
|
||||
|
||||
-static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
|
||||
-{
|
||||
- int i = 0;
|
||||
-#if HAVE_FAST_UNALIGNED
|
||||
- /* we check i < size instead of i + 3 / 7 because it is
|
||||
- * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
|
||||
- * bytes at the end.
|
||||
- */
|
||||
-# if HAVE_FAST_64BIT
|
||||
- while (i < size &&
|
||||
- !((~*(const uint64_t *)(buf + i) &
|
||||
- (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
|
||||
- 0x8080808080808080ULL))
|
||||
- i += 8;
|
||||
-# else
|
||||
- while (i < size &&
|
||||
- !((~*(const uint32_t *)(buf + i) &
|
||||
- (*(const uint32_t *)(buf + i) - 0x01010101U)) &
|
||||
- 0x80808080U))
|
||||
- i += 4;
|
||||
-# endif
|
||||
-#endif
|
||||
- for (; i < size; i++)
|
||||
- if (!buf[i])
|
||||
- break;
|
||||
- return i;
|
||||
-}
|
||||
-
|
||||
av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
@@ -178,7 +151,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
H264_DSP(8);
|
||||
break;
|
||||
}
|
||||
- c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
|
||||
+ c->h264_find_start_code_candidate = ff_startcode_find_candidate_c;
|
||||
|
||||
if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc);
|
||||
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
|
||||
diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
|
||||
new file mode 100644
|
||||
index 0000000..5df7695
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/startcode.c
|
||||
@@ -0,0 +1,57 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
|
||||
+ *
|
||||
+ * This file is part of FFmpeg.
|
||||
+ *
|
||||
+ * FFmpeg is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * FFmpeg is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with FFmpeg; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+/**
|
||||
+ * @file
|
||||
+ * Accelerated start code search function for start codes common to
|
||||
+ * MPEG-1/2/4 video, VC-1, H.264/5
|
||||
+ * @author Michael Niedermayer <michaelni@gmx.at>
|
||||
+ */
|
||||
+
|
||||
+#include "startcode.h"
|
||||
+#include "config.h"
|
||||
+
|
||||
+int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
|
||||
+{
|
||||
+ int i = 0;
|
||||
+#if HAVE_FAST_UNALIGNED
|
||||
+ /* we check i < size instead of i + 3 / 7 because it is
|
||||
+ * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
|
||||
+ * bytes at the end.
|
||||
+ */
|
||||
+# if HAVE_FAST_64BIT
|
||||
+ while (i < size &&
|
||||
+ !((~*(const uint64_t *)(buf + i) &
|
||||
+ (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
|
||||
+ 0x8080808080808080ULL))
|
||||
+ i += 8;
|
||||
+# else
|
||||
+ while (i < size &&
|
||||
+ !((~*(const uint32_t *)(buf + i) &
|
||||
+ (*(const uint32_t *)(buf + i) - 0x01010101U)) &
|
||||
+ 0x80808080U))
|
||||
+ i += 4;
|
||||
+# endif
|
||||
+#endif
|
||||
+ for (; i < size; i++)
|
||||
+ if (!buf[i])
|
||||
+ break;
|
||||
+ return i;
|
||||
+}
|
||||
diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h
|
||||
new file mode 100644
|
||||
index 0000000..cc55d5f
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/startcode.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
|
||||
+ *
|
||||
+ * This file is part of FFmpeg.
|
||||
+ *
|
||||
+ * FFmpeg is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * FFmpeg is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with FFmpeg; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+/**
|
||||
+ * @file
|
||||
+ * Accelerated start code search function for start codes common to
|
||||
+ * MPEG-1/2/4 video, VC-1, H.264/5
|
||||
+ * @author Michael Niedermayer <michaelni@gmx.at>
|
||||
+ */
|
||||
+
|
||||
+#ifndef AVCODEC_STARTCODE_H
|
||||
+#define AVCODEC_STARTCODE_H
|
||||
+
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+int ff_startcode_find_candidate_c(const uint8_t *buf, int size);
|
||||
+
|
||||
+#endif /* AVCODEC_STARTCODE_H */
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,65 @@
|
||||
From 425d69b993d25489e4830766507d9d8f6c819802 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 19 Mar 2014 17:26:19 +0000
|
||||
Subject: [PATCH 1/6] truehd: tune VLC decoding for ARM.
|
||||
|
||||
Profiling on a Raspberry Pi revealed the best performance to correspond
|
||||
with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
|
||||
in particular are as follows:
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Confidence Change
|
||||
6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant)
|
||||
6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5%
|
||||
8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5%
|
||||
8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6%
|
||||
6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6%
|
||||
6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1%
|
||||
8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4%
|
||||
8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1%
|
||||
|
||||
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
|
||||
---
|
||||
libavcodec/mlpdec.c | 13 ++++++++++---
|
||||
1 file changed, 10 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
|
||||
index 93ed552..cbd9000 100644
|
||||
--- a/libavcodec/mlpdec.c
|
||||
+++ b/libavcodec/mlpdec.c
|
||||
@@ -37,9 +37,16 @@
|
||||
#include "mlp_parser.h"
|
||||
#include "mlpdsp.h"
|
||||
#include "mlp.h"
|
||||
+#include "config.h"
|
||||
|
||||
/** number of bits used for VLC lookup - longest Huffman code is 9 */
|
||||
+#if ARCH_ARM == 1
|
||||
+#define VLC_BITS 5
|
||||
+#define VLC_STATIC_SIZE 64
|
||||
+#else
|
||||
#define VLC_BITS 9
|
||||
+#define VLC_STATIC_SIZE 512
|
||||
+#endif
|
||||
|
||||
typedef struct SubStream {
|
||||
/// Set if a valid restart header has been read. Otherwise the substream cannot be decoded.
|
||||
@@ -193,13 +200,13 @@ static av_cold void init_static(void)
|
||||
if (!huff_vlc[0].bits) {
|
||||
INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18,
|
||||
&ff_mlp_huffman_tables[0][0][1], 2, 1,
|
||||
- &ff_mlp_huffman_tables[0][0][0], 2, 1, 512);
|
||||
+ &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE);
|
||||
INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16,
|
||||
&ff_mlp_huffman_tables[1][0][1], 2, 1,
|
||||
- &ff_mlp_huffman_tables[1][0][0], 2, 1, 512);
|
||||
+ &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE);
|
||||
INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15,
|
||||
&ff_mlp_huffman_tables[2][0][1], 2, 1,
|
||||
- &ff_mlp_huffman_tables[2][0][0], 2, 1, 512);
|
||||
+ &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE);
|
||||
}
|
||||
|
||||
ff_mlp_init_crc();
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,557 @@
|
||||
From bfe3d8c8e4e046163dc314aa16207413e377283f Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 3 Mar 2014 19:44:23 +0000
|
||||
Subject: [PATCH 2/6] truehd: add hand-scheduled ARM asm version of
|
||||
mlp_filter_channel.
|
||||
|
||||
Profiling results for overall audio decode and the mlp_filter_channel(_arm)
|
||||
function in particular are as follows:
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Confidence Change
|
||||
6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant)
|
||||
6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8%
|
||||
8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant)
|
||||
8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8%
|
||||
6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9%
|
||||
6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4%
|
||||
8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6%
|
||||
8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2%
|
||||
|
||||
Experiments with adding preload instructions to this function yielded no
|
||||
useful benefit, so these have not been included.
|
||||
|
||||
The assembly version has also been tested with a fuzz tester to ensure that
|
||||
any combinations of inputs not exercised by my available test streams still
|
||||
generate mathematically identical results to the C version.
|
||||
---
|
||||
libavcodec/arm/Makefile | 2 +
|
||||
libavcodec/arm/mlpdsp_arm.S | 433 +++++++++++++++++++++++++++++++++++++++
|
||||
libavcodec/arm/mlpdsp_init_arm.c | 36 ++++
|
||||
libavcodec/mlpdsp.c | 2 +
|
||||
libavcodec/mlpdsp.h | 1 +
|
||||
5 files changed, 474 insertions(+)
|
||||
create mode 100644 libavcodec/arm/mlpdsp_arm.S
|
||||
create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
|
||||
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index a8446b2..ba673b1 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -22,6 +22,8 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o
|
||||
OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o
|
||||
OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \
|
||||
arm/hpeldsp_arm.o
|
||||
+OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \
|
||||
+ arm/mlpdsp_arm.o
|
||||
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o
|
||||
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o
|
||||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o
|
||||
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
|
||||
new file mode 100644
|
||||
index 0000000..615819d
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/mlpdsp_arm.S
|
||||
@@ -0,0 +1,433 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2014 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of FFmpeg.
|
||||
+ *
|
||||
+ * FFmpeg is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * FFmpeg is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with FFmpeg; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+#define MAX_CHANNELS 8
|
||||
+#define MAX_FIR_ORDER 8
|
||||
+#define MAX_IIR_ORDER 4
|
||||
+#define MAX_RATEFACTOR 4
|
||||
+#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR)
|
||||
+
|
||||
+PST .req a1
|
||||
+PCO .req a2
|
||||
+AC0 .req a3
|
||||
+AC1 .req a4
|
||||
+CO0 .req v1
|
||||
+CO1 .req v2
|
||||
+CO2 .req v3
|
||||
+CO3 .req v4
|
||||
+ST0 .req v5
|
||||
+ST1 .req v6
|
||||
+ST2 .req sl
|
||||
+ST3 .req fp
|
||||
+I .req ip
|
||||
+PSAMP .req lr
|
||||
+
|
||||
+
|
||||
+// Some macros that do loads/multiplies where the register number is determined
|
||||
+// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
|
||||
+
|
||||
+.macro load group, index, base, offset
|
||||
+ .altmacro
|
||||
+ load_ \group, %(\index), \base, \offset
|
||||
+ .noaltmacro
|
||||
+.endm
|
||||
+
|
||||
+.macro load_ group, index, base, offset
|
||||
+ ldr \group\index, [\base, #\offset]
|
||||
+.endm
|
||||
+
|
||||
+.macro loadd group, index, base, offset
|
||||
+ .altmacro
|
||||
+ loadd_ \group, %(\index), %(\index+1), \base, \offset
|
||||
+ .noaltmacro
|
||||
+.endm
|
||||
+
|
||||
+.macro loadd_ group, index0, index1, base, offset
|
||||
+A .if offset >= 256
|
||||
+A ldr \group\index0, [\base, #\offset]
|
||||
+A ldr \group\index1, [\base, #(\offset) + 4]
|
||||
+A .else
|
||||
+ ldrd \group\index0, \group\index1, [\base, #\offset]
|
||||
+A .endif
|
||||
+.endm
|
||||
+
|
||||
+.macro multiply index, accumulate, long
|
||||
+ .altmacro
|
||||
+ multiply_ %(\index), \accumulate, \long
|
||||
+ .noaltmacro
|
||||
+.endm
|
||||
+
|
||||
+.macro multiply_ index, accumulate, long
|
||||
+ .if \long
|
||||
+ .if \accumulate
|
||||
+ smlal AC0, AC1, CO\index, ST\index
|
||||
+ .else
|
||||
+ smull AC0, AC1, CO\index, ST\index
|
||||
+ .endif
|
||||
+ .else
|
||||
+ .if \accumulate
|
||||
+ mla AC0, CO\index, ST\index, AC0
|
||||
+ .else
|
||||
+ mul AC0, CO\index, ST\index
|
||||
+ .endif
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+// A macro to update the load register number and load offsets
|
||||
+
|
||||
+.macro inc howmany
|
||||
+ .set LOAD_REG, (LOAD_REG + \howmany) & 3
|
||||
+ .set OFFSET_CO, OFFSET_CO + 4 * \howmany
|
||||
+ .set OFFSET_ST, OFFSET_ST + 4 * \howmany
|
||||
+ .if FIR_REMAIN > 0
|
||||
+ .set FIR_REMAIN, FIR_REMAIN - \howmany
|
||||
+ .if FIR_REMAIN == 0
|
||||
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
|
||||
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
|
||||
+ .endif
|
||||
+ .elseif IIR_REMAIN > 0
|
||||
+ .set IIR_REMAIN, IIR_REMAIN - \howmany
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+// Macro to implement the inner loop for one specific combination of parameters
|
||||
+
|
||||
+.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps
|
||||
+ .set TOTAL_TAPS, \iir_taps + \fir_taps
|
||||
+
|
||||
+ // Deal with register allocation...
|
||||
+ .set DEFINED_SHIFT, 0
|
||||
+ .set DEFINED_MASK, 0
|
||||
+ .set SHUFFLE_SHIFT, 0
|
||||
+ .set SHUFFLE_MASK, 0
|
||||
+ .set SPILL_SHIFT, 0
|
||||
+ .set SPILL_MASK, 0
|
||||
+ .if TOTAL_TAPS == 0
|
||||
+ // Little register pressure in this case - just keep MASK where it was
|
||||
+ .if !\mask_minus1
|
||||
+ MASK .req ST1
|
||||
+ .set DEFINED_MASK, 1
|
||||
+ .endif
|
||||
+ .else
|
||||
+ .if \shift_0
|
||||
+ .if !\mask_minus1
|
||||
+ // AC1 is unused with shift 0
|
||||
+ MASK .req AC1
|
||||
+ .set DEFINED_MASK, 1
|
||||
+ .set SHUFFLE_MASK, 1
|
||||
+ .endif
|
||||
+ .elseif \shift_8
|
||||
+ .if !\mask_minus1
|
||||
+ .if TOTAL_TAPS <= 4
|
||||
+ // All coefficients are preloaded (so pointer not needed)
|
||||
+ MASK .req PCO
|
||||
+ .set DEFINED_MASK, 1
|
||||
+ .set SHUFFLE_MASK, 1
|
||||
+ .else
|
||||
+ .set SPILL_MASK, 1
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .else // shift not 0 or 8
|
||||
+ .if TOTAL_TAPS <= 3
|
||||
+ // All coefficients are preloaded, and at least one CO register is unused
|
||||
+ .if \fir_taps & 1
|
||||
+ SHIFT .req CO0
|
||||
+ .set DEFINED_SHIFT, 1
|
||||
+ .set SHUFFLE_SHIFT, 1
|
||||
+ .else
|
||||
+ SHIFT .req CO3
|
||||
+ .set DEFINED_SHIFT, 1
|
||||
+ .set SHUFFLE_SHIFT, 1
|
||||
+ .endif
|
||||
+ .if !\mask_minus1
|
||||
+ MASK .req PCO
|
||||
+ .set DEFINED_MASK, 1
|
||||
+ .set SHUFFLE_MASK, 1
|
||||
+ .endif
|
||||
+ .elseif TOTAL_TAPS == 4
|
||||
+ // All coefficients are preloaded
|
||||
+ SHIFT .req PCO
|
||||
+ .set DEFINED_SHIFT, 1
|
||||
+ .set SHUFFLE_SHIFT, 1
|
||||
+ .if !\mask_minus1
|
||||
+ .set SPILL_MASK, 1
|
||||
+ .endif
|
||||
+ .else
|
||||
+ .set SPILL_SHIFT, 1
|
||||
+ .if !\mask_minus1
|
||||
+ .set SPILL_MASK, 1
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .if SPILL_SHIFT
|
||||
+ SHIFT .req ST0
|
||||
+ .set DEFINED_SHIFT, 1
|
||||
+ .endif
|
||||
+ .if SPILL_MASK
|
||||
+ MASK .req ST1
|
||||
+ .set DEFINED_MASK, 1
|
||||
+ .endif
|
||||
+
|
||||
+ // Preload coefficients if possible
|
||||
+ .if TOTAL_TAPS <= 4
|
||||
+ .set OFFSET_CO, 0
|
||||
+ .if \fir_taps & 1
|
||||
+ .set LOAD_REG, 1
|
||||
+ .else
|
||||
+ .set LOAD_REG, 0
|
||||
+ .endif
|
||||
+ .rept \fir_taps
|
||||
+ load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
+ .set LOAD_REG, (LOAD_REG + 1) & 3
|
||||
+ .set OFFSET_CO, OFFSET_CO + 4
|
||||
+ .endr
|
||||
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
|
||||
+ .rept \iir_taps
|
||||
+ load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
+ .set LOAD_REG, (LOAD_REG + 1) & 3
|
||||
+ .set OFFSET_CO, OFFSET_CO + 4
|
||||
+ .endr
|
||||
+ .endif
|
||||
+
|
||||
+ // Move mask/shift to final positions if necessary
|
||||
+ // Need to do this after preloading, because in some cases we
|
||||
+ // reuse the coefficient pointer register
|
||||
+ .if SHUFFLE_SHIFT
|
||||
+ mov SHIFT, ST0
|
||||
+ .endif
|
||||
+ .if SHUFFLE_MASK
|
||||
+ mov MASK, ST1
|
||||
+ .endif
|
||||
+
|
||||
+ // Begin loop
|
||||
+01:
|
||||
+ .if TOTAL_TAPS == 0
|
||||
+ // Things simplify a lot in this case
|
||||
+ // In fact this could be pipelined further if it's worth it...
|
||||
+ ldr ST0, [PSAMP]
|
||||
+ subs I, I, #1
|
||||
+ .if !\mask_minus1
|
||||
+ and ST0, ST0, MASK
|
||||
+ .endif
|
||||
+ str ST0, [PST, #-4]!
|
||||
+ str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
|
||||
+ str ST0, [PSAMP], #4 * MAX_CHANNELS
|
||||
+ bne 01b
|
||||
+ .else
|
||||
+ .if \fir_taps & 1
|
||||
+ .set LOAD_REG, 1
|
||||
+ .else
|
||||
+ .set LOAD_REG, 0
|
||||
+ .endif
|
||||
+ .set LOAD_BANK, 0
|
||||
+ .set FIR_REMAIN, \fir_taps
|
||||
+ .set IIR_REMAIN, \iir_taps
|
||||
+ .if FIR_REMAIN == 0 // only IIR terms
|
||||
+ .set OFFSET_CO, 4 * MAX_FIR_ORDER
|
||||
+ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
|
||||
+ .else
|
||||
+ .set OFFSET_CO, 0
|
||||
+ .set OFFSET_ST, 0
|
||||
+ .endif
|
||||
+ .set MUL_REG, LOAD_REG
|
||||
+ .set COUNTER, 0
|
||||
+ .rept TOTAL_TAPS + 2
|
||||
+ // Do load(s)
|
||||
+ .if FIR_REMAIN != 0 || IIR_REMAIN != 0
|
||||
+ .if COUNTER == 0
|
||||
+ .if TOTAL_TAPS > 4
|
||||
+ load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
+ .endif
|
||||
+ load ST, LOAD_REG, PST, OFFSET_ST
|
||||
+ inc 1
|
||||
+ .elseif COUNTER == 1 && (\fir_taps & 1) == 0
|
||||
+ .if TOTAL_TAPS > 4
|
||||
+ load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
+ .endif
|
||||
+ load ST, LOAD_REG, PST, OFFSET_ST
|
||||
+ inc 1
|
||||
+ .elseif LOAD_BANK == 0
|
||||
+ .if TOTAL_TAPS > 4
|
||||
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
|
||||
+ load CO, LOAD_REG, PCO, OFFSET_CO
|
||||
+ .else
|
||||
+ loadd CO, LOAD_REG, PCO, OFFSET_CO
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .set LOAD_BANK, 1
|
||||
+ .else
|
||||
+ .if FIR_REMAIN == 0 && IIR_REMAIN == 1
|
||||
+ load ST, LOAD_REG, PST, OFFSET_ST
|
||||
+ inc 1
|
||||
+ .else
|
||||
+ loadd ST, LOAD_REG, PST, OFFSET_ST
|
||||
+ inc 2
|
||||
+ .endif
|
||||
+ .set LOAD_BANK, 0
|
||||
+ .endif
|
||||
+ .endif
|
||||
+
|
||||
+ // Do interleaved multiplies, slightly delayed
|
||||
+ .if COUNTER >= 2
|
||||
+ multiply MUL_REG, COUNTER > 2, !\shift_0
|
||||
+ .set MUL_REG, (MUL_REG + 1) & 3
|
||||
+ .endif
|
||||
+ .set COUNTER, COUNTER + 1
|
||||
+ .endr
|
||||
+
|
||||
+ // Post-process the result of the multiplies
|
||||
+ .if SPILL_SHIFT
|
||||
+ ldr SHIFT, [sp, #9*4 + 0*4]
|
||||
+ .endif
|
||||
+ .if SPILL_MASK
|
||||
+ ldr MASK, [sp, #9*4 + 1*4]
|
||||
+ .endif
|
||||
+ ldr ST2, [PSAMP]
|
||||
+ subs I, I, #1
|
||||
+ .if \shift_8
|
||||
+ mov AC0, AC0, lsr #8
|
||||
+ orr AC0, AC0, AC1, lsl #24
|
||||
+ .elseif !\shift_0
|
||||
+ rsb ST3, SHIFT, #32
|
||||
+ mov AC0, AC0, lsr SHIFT
|
||||
+A orr AC0, AC0, AC1, lsl ST3
|
||||
+T mov AC1, AC1, lsl ST3
|
||||
+T orr AC0, AC0, AC1
|
||||
+ .endif
|
||||
+ .if \mask_minus1
|
||||
+ add ST3, ST2, AC0
|
||||
+ .else
|
||||
+ add ST2, ST2, AC0
|
||||
+ and ST3, ST2, MASK
|
||||
+ sub ST2, ST3, AC0
|
||||
+ .endif
|
||||
+ str ST3, [PST, #-4]!
|
||||
+ str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
|
||||
+ str ST3, [PSAMP], #4 * MAX_CHANNELS
|
||||
+ bne 01b
|
||||
+ .endif
|
||||
+ b 99f
|
||||
+
|
||||
+ .if DEFINED_SHIFT
|
||||
+ .unreq SHIFT
|
||||
+ .endif
|
||||
+ .if DEFINED_MASK
|
||||
+ .unreq MASK
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps
|
||||
+A ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps)
|
||||
+T tbh [pc, a3, lsl #1]
|
||||
+0:
|
||||
+A .word 0, 70f, 71f, 72f, 73f, 74f
|
||||
+T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2
|
||||
+ .if \iir_taps <= 3
|
||||
+A .word 75f
|
||||
+T .hword (75f - 0b) / 2
|
||||
+ .if \iir_taps <= 2
|
||||
+A .word 76f
|
||||
+T .hword (76f - 0b) / 2
|
||||
+ .if \iir_taps <= 1
|
||||
+A .word 77f
|
||||
+T .hword (77f - 0b) / 2
|
||||
+ .if \iir_taps == 0
|
||||
+A .word 78f
|
||||
+T .hword (78f - 0b) / 2
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
|
||||
+71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
|
||||
+72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
|
||||
+73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
|
||||
+74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
|
||||
+ .if \iir_taps <= 3
|
||||
+75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
|
||||
+ .if \iir_taps <= 2
|
||||
+76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
|
||||
+ .if \iir_taps <= 1
|
||||
+77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
|
||||
+ .if \iir_taps == 0
|
||||
+78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+.macro switch_on_iir_taps mask_minus1, shift_0, shift_8
|
||||
+A ldr pc, [pc, a4, LSL #2] // irorder is in range 0-4
|
||||
+T tbh [pc, a4, lsl #1]
|
||||
+0:
|
||||
+A .word 0, 60f, 61f, 62f, 63f, 64f
|
||||
+T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2
|
||||
+60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0
|
||||
+61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1
|
||||
+62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2
|
||||
+63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3
|
||||
+64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4
|
||||
+.endm
|
||||
+
|
||||
+/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
|
||||
+ * int firorder, int iirorder,
|
||||
+ * unsigned int filter_shift, int32_t mask,
|
||||
+ * int blocksize, int32_t *sample_buffer);
|
||||
+ */
|
||||
+function ff_mlp_filter_channel_arm, export=1
|
||||
+ push {v1-fp,lr}
|
||||
+ add v1, sp, #9*4 // point at arguments on stack
|
||||
+ ldm v1, {ST0,ST1,I,PSAMP}
|
||||
+ cmp ST1, #-1
|
||||
+ bne 30f
|
||||
+ movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
|
||||
+ bne 20f
|
||||
+ bcs 10f
|
||||
+ switch_on_iir_taps 1, 1, 0
|
||||
+10: switch_on_iir_taps 1, 0, 1
|
||||
+20: switch_on_iir_taps 1, 0, 0
|
||||
+30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
|
||||
+ bne 50f
|
||||
+ bcs 40f
|
||||
+ switch_on_iir_taps 0, 1, 0
|
||||
+40: switch_on_iir_taps 0, 0, 1
|
||||
+50: switch_on_iir_taps 0, 0, 0
|
||||
+99: pop {v1-fp,pc}
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq PST
|
||||
+ .unreq PCO
|
||||
+ .unreq AC0
|
||||
+ .unreq AC1
|
||||
+ .unreq CO0
|
||||
+ .unreq CO1
|
||||
+ .unreq CO2
|
||||
+ .unreq CO3
|
||||
+ .unreq ST0
|
||||
+ .unreq ST1
|
||||
+ .unreq ST2
|
||||
+ .unreq ST3
|
||||
+ .unreq I
|
||||
+ .unreq PSAMP
|
||||
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
|
||||
new file mode 100644
|
||||
index 0000000..9a14815
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/mlpdsp_init_arm.c
|
||||
@@ -0,0 +1,36 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2014 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of FFmpeg.
|
||||
+ *
|
||||
+ * FFmpeg is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * FFmpeg is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with FFmpeg; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include <stdint.h>
|
||||
+
|
||||
+#include "libavutil/arm/cpu.h"
|
||||
+#include "libavutil/attributes.h"
|
||||
+#include "libavcodec/mlpdsp.h"
|
||||
+
|
||||
+void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
|
||||
+ int firorder, int iirorder,
|
||||
+ unsigned int filter_shift, int32_t mask,
|
||||
+ int blocksize, int32_t *sample_buffer);
|
||||
+
|
||||
+av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
|
||||
+{
|
||||
+ c->mlp_filter_channel = ff_mlp_filter_channel_arm;
|
||||
+}
|
||||
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
|
||||
index b413e86..4b403b8 100644
|
||||
--- a/libavcodec/mlpdsp.c
|
||||
+++ b/libavcodec/mlpdsp.c
|
||||
@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
|
||||
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
|
||||
{
|
||||
c->mlp_filter_channel = mlp_filter_channel;
|
||||
+ if (ARCH_ARM)
|
||||
+ ff_mlpdsp_init_arm(c);
|
||||
if (ARCH_X86)
|
||||
ff_mlpdsp_init_x86(c);
|
||||
}
|
||||
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
|
||||
index 84a8aa3..129bcfe 100644
|
||||
--- a/libavcodec/mlpdsp.h
|
||||
+++ b/libavcodec/mlpdsp.h
|
||||
@@ -32,6 +32,7 @@ typedef struct MLPDSPContext {
|
||||
} MLPDSPContext;
|
||||
|
||||
void ff_mlpdsp_init(MLPDSPContext *c);
|
||||
+void ff_mlpdsp_init_arm(MLPDSPContext *c);
|
||||
void ff_mlpdsp_init_x86(MLPDSPContext *c);
|
||||
|
||||
#endif /* AVCODEC_MLPDSP_H */
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,143 @@
|
||||
From a60747132a1a6652ac0d18f3f110a20ea637ac30 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 16 Apr 2014 01:51:32 +0100
|
||||
Subject: [PATCH 2/3] vc-1: Add platform-specific start code search routine to
|
||||
VC1DSPContext.
|
||||
|
||||
Initialise VC1DSPContext for parser as well as for decoder.
|
||||
Note, the VC-1 code doesn't actually use the function pointer yet.
|
||||
|
||||
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
|
||||
---
|
||||
libavcodec/Makefile | 6 +++---
|
||||
libavcodec/arm/Makefile | 2 ++
|
||||
libavcodec/arm/vc1dsp_init_arm.c | 4 ++++
|
||||
libavcodec/vc1.c | 2 ++
|
||||
libavcodec/vc1dec.c | 1 -
|
||||
libavcodec/vc1dsp.c | 3 +++
|
||||
libavcodec/vc1dsp.h | 8 ++++++++
|
||||
7 files changed, 22 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/Makefile b/libavcodec/Makefile
|
||||
index 19caf11..120f85a 100644
|
||||
--- a/libavcodec/Makefile
|
||||
+++ b/libavcodec/Makefile
|
||||
@@ -458,7 +458,7 @@ OBJS-$(CONFIG_VB_DECODER) += vb.o
|
||||
OBJS-$(CONFIG_VBLE_DECODER) += vble.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += vc1dec.o vc1.o vc1data.o vc1dsp.o \
|
||||
msmpeg4dec.o msmpeg4.o msmpeg4data.o \
|
||||
- wmv2dsp.o
|
||||
+ wmv2dsp.o startcode.o
|
||||
OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o
|
||||
OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o
|
||||
OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o
|
||||
@@ -783,9 +783,9 @@ OBJS-$(CONFIG_PNM_PARSER) += pnm_parser.o pnm.o
|
||||
OBJS-$(CONFIG_RV30_PARSER) += rv34_parser.o
|
||||
OBJS-$(CONFIG_RV40_PARSER) += rv34_parser.o
|
||||
OBJS-$(CONFIG_TAK_PARSER) += tak_parser.o tak.o
|
||||
-OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o \
|
||||
+OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o vc1dsp.o \
|
||||
msmpeg4.o msmpeg4data.o mpeg4video.o \
|
||||
- h263.o
|
||||
+ h263.o startcode.o
|
||||
OBJS-$(CONFIG_VORBIS_PARSER) += vorbis_parser.o xiph.o
|
||||
OBJS-$(CONFIG_VP3_PARSER) += vp3_parser.o
|
||||
OBJS-$(CONFIG_VP8_PARSER) += vp8_parser.o
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index b6410b2..fa2b18e 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -51,6 +51,8 @@ ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
|
||||
arm/hpeldsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
|
||||
+ARMV6-OBJS-$(CONFIG_VC1_DECODER) += arm/startcode_armv6.o
|
||||
+ARMV6-OBJS-$(CONFIG_VC1_PARSER) += arm/startcode_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
|
||||
arm/vp8dsp_init_armv6.o \
|
||||
arm/vp8dsp_armv6.o
|
||||
diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
|
||||
index 47d4126..4a84848 100644
|
||||
--- a/libavcodec/arm/vc1dsp_init_arm.c
|
||||
+++ b/libavcodec/arm/vc1dsp_init_arm.c
|
||||
@@ -23,10 +23,14 @@
|
||||
#include "libavcodec/vc1dsp.h"
|
||||
#include "vc1dsp.h"
|
||||
|
||||
+int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size);
|
||||
+
|
||||
av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
+ if (have_armv6(cpu_flags))
|
||||
+ dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_armv6;
|
||||
if (have_neon(cpu_flags))
|
||||
ff_vc1dsp_init_neon(dsp);
|
||||
}
|
||||
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
|
||||
index 49d4885..cb941dd 100644
|
||||
--- a/libavcodec/vc1.c
|
||||
+++ b/libavcodec/vc1.c
|
||||
@@ -1706,5 +1706,7 @@ av_cold int ff_vc1_init_common(VC1Context *v)
|
||||
v->pq = -1;
|
||||
v->mvrange = 0; /* 7.1.1.18, p80 */
|
||||
|
||||
+ ff_vc1dsp_init(&v->vc1dsp);
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
|
||||
index 30fee47..67cda42 100644
|
||||
--- a/libavcodec/vc1dec.c
|
||||
+++ b/libavcodec/vc1dec.c
|
||||
@@ -5631,7 +5631,6 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
|
||||
ff_vc1_decode_end(avctx);
|
||||
|
||||
ff_h264chroma_init(&v->h264chroma, 8);
|
||||
- ff_vc1dsp_init(&v->vc1dsp);
|
||||
|
||||
if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) {
|
||||
int count = 0;
|
||||
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
|
||||
index ec9c17b..09a9006 100644
|
||||
--- a/libavcodec/vc1dsp.c
|
||||
+++ b/libavcodec/vc1dsp.c
|
||||
@@ -30,6 +30,7 @@
|
||||
#include "h264chroma.h"
|
||||
#include "rnd_avg.h"
|
||||
#include "vc1dsp.h"
|
||||
+#include "startcode.h"
|
||||
|
||||
/* Apply overlap transform to horizontal edge */
|
||||
static void vc1_v_overlap_c(uint8_t *src, int stride)
|
||||
@@ -947,6 +948,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
|
||||
dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c;
|
||||
#endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
|
||||
|
||||
+ dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_c;
|
||||
+
|
||||
if (ARCH_AARCH64)
|
||||
ff_vc1dsp_init_aarch64(dsp);
|
||||
if (ARCH_ARM)
|
||||
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
|
||||
index 990fbc3..6a90eed 100644
|
||||
--- a/libavcodec/vc1dsp.h
|
||||
+++ b/libavcodec/vc1dsp.h
|
||||
@@ -74,6 +74,14 @@ typedef struct VC1DSPContext {
|
||||
void (*sprite_v_double_twoscale)(uint8_t *dst, const uint8_t *src1a, const uint8_t *src1b, int offset1,
|
||||
const uint8_t *src2a, const uint8_t *src2b, int offset2,
|
||||
int alpha, int width);
|
||||
+
|
||||
+ /**
|
||||
+ * Search buf from the start for up to size bytes. Return the index
|
||||
+ * of a zero byte, or >= size if not found. Ideally, use lookahead
|
||||
+ * to filter out any zero bytes that are known to not be followed by
|
||||
+ * one or more further zero bytes and a one byte.
|
||||
+ */
|
||||
+ int (*vc1_find_start_code_candidate)(const uint8_t *buf, int size);
|
||||
} VC1DSPContext;
|
||||
|
||||
void ff_vc1dsp_init(VC1DSPContext* c);
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,158 @@
|
||||
From bb74fc44081fb6d7923ce1b7ed3e3e6514695f3e Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 5 Mar 2014 21:01:28 +0000
|
||||
Subject: [PATCH 3/6] truehd: break out part of rematrix_channels into
|
||||
platform-specific callback.
|
||||
|
||||
Verified with profiling that this doesn't have a measurable effect upon
|
||||
overall performance.
|
||||
---
|
||||
libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
|
||||
libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
|
||||
libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
|
||||
3 files changed, 68 insertions(+), 25 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
|
||||
index cbd9000..01ded5c 100644
|
||||
--- a/libavcodec/mlpdec.c
|
||||
+++ b/libavcodec/mlpdec.c
|
||||
@@ -1024,7 +1024,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
|
||||
static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
|
||||
{
|
||||
SubStream *s = &m->substream[substr];
|
||||
- unsigned int mat, src_ch, i;
|
||||
+ unsigned int mat;
|
||||
unsigned int maxchan;
|
||||
|
||||
maxchan = s->max_matrix_channel;
|
||||
@@ -1036,31 +1036,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
|
||||
}
|
||||
|
||||
for (mat = 0; mat < s->num_primitive_matrices; mat++) {
|
||||
- int matrix_noise_shift = s->matrix_noise_shift[mat];
|
||||
unsigned int dest_ch = s->matrix_out_ch[mat];
|
||||
- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
|
||||
- int32_t *coeffs = s->matrix_coeff[mat];
|
||||
- int index = s->num_primitive_matrices - mat;
|
||||
- int index2 = 2 * index + 1;
|
||||
-
|
||||
- /* TODO: DSPContext? */
|
||||
-
|
||||
- for (i = 0; i < s->blockpos; i++) {
|
||||
- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
|
||||
- int32_t *samples = m->sample_buffer[i];
|
||||
- int64_t accum = 0;
|
||||
-
|
||||
- for (src_ch = 0; src_ch <= maxchan; src_ch++)
|
||||
- accum += (int64_t) samples[src_ch] * coeffs[src_ch];
|
||||
-
|
||||
- if (matrix_noise_shift) {
|
||||
- index &= m->access_unit_size_pow2 - 1;
|
||||
- accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
|
||||
- index += index2;
|
||||
- }
|
||||
-
|
||||
- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
|
||||
- }
|
||||
+ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
|
||||
+ s->matrix_coeff[mat],
|
||||
+ &m->bypassed_lsbs[0][mat],
|
||||
+ m->noise_buffer,
|
||||
+ s->num_primitive_matrices - mat,
|
||||
+ dest_ch,
|
||||
+ s->blockpos,
|
||||
+ maxchan,
|
||||
+ s->matrix_noise_shift[mat],
|
||||
+ m->access_unit_size_pow2,
|
||||
+ MSB_MASK(s->quant_step_size[dest_ch]));
|
||||
}
|
||||
}
|
||||
|
||||
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
|
||||
index 4b403b8..7a359b0 100644
|
||||
--- a/libavcodec/mlpdsp.c
|
||||
+++ b/libavcodec/mlpdsp.c
|
||||
@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
|
||||
}
|
||||
}
|
||||
|
||||
+void ff_mlp_rematrix_channel(int32_t *samples,
|
||||
+ const int32_t *coeffs,
|
||||
+ const uint8_t *bypassed_lsbs,
|
||||
+ const int8_t *noise_buffer,
|
||||
+ int index,
|
||||
+ unsigned int dest_ch,
|
||||
+ uint16_t blockpos,
|
||||
+ unsigned int maxchan,
|
||||
+ int matrix_noise_shift,
|
||||
+ int access_unit_size_pow2,
|
||||
+ int32_t mask)
|
||||
+{
|
||||
+ unsigned int src_ch, i;
|
||||
+ int index2 = 2 * index + 1;
|
||||
+ for (i = 0; i < blockpos; i++) {
|
||||
+ int64_t accum = 0;
|
||||
+
|
||||
+ for (src_ch = 0; src_ch <= maxchan; src_ch++)
|
||||
+ accum += (int64_t) samples[src_ch] * coeffs[src_ch];
|
||||
+
|
||||
+ if (matrix_noise_shift) {
|
||||
+ index &= access_unit_size_pow2 - 1;
|
||||
+ accum += noise_buffer[index] << (matrix_noise_shift + 7);
|
||||
+ index += index2;
|
||||
+ }
|
||||
+
|
||||
+ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
|
||||
+ bypassed_lsbs += MAX_CHANNELS;
|
||||
+ samples += MAX_CHANNELS;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
|
||||
{
|
||||
c->mlp_filter_channel = mlp_filter_channel;
|
||||
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
|
||||
if (ARCH_ARM)
|
||||
ff_mlpdsp_init_arm(c);
|
||||
if (ARCH_X86)
|
||||
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
|
||||
index 129bcfe..f98e9be 100644
|
||||
--- a/libavcodec/mlpdsp.h
|
||||
+++ b/libavcodec/mlpdsp.h
|
||||
@@ -24,11 +24,34 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
+void ff_mlp_rematrix_channel(int32_t *samples,
|
||||
+ const int32_t *coeffs,
|
||||
+ const uint8_t *bypassed_lsbs,
|
||||
+ const int8_t *noise_buffer,
|
||||
+ int index,
|
||||
+ unsigned int dest_ch,
|
||||
+ uint16_t blockpos,
|
||||
+ unsigned int maxchan,
|
||||
+ int matrix_noise_shift,
|
||||
+ int access_unit_size_pow2,
|
||||
+ int32_t mask);
|
||||
+
|
||||
typedef struct MLPDSPContext {
|
||||
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
|
||||
int firorder, int iirorder,
|
||||
unsigned int filter_shift, int32_t mask,
|
||||
int blocksize, int32_t *sample_buffer);
|
||||
+ void (*mlp_rematrix_channel)(int32_t *samples,
|
||||
+ const int32_t *coeffs,
|
||||
+ const uint8_t *bypassed_lsbs,
|
||||
+ const int8_t *noise_buffer,
|
||||
+ int index,
|
||||
+ unsigned int dest_ch,
|
||||
+ uint16_t blockpos,
|
||||
+ unsigned int maxchan,
|
||||
+ int matrix_noise_shift,
|
||||
+ int access_unit_size_pow2,
|
||||
+ int32_t mask);
|
||||
} MLPDSPContext;
|
||||
|
||||
void ff_mlpdsp_init(MLPDSPContext *c);
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,401 @@
|
||||
From c39df43eae03768427243668c040de8437c4f79c Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 23 Apr 2014 01:41:04 +0100
|
||||
Subject: [PATCH 3/3] vc-1: Optimise parser (with special attention to ARM)
|
||||
|
||||
The previous implementation of the parser made four passes over each input
|
||||
buffer (reduced to two if the container format already guaranteed the input
|
||||
buffer corresponded to frames, such as with MKV). But these buffers are
|
||||
often 200K in size, certainly enough to flush the data out of L1 cache, and
|
||||
for many CPUs, all the way out to main memory. The passes were:
|
||||
|
||||
1) locate frame boundaries (not needed for MKV etc)
|
||||
2) copy the data into a contiguous block (not needed for MKV etc)
|
||||
3) locate the start codes within each frame
|
||||
4) unescape the data between start codes
|
||||
|
||||
After this, the unescaped data was parsed to extract certain header fields,
|
||||
but because the unescape operation was so large, this was usually also
|
||||
effectively operating on uncached memory. Most of the unescaped data was
|
||||
simply thrown away and never processed further. Only step 2 - because it
|
||||
used memcpy - was using prefetch, making things even worse.
|
||||
|
||||
This patch reorganises these steps so that, aside from the copying, the
|
||||
operations are performed in parallel, maximising cache utilisation. No more
|
||||
than the worst-case number of bytes needed for header parsing is unescaped.
|
||||
Most of the data is, in practice, only read in order to search for a start
|
||||
code, for which optimised implementations already existed in the H264 codec
|
||||
(notably the ARM version uses prefetch, so we end up doing both remaining
|
||||
passes at maximum speed). For MKV files, we know when we've found the last
|
||||
start code of interest in a given frame, so we are able to avoid doing even
|
||||
that one remaining pass for most of the buffer.
|
||||
|
||||
In some use-cases (such as the Raspberry Pi) video decode is handled by the
|
||||
GPU, but the entire elementary stream is still fed through the parser to
|
||||
pick out certain elements of the header which are necessary to manage the
|
||||
decode process. As you might expect, in these cases, the performance of the
|
||||
parser is significant.
|
||||
|
||||
To measure parser performance, I used the same VC-1 elementary stream in
|
||||
either an MPEG-2 transport stream or a MKV file, and fed it through ffmpeg
|
||||
with -c:v copy -c:a copy -f null. These are the gperftools counts for
|
||||
those streams, both filtered to only include vc1_parse() and its callees,
|
||||
and unfiltered (to include the whole binary). Lower numbers are better:
|
||||
|
||||
Before After
|
||||
File Filtered Mean StdDev Mean StdDev Confidence Change
|
||||
M2TS No 861.7 8.2 650.5 8.1 100.0% +32.5%
|
||||
MKV No 868.9 7.4 731.7 9.0 100.0% +18.8%
|
||||
M2TS Yes 250.0 11.2 27.2 3.4 100.0% +817.9%
|
||||
MKV Yes 149.0 12.8 1.7 0.8 100.0% +8526.3%
|
||||
|
||||
Yes, that last case shows vc1_parse() running 86 times faster! The M2TS
|
||||
case does show a larger absolute improvement though, since it was worse
|
||||
to begin with.
|
||||
|
||||
This patch has been tested with the FATE suite (albeit on x86 for speed).
|
||||
|
||||
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
|
||||
---
|
||||
libavcodec/vc1_parser.c | 284 ++++++++++++++++++++++++++++++------------------
|
||||
1 file changed, 180 insertions(+), 104 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
|
||||
index cc29ce1..4ed14bc 100644
|
||||
--- a/libavcodec/vc1_parser.c
|
||||
+++ b/libavcodec/vc1_parser.c
|
||||
@@ -30,122 +30,88 @@
|
||||
#include "vc1.h"
|
||||
#include "get_bits.h"
|
||||
|
||||
+/** The maximum number of bytes of a sequence, entry point or
|
||||
+ * frame header whose values we pay any attention to */
|
||||
+#define UNESCAPED_THRESHOLD 37
|
||||
+
|
||||
+/** The maximum number of bytes of a sequence, entry point or
|
||||
+ * frame header which must be valid memory (because they are
|
||||
+ * used to update the bitstream cache in skip_bits() calls)
|
||||
+ */
|
||||
+#define UNESCAPED_LIMIT 144
|
||||
+
|
||||
+typedef enum {
|
||||
+ NO_MATCH,
|
||||
+ ONE_ZERO,
|
||||
+ TWO_ZEROS,
|
||||
+ ONE
|
||||
+} VC1ParseSearchState;
|
||||
+
|
||||
typedef struct {
|
||||
ParseContext pc;
|
||||
VC1Context v;
|
||||
+ uint8_t prev_start_code;
|
||||
+ size_t bytes_to_skip;
|
||||
+ uint8_t unesc_buffer[UNESCAPED_LIMIT];
|
||||
+ size_t unesc_index;
|
||||
+ VC1ParseSearchState search_state;
|
||||
} VC1ParseContext;
|
||||
|
||||
-static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx,
|
||||
- const uint8_t *buf, int buf_size)
|
||||
+static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
|
||||
+ const uint8_t *buf, int buf_size)
|
||||
{
|
||||
+ /* Parse the header we just finished unescaping */
|
||||
VC1ParseContext *vpc = s->priv_data;
|
||||
GetBitContext gb;
|
||||
- const uint8_t *start, *end, *next;
|
||||
- uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
|
||||
-
|
||||
+ int ret;
|
||||
vpc->v.s.avctx = avctx;
|
||||
vpc->v.parse_only = 1;
|
||||
- vpc->v.first_pic_header_flag = 1;
|
||||
- next = buf;
|
||||
- s->repeat_pict = 0;
|
||||
-
|
||||
- for(start = buf, end = buf + buf_size; next < end; start = next){
|
||||
- int buf2_size, size;
|
||||
- int ret;
|
||||
-
|
||||
- next = find_next_marker(start + 4, end);
|
||||
- size = next - start - 4;
|
||||
- buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
|
||||
- init_get_bits(&gb, buf2, buf2_size * 8);
|
||||
- if(size <= 0) continue;
|
||||
- switch(AV_RB32(start)){
|
||||
- case VC1_CODE_SEQHDR:
|
||||
- ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
|
||||
- break;
|
||||
- case VC1_CODE_ENTRYPOINT:
|
||||
- ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
|
||||
- break;
|
||||
- case VC1_CODE_FRAME:
|
||||
- if(vpc->v.profile < PROFILE_ADVANCED)
|
||||
- ret = ff_vc1_parse_frame_header (&vpc->v, &gb);
|
||||
- else
|
||||
- ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
|
||||
-
|
||||
- if (ret < 0)
|
||||
- break;
|
||||
-
|
||||
- /* keep AV_PICTURE_TYPE_BI internal to VC1 */
|
||||
- if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
|
||||
- s->pict_type = AV_PICTURE_TYPE_B;
|
||||
- else
|
||||
- s->pict_type = vpc->v.s.pict_type;
|
||||
-
|
||||
- if (avctx->ticks_per_frame > 1){
|
||||
- // process pulldown flags
|
||||
- s->repeat_pict = 1;
|
||||
- // Pulldown flags are only valid when 'broadcast' has been set.
|
||||
- // So ticks_per_frame will be 2
|
||||
- if (vpc->v.rff){
|
||||
- // repeat field
|
||||
- s->repeat_pict = 2;
|
||||
- }else if (vpc->v.rptfrm){
|
||||
- // repeat frames
|
||||
- s->repeat_pict = vpc->v.rptfrm * 2 + 1;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
|
||||
- s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
|
||||
- else
|
||||
- s->field_order = AV_FIELD_PROGRESSIVE;
|
||||
+ init_get_bits(&gb, buf, buf_size * 8);
|
||||
+ switch (vpc->prev_start_code) {
|
||||
+ case VC1_CODE_SEQHDR & 0xFF:
|
||||
+ ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
|
||||
+ break;
|
||||
+ case VC1_CODE_ENTRYPOINT & 0xFF:
|
||||
+ ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
|
||||
+ break;
|
||||
+ case VC1_CODE_FRAME & 0xFF:
|
||||
+ if(vpc->v.profile < PROFILE_ADVANCED)
|
||||
+ ret = ff_vc1_parse_frame_header (&vpc->v, &gb);
|
||||
+ else
|
||||
+ ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
|
||||
|
||||
+ if (ret < 0)
|
||||
break;
|
||||
- }
|
||||
- }
|
||||
|
||||
- av_free(buf2);
|
||||
-}
|
||||
+ /* keep AV_PICTURE_TYPE_BI internal to VC1 */
|
||||
+ if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
|
||||
+ s->pict_type = AV_PICTURE_TYPE_B;
|
||||
+ else
|
||||
+ s->pict_type = vpc->v.s.pict_type;
|
||||
|
||||
-/**
|
||||
- * Find the end of the current frame in the bitstream.
|
||||
- * @return the position of the first byte of the next frame, or -1
|
||||
- */
|
||||
-static int vc1_find_frame_end(ParseContext *pc, const uint8_t *buf,
|
||||
- int buf_size) {
|
||||
- int pic_found, i;
|
||||
- uint32_t state;
|
||||
-
|
||||
- pic_found= pc->frame_start_found;
|
||||
- state= pc->state;
|
||||
-
|
||||
- i=0;
|
||||
- if(!pic_found){
|
||||
- for(i=0; i<buf_size; i++){
|
||||
- state= (state<<8) | buf[i];
|
||||
- if(state == VC1_CODE_FRAME || state == VC1_CODE_FIELD){
|
||||
- i++;
|
||||
- pic_found=1;
|
||||
- break;
|
||||
+ if (avctx->ticks_per_frame > 1){
|
||||
+ // process pulldown flags
|
||||
+ s->repeat_pict = 1;
|
||||
+ // Pulldown flags are only valid when 'broadcast' has been set.
|
||||
+ // So ticks_per_frame will be 2
|
||||
+ if (vpc->v.rff){
|
||||
+ // repeat field
|
||||
+ s->repeat_pict = 2;
|
||||
+ }else if (vpc->v.rptfrm){
|
||||
+ // repeat frames
|
||||
+ s->repeat_pict = vpc->v.rptfrm * 2 + 1;
|
||||
}
|
||||
+ }else{
|
||||
+ s->repeat_pict = 0;
|
||||
}
|
||||
- }
|
||||
|
||||
- if(pic_found){
|
||||
- /* EOF considered as end of frame */
|
||||
- if (buf_size == 0)
|
||||
- return 0;
|
||||
- for(; i<buf_size; i++){
|
||||
- state= (state<<8) | buf[i];
|
||||
- if(IS_MARKER(state) && state != VC1_CODE_FIELD && state != VC1_CODE_SLICE){
|
||||
- pc->frame_start_found=0;
|
||||
- pc->state=-1;
|
||||
- return i-3;
|
||||
- }
|
||||
- }
|
||||
+ if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
|
||||
+ s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
|
||||
+ else
|
||||
+ s->field_order = AV_FIELD_PROGRESSIVE;
|
||||
+
|
||||
+ break;
|
||||
}
|
||||
- pc->frame_start_found= pic_found;
|
||||
- pc->state= state;
|
||||
- return END_NOT_FOUND;
|
||||
}
|
||||
|
||||
static int vc1_parse(AVCodecParserContext *s,
|
||||
@@ -153,22 +119,127 @@ static int vc1_parse(AVCodecParserContext *s,
|
||||
const uint8_t **poutbuf, int *poutbuf_size,
|
||||
const uint8_t *buf, int buf_size)
|
||||
{
|
||||
+ /* Here we do the searching for frame boundaries and headers at
|
||||
+ * the same time. Only a minimal amount at the start of each
|
||||
+ * header is unescaped. */
|
||||
VC1ParseContext *vpc = s->priv_data;
|
||||
- int next;
|
||||
+ int pic_found = vpc->pc.frame_start_found;
|
||||
+ uint8_t *unesc_buffer = vpc->unesc_buffer;
|
||||
+ size_t unesc_index = vpc->unesc_index;
|
||||
+ VC1ParseSearchState search_state = vpc->search_state;
|
||||
+ int next = END_NOT_FOUND;
|
||||
+ int i = vpc->bytes_to_skip;
|
||||
+
|
||||
+ if (pic_found && buf_size == 0) {
|
||||
+ /* EOF considered as end of frame */
|
||||
+ memset(unesc_buffer + unesc_index, 0, UNESCAPED_THRESHOLD - unesc_index);
|
||||
+ vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
|
||||
+ next = 0;
|
||||
+ }
|
||||
+ while (i < buf_size) {
|
||||
+ int start_code_found = 0;
|
||||
+ uint8_t b;
|
||||
+ while (i < buf_size && unesc_index < UNESCAPED_THRESHOLD) {
|
||||
+ b = buf[i++];
|
||||
+ unesc_buffer[unesc_index++] = b;
|
||||
+ if (search_state <= ONE_ZERO)
|
||||
+ search_state = b ? NO_MATCH : search_state + 1;
|
||||
+ else if (search_state == TWO_ZEROS) {
|
||||
+ if (b == 1)
|
||||
+ search_state = ONE;
|
||||
+ else if (b > 1) {
|
||||
+ if (b == 3)
|
||||
+ unesc_index--; // swallow emulation prevention byte
|
||||
+ search_state = NO_MATCH;
|
||||
+ }
|
||||
+ }
|
||||
+ else { // search_state == ONE
|
||||
+ // Header unescaping terminates early due to detection of next start code
|
||||
+ search_state = NO_MATCH;
|
||||
+ start_code_found = 1;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ if ((s->flags & PARSER_FLAG_COMPLETE_FRAMES) &&
|
||||
+ unesc_index >= UNESCAPED_THRESHOLD &&
|
||||
+ vpc->prev_start_code == (VC1_CODE_FRAME & 0xFF))
|
||||
+ {
|
||||
+ // No need to keep scanning the rest of the buffer for
|
||||
+ // start codes if we know it contains a complete frame and
|
||||
+ // we've already unescaped all we need of the frame header
|
||||
+ vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
|
||||
+ break;
|
||||
+ }
|
||||
+ if (unesc_index >= UNESCAPED_THRESHOLD && !start_code_found) {
|
||||
+ while (i < buf_size) {
|
||||
+ if (search_state == NO_MATCH) {
|
||||
+ i += vpc->v.vc1dsp.vc1_find_start_code_candidate(buf + i, buf_size - i);
|
||||
+ if (i < buf_size) {
|
||||
+ search_state = ONE_ZERO;
|
||||
+ }
|
||||
+ i++;
|
||||
+ } else {
|
||||
+ b = buf[i++];
|
||||
+ if (search_state == ONE_ZERO)
|
||||
+ search_state = b ? NO_MATCH : TWO_ZEROS;
|
||||
+ else if (search_state == TWO_ZEROS) {
|
||||
+ if (b >= 1)
|
||||
+ search_state = b == 1 ? ONE : NO_MATCH;
|
||||
+ }
|
||||
+ else { // search_state == ONE
|
||||
+ search_state = NO_MATCH;
|
||||
+ start_code_found = 1;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ if (start_code_found) {
|
||||
+ vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
|
||||
+
|
||||
+ vpc->prev_start_code = b;
|
||||
+ unesc_index = 0;
|
||||
+
|
||||
+ if (!(s->flags & PARSER_FLAG_COMPLETE_FRAMES)) {
|
||||
+ if (!pic_found && (b == (VC1_CODE_FRAME & 0xFF) || b == (VC1_CODE_FIELD & 0xFF))) {
|
||||
+ pic_found = 1;
|
||||
+ }
|
||||
+ else if (pic_found && b != (VC1_CODE_FIELD & 0xFF) && b != (VC1_CODE_SLICE & 0xFF)) {
|
||||
+ next = i - 4;
|
||||
+ pic_found = b == (VC1_CODE_FRAME & 0xFF);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
|
||||
- if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
|
||||
- next= buf_size;
|
||||
- }else{
|
||||
- next= vc1_find_frame_end(&vpc->pc, buf, buf_size);
|
||||
+ vpc->pc.frame_start_found = pic_found;
|
||||
+ vpc->unesc_index = unesc_index;
|
||||
+ vpc->search_state = search_state;
|
||||
|
||||
+ if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
|
||||
+ next = buf_size;
|
||||
+ } else {
|
||||
if (ff_combine_frame(&vpc->pc, next, &buf, &buf_size) < 0) {
|
||||
+ vpc->bytes_to_skip = 0;
|
||||
*poutbuf = NULL;
|
||||
*poutbuf_size = 0;
|
||||
return buf_size;
|
||||
}
|
||||
}
|
||||
|
||||
- vc1_extract_headers(s, avctx, buf, buf_size);
|
||||
+ vpc->v.first_pic_header_flag = 1;
|
||||
+
|
||||
+ /* If we return with a valid pointer to a combined frame buffer
|
||||
+ * then on the next call then we'll have been unhelpfully rewound
|
||||
+ * by up to 4 bytes (depending upon whether the start code
|
||||
+ * overlapped the input buffer, and if so by how much). We don't
|
||||
+ * want this: it will either cause spurious second detections of
|
||||
+ * the start code we've already seen, or cause extra bytes to be
|
||||
+ * inserted at the start of the unescaped buffer. */
|
||||
+ vpc->bytes_to_skip = 4;
|
||||
+ if (next < 0)
|
||||
+ vpc->bytes_to_skip += next;
|
||||
|
||||
*poutbuf = buf;
|
||||
*poutbuf_size = buf_size;
|
||||
@@ -199,6 +270,11 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
|
||||
{
|
||||
VC1ParseContext *vpc = s->priv_data;
|
||||
vpc->v.s.slice_context_count = 1;
|
||||
+ vpc->v.first_pic_header_flag = 1;
|
||||
+ vpc->prev_start_code = 0;
|
||||
+ vpc->bytes_to_skip = 0;
|
||||
+ vpc->unesc_index = 0;
|
||||
+ vpc->search_state = NO_MATCH;
|
||||
return ff_vc1_init_common(&vpc->v);
|
||||
}
|
||||
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,285 @@
|
||||
From 98428a8cf593587b403076bb54b46cc70ed17ff2 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Mon, 10 Mar 2014 14:42:05 +0000
|
||||
Subject: [PATCH 4/6] truehd: add hand-scheduled ARM asm version of
|
||||
ff_mlp_rematrix_channel.
|
||||
|
||||
Profiling results for overall audio decode and the rematrix_channels function
|
||||
in particular are as follows:
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Confidence Change
|
||||
6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3%
|
||||
6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant)
|
||||
8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant)
|
||||
8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant)
|
||||
6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9%
|
||||
6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3%
|
||||
8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9%
|
||||
8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3%
|
||||
|
||||
The assembly version has also been tested with a fuzz tester to ensure that
|
||||
any combinations of inputs not exercised by my available test streams still
|
||||
generate mathematically identical results to the C version.
|
||||
---
|
||||
libavcodec/arm/mlpdsp_arm.S | 222 +++++++++++++++++++++++++++++++++++++++
|
||||
libavcodec/arm/mlpdsp_init_arm.c | 12 +++
|
||||
2 files changed, 234 insertions(+)
|
||||
|
||||
diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
|
||||
index 615819d..9b51d0c 100644
|
||||
--- a/libavcodec/arm/mlpdsp_arm.S
|
||||
+++ b/libavcodec/arm/mlpdsp_arm.S
|
||||
@@ -431,3 +431,225 @@ endfunc
|
||||
.unreq ST3
|
||||
.unreq I
|
||||
.unreq PSAMP
|
||||
+
|
||||
+/********************************************************************/
|
||||
+
|
||||
+PSA .req a1 // samples
|
||||
+PCO .req a2 // coeffs
|
||||
+PBL .req a3 // bypassed_lsbs
|
||||
+INDEX .req a4
|
||||
+CO0 .req v1
|
||||
+CO1 .req v2
|
||||
+CO2 .req v3
|
||||
+CO3 .req v4
|
||||
+SA0 .req v5
|
||||
+SA1 .req v6
|
||||
+SA2 .req sl
|
||||
+SA3 .req fp
|
||||
+AC0 .req ip
|
||||
+AC1 .req lr
|
||||
+NOISE .req SA0
|
||||
+LSB .req SA1
|
||||
+DCH .req SA2 // dest_ch
|
||||
+MASK .req SA3
|
||||
+
|
||||
+ // INDEX is used as follows:
|
||||
+ // bits 0..6 index2 (values up to 17, but wider so that we can
|
||||
+ // add to index field without needing to mask)
|
||||
+ // bits 7..14 i (values up to 160)
|
||||
+ // bit 15 underflow detect for i
|
||||
+ // bits 25..31 (if access_unit_size_pow2 == 128) \ index
|
||||
+ // bits 26..31 (if access_unit_size_pow2 == 64) /
|
||||
+
|
||||
+.macro implement_rematrix shift, index_mask, mask_minus1, maxchan
|
||||
+ .if \maxchan == 1
|
||||
+ // We can just leave the coefficients in registers in this case
|
||||
+ ldrd CO0, CO1, [PCO]
|
||||
+ .endif
|
||||
+1:
|
||||
+ .if \maxchan == 1
|
||||
+ ldrd SA0, SA1, [PSA]
|
||||
+ smull AC0, AC1, CO0, SA0
|
||||
+ .elseif \maxchan == 5
|
||||
+ ldr CO0, [PCO, #0]
|
||||
+ ldr SA0, [PSA, #0]
|
||||
+ ldr CO1, [PCO, #4]
|
||||
+ ldr SA1, [PSA, #4]
|
||||
+ ldrd CO2, CO3, [PCO, #8]
|
||||
+ smull AC0, AC1, CO0, SA0
|
||||
+ ldrd SA2, SA3, [PSA, #8]
|
||||
+ smlal AC0, AC1, CO1, SA1
|
||||
+ ldrd CO0, CO1, [PCO, #16]
|
||||
+ smlal AC0, AC1, CO2, SA2
|
||||
+ ldrd SA0, SA1, [PSA, #16]
|
||||
+ smlal AC0, AC1, CO3, SA3
|
||||
+ smlal AC0, AC1, CO0, SA0
|
||||
+ .else // \maxchan == 7
|
||||
+ ldr CO2, [PCO, #0]
|
||||
+ ldr SA2, [PSA, #0]
|
||||
+ ldr CO3, [PCO, #4]
|
||||
+ ldr SA3, [PSA, #4]
|
||||
+ ldrd CO0, CO1, [PCO, #8]
|
||||
+ smull AC0, AC1, CO2, SA2
|
||||
+ ldrd SA0, SA1, [PSA, #8]
|
||||
+ smlal AC0, AC1, CO3, SA3
|
||||
+ ldrd CO2, CO3, [PCO, #16]
|
||||
+ smlal AC0, AC1, CO0, SA0
|
||||
+ ldrd SA2, SA3, [PSA, #16]
|
||||
+ smlal AC0, AC1, CO1, SA1
|
||||
+ ldrd CO0, CO1, [PCO, #24]
|
||||
+ smlal AC0, AC1, CO2, SA2
|
||||
+ ldrd SA0, SA1, [PSA, #24]
|
||||
+ smlal AC0, AC1, CO3, SA3
|
||||
+ smlal AC0, AC1, CO0, SA0
|
||||
+ .endif
|
||||
+ ldm sp, {NOISE, DCH, MASK}
|
||||
+ smlal AC0, AC1, CO1, SA1
|
||||
+ .if \shift != 0
|
||||
+ .if \index_mask == 63
|
||||
+ add NOISE, NOISE, INDEX, lsr #32-6
|
||||
+ ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
+ ldrsb NOISE, [NOISE]
|
||||
+ add INDEX, INDEX, INDEX, lsl #32-6
|
||||
+ .else // \index_mask == 127
|
||||
+ add NOISE, NOISE, INDEX, lsr #32-7
|
||||
+ ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
+ ldrsb NOISE, [NOISE]
|
||||
+ add INDEX, INDEX, INDEX, lsl #32-7
|
||||
+ .endif
|
||||
+ sub INDEX, INDEX, #1<<7
|
||||
+ adds AC0, AC0, NOISE, lsl #\shift + 7
|
||||
+ adc AC1, AC1, NOISE, asr #31
|
||||
+ .else
|
||||
+ ldrb LSB, [PBL], #MAX_CHANNELS
|
||||
+ sub INDEX, INDEX, #1<<7
|
||||
+ .endif
|
||||
+ add PSA, PSA, #MAX_CHANNELS*4
|
||||
+ mov AC0, AC0, lsr #14
|
||||
+ orr AC0, AC0, AC1, lsl #18
|
||||
+ .if !\mask_minus1
|
||||
+ and AC0, AC0, MASK
|
||||
+ .endif
|
||||
+ add AC0, AC0, LSB
|
||||
+ tst INDEX, #1<<15
|
||||
+ str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA
|
||||
+ beq 1b
|
||||
+ b 98f
|
||||
+.endm
|
||||
+
|
||||
+.macro switch_on_maxchan shift, index_mask, mask_minus1
|
||||
+ cmp v4, #5
|
||||
+ blo 51f
|
||||
+ beq 50f
|
||||
+ implement_rematrix \shift, \index_mask, \mask_minus1, 7
|
||||
+50: implement_rematrix \shift, \index_mask, \mask_minus1, 5
|
||||
+51: implement_rematrix \shift, \index_mask, \mask_minus1, 1
|
||||
+.endm
|
||||
+
|
||||
+.macro switch_on_mask shift, index_mask
|
||||
+ cmp sl, #-1
|
||||
+ bne 40f
|
||||
+ switch_on_maxchan \shift, \index_mask, 1
|
||||
+40: switch_on_maxchan \shift, \index_mask, 0
|
||||
+.endm
|
||||
+
|
||||
+.macro switch_on_au_size shift
|
||||
+ .if \shift == 0
|
||||
+ switch_on_mask \shift, undefined
|
||||
+ .else
|
||||
+ teq v6, #64
|
||||
+ bne 30f
|
||||
+ orr INDEX, INDEX, v1, lsl #32-6
|
||||
+ switch_on_mask \shift, 63
|
||||
+30: orr INDEX, INDEX, v1, lsl #32-7
|
||||
+ switch_on_mask \shift, 127
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
+ * const int32_t *coeffs,
|
||||
+ * const uint8_t *bypassed_lsbs,
|
||||
+ * const int8_t *noise_buffer,
|
||||
+ * int index,
|
||||
+ * unsigned int dest_ch,
|
||||
+ * uint16_t blockpos,
|
||||
+ * unsigned int maxchan,
|
||||
+ * int matrix_noise_shift,
|
||||
+ * int access_unit_size_pow2,
|
||||
+ * int32_t mask);
|
||||
+ */
|
||||
+function ff_mlp_rematrix_channel_arm, export=1
|
||||
+ push {v1-fp,lr}
|
||||
+ add v1, sp, #9*4 // point at arguments on stack
|
||||
+ ldm v1, {v1-sl}
|
||||
+ teq v4, #1
|
||||
+ itt ne
|
||||
+ teqne v4, #5
|
||||
+ teqne v4, #7
|
||||
+ bne 99f
|
||||
+ teq v6, #64
|
||||
+ it ne
|
||||
+ teqne v6, #128
|
||||
+ bne 99f
|
||||
+ sub v2, v2, #MAX_CHANNELS
|
||||
+ push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned
|
||||
+ movs INDEX, v3, lsl #7
|
||||
+ beq 98f // just in case, do nothing if blockpos = 0
|
||||
+ subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
|
||||
+ adc lr, v1, v1 // calculate index2 (C was set by preceding subs)
|
||||
+ orr INDEX, INDEX, lr
|
||||
+ // Switch on matrix_noise_shift: values 0 and 1 are
|
||||
+ // disproportionately common so do those in a form the branch
|
||||
+ // predictor can accelerate. Values can only go up to 15.
|
||||
+ cmp v5, #1
|
||||
+ beq 11f
|
||||
+ blo 10f
|
||||
+A ldr pc, [pc, v5, lsl #2]
|
||||
+T tbh [pc, v5, lsl #1]
|
||||
+0:
|
||||
+A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f
|
||||
+T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2
|
||||
+T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2
|
||||
+T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2
|
||||
+10: switch_on_au_size 0
|
||||
+11: switch_on_au_size 1
|
||||
+12: switch_on_au_size 2
|
||||
+13: switch_on_au_size 3
|
||||
+14: switch_on_au_size 4
|
||||
+15: switch_on_au_size 5
|
||||
+16: switch_on_au_size 6
|
||||
+17: switch_on_au_size 7
|
||||
+18: switch_on_au_size 8
|
||||
+19: switch_on_au_size 9
|
||||
+20: switch_on_au_size 10
|
||||
+21: switch_on_au_size 11
|
||||
+22: switch_on_au_size 12
|
||||
+23: switch_on_au_size 13
|
||||
+24: switch_on_au_size 14
|
||||
+25: switch_on_au_size 15
|
||||
+
|
||||
+98: add sp, sp, #3*4
|
||||
+ pop {v1-fp,pc}
|
||||
+99: // Can't handle these parameters, drop back to C
|
||||
+ pop {v1-fp,lr}
|
||||
+ b X(ff_mlp_rematrix_channel)
|
||||
+endfunc
|
||||
+
|
||||
+ .unreq PSA
|
||||
+ .unreq PCO
|
||||
+ .unreq PBL
|
||||
+ .unreq INDEX
|
||||
+ .unreq CO0
|
||||
+ .unreq CO1
|
||||
+ .unreq CO2
|
||||
+ .unreq CO3
|
||||
+ .unreq SA0
|
||||
+ .unreq SA1
|
||||
+ .unreq SA2
|
||||
+ .unreq SA3
|
||||
+ .unreq AC0
|
||||
+ .unreq AC1
|
||||
+ .unreq NOISE
|
||||
+ .unreq LSB
|
||||
+ .unreq DCH
|
||||
+ .unreq MASK
|
||||
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
|
||||
index 9a14815..1bb2276 100644
|
||||
--- a/libavcodec/arm/mlpdsp_init_arm.c
|
||||
+++ b/libavcodec/arm/mlpdsp_init_arm.c
|
||||
@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
|
||||
int firorder, int iirorder,
|
||||
unsigned int filter_shift, int32_t mask,
|
||||
int blocksize, int32_t *sample_buffer);
|
||||
+void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
+ const int32_t *coeffs,
|
||||
+ const uint8_t *bypassed_lsbs,
|
||||
+ const int8_t *noise_buffer,
|
||||
+ int index,
|
||||
+ unsigned int dest_ch,
|
||||
+ uint16_t blockpos,
|
||||
+ unsigned int maxchan,
|
||||
+ int matrix_noise_shift,
|
||||
+ int access_unit_size_pow2,
|
||||
+ int32_t mask);
|
||||
|
||||
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
|
||||
{
|
||||
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
|
||||
+ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
|
||||
}
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,197 @@
|
||||
From 5bfcb7a691eb63c56f1485b60f399d79ff943799 Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Wed, 12 Mar 2014 18:18:39 +0000
|
||||
Subject: [PATCH 5/6] truehd: break out part of output_data into
|
||||
platform-specific callback.
|
||||
|
||||
Verified with profiling that this doesn't have a measurable effect upon
|
||||
overall performance.
|
||||
---
|
||||
libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
|
||||
libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++
|
||||
libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
|
||||
3 files changed, 83 insertions(+), 17 deletions(-)
|
||||
|
||||
diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
|
||||
index 01ded5c..061dabc 100644
|
||||
--- a/libavcodec/mlpdec.c
|
||||
+++ b/libavcodec/mlpdec.c
|
||||
@@ -363,6 +363,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
|
||||
m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
|
||||
else
|
||||
m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
|
||||
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign,
|
||||
+ m->substream[m->max_decoded_substream].output_shift,
|
||||
+ m->substream[m->max_decoded_substream].max_matrix_channel,
|
||||
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
|
||||
|
||||
m->params_valid = 1;
|
||||
for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
|
||||
@@ -612,6 +616,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
|
||||
if (substr == m->max_decoded_substream) {
|
||||
m->avctx->channels = s->max_matrix_channel + 1;
|
||||
m->avctx->channel_layout = s->ch_layout;
|
||||
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
|
||||
+ s->output_shift,
|
||||
+ s->max_matrix_channel,
|
||||
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
|
||||
|
||||
if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) {
|
||||
if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) ||
|
||||
@@ -857,9 +865,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
|
||||
return ret;
|
||||
|
||||
if (s->param_presence_flags & PARAM_OUTSHIFT)
|
||||
- if (get_bits1(gbp))
|
||||
+ if (get_bits1(gbp)) {
|
||||
for (ch = 0; ch <= s->max_matrix_channel; ch++)
|
||||
s->output_shift[ch] = get_sbits(gbp, 4);
|
||||
+ if (substr == m->max_decoded_substream)
|
||||
+ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
|
||||
+ s->output_shift,
|
||||
+ s->max_matrix_channel,
|
||||
+ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
|
||||
+ }
|
||||
|
||||
if (s->param_presence_flags & PARAM_QUANTSTEP)
|
||||
if (get_bits1(gbp))
|
||||
@@ -1058,9 +1072,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
|
||||
{
|
||||
AVCodecContext *avctx = m->avctx;
|
||||
SubStream *s = &m->substream[substr];
|
||||
- unsigned int i, out_ch = 0;
|
||||
- int32_t *data_32;
|
||||
- int16_t *data_16;
|
||||
int ret;
|
||||
int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
|
||||
|
||||
@@ -1078,19 +1089,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
|
||||
frame->nb_samples = s->blockpos;
|
||||
if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
|
||||
return ret;
|
||||
- data_32 = (int32_t *)frame->data[0];
|
||||
- data_16 = (int16_t *)frame->data[0];
|
||||
-
|
||||
- for (i = 0; i < s->blockpos; i++) {
|
||||
- for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) {
|
||||
- int mat_ch = s->ch_assign[out_ch];
|
||||
- int32_t sample = m->sample_buffer[i][mat_ch]
|
||||
- << s->output_shift[mat_ch];
|
||||
- s->lossless_check_data ^= (sample & 0xffffff) << mat_ch;
|
||||
- if (is32) *data_32++ = sample << 8;
|
||||
- else *data_16++ = sample >> 8;
|
||||
- }
|
||||
- }
|
||||
+ s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
|
||||
+ s->blockpos,
|
||||
+ m->sample_buffer,
|
||||
+ frame->data[0],
|
||||
+ s->ch_assign,
|
||||
+ s->output_shift,
|
||||
+ s->max_matrix_channel,
|
||||
+ is32);
|
||||
|
||||
/* Update matrix encoding side data */
|
||||
if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0)
|
||||
diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
|
||||
index 7a359b0..3ae8c37 100644
|
||||
--- a/libavcodec/mlpdsp.c
|
||||
+++ b/libavcodec/mlpdsp.c
|
||||
@@ -89,10 +89,48 @@ void ff_mlp_rematrix_channel(int32_t *samples,
|
||||
}
|
||||
}
|
||||
|
||||
+static int32_t (*mlp_select_pack_output(uint8_t *ch_assign,
|
||||
+ int8_t *output_shift,
|
||||
+ uint8_t max_matrix_channel,
|
||||
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
|
||||
+{
|
||||
+ return ff_mlp_pack_output;
|
||||
+}
|
||||
+
|
||||
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
|
||||
+ uint16_t blockpos,
|
||||
+ int32_t (*sample_buffer)[MAX_CHANNELS],
|
||||
+ void *data,
|
||||
+ uint8_t *ch_assign,
|
||||
+ int8_t *output_shift,
|
||||
+ uint8_t max_matrix_channel,
|
||||
+ int is32)
|
||||
+{
|
||||
+ unsigned int i, out_ch = 0;
|
||||
+ int32_t *data_32 = data;
|
||||
+ int16_t *data_16 = data;
|
||||
+
|
||||
+ for (i = 0; i < blockpos; i++) {
|
||||
+ for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
|
||||
+ int mat_ch = ch_assign[out_ch];
|
||||
+ int32_t sample = sample_buffer[i][mat_ch]
|
||||
+ << output_shift[mat_ch];
|
||||
+ lossless_check_data ^= (sample & 0xffffff) << mat_ch;
|
||||
+ if (is32)
|
||||
+ *data_32++ = sample << 8;
|
||||
+ else
|
||||
+ *data_16++ = sample >> 8;
|
||||
+ }
|
||||
+ }
|
||||
+ return lossless_check_data;
|
||||
+}
|
||||
+
|
||||
av_cold void ff_mlpdsp_init(MLPDSPContext *c)
|
||||
{
|
||||
c->mlp_filter_channel = mlp_filter_channel;
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
|
||||
+ c->mlp_select_pack_output = mlp_select_pack_output;
|
||||
+ c->mlp_pack_output = ff_mlp_pack_output;
|
||||
if (ARCH_ARM)
|
||||
ff_mlpdsp_init_arm(c);
|
||||
if (ARCH_X86)
|
||||
diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
|
||||
index f98e9be..a0edeb7 100644
|
||||
--- a/libavcodec/mlpdsp.h
|
||||
+++ b/libavcodec/mlpdsp.h
|
||||
@@ -23,6 +23,7 @@
|
||||
#define AVCODEC_MLPDSP_H
|
||||
|
||||
#include <stdint.h>
|
||||
+#include "mlp.h"
|
||||
|
||||
void ff_mlp_rematrix_channel(int32_t *samples,
|
||||
const int32_t *coeffs,
|
||||
@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
|
||||
int access_unit_size_pow2,
|
||||
int32_t mask);
|
||||
|
||||
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
|
||||
+ uint16_t blockpos,
|
||||
+ int32_t (*sample_buffer)[MAX_CHANNELS],
|
||||
+ void *data,
|
||||
+ uint8_t *ch_assign,
|
||||
+ int8_t *output_shift,
|
||||
+ uint8_t max_matrix_channel,
|
||||
+ int is32);
|
||||
+
|
||||
typedef struct MLPDSPContext {
|
||||
void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
|
||||
int firorder, int iirorder,
|
||||
@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
|
||||
int matrix_noise_shift,
|
||||
int access_unit_size_pow2,
|
||||
int32_t mask);
|
||||
+ int32_t (*(*mlp_select_pack_output)(uint8_t *ch_assign,
|
||||
+ int8_t *output_shift,
|
||||
+ uint8_t max_matrix_channel,
|
||||
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
|
||||
+ int32_t (*mlp_pack_output)(int32_t lossless_check_data,
|
||||
+ uint16_t blockpos,
|
||||
+ int32_t (*sample_buffer)[MAX_CHANNELS],
|
||||
+ void *data,
|
||||
+ uint8_t *ch_assign,
|
||||
+ int8_t *output_shift,
|
||||
+ uint8_t max_matrix_channel,
|
||||
+ int is32);
|
||||
} MLPDSPContext;
|
||||
|
||||
void ff_mlpdsp_init(MLPDSPContext *c);
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,689 @@
|
||||
From c647209386bd811cc1c33b4fc8ec17a00f8c8ded Mon Sep 17 00:00:00 2001
|
||||
From: Ben Avison <bavison@riscosopen.org>
|
||||
Date: Thu, 13 Mar 2014 00:21:55 +0000
|
||||
Subject: [PATCH 6/6] truehd: add hand-scheduled ARM asm version of
|
||||
ff_mlp_pack_output.
|
||||
|
||||
Profiling results for overall decode and the output_data function in
|
||||
particular are as follows:
|
||||
|
||||
Before After
|
||||
Mean StdDev Mean StdDev Confidence Change
|
||||
6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant)
|
||||
6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5%
|
||||
8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant)
|
||||
8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7%
|
||||
6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1%
|
||||
6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9%
|
||||
8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6%
|
||||
8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3%
|
||||
|
||||
The assembly version has also been tested with a fuzz tester to ensure that
|
||||
any combinations of inputs not exercised by my available test streams still
|
||||
generate mathematically identical results to the C version.
|
||||
---
|
||||
libavcodec/arm/Makefile | 1 +
|
||||
libavcodec/arm/mlpdsp_armv6.S | 530 +++++++++++++++++++++++++++++++++++++++
|
||||
libavcodec/arm/mlpdsp_init_arm.c | 96 +++++++
|
||||
3 files changed, 627 insertions(+)
|
||||
create mode 100644 libavcodec/arm/mlpdsp_armv6.S
|
||||
|
||||
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
|
||||
index ba673b1..7b2f923 100644
|
||||
--- a/libavcodec/arm/Makefile
|
||||
+++ b/libavcodec/arm/Makefile
|
||||
@@ -52,6 +52,7 @@ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \
|
||||
arm/hpeldsp_armv6.o
|
||||
+ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o
|
||||
ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \
|
||||
arm/vp8dsp_init_armv6.o \
|
||||
diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
|
||||
new file mode 100644
|
||||
index 0000000..05a2c85
|
||||
--- /dev/null
|
||||
+++ b/libavcodec/arm/mlpdsp_armv6.S
|
||||
@@ -0,0 +1,530 @@
|
||||
+/*
|
||||
+ * Copyright (c) 2014 RISC OS Open Ltd
|
||||
+ * Author: Ben Avison <bavison@riscosopen.org>
|
||||
+ *
|
||||
+ * This file is part of FFmpeg.
|
||||
+ *
|
||||
+ * FFmpeg is free software; you can redistribute it and/or
|
||||
+ * modify it under the terms of the GNU Lesser General Public
|
||||
+ * License as published by the Free Software Foundation; either
|
||||
+ * version 2.1 of the License, or (at your option) any later version.
|
||||
+ *
|
||||
+ * FFmpeg is distributed in the hope that it will be useful,
|
||||
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ * Lesser General Public License for more details.
|
||||
+ *
|
||||
+ * You should have received a copy of the GNU Lesser General Public
|
||||
+ * License along with FFmpeg; if not, write to the Free Software
|
||||
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
+ */
|
||||
+
|
||||
+#include "libavutil/arm/asm.S"
|
||||
+
|
||||
+.macro loadregoffsh2 group, index, base, offgroup, offindex
|
||||
+ .altmacro
|
||||
+ loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
|
||||
+ .noaltmacro
|
||||
+.endm
|
||||
+
|
||||
+.macro loadregoffsh2_ group, index, base, offgroup, offindex
|
||||
+ ldr \group\index, [\base, \offgroup\offindex, lsl #2]
|
||||
+.endm
|
||||
+
|
||||
+.macro eorlslreg check, data, group, index
|
||||
+ .altmacro
|
||||
+ eorlslreg_ \check, \data, \group, %(\index)
|
||||
+ .noaltmacro
|
||||
+.endm
|
||||
+
|
||||
+.macro eorlslreg_ check, data, group, index
|
||||
+ eor \check, \check, \data, lsl \group\index
|
||||
+.endm
|
||||
+
|
||||
+.macro decr_modulo var, by, modulus
|
||||
+ .set \var, \var - \by
|
||||
+ .if \var == 0
|
||||
+ .set \var, \modulus
|
||||
+ .endif
|
||||
+.endm
|
||||
+
|
||||
+ .macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0
|
||||
+ .if \size == 2
|
||||
+ ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4
|
||||
+ .else // size == 4
|
||||
+ .if IDX1 > 4 || \channels==8
|
||||
+ ldm IN!, {\r0, \r1, \r2, \r3}
|
||||
+ .else
|
||||
+ ldm IN, {\r0, \r1, \r2, \r3}
|
||||
+ .if !\pointer_dead
|
||||
+ add IN, IN, #(4 + 8 - \channels) * 4
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ decr_modulo IDX1, \size, \channels
|
||||
+ .endm
|
||||
+
|
||||
+ .macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0
|
||||
+ .if \size == 2
|
||||
+ .if IDX1 > 2
|
||||
+ ldm IN!, {\r2, \r3}
|
||||
+ .else
|
||||
+//A .ifc \r2, ip
|
||||
+//A .if \pointer_dead
|
||||
+//A ldm IN, {\r2, \r3}
|
||||
+//A .else
|
||||
+//A ldr \r2, [IN], #4
|
||||
+//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4
|
||||
+//A .endif
|
||||
+//A .else
|
||||
+ ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4
|
||||
+//A .endif
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ decr_modulo IDX1, \size, \channels
|
||||
+ .endm
|
||||
+
|
||||
+.macro implement_pack inorder, channels, shift
|
||||
+.if \inorder
|
||||
+.ifc \shift, mixed
|
||||
+
|
||||
+CHECK .req a1
|
||||
+COUNT .req a2
|
||||
+IN .req a3
|
||||
+OUT .req a4
|
||||
+DAT0 .req v1
|
||||
+DAT1 .req v2
|
||||
+DAT2 .req v3
|
||||
+DAT3 .req v4
|
||||
+SHIFT0 .req v5
|
||||
+SHIFT1 .req v6
|
||||
+SHIFT2 .req sl
|
||||
+SHIFT3 .req fp
|
||||
+SHIFT4 .req ip
|
||||
+SHIFT5 .req lr
|
||||
+
|
||||
+ .macro output4words
|
||||
+ .set SIZE_GROUP1, IDX1
|
||||
+ .if SIZE_GROUP1 > 4
|
||||
+ .set SIZE_GROUP1, 4
|
||||
+ .endif
|
||||
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
|
||||
+ load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
|
||||
+ load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
|
||||
+ .if \channels == 2
|
||||
+ lsl DAT0, SHIFT0
|
||||
+ lsl DAT1, SHIFT1
|
||||
+ lsl DAT2, SHIFT0
|
||||
+ lsl DAT3, SHIFT1
|
||||
+ .elseif \channels == 6
|
||||
+ .if IDX2 == 6
|
||||
+ lsl DAT0, SHIFT0
|
||||
+ lsl DAT1, SHIFT1
|
||||
+ lsl DAT2, SHIFT2
|
||||
+ lsl DAT3, SHIFT3
|
||||
+ .elseif IDX2 == 2
|
||||
+ lsl DAT0, SHIFT4
|
||||
+ lsl DAT1, SHIFT5
|
||||
+ lsl DAT2, SHIFT0
|
||||
+ lsl DAT3, SHIFT1
|
||||
+ .else // IDX2 == 4
|
||||
+ lsl DAT0, SHIFT2
|
||||
+ lsl DAT1, SHIFT3
|
||||
+ lsl DAT2, SHIFT4
|
||||
+ lsl DAT3, SHIFT5
|
||||
+ .endif
|
||||
+ .elseif \channels == 8
|
||||
+ .if IDX2 == 8
|
||||
+ uxtb SHIFT0, SHIFT4, ror #0
|
||||
+ uxtb SHIFT1, SHIFT4, ror #8
|
||||
+ uxtb SHIFT2, SHIFT4, ror #16
|
||||
+ uxtb SHIFT3, SHIFT4, ror #24
|
||||
+ .else
|
||||
+ uxtb SHIFT0, SHIFT5, ror #0
|
||||
+ uxtb SHIFT1, SHIFT5, ror #8
|
||||
+ uxtb SHIFT2, SHIFT5, ror #16
|
||||
+ uxtb SHIFT3, SHIFT5, ror #24
|
||||
+ .endif
|
||||
+ lsl DAT0, SHIFT0
|
||||
+ lsl DAT1, SHIFT1
|
||||
+ lsl DAT2, SHIFT2
|
||||
+ lsl DAT3, SHIFT3
|
||||
+ .endif
|
||||
+ eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
|
||||
+ eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
|
||||
+ decr_modulo IDX2, 2, \channels
|
||||
+ eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
|
||||
+ eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
|
||||
+ decr_modulo IDX2, 2, \channels
|
||||
+ stm OUT!, {DAT0 - DAT3}
|
||||
+ .endm
|
||||
+
|
||||
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
|
||||
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
|
||||
+
|
||||
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
|
||||
+ .if SAMPLES_PER_LOOP > 1
|
||||
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
|
||||
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
|
||||
+ .endif
|
||||
+ teq COUNT, #0
|
||||
+ it eq
|
||||
+ bxeq lr
|
||||
+ push {v1-v6,sl,fp,lr}
|
||||
+ ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack
|
||||
+ ldr SHIFT1, =0x08080808
|
||||
+ ldr SHIFT4, [SHIFT0]
|
||||
+ .if \channels == 2
|
||||
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
|
||||
+ uxtb SHIFT0, SHIFT4, ror #0
|
||||
+ uxtb SHIFT1, SHIFT4, ror #8
|
||||
+ .else
|
||||
+ ldr SHIFT5, [SHIFT0, #4]
|
||||
+ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
|
||||
+ uadd8 SHIFT5, SHIFT5, SHIFT1
|
||||
+ .if \channels == 6
|
||||
+ uxtb SHIFT0, SHIFT4, ror #0
|
||||
+ uxtb SHIFT1, SHIFT4, ror #8
|
||||
+ uxtb SHIFT2, SHIFT4, ror #16
|
||||
+ uxtb SHIFT3, SHIFT4, ror #24
|
||||
+ uxtb SHIFT4, SHIFT5, ror #0
|
||||
+ uxtb SHIFT5, SHIFT5, ror #8
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .set IDX1, \channels
|
||||
+ .set IDX2, \channels
|
||||
+0:
|
||||
+ .rept WORDS_PER_LOOP / 4
|
||||
+ output4words
|
||||
+ .endr
|
||||
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
+ bne 0b
|
||||
+ pop {v1-v6,sl,fp,pc}
|
||||
+ .ltorg
|
||||
+endfunc
|
||||
+ .purgem output4words
|
||||
+
|
||||
+ .unreq CHECK
|
||||
+ .unreq COUNT
|
||||
+ .unreq IN
|
||||
+ .unreq OUT
|
||||
+ .unreq DAT0
|
||||
+ .unreq DAT1
|
||||
+ .unreq DAT2
|
||||
+ .unreq DAT3
|
||||
+ .unreq SHIFT0
|
||||
+ .unreq SHIFT1
|
||||
+ .unreq SHIFT2
|
||||
+ .unreq SHIFT3
|
||||
+ .unreq SHIFT4
|
||||
+ .unreq SHIFT5
|
||||
+
|
||||
+.else // not mixed
|
||||
+
|
||||
+CHECK .req a1
|
||||
+COUNT .req a2
|
||||
+IN .req a3
|
||||
+OUT .req a4
|
||||
+DAT0 .req v1
|
||||
+DAT1 .req v2
|
||||
+DAT2 .req v3
|
||||
+DAT3 .req v4
|
||||
+DAT4 .req v5
|
||||
+DAT5 .req v6
|
||||
+DAT6 .req sl // use these rather than the otherwise unused
|
||||
+DAT7 .req fp // ip and lr so that we can load them usinf LDRD
|
||||
+
|
||||
+ .macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
|
||||
+ .if \head
|
||||
+ .set SIZE_GROUP1, IDX1
|
||||
+ .if SIZE_GROUP1 > 4
|
||||
+ .set SIZE_GROUP1, 4
|
||||
+ .endif
|
||||
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
|
||||
+ load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
|
||||
+ .endif
|
||||
+ .if \tail
|
||||
+ eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
|
||||
+ eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
|
||||
+ decr_modulo IDX2, 2, \channels
|
||||
+ .endif
|
||||
+ .if \head
|
||||
+ load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
|
||||
+ .endif
|
||||
+ .if \tail
|
||||
+ eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
|
||||
+ eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
|
||||
+ decr_modulo IDX2, 2, \channels
|
||||
+ stm OUT!, {\r4, \r5, \r6, \r7}
|
||||
+ .endif
|
||||
+ .if \head
|
||||
+ lsl \r0, #8 + \shift
|
||||
+ lsl \r1, #8 + \shift
|
||||
+ lsl \r2, #8 + \shift
|
||||
+ lsl \r3, #8 + \shift
|
||||
+ .endif
|
||||
+ .endm
|
||||
+
|
||||
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8)
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
|
||||
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
|
||||
+
|
||||
+function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
|
||||
+ .if SAMPLES_PER_LOOP > 1
|
||||
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
|
||||
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
|
||||
+ .endif
|
||||
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
+ it lo
|
||||
+ bxlo lr
|
||||
+ push {v1-v6,sl,fp,lr}
|
||||
+ .set IDX1, \channels
|
||||
+ .set IDX2, \channels
|
||||
+ output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
+0: beq 1f
|
||||
+ .rept WORDS_PER_LOOP / 8
|
||||
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
|
||||
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
+ .endr
|
||||
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
+ bne 0b
|
||||
+1:
|
||||
+ .rept WORDS_PER_LOOP / 8 - 1
|
||||
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
|
||||
+ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
+ .endr
|
||||
+ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
|
||||
+ output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
|
||||
+ pop {v1-v6,sl,fp,pc}
|
||||
+endfunc
|
||||
+ .purgem output4words
|
||||
+
|
||||
+ .unreq CHECK
|
||||
+ .unreq COUNT
|
||||
+ .unreq IN
|
||||
+ .unreq OUT
|
||||
+ .unreq DAT0
|
||||
+ .unreq DAT1
|
||||
+ .unreq DAT2
|
||||
+ .unreq DAT3
|
||||
+ .unreq DAT4
|
||||
+ .unreq DAT5
|
||||
+ .unreq DAT6
|
||||
+ .unreq DAT7
|
||||
+
|
||||
+.endif // mixed
|
||||
+.else // not inorder
|
||||
+.ifc \shift, mixed
|
||||
+
|
||||
+// This case not currently handled
|
||||
+
|
||||
+.else // not mixed
|
||||
+
|
||||
+#if !CONFIG_THUMB
|
||||
+
|
||||
+CHECK .req a1
|
||||
+COUNT .req a2
|
||||
+IN .req a3
|
||||
+OUT .req a4
|
||||
+DAT0 .req v1
|
||||
+DAT1 .req v2
|
||||
+DAT2 .req v3
|
||||
+DAT3 .req v4
|
||||
+CHAN0 .req v5
|
||||
+CHAN1 .req v6
|
||||
+CHAN2 .req sl
|
||||
+CHAN3 .req fp
|
||||
+CHAN4 .req ip
|
||||
+CHAN5 .req lr
|
||||
+
|
||||
+ .macro output4words
|
||||
+ .if \channels == 8
|
||||
+ .if IDX1 == 8
|
||||
+ uxtb CHAN0, CHAN4, ror #0
|
||||
+ uxtb CHAN1, CHAN4, ror #8
|
||||
+ uxtb CHAN2, CHAN4, ror #16
|
||||
+ uxtb CHAN3, CHAN4, ror #24
|
||||
+ .else
|
||||
+ uxtb CHAN0, CHAN5, ror #0
|
||||
+ uxtb CHAN1, CHAN5, ror #8
|
||||
+ uxtb CHAN2, CHAN5, ror #16
|
||||
+ uxtb CHAN3, CHAN5, ror #24
|
||||
+ .endif
|
||||
+ ldr DAT0, [IN, CHAN0, lsl #2]
|
||||
+ ldr DAT1, [IN, CHAN1, lsl #2]
|
||||
+ ldr DAT2, [IN, CHAN2, lsl #2]
|
||||
+ ldr DAT3, [IN, CHAN3, lsl #2]
|
||||
+ .if IDX1 == 4
|
||||
+ add IN, IN, #8*4
|
||||
+ .endif
|
||||
+ decr_modulo IDX1, 4, \channels
|
||||
+ .else
|
||||
+ .set SIZE_GROUP1, IDX1
|
||||
+ .if SIZE_GROUP1 > 4
|
||||
+ .set SIZE_GROUP1, 4
|
||||
+ .endif
|
||||
+ .set SIZE_GROUP2, 4 - SIZE_GROUP1
|
||||
+ .if SIZE_GROUP1 == 2
|
||||
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
|
||||
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
|
||||
+ add IN, IN, #8*4
|
||||
+ .else // SIZE_GROUP1 == 4
|
||||
+ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
|
||||
+ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
|
||||
+ loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
|
||||
+ loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
|
||||
+ .if IDX1 == 4
|
||||
+ add IN, IN, #8*4
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ decr_modulo IDX1, SIZE_GROUP1, \channels
|
||||
+ .if SIZE_GROUP2 == 2
|
||||
+ loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
|
||||
+ loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
|
||||
+ .if IDX1 == 2
|
||||
+ add IN, IN, #8*4
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ decr_modulo IDX1, SIZE_GROUP2, \channels
|
||||
+ .endif
|
||||
+ .if \channels == 8 // in this case we can corrupt CHAN0-3
|
||||
+ rsb CHAN0, CHAN0, #8
|
||||
+ rsb CHAN1, CHAN1, #8
|
||||
+ rsb CHAN2, CHAN2, #8
|
||||
+ rsb CHAN3, CHAN3, #8
|
||||
+ lsl DAT0, #8 + \shift
|
||||
+ lsl DAT1, #8 + \shift
|
||||
+ lsl DAT2, #8 + \shift
|
||||
+ lsl DAT3, #8 + \shift
|
||||
+ eor CHECK, CHECK, DAT0, lsr CHAN0
|
||||
+ eor CHECK, CHECK, DAT1, lsr CHAN1
|
||||
+ eor CHECK, CHECK, DAT2, lsr CHAN2
|
||||
+ eor CHECK, CHECK, DAT3, lsr CHAN3
|
||||
+ .else
|
||||
+ .if \shift != 0
|
||||
+ lsl DAT0, #\shift
|
||||
+ lsl DAT1, #\shift
|
||||
+ lsl DAT2, #\shift
|
||||
+ lsl DAT3, #\shift
|
||||
+ .endif
|
||||
+ bic DAT0, DAT0, #0xff000000
|
||||
+ bic DAT1, DAT1, #0xff000000
|
||||
+ bic DAT2, DAT2, #0xff000000
|
||||
+ bic DAT3, DAT3, #0xff000000
|
||||
+ eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
|
||||
+ eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
|
||||
+ decr_modulo IDX2, 2, \channels
|
||||
+ eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
|
||||
+ eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
|
||||
+ decr_modulo IDX2, 2, \channels
|
||||
+ lsl DAT0, #8
|
||||
+ lsl DAT1, #8
|
||||
+ lsl DAT2, #8
|
||||
+ lsl DAT3, #8
|
||||
+ .endif
|
||||
+ stm OUT!, {DAT0 - DAT3}
|
||||
+ .endm
|
||||
+
|
||||
+ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4)
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .if (WORDS_PER_LOOP % 2) == 0
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
|
||||
+ .endif
|
||||
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
|
||||
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
|
||||
+
|
||||
+function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
|
||||
+ .if SAMPLES_PER_LOOP > 1
|
||||
+ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice
|
||||
+ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not
|
||||
+ .endif
|
||||
+ teq COUNT, #0
|
||||
+ it eq
|
||||
+ bxeq lr
|
||||
+ push {v1-v6,sl,fp,lr}
|
||||
+ ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack
|
||||
+ ldr CHAN4, [CHAN0]
|
||||
+ .if \channels == 2
|
||||
+ uxtb CHAN0, CHAN4, ror #0
|
||||
+ uxtb CHAN1, CHAN4, ror #8
|
||||
+ .else
|
||||
+ ldr CHAN5, [CHAN0, #4]
|
||||
+ .if \channels == 6
|
||||
+ uxtb CHAN0, CHAN4, ror #0
|
||||
+ uxtb CHAN1, CHAN4, ror #8
|
||||
+ uxtb CHAN2, CHAN4, ror #16
|
||||
+ uxtb CHAN3, CHAN4, ror #24
|
||||
+ uxtb CHAN4, CHAN5, ror #0
|
||||
+ uxtb CHAN5, CHAN5, ror #8
|
||||
+ .endif
|
||||
+ .endif
|
||||
+ .set IDX1, \channels
|
||||
+ .set IDX2, \channels
|
||||
+0:
|
||||
+ .rept WORDS_PER_LOOP / 4
|
||||
+ output4words
|
||||
+ .endr
|
||||
+ subs COUNT, COUNT, #SAMPLES_PER_LOOP
|
||||
+ bne 0b
|
||||
+ pop {v1-v6,sl,fp,pc}
|
||||
+ .ltorg
|
||||
+endfunc
|
||||
+ .purgem output4words
|
||||
+
|
||||
+ .unreq CHECK
|
||||
+ .unreq COUNT
|
||||
+ .unreq IN
|
||||
+ .unreq OUT
|
||||
+ .unreq DAT0
|
||||
+ .unreq DAT1
|
||||
+ .unreq DAT2
|
||||
+ .unreq DAT3
|
||||
+ .unreq CHAN0
|
||||
+ .unreq CHAN1
|
||||
+ .unreq CHAN2
|
||||
+ .unreq CHAN3
|
||||
+ .unreq CHAN4
|
||||
+ .unreq CHAN5
|
||||
+
|
||||
+#endif // !CONFIG_THUMB
|
||||
+
|
||||
+.endif // mixed
|
||||
+.endif // inorder
|
||||
+.endm // implement_pack
|
||||
+
|
||||
+.macro pack_channels inorder, channels
|
||||
+ implement_pack \inorder, \channels, 0
|
||||
+ implement_pack \inorder, \channels, 1
|
||||
+ implement_pack \inorder, \channels, 2
|
||||
+ implement_pack \inorder, \channels, 3
|
||||
+ implement_pack \inorder, \channels, 4
|
||||
+ implement_pack \inorder, \channels, 5
|
||||
+ implement_pack \inorder, \channels, mixed
|
||||
+.endm
|
||||
+
|
||||
+.macro pack_order inorder
|
||||
+ pack_channels \inorder, 2
|
||||
+ pack_channels \inorder, 6
|
||||
+ pack_channels \inorder, 8
|
||||
+.endm
|
||||
+
|
||||
+ pack_order 0
|
||||
+ pack_order 1
|
||||
diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
|
||||
index 1bb2276..10ec316 100644
|
||||
--- a/libavcodec/arm/mlpdsp_init_arm.c
|
||||
+++ b/libavcodec/arm/mlpdsp_init_arm.c
|
||||
@@ -41,8 +41,104 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
|
||||
int access_unit_size_pow2,
|
||||
int32_t mask);
|
||||
|
||||
+#define DECLARE_PACK(order,channels,shift) \
|
||||
+ int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
|
||||
+#define ENUMERATE_PACK(order,channels,shift) \
|
||||
+ ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
|
||||
+#define PACK_CHANNELS(macro,order,channels) \
|
||||
+ macro(order,channels,0) \
|
||||
+ macro(order,channels,1) \
|
||||
+ macro(order,channels,2) \
|
||||
+ macro(order,channels,3) \
|
||||
+ macro(order,channels,4) \
|
||||
+ macro(order,channels,5) \
|
||||
+ macro(order,channels,mixed)
|
||||
+#define PACK_ORDER(macro,order) \
|
||||
+ PACK_CHANNELS(macro,order,2) \
|
||||
+ PACK_CHANNELS(macro,order,6) \
|
||||
+ PACK_CHANNELS(macro,order,8)
|
||||
+#define PACK_ALL(macro) \
|
||||
+ PACK_ORDER(macro,outof) \
|
||||
+ PACK_ORDER(macro,in)
|
||||
+PACK_ALL(DECLARE_PACK)
|
||||
+
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
|
||||
+#if CONFIG_THUMB
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0
|
||||
+#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0
|
||||
+#endif
|
||||
+
|
||||
+static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
|
||||
+ int8_t *output_shift,
|
||||
+ uint8_t max_matrix_channel,
|
||||
+ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
|
||||
+{
|
||||
+ int ch_index;
|
||||
+ int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
|
||||
+ int inorder = 1;
|
||||
+ static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
|
||||
+ PACK_ALL(ENUMERATE_PACK)
|
||||
+ };
|
||||
+ int i;
|
||||
+
|
||||
+ if (!is32) // don't support 16-bit output (it's not used by TrueHD)
|
||||
+ return ff_mlp_pack_output;
|
||||
+
|
||||
+ switch (max_matrix_channel) {
|
||||
+ case 1:
|
||||
+ ch_index = 0;
|
||||
+ break;
|
||||
+ case 5:
|
||||
+ ch_index = 1;
|
||||
+ break;
|
||||
+ case 7:
|
||||
+ ch_index = 2;
|
||||
+ break;
|
||||
+ default:
|
||||
+ return ff_mlp_pack_output;
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i <= max_matrix_channel; i++) {
|
||||
+ if (shift != 6 && output_shift[i] != shift)
|
||||
+ shift = 6; // indicate mixed shifts
|
||||
+ if (ch_assign[i] != i)
|
||||
+ inorder = 0;
|
||||
+ }
|
||||
+#if CONFIG_THUMB
|
||||
+ if (!inorder)
|
||||
+ return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode
|
||||
+#else
|
||||
+ if (shift == 6 && !inorder)
|
||||
+ return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
|
||||
+#endif
|
||||
+
|
||||
+ return routine[(inorder*3+ch_index)*7+shift];
|
||||
+}
|
||||
+
|
||||
av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
|
||||
{
|
||||
+ int cpu_flags = av_get_cpu_flags();
|
||||
+
|
||||
c->mlp_filter_channel = ff_mlp_filter_channel_arm;
|
||||
c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
|
||||
+ if (cpu_flags & AV_CPU_FLAG_ARMV6)
|
||||
+ c->mlp_select_pack_output = mlp_select_pack_output_armv6;
|
||||
}
|
||||
--
|
||||
1.9.1
|
@ -0,0 +1,47 @@
|
||||
commit 0e7427498cb1131671f6fe9d054245ae7e5a36f5
|
||||
Author: popcornmix <popcornmix@gmail.com>
|
||||
Date: Tue Mar 25 19:43:07 2014 +0000
|
||||
|
||||
[ffmpeg] Speed up wtv index creation
|
||||
|
||||
The index creation is O(N^2) with number of entries (typically thousands).
|
||||
On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
|
||||
|
||||
By replacing with an O(N) loop, this takes virtually zero time
|
||||
|
||||
diff --git a/libavformat/wtvdec.c b/libavformat/wtvdec.c
|
||||
index e423370..70898bd 100644
|
||||
--- a/libavformat/wtvdec.c
|
||||
+++ b/libavformat/wtvdec.c
|
||||
@@ -980,21 +980,23 @@ static int read_header(AVFormatContext *s)
|
||||
pb = wtvfile_open(s, root, root_size, ff_timeline_table_0_entries_Events_le16);
|
||||
if (pb) {
|
||||
int i;
|
||||
+ AVIndexEntry *e = wtv->index_entries;
|
||||
+ AVIndexEntry *e_end = wtv->index_entries + wtv->nb_index_entries - 1;
|
||||
+ uint64_t last_position = 0;
|
||||
while (1) {
|
||||
uint64_t frame_nb = avio_rl64(pb);
|
||||
uint64_t position = avio_rl64(pb);
|
||||
+ while (frame_nb > e->size && e <= e_end) {
|
||||
+ e->pos = last_position;
|
||||
+ e++;
|
||||
+ }
|
||||
if (url_feof(pb))
|
||||
break;
|
||||
- for (i = wtv->nb_index_entries - 1; i >= 0; i--) {
|
||||
- AVIndexEntry *e = wtv->index_entries + i;
|
||||
- if (frame_nb > e->size)
|
||||
- break;
|
||||
- if (position > e->pos)
|
||||
- e->pos = position;
|
||||
- }
|
||||
+ last_position = position;
|
||||
}
|
||||
+ e_end->pos = last_position;
|
||||
wtvfile_close(pb);
|
||||
- st->duration = wtv->index_entries[wtv->nb_index_entries - 1].timestamp;
|
||||
+ st->duration = e_end->timestamp;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user