diff --git a/projects/RPi/patches/ffmpeg/0001-h264-Move-search-code-search-functions-into-separate.patch b/projects/RPi/patches/ffmpeg/0001-h264-Move-search-code-search-functions-into-separate.patch new file mode 100644 index 0000000000..62e473d94e --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0001-h264-Move-search-code-search-functions-into-separate.patch @@ -0,0 +1,752 @@ +From 8cdb3bf2837a3fb4fff3c6586316f81ae5f7b6cd Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Wed, 16 Apr 2014 01:51:31 +0100 +Subject: [PATCH 1/3] h264: Move search code search functions into separate + source files. + +This permits re-use with parsers for codecs which use similar start codes. + +Signed-off-by: Michael Niedermayer +--- + libavcodec/Makefile | 2 +- + libavcodec/arm/Makefile | 2 +- + libavcodec/arm/h264dsp_armv6.S | 253 -------------------------------------- + libavcodec/arm/h264dsp_init_arm.c | 4 +- + libavcodec/arm/startcode_armv6.S | 253 ++++++++++++++++++++++++++++++++++++++ + libavcodec/h264dsp.c | 31 +---- + libavcodec/startcode.c | 57 +++++++++ + libavcodec/startcode.h | 35 ++++++ + 8 files changed, 351 insertions(+), 286 deletions(-) + delete mode 100644 libavcodec/arm/h264dsp_armv6.S + create mode 100644 libavcodec/arm/startcode_armv6.S + create mode 100644 libavcodec/startcode.c + create mode 100644 libavcodec/startcode.h + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index b56ecd1..19caf11 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -49,7 +49,7 @@ OBJS-$(CONFIG_FFT) += avfft.o fft_fixed.o fft_float.o \ + OBJS-$(CONFIG_GOLOMB) += golomb.o + OBJS-$(CONFIG_H263DSP) += h263dsp.o + OBJS-$(CONFIG_H264CHROMA) += h264chroma.o +-OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o ++OBJS-$(CONFIG_H264DSP) += h264dsp.o h264idct.o startcode.o + OBJS-$(CONFIG_H264PRED) += h264pred.o + OBJS-$(CONFIG_H264QPEL) += h264qpel.o + OBJS-$(CONFIG_HPELDSP) += hpeldsp.o +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index a8446b2..b6410b2 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -47,7 +47,7 @@ ARMV6-OBJS-$(CONFIG_DSPUTIL) += arm/dsputil_init_armv6.o \ + arm/simple_idct_armv6.o \ + + ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o +-ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o ++ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o + ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ + arm/hpeldsp_armv6.o + ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o +diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S +deleted file mode 100644 +index 2758262..0000000 +--- a/libavcodec/arm/h264dsp_armv6.S ++++ /dev/null +@@ -1,253 +0,0 @@ +-/* +- * Copyright (c) 2013 RISC OS Open Ltd +- * Author: Ben Avison +- * +- * This file is part of FFmpeg. +- * +- * FFmpeg is free software; you can redistribute it and/or +- * modify it under the terms of the GNU Lesser General Public +- * License as published by the Free Software Foundation; either +- * version 2.1 of the License, or (at your option) any later version. +- * +- * FFmpeg is distributed in the hope that it will be useful, +- * but WITHOUT ANY WARRANTY; without even the implied warranty of +- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- * Lesser General Public License for more details. +- * +- * You should have received a copy of the GNU Lesser General Public +- * License along with FFmpeg; if not, write to the Free Software +- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +- */ +- +-#include "libavutil/arm/asm.S" +- +-RESULT .req a1 +-BUF .req a1 +-SIZE .req a2 +-PATTERN .req a3 +-PTR .req a4 +-DAT0 .req v1 +-DAT1 .req v2 +-DAT2 .req v3 +-DAT3 .req v4 +-TMP0 .req v5 +-TMP1 .req v6 +-TMP2 .req ip +-TMP3 .req lr +- +-#define PRELOAD_DISTANCE 4 +- +-.macro innerloop4 +- ldr DAT0, [PTR], #4 +- subs SIZE, SIZE, #4 @ C flag survives rest of macro +- sub TMP0, DAT0, PATTERN, lsr #14 +- bic TMP0, TMP0, DAT0 +- ands TMP0, TMP0, PATTERN +-.endm +- +-.macro innerloop16 decrement, do_preload +- ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} +- .ifnc "\do_preload","" +- pld [PTR, #PRELOAD_DISTANCE*32] +- .endif +- .ifnc "\decrement","" +- subs SIZE, SIZE, #\decrement @ C flag survives rest of macro +- .endif +- sub TMP0, DAT0, PATTERN, lsr #14 +- sub TMP1, DAT1, PATTERN, lsr #14 +- bic TMP0, TMP0, DAT0 +- bic TMP1, TMP1, DAT1 +- sub TMP2, DAT2, PATTERN, lsr #14 +- sub TMP3, DAT3, PATTERN, lsr #14 +- ands TMP0, TMP0, PATTERN +- bic TMP2, TMP2, DAT2 +- it eq +- andseq TMP1, TMP1, PATTERN +- bic TMP3, TMP3, DAT3 +- itt eq +- andseq TMP2, TMP2, PATTERN +- andseq TMP3, TMP3, PATTERN +-.endm +- +-/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */ +-function ff_h264_find_start_code_candidate_armv6, export=1 +- push {v1-v6,lr} +- mov PTR, BUF +- @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go +- @ before using code that does preloads +- cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 +- blo 60f +- +- @ Get to word-alignment, 1 byte at a time +- tst PTR, #3 +- beq 2f +-1: ldrb DAT0, [PTR], #1 +- sub SIZE, SIZE, #1 +- teq DAT0, #0 +- beq 90f +- tst PTR, #3 +- bne 1b +-2: @ Get to 4-word alignment, 1 word at a time +- ldr PATTERN, =0x80008000 +- setend be +- tst PTR, #12 +- beq 4f +-3: innerloop4 +- bne 91f +- tst PTR, #12 +- bne 3b +-4: @ Get to cacheline (8-word) alignment +- tst PTR, #16 +- beq 5f +- innerloop16 16 +- bne 93f +-5: @ Check complete cachelines, with preloading +- @ We need to stop when there are still (PRELOAD_DISTANCE+1) +- @ complete cachelines to go +- sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 +-6: innerloop16 , do_preload +- bne 93f +- innerloop16 32 +- bne 93f +- bcs 6b +- @ Preload trailing part-cacheline, if any +- tst SIZE, #31 +- beq 7f +- pld [PTR, #(PRELOAD_DISTANCE+1)*32] +- @ Check remaining data without doing any more preloads. First +- @ do in chunks of 4 words: +-7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 +- bmi 9f +-8: innerloop16 16 +- bne 93f +- bcs 8b +- @ Then in words: +-9: adds SIZE, SIZE, #16 - 4 +- bmi 11f +-10: innerloop4 +- bne 91f +- bcs 10b +-11: setend le +- @ Check second byte of final halfword +- ldrb DAT0, [PTR, #-1] +- teq DAT0, #0 +- beq 90f +- @ Check any remaining bytes +- tst SIZE, #3 +- beq 13f +-12: ldrb DAT0, [PTR], #1 +- sub SIZE, SIZE, #1 +- teq DAT0, #0 +- beq 90f +- tst SIZE, #3 +- bne 12b +- @ No candidate found +-13: sub RESULT, PTR, BUF +- b 99f +- +-60: @ Small buffer - simply check by looping over bytes +- subs SIZE, SIZE, #1 +- bcc 99f +-61: ldrb DAT0, [PTR], #1 +- subs SIZE, SIZE, #1 +- teq DAT0, #0 +- beq 90f +- bcs 61b +- @ No candidate found +- sub RESULT, PTR, BUF +- b 99f +- +-90: @ Found a candidate at the preceding byte +- sub RESULT, PTR, BUF +- sub RESULT, RESULT, #1 +- b 99f +- +-91: @ Found a candidate somewhere in the preceding 4 bytes +- sub RESULT, PTR, BUF +- sub RESULT, RESULT, #4 +- sub TMP0, DAT0, #0x20000 +- bics TMP0, TMP0, DAT0 +- itt pl +- ldrbpl DAT0, [PTR, #-3] +- addpl RESULT, RESULT, #2 +- bpl 92f +- teq RESULT, #0 +- beq 98f @ don't look back a byte if found at first byte in buffer +- ldrb DAT0, [PTR, #-5] +-92: teq DAT0, #0 +- it eq +- subeq RESULT, RESULT, #1 +- b 98f +- +-93: @ Found a candidate somewhere in the preceding 16 bytes +- sub RESULT, PTR, BUF +- sub RESULT, RESULT, #16 +- teq TMP0, #0 +- beq 95f @ not in first 4 bytes +- sub TMP0, DAT0, #0x20000 +- bics TMP0, TMP0, DAT0 +- itt pl +- ldrbpl DAT0, [PTR, #-15] +- addpl RESULT, RESULT, #2 +- bpl 94f +- teq RESULT, #0 +- beq 98f @ don't look back a byte if found at first byte in buffer +- ldrb DAT0, [PTR, #-17] +-94: teq DAT0, #0 +- it eq +- subeq RESULT, RESULT, #1 +- b 98f +-95: add RESULT, RESULT, #4 +- teq TMP1, #0 +- beq 96f @ not in next 4 bytes +- sub TMP1, DAT1, #0x20000 +- bics TMP1, TMP1, DAT1 +- itee mi +- ldrbmi DAT0, [PTR, #-13] +- ldrbpl DAT0, [PTR, #-11] +- addpl RESULT, RESULT, #2 +- teq DAT0, #0 +- it eq +- subeq RESULT, RESULT, #1 +- b 98f +-96: add RESULT, RESULT, #4 +- teq TMP2, #0 +- beq 97f @ not in next 4 bytes +- sub TMP2, DAT2, #0x20000 +- bics TMP2, TMP2, DAT2 +- itee mi +- ldrbmi DAT0, [PTR, #-9] +- ldrbpl DAT0, [PTR, #-7] +- addpl RESULT, RESULT, #2 +- teq DAT0, #0 +- it eq +- subeq RESULT, RESULT, #1 +- b 98f +-97: add RESULT, RESULT, #4 +- sub TMP3, DAT3, #0x20000 +- bics TMP3, TMP3, DAT3 +- itee mi +- ldrbmi DAT0, [PTR, #-5] +- ldrbpl DAT0, [PTR, #-3] +- addpl RESULT, RESULT, #2 +- teq DAT0, #0 +- it eq +- subeq RESULT, RESULT, #1 +- @ drop through to 98f +-98: setend le +-99: pop {v1-v6,pc} +-endfunc +- +- .unreq RESULT +- .unreq BUF +- .unreq SIZE +- .unreq PATTERN +- .unreq PTR +- .unreq DAT0 +- .unreq DAT1 +- .unreq DAT2 +- .unreq DAT3 +- .unreq TMP0 +- .unreq TMP1 +- .unreq TMP2 +- .unreq TMP3 +diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c +index a0418fd..eb6c514 100644 +--- a/libavcodec/arm/h264dsp_init_arm.c ++++ b/libavcodec/arm/h264dsp_init_arm.c +@@ -24,7 +24,7 @@ + #include "libavutil/arm/cpu.h" + #include "libavcodec/h264dsp.h" + +-int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size); ++int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size); + + void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha, + int beta, int8_t *tc0); +@@ -109,7 +109,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth, + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) +- c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6; ++ c->h264_find_start_code_candidate = ff_startcode_find_candidate_armv6; + if (have_neon(cpu_flags)) + h264dsp_init_neon(c, bit_depth, chroma_format_idc); + } +diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S +new file mode 100644 +index 0000000..a46f009 +--- /dev/null ++++ b/libavcodec/arm/startcode_armv6.S +@@ -0,0 +1,253 @@ ++/* ++ * Copyright (c) 2013 RISC OS Open Ltd ++ * Author: Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++ ++RESULT .req a1 ++BUF .req a1 ++SIZE .req a2 ++PATTERN .req a3 ++PTR .req a4 ++DAT0 .req v1 ++DAT1 .req v2 ++DAT2 .req v3 ++DAT3 .req v4 ++TMP0 .req v5 ++TMP1 .req v6 ++TMP2 .req ip ++TMP3 .req lr ++ ++#define PRELOAD_DISTANCE 4 ++ ++.macro innerloop4 ++ ldr DAT0, [PTR], #4 ++ subs SIZE, SIZE, #4 @ C flag survives rest of macro ++ sub TMP0, DAT0, PATTERN, lsr #14 ++ bic TMP0, TMP0, DAT0 ++ ands TMP0, TMP0, PATTERN ++.endm ++ ++.macro innerloop16 decrement, do_preload ++ ldmia PTR!, {DAT0,DAT1,DAT2,DAT3} ++ .ifnc "\do_preload","" ++ pld [PTR, #PRELOAD_DISTANCE*32] ++ .endif ++ .ifnc "\decrement","" ++ subs SIZE, SIZE, #\decrement @ C flag survives rest of macro ++ .endif ++ sub TMP0, DAT0, PATTERN, lsr #14 ++ sub TMP1, DAT1, PATTERN, lsr #14 ++ bic TMP0, TMP0, DAT0 ++ bic TMP1, TMP1, DAT1 ++ sub TMP2, DAT2, PATTERN, lsr #14 ++ sub TMP3, DAT3, PATTERN, lsr #14 ++ ands TMP0, TMP0, PATTERN ++ bic TMP2, TMP2, DAT2 ++ it eq ++ andseq TMP1, TMP1, PATTERN ++ bic TMP3, TMP3, DAT3 ++ itt eq ++ andseq TMP2, TMP2, PATTERN ++ andseq TMP3, TMP3, PATTERN ++.endm ++ ++/* int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size) */ ++function ff_startcode_find_candidate_armv6, export=1 ++ push {v1-v6,lr} ++ mov PTR, BUF ++ @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go ++ @ before using code that does preloads ++ cmp SIZE, #(PRELOAD_DISTANCE+3)*32 - 1 ++ blo 60f ++ ++ @ Get to word-alignment, 1 byte at a time ++ tst PTR, #3 ++ beq 2f ++1: ldrb DAT0, [PTR], #1 ++ sub SIZE, SIZE, #1 ++ teq DAT0, #0 ++ beq 90f ++ tst PTR, #3 ++ bne 1b ++2: @ Get to 4-word alignment, 1 word at a time ++ ldr PATTERN, =0x80008000 ++ setend be ++ tst PTR, #12 ++ beq 4f ++3: innerloop4 ++ bne 91f ++ tst PTR, #12 ++ bne 3b ++4: @ Get to cacheline (8-word) alignment ++ tst PTR, #16 ++ beq 5f ++ innerloop16 16 ++ bne 93f ++5: @ Check complete cachelines, with preloading ++ @ We need to stop when there are still (PRELOAD_DISTANCE+1) ++ @ complete cachelines to go ++ sub SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 ++6: innerloop16 , do_preload ++ bne 93f ++ innerloop16 32 ++ bne 93f ++ bcs 6b ++ @ Preload trailing part-cacheline, if any ++ tst SIZE, #31 ++ beq 7f ++ pld [PTR, #(PRELOAD_DISTANCE+1)*32] ++ @ Check remaining data without doing any more preloads. First ++ @ do in chunks of 4 words: ++7: adds SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16 ++ bmi 9f ++8: innerloop16 16 ++ bne 93f ++ bcs 8b ++ @ Then in words: ++9: adds SIZE, SIZE, #16 - 4 ++ bmi 11f ++10: innerloop4 ++ bne 91f ++ bcs 10b ++11: setend le ++ @ Check second byte of final halfword ++ ldrb DAT0, [PTR, #-1] ++ teq DAT0, #0 ++ beq 90f ++ @ Check any remaining bytes ++ tst SIZE, #3 ++ beq 13f ++12: ldrb DAT0, [PTR], #1 ++ sub SIZE, SIZE, #1 ++ teq DAT0, #0 ++ beq 90f ++ tst SIZE, #3 ++ bne 12b ++ @ No candidate found ++13: sub RESULT, PTR, BUF ++ b 99f ++ ++60: @ Small buffer - simply check by looping over bytes ++ subs SIZE, SIZE, #1 ++ bcc 99f ++61: ldrb DAT0, [PTR], #1 ++ subs SIZE, SIZE, #1 ++ teq DAT0, #0 ++ beq 90f ++ bcs 61b ++ @ No candidate found ++ sub RESULT, PTR, BUF ++ b 99f ++ ++90: @ Found a candidate at the preceding byte ++ sub RESULT, PTR, BUF ++ sub RESULT, RESULT, #1 ++ b 99f ++ ++91: @ Found a candidate somewhere in the preceding 4 bytes ++ sub RESULT, PTR, BUF ++ sub RESULT, RESULT, #4 ++ sub TMP0, DAT0, #0x20000 ++ bics TMP0, TMP0, DAT0 ++ itt pl ++ ldrbpl DAT0, [PTR, #-3] ++ addpl RESULT, RESULT, #2 ++ bpl 92f ++ teq RESULT, #0 ++ beq 98f @ don't look back a byte if found at first byte in buffer ++ ldrb DAT0, [PTR, #-5] ++92: teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++ ++93: @ Found a candidate somewhere in the preceding 16 bytes ++ sub RESULT, PTR, BUF ++ sub RESULT, RESULT, #16 ++ teq TMP0, #0 ++ beq 95f @ not in first 4 bytes ++ sub TMP0, DAT0, #0x20000 ++ bics TMP0, TMP0, DAT0 ++ itt pl ++ ldrbpl DAT0, [PTR, #-15] ++ addpl RESULT, RESULT, #2 ++ bpl 94f ++ teq RESULT, #0 ++ beq 98f @ don't look back a byte if found at first byte in buffer ++ ldrb DAT0, [PTR, #-17] ++94: teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++95: add RESULT, RESULT, #4 ++ teq TMP1, #0 ++ beq 96f @ not in next 4 bytes ++ sub TMP1, DAT1, #0x20000 ++ bics TMP1, TMP1, DAT1 ++ itee mi ++ ldrbmi DAT0, [PTR, #-13] ++ ldrbpl DAT0, [PTR, #-11] ++ addpl RESULT, RESULT, #2 ++ teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++96: add RESULT, RESULT, #4 ++ teq TMP2, #0 ++ beq 97f @ not in next 4 bytes ++ sub TMP2, DAT2, #0x20000 ++ bics TMP2, TMP2, DAT2 ++ itee mi ++ ldrbmi DAT0, [PTR, #-9] ++ ldrbpl DAT0, [PTR, #-7] ++ addpl RESULT, RESULT, #2 ++ teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ b 98f ++97: add RESULT, RESULT, #4 ++ sub TMP3, DAT3, #0x20000 ++ bics TMP3, TMP3, DAT3 ++ itee mi ++ ldrbmi DAT0, [PTR, #-5] ++ ldrbpl DAT0, [PTR, #-3] ++ addpl RESULT, RESULT, #2 ++ teq DAT0, #0 ++ it eq ++ subeq RESULT, RESULT, #1 ++ @ drop through to 98f ++98: setend le ++99: pop {v1-v6,pc} ++endfunc ++ ++ .unreq RESULT ++ .unreq BUF ++ .unreq SIZE ++ .unreq PATTERN ++ .unreq PTR ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq TMP0 ++ .unreq TMP1 ++ .unreq TMP2 ++ .unreq TMP3 +diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c +index a2a4aba..a4da776 100644 +--- a/libavcodec/h264dsp.c ++++ b/libavcodec/h264dsp.c +@@ -33,6 +33,7 @@ + #include "avcodec.h" + #include "h264dsp.h" + #include "h264idct.h" ++#include "startcode.h" + #include "libavutil/common.h" + + #define BIT_DEPTH 8 +@@ -63,34 +64,6 @@ + #include "h264addpx_template.c" + #undef BIT_DEPTH + +-static int h264_find_start_code_candidate_c(const uint8_t *buf, int size) +-{ +- int i = 0; +-#if HAVE_FAST_UNALIGNED +- /* we check i < size instead of i + 3 / 7 because it is +- * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE +- * bytes at the end. +- */ +-# if HAVE_FAST_64BIT +- while (i < size && +- !((~*(const uint64_t *)(buf + i) & +- (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & +- 0x8080808080808080ULL)) +- i += 8; +-# else +- while (i < size && +- !((~*(const uint32_t *)(buf + i) & +- (*(const uint32_t *)(buf + i) - 0x01010101U)) & +- 0x80808080U)) +- i += 4; +-# endif +-#endif +- for (; i < size; i++) +- if (!buf[i]) +- break; +- return i; +-} +- + av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc) + { +@@ -178,7 +151,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, + H264_DSP(8); + break; + } +- c->h264_find_start_code_candidate = h264_find_start_code_candidate_c; ++ c->h264_find_start_code_candidate = ff_startcode_find_candidate_c; + + if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc); + if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc); +diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c +new file mode 100644 +index 0000000..5df7695 +--- /dev/null ++++ b/libavcodec/startcode.c +@@ -0,0 +1,57 @@ ++/* ++ * Copyright (c) 2003-2010 Michael Niedermayer ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * Accelerated start code search function for start codes common to ++ * MPEG-1/2/4 video, VC-1, H.264/5 ++ * @author Michael Niedermayer ++ */ ++ ++#include "startcode.h" ++#include "config.h" ++ ++int ff_startcode_find_candidate_c(const uint8_t *buf, int size) ++{ ++ int i = 0; ++#if HAVE_FAST_UNALIGNED ++ /* we check i < size instead of i + 3 / 7 because it is ++ * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE ++ * bytes at the end. ++ */ ++# if HAVE_FAST_64BIT ++ while (i < size && ++ !((~*(const uint64_t *)(buf + i) & ++ (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) & ++ 0x8080808080808080ULL)) ++ i += 8; ++# else ++ while (i < size && ++ !((~*(const uint32_t *)(buf + i) & ++ (*(const uint32_t *)(buf + i) - 0x01010101U)) & ++ 0x80808080U)) ++ i += 4; ++# endif ++#endif ++ for (; i < size; i++) ++ if (!buf[i]) ++ break; ++ return i; ++} +diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h +new file mode 100644 +index 0000000..cc55d5f +--- /dev/null ++++ b/libavcodec/startcode.h +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (c) 2003-2010 Michael Niedermayer ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * Accelerated start code search function for start codes common to ++ * MPEG-1/2/4 video, VC-1, H.264/5 ++ * @author Michael Niedermayer ++ */ ++ ++#ifndef AVCODEC_STARTCODE_H ++#define AVCODEC_STARTCODE_H ++ ++#include ++ ++int ff_startcode_find_candidate_c(const uint8_t *buf, int size); ++ ++#endif /* AVCODEC_STARTCODE_H */ +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0001-truehd-tune-VLC-decoding-for-ARM.patch b/projects/RPi/patches/ffmpeg/0001-truehd-tune-VLC-decoding-for-ARM.patch new file mode 100644 index 0000000000..29508437e5 --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0001-truehd-tune-VLC-decoding-for-ARM.patch @@ -0,0 +1,65 @@ +From 425d69b993d25489e4830766507d9d8f6c819802 Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Wed, 19 Mar 2014 17:26:19 +0000 +Subject: [PATCH 1/6] truehd: tune VLC decoding for ARM. + +Profiling on a Raspberry Pi revealed the best performance to correspond +with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function +in particular are as follows: + + Before After + Mean StdDev Mean StdDev Confidence Change +6:2 total 348.8 20.1 339.6 15.1 88.8% +2.7% (insignificant) +6:2 function 38.1 8.1 26.4 4.1 100.0% +44.5% +8:2 total 339.1 15.4 324.5 15.5 99.4% +4.5% +8:2 function 33.8 7.0 27.3 5.6 99.7% +23.6% +6:6 total 604.6 20.8 572.8 20.6 100.0% +5.6% +6:6 function 95.8 8.4 68.9 8.2 100.0% +39.1% +8:8 total 766.4 17.6 741.5 21.2 100.0% +3.4% +8:8 function 106.0 11.4 86.1 9.9 100.0% +23.1% + +Signed-off-by: Michael Niedermayer +--- + libavcodec/mlpdec.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c +index 93ed552..cbd9000 100644 +--- a/libavcodec/mlpdec.c ++++ b/libavcodec/mlpdec.c +@@ -37,9 +37,16 @@ + #include "mlp_parser.h" + #include "mlpdsp.h" + #include "mlp.h" ++#include "config.h" + + /** number of bits used for VLC lookup - longest Huffman code is 9 */ ++#if ARCH_ARM == 1 ++#define VLC_BITS 5 ++#define VLC_STATIC_SIZE 64 ++#else + #define VLC_BITS 9 ++#define VLC_STATIC_SIZE 512 ++#endif + + typedef struct SubStream { + /// Set if a valid restart header has been read. Otherwise the substream cannot be decoded. +@@ -193,13 +200,13 @@ static av_cold void init_static(void) + if (!huff_vlc[0].bits) { + INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18, + &ff_mlp_huffman_tables[0][0][1], 2, 1, +- &ff_mlp_huffman_tables[0][0][0], 2, 1, 512); ++ &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE); + INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16, + &ff_mlp_huffman_tables[1][0][1], 2, 1, +- &ff_mlp_huffman_tables[1][0][0], 2, 1, 512); ++ &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE); + INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15, + &ff_mlp_huffman_tables[2][0][1], 2, 1, +- &ff_mlp_huffman_tables[2][0][0], 2, 1, 512); ++ &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE); + } + + ff_mlp_init_crc(); +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0002-truehd-add-hand-scheduled-ARM-asm-version-of-mlp_fil.patch b/projects/RPi/patches/ffmpeg/0002-truehd-add-hand-scheduled-ARM-asm-version-of-mlp_fil.patch new file mode 100644 index 0000000000..4aea35f9fd --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0002-truehd-add-hand-scheduled-ARM-asm-version-of-mlp_fil.patch @@ -0,0 +1,557 @@ +From bfe3d8c8e4e046163dc314aa16207413e377283f Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Mon, 3 Mar 2014 19:44:23 +0000 +Subject: [PATCH 2/6] truehd: add hand-scheduled ARM asm version of + mlp_filter_channel. + +Profiling results for overall audio decode and the mlp_filter_channel(_arm) +function in particular are as follows: + + Before After + Mean StdDev Mean StdDev Confidence Change +6:2 total 380.4 22.0 370.8 17.0 87.4% +2.6% (insignificant) +6:2 function 60.7 7.2 36.6 8.1 100.0% +65.8% +8:2 total 357.0 17.5 343.2 19.0 97.8% +4.0% (insignificant) +8:2 function 60.3 8.8 37.3 3.8 100.0% +61.8% +6:6 total 717.2 23.2 658.4 15.7 100.0% +8.9% +6:6 function 140.4 12.9 81.5 9.2 100.0% +72.4% +8:8 total 981.9 16.2 896.2 24.5 100.0% +9.6% +8:8 function 193.4 15.0 103.3 11.5 100.0% +87.2% + +Experiments with adding preload instructions to this function yielded no +useful benefit, so these have not been included. + +The assembly version has also been tested with a fuzz tester to ensure that +any combinations of inputs not exercised by my available test streams still +generate mathematically identical results to the C version. +--- + libavcodec/arm/Makefile | 2 + + libavcodec/arm/mlpdsp_arm.S | 433 +++++++++++++++++++++++++++++++++++++++ + libavcodec/arm/mlpdsp_init_arm.c | 36 ++++ + libavcodec/mlpdsp.c | 2 + + libavcodec/mlpdsp.h | 1 + + 5 files changed, 474 insertions(+) + create mode 100644 libavcodec/arm/mlpdsp_arm.S + create mode 100644 libavcodec/arm/mlpdsp_init_arm.c + +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index a8446b2..ba673b1 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -22,6 +22,8 @@ OBJS-$(CONFIG_H264PRED) += arm/h264pred_init_arm.o + OBJS-$(CONFIG_H264QPEL) += arm/h264qpel_init_arm.o + OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_arm.o \ + arm/hpeldsp_arm.o ++OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o \ ++ arm/mlpdsp_arm.o + OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o + OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o + OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o +diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S +new file mode 100644 +index 0000000..615819d +--- /dev/null ++++ b/libavcodec/arm/mlpdsp_arm.S +@@ -0,0 +1,433 @@ ++/* ++ * Copyright (c) 2014 RISC OS Open Ltd ++ * Author: Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++ ++#define MAX_CHANNELS 8 ++#define MAX_FIR_ORDER 8 ++#define MAX_IIR_ORDER 4 ++#define MAX_RATEFACTOR 4 ++#define MAX_BLOCKSIZE (40 * MAX_RATEFACTOR) ++ ++PST .req a1 ++PCO .req a2 ++AC0 .req a3 ++AC1 .req a4 ++CO0 .req v1 ++CO1 .req v2 ++CO2 .req v3 ++CO3 .req v4 ++ST0 .req v5 ++ST1 .req v6 ++ST2 .req sl ++ST3 .req fp ++I .req ip ++PSAMP .req lr ++ ++ ++// Some macros that do loads/multiplies where the register number is determined ++// from an assembly-time expression. Boy is GNU assembler's syntax ugly... ++ ++.macro load group, index, base, offset ++ .altmacro ++ load_ \group, %(\index), \base, \offset ++ .noaltmacro ++.endm ++ ++.macro load_ group, index, base, offset ++ ldr \group\index, [\base, #\offset] ++.endm ++ ++.macro loadd group, index, base, offset ++ .altmacro ++ loadd_ \group, %(\index), %(\index+1), \base, \offset ++ .noaltmacro ++.endm ++ ++.macro loadd_ group, index0, index1, base, offset ++A .if offset >= 256 ++A ldr \group\index0, [\base, #\offset] ++A ldr \group\index1, [\base, #(\offset) + 4] ++A .else ++ ldrd \group\index0, \group\index1, [\base, #\offset] ++A .endif ++.endm ++ ++.macro multiply index, accumulate, long ++ .altmacro ++ multiply_ %(\index), \accumulate, \long ++ .noaltmacro ++.endm ++ ++.macro multiply_ index, accumulate, long ++ .if \long ++ .if \accumulate ++ smlal AC0, AC1, CO\index, ST\index ++ .else ++ smull AC0, AC1, CO\index, ST\index ++ .endif ++ .else ++ .if \accumulate ++ mla AC0, CO\index, ST\index, AC0 ++ .else ++ mul AC0, CO\index, ST\index ++ .endif ++ .endif ++.endm ++ ++// A macro to update the load register number and load offsets ++ ++.macro inc howmany ++ .set LOAD_REG, (LOAD_REG + \howmany) & 3 ++ .set OFFSET_CO, OFFSET_CO + 4 * \howmany ++ .set OFFSET_ST, OFFSET_ST + 4 * \howmany ++ .if FIR_REMAIN > 0 ++ .set FIR_REMAIN, FIR_REMAIN - \howmany ++ .if FIR_REMAIN == 0 ++ .set OFFSET_CO, 4 * MAX_FIR_ORDER ++ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) ++ .endif ++ .elseif IIR_REMAIN > 0 ++ .set IIR_REMAIN, IIR_REMAIN - \howmany ++ .endif ++.endm ++ ++// Macro to implement the inner loop for one specific combination of parameters ++ ++.macro implement_filter mask_minus1, shift_0, shift_8, iir_taps, fir_taps ++ .set TOTAL_TAPS, \iir_taps + \fir_taps ++ ++ // Deal with register allocation... ++ .set DEFINED_SHIFT, 0 ++ .set DEFINED_MASK, 0 ++ .set SHUFFLE_SHIFT, 0 ++ .set SHUFFLE_MASK, 0 ++ .set SPILL_SHIFT, 0 ++ .set SPILL_MASK, 0 ++ .if TOTAL_TAPS == 0 ++ // Little register pressure in this case - just keep MASK where it was ++ .if !\mask_minus1 ++ MASK .req ST1 ++ .set DEFINED_MASK, 1 ++ .endif ++ .else ++ .if \shift_0 ++ .if !\mask_minus1 ++ // AC1 is unused with shift 0 ++ MASK .req AC1 ++ .set DEFINED_MASK, 1 ++ .set SHUFFLE_MASK, 1 ++ .endif ++ .elseif \shift_8 ++ .if !\mask_minus1 ++ .if TOTAL_TAPS <= 4 ++ // All coefficients are preloaded (so pointer not needed) ++ MASK .req PCO ++ .set DEFINED_MASK, 1 ++ .set SHUFFLE_MASK, 1 ++ .else ++ .set SPILL_MASK, 1 ++ .endif ++ .endif ++ .else // shift not 0 or 8 ++ .if TOTAL_TAPS <= 3 ++ // All coefficients are preloaded, and at least one CO register is unused ++ .if \fir_taps & 1 ++ SHIFT .req CO0 ++ .set DEFINED_SHIFT, 1 ++ .set SHUFFLE_SHIFT, 1 ++ .else ++ SHIFT .req CO3 ++ .set DEFINED_SHIFT, 1 ++ .set SHUFFLE_SHIFT, 1 ++ .endif ++ .if !\mask_minus1 ++ MASK .req PCO ++ .set DEFINED_MASK, 1 ++ .set SHUFFLE_MASK, 1 ++ .endif ++ .elseif TOTAL_TAPS == 4 ++ // All coefficients are preloaded ++ SHIFT .req PCO ++ .set DEFINED_SHIFT, 1 ++ .set SHUFFLE_SHIFT, 1 ++ .if !\mask_minus1 ++ .set SPILL_MASK, 1 ++ .endif ++ .else ++ .set SPILL_SHIFT, 1 ++ .if !\mask_minus1 ++ .set SPILL_MASK, 1 ++ .endif ++ .endif ++ .endif ++ .endif ++ .if SPILL_SHIFT ++ SHIFT .req ST0 ++ .set DEFINED_SHIFT, 1 ++ .endif ++ .if SPILL_MASK ++ MASK .req ST1 ++ .set DEFINED_MASK, 1 ++ .endif ++ ++ // Preload coefficients if possible ++ .if TOTAL_TAPS <= 4 ++ .set OFFSET_CO, 0 ++ .if \fir_taps & 1 ++ .set LOAD_REG, 1 ++ .else ++ .set LOAD_REG, 0 ++ .endif ++ .rept \fir_taps ++ load CO, LOAD_REG, PCO, OFFSET_CO ++ .set LOAD_REG, (LOAD_REG + 1) & 3 ++ .set OFFSET_CO, OFFSET_CO + 4 ++ .endr ++ .set OFFSET_CO, 4 * MAX_FIR_ORDER ++ .rept \iir_taps ++ load CO, LOAD_REG, PCO, OFFSET_CO ++ .set LOAD_REG, (LOAD_REG + 1) & 3 ++ .set OFFSET_CO, OFFSET_CO + 4 ++ .endr ++ .endif ++ ++ // Move mask/shift to final positions if necessary ++ // Need to do this after preloading, because in some cases we ++ // reuse the coefficient pointer register ++ .if SHUFFLE_SHIFT ++ mov SHIFT, ST0 ++ .endif ++ .if SHUFFLE_MASK ++ mov MASK, ST1 ++ .endif ++ ++ // Begin loop ++01: ++ .if TOTAL_TAPS == 0 ++ // Things simplify a lot in this case ++ // In fact this could be pipelined further if it's worth it... ++ ldr ST0, [PSAMP] ++ subs I, I, #1 ++ .if !\mask_minus1 ++ and ST0, ST0, MASK ++ .endif ++ str ST0, [PST, #-4]! ++ str ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] ++ str ST0, [PSAMP], #4 * MAX_CHANNELS ++ bne 01b ++ .else ++ .if \fir_taps & 1 ++ .set LOAD_REG, 1 ++ .else ++ .set LOAD_REG, 0 ++ .endif ++ .set LOAD_BANK, 0 ++ .set FIR_REMAIN, \fir_taps ++ .set IIR_REMAIN, \iir_taps ++ .if FIR_REMAIN == 0 // only IIR terms ++ .set OFFSET_CO, 4 * MAX_FIR_ORDER ++ .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER) ++ .else ++ .set OFFSET_CO, 0 ++ .set OFFSET_ST, 0 ++ .endif ++ .set MUL_REG, LOAD_REG ++ .set COUNTER, 0 ++ .rept TOTAL_TAPS + 2 ++ // Do load(s) ++ .if FIR_REMAIN != 0 || IIR_REMAIN != 0 ++ .if COUNTER == 0 ++ .if TOTAL_TAPS > 4 ++ load CO, LOAD_REG, PCO, OFFSET_CO ++ .endif ++ load ST, LOAD_REG, PST, OFFSET_ST ++ inc 1 ++ .elseif COUNTER == 1 && (\fir_taps & 1) == 0 ++ .if TOTAL_TAPS > 4 ++ load CO, LOAD_REG, PCO, OFFSET_CO ++ .endif ++ load ST, LOAD_REG, PST, OFFSET_ST ++ inc 1 ++ .elseif LOAD_BANK == 0 ++ .if TOTAL_TAPS > 4 ++ .if FIR_REMAIN == 0 && IIR_REMAIN == 1 ++ load CO, LOAD_REG, PCO, OFFSET_CO ++ .else ++ loadd CO, LOAD_REG, PCO, OFFSET_CO ++ .endif ++ .endif ++ .set LOAD_BANK, 1 ++ .else ++ .if FIR_REMAIN == 0 && IIR_REMAIN == 1 ++ load ST, LOAD_REG, PST, OFFSET_ST ++ inc 1 ++ .else ++ loadd ST, LOAD_REG, PST, OFFSET_ST ++ inc 2 ++ .endif ++ .set LOAD_BANK, 0 ++ .endif ++ .endif ++ ++ // Do interleaved multiplies, slightly delayed ++ .if COUNTER >= 2 ++ multiply MUL_REG, COUNTER > 2, !\shift_0 ++ .set MUL_REG, (MUL_REG + 1) & 3 ++ .endif ++ .set COUNTER, COUNTER + 1 ++ .endr ++ ++ // Post-process the result of the multiplies ++ .if SPILL_SHIFT ++ ldr SHIFT, [sp, #9*4 + 0*4] ++ .endif ++ .if SPILL_MASK ++ ldr MASK, [sp, #9*4 + 1*4] ++ .endif ++ ldr ST2, [PSAMP] ++ subs I, I, #1 ++ .if \shift_8 ++ mov AC0, AC0, lsr #8 ++ orr AC0, AC0, AC1, lsl #24 ++ .elseif !\shift_0 ++ rsb ST3, SHIFT, #32 ++ mov AC0, AC0, lsr SHIFT ++A orr AC0, AC0, AC1, lsl ST3 ++T mov AC1, AC1, lsl ST3 ++T orr AC0, AC0, AC1 ++ .endif ++ .if \mask_minus1 ++ add ST3, ST2, AC0 ++ .else ++ add ST2, ST2, AC0 ++ and ST3, ST2, MASK ++ sub ST2, ST3, AC0 ++ .endif ++ str ST3, [PST, #-4]! ++ str ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)] ++ str ST3, [PSAMP], #4 * MAX_CHANNELS ++ bne 01b ++ .endif ++ b 99f ++ ++ .if DEFINED_SHIFT ++ .unreq SHIFT ++ .endif ++ .if DEFINED_MASK ++ .unreq MASK ++ .endif ++.endm ++ ++.macro switch_on_fir_taps mask_minus1, shift_0, shift_8, iir_taps ++A ldr pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps) ++T tbh [pc, a3, lsl #1] ++0: ++A .word 0, 70f, 71f, 72f, 73f, 74f ++T .hword (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2 ++ .if \iir_taps <= 3 ++A .word 75f ++T .hword (75f - 0b) / 2 ++ .if \iir_taps <= 2 ++A .word 76f ++T .hword (76f - 0b) / 2 ++ .if \iir_taps <= 1 ++A .word 77f ++T .hword (77f - 0b) / 2 ++ .if \iir_taps == 0 ++A .word 78f ++T .hword (78f - 0b) / 2 ++ .endif ++ .endif ++ .endif ++ .endif ++70: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 0 ++71: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 1 ++72: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 2 ++73: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 3 ++74: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 4 ++ .if \iir_taps <= 3 ++75: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 5 ++ .if \iir_taps <= 2 ++76: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 6 ++ .if \iir_taps <= 1 ++77: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 7 ++ .if \iir_taps == 0 ++78: implement_filter \mask_minus1, \shift_0, \shift_8, \iir_taps, 8 ++ .endif ++ .endif ++ .endif ++ .endif ++.endm ++ ++.macro switch_on_iir_taps mask_minus1, shift_0, shift_8 ++A ldr pc, [pc, a4, LSL #2] // irorder is in range 0-4 ++T tbh [pc, a4, lsl #1] ++0: ++A .word 0, 60f, 61f, 62f, 63f, 64f ++T .hword (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2 ++60: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 0 ++61: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 1 ++62: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 2 ++63: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 3 ++64: switch_on_fir_taps \mask_minus1, \shift_0, \shift_8, 4 ++.endm ++ ++/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, ++ * int firorder, int iirorder, ++ * unsigned int filter_shift, int32_t mask, ++ * int blocksize, int32_t *sample_buffer); ++ */ ++function ff_mlp_filter_channel_arm, export=1 ++ push {v1-fp,lr} ++ add v1, sp, #9*4 // point at arguments on stack ++ ldm v1, {ST0,ST1,I,PSAMP} ++ cmp ST1, #-1 ++ bne 30f ++ movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 ++ bne 20f ++ bcs 10f ++ switch_on_iir_taps 1, 1, 0 ++10: switch_on_iir_taps 1, 0, 1 ++20: switch_on_iir_taps 1, 0, 0 ++30: movs ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8 ++ bne 50f ++ bcs 40f ++ switch_on_iir_taps 0, 1, 0 ++40: switch_on_iir_taps 0, 0, 1 ++50: switch_on_iir_taps 0, 0, 0 ++99: pop {v1-fp,pc} ++endfunc ++ ++ .unreq PST ++ .unreq PCO ++ .unreq AC0 ++ .unreq AC1 ++ .unreq CO0 ++ .unreq CO1 ++ .unreq CO2 ++ .unreq CO3 ++ .unreq ST0 ++ .unreq ST1 ++ .unreq ST2 ++ .unreq ST3 ++ .unreq I ++ .unreq PSAMP +diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c +new file mode 100644 +index 0000000..9a14815 +--- /dev/null ++++ b/libavcodec/arm/mlpdsp_init_arm.c +@@ -0,0 +1,36 @@ ++/* ++ * Copyright (c) 2014 RISC OS Open Ltd ++ * Author: Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++ ++#include "libavutil/arm/cpu.h" ++#include "libavutil/attributes.h" ++#include "libavcodec/mlpdsp.h" ++ ++void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, ++ int firorder, int iirorder, ++ unsigned int filter_shift, int32_t mask, ++ int blocksize, int32_t *sample_buffer); ++ ++av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c) ++{ ++ c->mlp_filter_channel = ff_mlp_filter_channel_arm; ++} +diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c +index b413e86..4b403b8 100644 +--- a/libavcodec/mlpdsp.c ++++ b/libavcodec/mlpdsp.c +@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff, + av_cold void ff_mlpdsp_init(MLPDSPContext *c) + { + c->mlp_filter_channel = mlp_filter_channel; ++ if (ARCH_ARM) ++ ff_mlpdsp_init_arm(c); + if (ARCH_X86) + ff_mlpdsp_init_x86(c); + } +diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h +index 84a8aa3..129bcfe 100644 +--- a/libavcodec/mlpdsp.h ++++ b/libavcodec/mlpdsp.h +@@ -32,6 +32,7 @@ typedef struct MLPDSPContext { + } MLPDSPContext; + + void ff_mlpdsp_init(MLPDSPContext *c); ++void ff_mlpdsp_init_arm(MLPDSPContext *c); + void ff_mlpdsp_init_x86(MLPDSPContext *c); + + #endif /* AVCODEC_MLPDSP_H */ +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0002-vc-1-Add-platform-specific-start-code-search-routine.patch b/projects/RPi/patches/ffmpeg/0002-vc-1-Add-platform-specific-start-code-search-routine.patch new file mode 100644 index 0000000000..e84ace6065 --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0002-vc-1-Add-platform-specific-start-code-search-routine.patch @@ -0,0 +1,143 @@ +From a60747132a1a6652ac0d18f3f110a20ea637ac30 Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Wed, 16 Apr 2014 01:51:32 +0100 +Subject: [PATCH 2/3] vc-1: Add platform-specific start code search routine to + VC1DSPContext. + +Initialise VC1DSPContext for parser as well as for decoder. +Note, the VC-1 code doesn't actually use the function pointer yet. + +Signed-off-by: Michael Niedermayer +--- + libavcodec/Makefile | 6 +++--- + libavcodec/arm/Makefile | 2 ++ + libavcodec/arm/vc1dsp_init_arm.c | 4 ++++ + libavcodec/vc1.c | 2 ++ + libavcodec/vc1dec.c | 1 - + libavcodec/vc1dsp.c | 3 +++ + libavcodec/vc1dsp.h | 8 ++++++++ + 7 files changed, 22 insertions(+), 4 deletions(-) + +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 19caf11..120f85a 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -458,7 +458,7 @@ OBJS-$(CONFIG_VB_DECODER) += vb.o + OBJS-$(CONFIG_VBLE_DECODER) += vble.o + OBJS-$(CONFIG_VC1_DECODER) += vc1dec.o vc1.o vc1data.o vc1dsp.o \ + msmpeg4dec.o msmpeg4.o msmpeg4data.o \ +- wmv2dsp.o ++ wmv2dsp.o startcode.o + OBJS-$(CONFIG_VCR1_DECODER) += vcr1.o + OBJS-$(CONFIG_VMDAUDIO_DECODER) += vmdav.o + OBJS-$(CONFIG_VMDVIDEO_DECODER) += vmdav.o +@@ -783,9 +783,9 @@ OBJS-$(CONFIG_PNM_PARSER) += pnm_parser.o pnm.o + OBJS-$(CONFIG_RV30_PARSER) += rv34_parser.o + OBJS-$(CONFIG_RV40_PARSER) += rv34_parser.o + OBJS-$(CONFIG_TAK_PARSER) += tak_parser.o tak.o +-OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o \ ++OBJS-$(CONFIG_VC1_PARSER) += vc1_parser.o vc1.o vc1data.o vc1dsp.o \ + msmpeg4.o msmpeg4data.o mpeg4video.o \ +- h263.o ++ h263.o startcode.o + OBJS-$(CONFIG_VORBIS_PARSER) += vorbis_parser.o xiph.o + OBJS-$(CONFIG_VP3_PARSER) += vp3_parser.o + OBJS-$(CONFIG_VP8_PARSER) += vp8_parser.o +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index b6410b2..fa2b18e 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -51,6 +51,8 @@ ARMV6-OBJS-$(CONFIG_H264DSP) += arm/startcode_armv6.o + ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ + arm/hpeldsp_armv6.o + ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o ++ARMV6-OBJS-$(CONFIG_VC1_DECODER) += arm/startcode_armv6.o ++ARMV6-OBJS-$(CONFIG_VC1_PARSER) += arm/startcode_armv6.o + ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ + arm/vp8dsp_init_armv6.o \ + arm/vp8dsp_armv6.o +diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c +index 47d4126..4a84848 100644 +--- a/libavcodec/arm/vc1dsp_init_arm.c ++++ b/libavcodec/arm/vc1dsp_init_arm.c +@@ -23,10 +23,14 @@ + #include "libavcodec/vc1dsp.h" + #include "vc1dsp.h" + ++int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size); ++ + av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp) + { + int cpu_flags = av_get_cpu_flags(); + ++ if (have_armv6(cpu_flags)) ++ dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_armv6; + if (have_neon(cpu_flags)) + ff_vc1dsp_init_neon(dsp); + } +diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c +index 49d4885..cb941dd 100644 +--- a/libavcodec/vc1.c ++++ b/libavcodec/vc1.c +@@ -1706,5 +1706,7 @@ av_cold int ff_vc1_init_common(VC1Context *v) + v->pq = -1; + v->mvrange = 0; /* 7.1.1.18, p80 */ + ++ ff_vc1dsp_init(&v->vc1dsp); ++ + return 0; + } +diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c +index 30fee47..67cda42 100644 +--- a/libavcodec/vc1dec.c ++++ b/libavcodec/vc1dec.c +@@ -5631,7 +5631,6 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx) + ff_vc1_decode_end(avctx); + + ff_h264chroma_init(&v->h264chroma, 8); +- ff_vc1dsp_init(&v->vc1dsp); + + if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) { + int count = 0; +diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c +index ec9c17b..09a9006 100644 +--- a/libavcodec/vc1dsp.c ++++ b/libavcodec/vc1dsp.c +@@ -30,6 +30,7 @@ + #include "h264chroma.h" + #include "rnd_avg.h" + #include "vc1dsp.h" ++#include "startcode.h" + + /* Apply overlap transform to horizontal edge */ + static void vc1_v_overlap_c(uint8_t *src, int stride) +@@ -947,6 +948,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp) + dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c; + #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */ + ++ dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_c; ++ + if (ARCH_AARCH64) + ff_vc1dsp_init_aarch64(dsp); + if (ARCH_ARM) +diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h +index 990fbc3..6a90eed 100644 +--- a/libavcodec/vc1dsp.h ++++ b/libavcodec/vc1dsp.h +@@ -74,6 +74,14 @@ typedef struct VC1DSPContext { + void (*sprite_v_double_twoscale)(uint8_t *dst, const uint8_t *src1a, const uint8_t *src1b, int offset1, + const uint8_t *src2a, const uint8_t *src2b, int offset2, + int alpha, int width); ++ ++ /** ++ * Search buf from the start for up to size bytes. Return the index ++ * of a zero byte, or >= size if not found. Ideally, use lookahead ++ * to filter out any zero bytes that are known to not be followed by ++ * one or more further zero bytes and a one byte. ++ */ ++ int (*vc1_find_start_code_candidate)(const uint8_t *buf, int size); + } VC1DSPContext; + + void ff_vc1dsp_init(VC1DSPContext* c); +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0003-truehd-break-out-part-of-rematrix_channels-into-plat.patch b/projects/RPi/patches/ffmpeg/0003-truehd-break-out-part-of-rematrix_channels-into-plat.patch new file mode 100644 index 0000000000..9c06f8fe4e --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0003-truehd-break-out-part-of-rematrix_channels-into-plat.patch @@ -0,0 +1,158 @@ +From bb74fc44081fb6d7923ce1b7ed3e3e6514695f3e Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Wed, 5 Mar 2014 21:01:28 +0000 +Subject: [PATCH 3/6] truehd: break out part of rematrix_channels into + platform-specific callback. + +Verified with profiling that this doesn't have a measurable effect upon +overall performance. +--- + libavcodec/mlpdec.c | 37 ++++++++++++------------------------- + libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++ + libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++ + 3 files changed, 68 insertions(+), 25 deletions(-) + +diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c +index cbd9000..01ded5c 100644 +--- a/libavcodec/mlpdec.c ++++ b/libavcodec/mlpdec.c +@@ -1024,7 +1024,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr) + static void rematrix_channels(MLPDecodeContext *m, unsigned int substr) + { + SubStream *s = &m->substream[substr]; +- unsigned int mat, src_ch, i; ++ unsigned int mat; + unsigned int maxchan; + + maxchan = s->max_matrix_channel; +@@ -1036,31 +1036,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr) + } + + for (mat = 0; mat < s->num_primitive_matrices; mat++) { +- int matrix_noise_shift = s->matrix_noise_shift[mat]; + unsigned int dest_ch = s->matrix_out_ch[mat]; +- int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]); +- int32_t *coeffs = s->matrix_coeff[mat]; +- int index = s->num_primitive_matrices - mat; +- int index2 = 2 * index + 1; +- +- /* TODO: DSPContext? */ +- +- for (i = 0; i < s->blockpos; i++) { +- int32_t bypassed_lsb = m->bypassed_lsbs[i][mat]; +- int32_t *samples = m->sample_buffer[i]; +- int64_t accum = 0; +- +- for (src_ch = 0; src_ch <= maxchan; src_ch++) +- accum += (int64_t) samples[src_ch] * coeffs[src_ch]; +- +- if (matrix_noise_shift) { +- index &= m->access_unit_size_pow2 - 1; +- accum += m->noise_buffer[index] << (matrix_noise_shift + 7); +- index += index2; +- } +- +- samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb; +- } ++ m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0], ++ s->matrix_coeff[mat], ++ &m->bypassed_lsbs[0][mat], ++ m->noise_buffer, ++ s->num_primitive_matrices - mat, ++ dest_ch, ++ s->blockpos, ++ maxchan, ++ s->matrix_noise_shift[mat], ++ m->access_unit_size_pow2, ++ MSB_MASK(s->quant_step_size[dest_ch])); + } + } + +diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c +index 4b403b8..7a359b0 100644 +--- a/libavcodec/mlpdsp.c ++++ b/libavcodec/mlpdsp.c +@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff, + } + } + ++void ff_mlp_rematrix_channel(int32_t *samples, ++ const int32_t *coeffs, ++ const uint8_t *bypassed_lsbs, ++ const int8_t *noise_buffer, ++ int index, ++ unsigned int dest_ch, ++ uint16_t blockpos, ++ unsigned int maxchan, ++ int matrix_noise_shift, ++ int access_unit_size_pow2, ++ int32_t mask) ++{ ++ unsigned int src_ch, i; ++ int index2 = 2 * index + 1; ++ for (i = 0; i < blockpos; i++) { ++ int64_t accum = 0; ++ ++ for (src_ch = 0; src_ch <= maxchan; src_ch++) ++ accum += (int64_t) samples[src_ch] * coeffs[src_ch]; ++ ++ if (matrix_noise_shift) { ++ index &= access_unit_size_pow2 - 1; ++ accum += noise_buffer[index] << (matrix_noise_shift + 7); ++ index += index2; ++ } ++ ++ samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs; ++ bypassed_lsbs += MAX_CHANNELS; ++ samples += MAX_CHANNELS; ++ } ++} ++ + av_cold void ff_mlpdsp_init(MLPDSPContext *c) + { + c->mlp_filter_channel = mlp_filter_channel; ++ c->mlp_rematrix_channel = ff_mlp_rematrix_channel; + if (ARCH_ARM) + ff_mlpdsp_init_arm(c); + if (ARCH_X86) +diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h +index 129bcfe..f98e9be 100644 +--- a/libavcodec/mlpdsp.h ++++ b/libavcodec/mlpdsp.h +@@ -24,11 +24,34 @@ + + #include + ++void ff_mlp_rematrix_channel(int32_t *samples, ++ const int32_t *coeffs, ++ const uint8_t *bypassed_lsbs, ++ const int8_t *noise_buffer, ++ int index, ++ unsigned int dest_ch, ++ uint16_t blockpos, ++ unsigned int maxchan, ++ int matrix_noise_shift, ++ int access_unit_size_pow2, ++ int32_t mask); ++ + typedef struct MLPDSPContext { + void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff, + int firorder, int iirorder, + unsigned int filter_shift, int32_t mask, + int blocksize, int32_t *sample_buffer); ++ void (*mlp_rematrix_channel)(int32_t *samples, ++ const int32_t *coeffs, ++ const uint8_t *bypassed_lsbs, ++ const int8_t *noise_buffer, ++ int index, ++ unsigned int dest_ch, ++ uint16_t blockpos, ++ unsigned int maxchan, ++ int matrix_noise_shift, ++ int access_unit_size_pow2, ++ int32_t mask); + } MLPDSPContext; + + void ff_mlpdsp_init(MLPDSPContext *c); +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0003-vc-1-Optimise-parser-with-special-attention-to-ARM.patch b/projects/RPi/patches/ffmpeg/0003-vc-1-Optimise-parser-with-special-attention-to-ARM.patch new file mode 100644 index 0000000000..1f0cf40951 --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0003-vc-1-Optimise-parser-with-special-attention-to-ARM.patch @@ -0,0 +1,401 @@ +From c39df43eae03768427243668c040de8437c4f79c Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Wed, 23 Apr 2014 01:41:04 +0100 +Subject: [PATCH 3/3] vc-1: Optimise parser (with special attention to ARM) + +The previous implementation of the parser made four passes over each input +buffer (reduced to two if the container format already guaranteed the input +buffer corresponded to frames, such as with MKV). But these buffers are +often 200K in size, certainly enough to flush the data out of L1 cache, and +for many CPUs, all the way out to main memory. The passes were: + +1) locate frame boundaries (not needed for MKV etc) +2) copy the data into a contiguous block (not needed for MKV etc) +3) locate the start codes within each frame +4) unescape the data between start codes + +After this, the unescaped data was parsed to extract certain header fields, +but because the unescape operation was so large, this was usually also +effectively operating on uncached memory. Most of the unescaped data was +simply thrown away and never processed further. Only step 2 - because it +used memcpy - was using prefetch, making things even worse. + +This patch reorganises these steps so that, aside from the copying, the +operations are performed in parallel, maximising cache utilisation. No more +than the worst-case number of bytes needed for header parsing is unescaped. +Most of the data is, in practice, only read in order to search for a start +code, for which optimised implementations already existed in the H264 codec +(notably the ARM version uses prefetch, so we end up doing both remaining +passes at maximum speed). For MKV files, we know when we've found the last +start code of interest in a given frame, so we are able to avoid doing even +that one remaining pass for most of the buffer. + +In some use-cases (such as the Raspberry Pi) video decode is handled by the +GPU, but the entire elementary stream is still fed through the parser to +pick out certain elements of the header which are necessary to manage the +decode process. As you might expect, in these cases, the performance of the +parser is significant. + +To measure parser performance, I used the same VC-1 elementary stream in +either an MPEG-2 transport stream or a MKV file, and fed it through ffmpeg +with -c:v copy -c:a copy -f null. These are the gperftools counts for +those streams, both filtered to only include vc1_parse() and its callees, +and unfiltered (to include the whole binary). Lower numbers are better: + + Before After +File Filtered Mean StdDev Mean StdDev Confidence Change +M2TS No 861.7 8.2 650.5 8.1 100.0% +32.5% +MKV No 868.9 7.4 731.7 9.0 100.0% +18.8% +M2TS Yes 250.0 11.2 27.2 3.4 100.0% +817.9% +MKV Yes 149.0 12.8 1.7 0.8 100.0% +8526.3% + +Yes, that last case shows vc1_parse() running 86 times faster! The M2TS +case does show a larger absolute improvement though, since it was worse +to begin with. + +This patch has been tested with the FATE suite (albeit on x86 for speed). + +Signed-off-by: Michael Niedermayer +--- + libavcodec/vc1_parser.c | 284 ++++++++++++++++++++++++++++++------------------ + 1 file changed, 180 insertions(+), 104 deletions(-) + +diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c +index cc29ce1..4ed14bc 100644 +--- a/libavcodec/vc1_parser.c ++++ b/libavcodec/vc1_parser.c +@@ -30,122 +30,88 @@ + #include "vc1.h" + #include "get_bits.h" + ++/** The maximum number of bytes of a sequence, entry point or ++ * frame header whose values we pay any attention to */ ++#define UNESCAPED_THRESHOLD 37 ++ ++/** The maximum number of bytes of a sequence, entry point or ++ * frame header which must be valid memory (because they are ++ * used to update the bitstream cache in skip_bits() calls) ++ */ ++#define UNESCAPED_LIMIT 144 ++ ++typedef enum { ++ NO_MATCH, ++ ONE_ZERO, ++ TWO_ZEROS, ++ ONE ++} VC1ParseSearchState; ++ + typedef struct { + ParseContext pc; + VC1Context v; ++ uint8_t prev_start_code; ++ size_t bytes_to_skip; ++ uint8_t unesc_buffer[UNESCAPED_LIMIT]; ++ size_t unesc_index; ++ VC1ParseSearchState search_state; + } VC1ParseContext; + +-static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx, +- const uint8_t *buf, int buf_size) ++static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx, ++ const uint8_t *buf, int buf_size) + { ++ /* Parse the header we just finished unescaping */ + VC1ParseContext *vpc = s->priv_data; + GetBitContext gb; +- const uint8_t *start, *end, *next; +- uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE); +- ++ int ret; + vpc->v.s.avctx = avctx; + vpc->v.parse_only = 1; +- vpc->v.first_pic_header_flag = 1; +- next = buf; +- s->repeat_pict = 0; +- +- for(start = buf, end = buf + buf_size; next < end; start = next){ +- int buf2_size, size; +- int ret; +- +- next = find_next_marker(start + 4, end); +- size = next - start - 4; +- buf2_size = vc1_unescape_buffer(start + 4, size, buf2); +- init_get_bits(&gb, buf2, buf2_size * 8); +- if(size <= 0) continue; +- switch(AV_RB32(start)){ +- case VC1_CODE_SEQHDR: +- ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb); +- break; +- case VC1_CODE_ENTRYPOINT: +- ff_vc1_decode_entry_point(avctx, &vpc->v, &gb); +- break; +- case VC1_CODE_FRAME: +- if(vpc->v.profile < PROFILE_ADVANCED) +- ret = ff_vc1_parse_frame_header (&vpc->v, &gb); +- else +- ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb); +- +- if (ret < 0) +- break; +- +- /* keep AV_PICTURE_TYPE_BI internal to VC1 */ +- if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI) +- s->pict_type = AV_PICTURE_TYPE_B; +- else +- s->pict_type = vpc->v.s.pict_type; +- +- if (avctx->ticks_per_frame > 1){ +- // process pulldown flags +- s->repeat_pict = 1; +- // Pulldown flags are only valid when 'broadcast' has been set. +- // So ticks_per_frame will be 2 +- if (vpc->v.rff){ +- // repeat field +- s->repeat_pict = 2; +- }else if (vpc->v.rptfrm){ +- // repeat frames +- s->repeat_pict = vpc->v.rptfrm * 2 + 1; +- } +- } +- +- if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf) +- s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB; +- else +- s->field_order = AV_FIELD_PROGRESSIVE; ++ init_get_bits(&gb, buf, buf_size * 8); ++ switch (vpc->prev_start_code) { ++ case VC1_CODE_SEQHDR & 0xFF: ++ ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb); ++ break; ++ case VC1_CODE_ENTRYPOINT & 0xFF: ++ ff_vc1_decode_entry_point(avctx, &vpc->v, &gb); ++ break; ++ case VC1_CODE_FRAME & 0xFF: ++ if(vpc->v.profile < PROFILE_ADVANCED) ++ ret = ff_vc1_parse_frame_header (&vpc->v, &gb); ++ else ++ ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb); + ++ if (ret < 0) + break; +- } +- } + +- av_free(buf2); +-} ++ /* keep AV_PICTURE_TYPE_BI internal to VC1 */ ++ if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI) ++ s->pict_type = AV_PICTURE_TYPE_B; ++ else ++ s->pict_type = vpc->v.s.pict_type; + +-/** +- * Find the end of the current frame in the bitstream. +- * @return the position of the first byte of the next frame, or -1 +- */ +-static int vc1_find_frame_end(ParseContext *pc, const uint8_t *buf, +- int buf_size) { +- int pic_found, i; +- uint32_t state; +- +- pic_found= pc->frame_start_found; +- state= pc->state; +- +- i=0; +- if(!pic_found){ +- for(i=0; iticks_per_frame > 1){ ++ // process pulldown flags ++ s->repeat_pict = 1; ++ // Pulldown flags are only valid when 'broadcast' has been set. ++ // So ticks_per_frame will be 2 ++ if (vpc->v.rff){ ++ // repeat field ++ s->repeat_pict = 2; ++ }else if (vpc->v.rptfrm){ ++ // repeat frames ++ s->repeat_pict = vpc->v.rptfrm * 2 + 1; + } ++ }else{ ++ s->repeat_pict = 0; + } +- } + +- if(pic_found){ +- /* EOF considered as end of frame */ +- if (buf_size == 0) +- return 0; +- for(; iframe_start_found=0; +- pc->state=-1; +- return i-3; +- } +- } ++ if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf) ++ s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB; ++ else ++ s->field_order = AV_FIELD_PROGRESSIVE; ++ ++ break; + } +- pc->frame_start_found= pic_found; +- pc->state= state; +- return END_NOT_FOUND; + } + + static int vc1_parse(AVCodecParserContext *s, +@@ -153,22 +119,127 @@ static int vc1_parse(AVCodecParserContext *s, + const uint8_t **poutbuf, int *poutbuf_size, + const uint8_t *buf, int buf_size) + { ++ /* Here we do the searching for frame boundaries and headers at ++ * the same time. Only a minimal amount at the start of each ++ * header is unescaped. */ + VC1ParseContext *vpc = s->priv_data; +- int next; ++ int pic_found = vpc->pc.frame_start_found; ++ uint8_t *unesc_buffer = vpc->unesc_buffer; ++ size_t unesc_index = vpc->unesc_index; ++ VC1ParseSearchState search_state = vpc->search_state; ++ int next = END_NOT_FOUND; ++ int i = vpc->bytes_to_skip; ++ ++ if (pic_found && buf_size == 0) { ++ /* EOF considered as end of frame */ ++ memset(unesc_buffer + unesc_index, 0, UNESCAPED_THRESHOLD - unesc_index); ++ vc1_extract_header(s, avctx, unesc_buffer, unesc_index); ++ next = 0; ++ } ++ while (i < buf_size) { ++ int start_code_found = 0; ++ uint8_t b; ++ while (i < buf_size && unesc_index < UNESCAPED_THRESHOLD) { ++ b = buf[i++]; ++ unesc_buffer[unesc_index++] = b; ++ if (search_state <= ONE_ZERO) ++ search_state = b ? NO_MATCH : search_state + 1; ++ else if (search_state == TWO_ZEROS) { ++ if (b == 1) ++ search_state = ONE; ++ else if (b > 1) { ++ if (b == 3) ++ unesc_index--; // swallow emulation prevention byte ++ search_state = NO_MATCH; ++ } ++ } ++ else { // search_state == ONE ++ // Header unescaping terminates early due to detection of next start code ++ search_state = NO_MATCH; ++ start_code_found = 1; ++ break; ++ } ++ } ++ if ((s->flags & PARSER_FLAG_COMPLETE_FRAMES) && ++ unesc_index >= UNESCAPED_THRESHOLD && ++ vpc->prev_start_code == (VC1_CODE_FRAME & 0xFF)) ++ { ++ // No need to keep scanning the rest of the buffer for ++ // start codes if we know it contains a complete frame and ++ // we've already unescaped all we need of the frame header ++ vc1_extract_header(s, avctx, unesc_buffer, unesc_index); ++ break; ++ } ++ if (unesc_index >= UNESCAPED_THRESHOLD && !start_code_found) { ++ while (i < buf_size) { ++ if (search_state == NO_MATCH) { ++ i += vpc->v.vc1dsp.vc1_find_start_code_candidate(buf + i, buf_size - i); ++ if (i < buf_size) { ++ search_state = ONE_ZERO; ++ } ++ i++; ++ } else { ++ b = buf[i++]; ++ if (search_state == ONE_ZERO) ++ search_state = b ? NO_MATCH : TWO_ZEROS; ++ else if (search_state == TWO_ZEROS) { ++ if (b >= 1) ++ search_state = b == 1 ? ONE : NO_MATCH; ++ } ++ else { // search_state == ONE ++ search_state = NO_MATCH; ++ start_code_found = 1; ++ break; ++ } ++ } ++ } ++ } ++ if (start_code_found) { ++ vc1_extract_header(s, avctx, unesc_buffer, unesc_index); ++ ++ vpc->prev_start_code = b; ++ unesc_index = 0; ++ ++ if (!(s->flags & PARSER_FLAG_COMPLETE_FRAMES)) { ++ if (!pic_found && (b == (VC1_CODE_FRAME & 0xFF) || b == (VC1_CODE_FIELD & 0xFF))) { ++ pic_found = 1; ++ } ++ else if (pic_found && b != (VC1_CODE_FIELD & 0xFF) && b != (VC1_CODE_SLICE & 0xFF)) { ++ next = i - 4; ++ pic_found = b == (VC1_CODE_FRAME & 0xFF); ++ break; ++ } ++ } ++ } ++ } + +- if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){ +- next= buf_size; +- }else{ +- next= vc1_find_frame_end(&vpc->pc, buf, buf_size); ++ vpc->pc.frame_start_found = pic_found; ++ vpc->unesc_index = unesc_index; ++ vpc->search_state = search_state; + ++ if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) { ++ next = buf_size; ++ } else { + if (ff_combine_frame(&vpc->pc, next, &buf, &buf_size) < 0) { ++ vpc->bytes_to_skip = 0; + *poutbuf = NULL; + *poutbuf_size = 0; + return buf_size; + } + } + +- vc1_extract_headers(s, avctx, buf, buf_size); ++ vpc->v.first_pic_header_flag = 1; ++ ++ /* If we return with a valid pointer to a combined frame buffer ++ * then on the next call then we'll have been unhelpfully rewound ++ * by up to 4 bytes (depending upon whether the start code ++ * overlapped the input buffer, and if so by how much). We don't ++ * want this: it will either cause spurious second detections of ++ * the start code we've already seen, or cause extra bytes to be ++ * inserted at the start of the unescaped buffer. */ ++ vpc->bytes_to_skip = 4; ++ if (next < 0) ++ vpc->bytes_to_skip += next; + + *poutbuf = buf; + *poutbuf_size = buf_size; +@@ -199,6 +270,11 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s) + { + VC1ParseContext *vpc = s->priv_data; + vpc->v.s.slice_context_count = 1; ++ vpc->v.first_pic_header_flag = 1; ++ vpc->prev_start_code = 0; ++ vpc->bytes_to_skip = 0; ++ vpc->unesc_index = 0; ++ vpc->search_state = NO_MATCH; + return ff_vc1_init_common(&vpc->v); + } + +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0004-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch b/projects/RPi/patches/ffmpeg/0004-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch new file mode 100644 index 0000000000..575622e346 --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0004-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch @@ -0,0 +1,285 @@ +From 98428a8cf593587b403076bb54b46cc70ed17ff2 Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Mon, 10 Mar 2014 14:42:05 +0000 +Subject: [PATCH 4/6] truehd: add hand-scheduled ARM asm version of + ff_mlp_rematrix_channel. + +Profiling results for overall audio decode and the rematrix_channels function +in particular are as follows: + + Before After + Mean StdDev Mean StdDev Confidence Change +6:2 total 370.8 17.0 348.8 20.1 99.9% +6.3% +6:2 function 46.4 8.4 45.8 6.6 18.0% +1.2% (insignificant) +8:2 total 343.2 19.0 339.1 15.4 54.7% +1.2% (insignificant) +8:2 function 38.9 3.9 40.2 6.9 52.4% -3.2% (insignificant) +6:6 total 658.4 15.7 604.6 20.8 100.0% +8.9% +6:6 function 109.0 8.7 59.5 5.4 100.0% +83.3% +8:8 total 896.2 24.5 766.4 17.6 100.0% +16.9% +8:8 function 223.4 12.8 93.8 5.0 100.0% +138.3% + +The assembly version has also been tested with a fuzz tester to ensure that +any combinations of inputs not exercised by my available test streams still +generate mathematically identical results to the C version. +--- + libavcodec/arm/mlpdsp_arm.S | 222 +++++++++++++++++++++++++++++++++++++++ + libavcodec/arm/mlpdsp_init_arm.c | 12 +++ + 2 files changed, 234 insertions(+) + +diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S +index 615819d..9b51d0c 100644 +--- a/libavcodec/arm/mlpdsp_arm.S ++++ b/libavcodec/arm/mlpdsp_arm.S +@@ -431,3 +431,225 @@ endfunc + .unreq ST3 + .unreq I + .unreq PSAMP ++ ++/********************************************************************/ ++ ++PSA .req a1 // samples ++PCO .req a2 // coeffs ++PBL .req a3 // bypassed_lsbs ++INDEX .req a4 ++CO0 .req v1 ++CO1 .req v2 ++CO2 .req v3 ++CO3 .req v4 ++SA0 .req v5 ++SA1 .req v6 ++SA2 .req sl ++SA3 .req fp ++AC0 .req ip ++AC1 .req lr ++NOISE .req SA0 ++LSB .req SA1 ++DCH .req SA2 // dest_ch ++MASK .req SA3 ++ ++ // INDEX is used as follows: ++ // bits 0..6 index2 (values up to 17, but wider so that we can ++ // add to index field without needing to mask) ++ // bits 7..14 i (values up to 160) ++ // bit 15 underflow detect for i ++ // bits 25..31 (if access_unit_size_pow2 == 128) \ index ++ // bits 26..31 (if access_unit_size_pow2 == 64) / ++ ++.macro implement_rematrix shift, index_mask, mask_minus1, maxchan ++ .if \maxchan == 1 ++ // We can just leave the coefficients in registers in this case ++ ldrd CO0, CO1, [PCO] ++ .endif ++1: ++ .if \maxchan == 1 ++ ldrd SA0, SA1, [PSA] ++ smull AC0, AC1, CO0, SA0 ++ .elseif \maxchan == 5 ++ ldr CO0, [PCO, #0] ++ ldr SA0, [PSA, #0] ++ ldr CO1, [PCO, #4] ++ ldr SA1, [PSA, #4] ++ ldrd CO2, CO3, [PCO, #8] ++ smull AC0, AC1, CO0, SA0 ++ ldrd SA2, SA3, [PSA, #8] ++ smlal AC0, AC1, CO1, SA1 ++ ldrd CO0, CO1, [PCO, #16] ++ smlal AC0, AC1, CO2, SA2 ++ ldrd SA0, SA1, [PSA, #16] ++ smlal AC0, AC1, CO3, SA3 ++ smlal AC0, AC1, CO0, SA0 ++ .else // \maxchan == 7 ++ ldr CO2, [PCO, #0] ++ ldr SA2, [PSA, #0] ++ ldr CO3, [PCO, #4] ++ ldr SA3, [PSA, #4] ++ ldrd CO0, CO1, [PCO, #8] ++ smull AC0, AC1, CO2, SA2 ++ ldrd SA0, SA1, [PSA, #8] ++ smlal AC0, AC1, CO3, SA3 ++ ldrd CO2, CO3, [PCO, #16] ++ smlal AC0, AC1, CO0, SA0 ++ ldrd SA2, SA3, [PSA, #16] ++ smlal AC0, AC1, CO1, SA1 ++ ldrd CO0, CO1, [PCO, #24] ++ smlal AC0, AC1, CO2, SA2 ++ ldrd SA0, SA1, [PSA, #24] ++ smlal AC0, AC1, CO3, SA3 ++ smlal AC0, AC1, CO0, SA0 ++ .endif ++ ldm sp, {NOISE, DCH, MASK} ++ smlal AC0, AC1, CO1, SA1 ++ .if \shift != 0 ++ .if \index_mask == 63 ++ add NOISE, NOISE, INDEX, lsr #32-6 ++ ldrb LSB, [PBL], #MAX_CHANNELS ++ ldrsb NOISE, [NOISE] ++ add INDEX, INDEX, INDEX, lsl #32-6 ++ .else // \index_mask == 127 ++ add NOISE, NOISE, INDEX, lsr #32-7 ++ ldrb LSB, [PBL], #MAX_CHANNELS ++ ldrsb NOISE, [NOISE] ++ add INDEX, INDEX, INDEX, lsl #32-7 ++ .endif ++ sub INDEX, INDEX, #1<<7 ++ adds AC0, AC0, NOISE, lsl #\shift + 7 ++ adc AC1, AC1, NOISE, asr #31 ++ .else ++ ldrb LSB, [PBL], #MAX_CHANNELS ++ sub INDEX, INDEX, #1<<7 ++ .endif ++ add PSA, PSA, #MAX_CHANNELS*4 ++ mov AC0, AC0, lsr #14 ++ orr AC0, AC0, AC1, lsl #18 ++ .if !\mask_minus1 ++ and AC0, AC0, MASK ++ .endif ++ add AC0, AC0, LSB ++ tst INDEX, #1<<15 ++ str AC0, [PSA, DCH, lsl #2] // DCH is precompensated for the early increment of PSA ++ beq 1b ++ b 98f ++.endm ++ ++.macro switch_on_maxchan shift, index_mask, mask_minus1 ++ cmp v4, #5 ++ blo 51f ++ beq 50f ++ implement_rematrix \shift, \index_mask, \mask_minus1, 7 ++50: implement_rematrix \shift, \index_mask, \mask_minus1, 5 ++51: implement_rematrix \shift, \index_mask, \mask_minus1, 1 ++.endm ++ ++.macro switch_on_mask shift, index_mask ++ cmp sl, #-1 ++ bne 40f ++ switch_on_maxchan \shift, \index_mask, 1 ++40: switch_on_maxchan \shift, \index_mask, 0 ++.endm ++ ++.macro switch_on_au_size shift ++ .if \shift == 0 ++ switch_on_mask \shift, undefined ++ .else ++ teq v6, #64 ++ bne 30f ++ orr INDEX, INDEX, v1, lsl #32-6 ++ switch_on_mask \shift, 63 ++30: orr INDEX, INDEX, v1, lsl #32-7 ++ switch_on_mask \shift, 127 ++ .endif ++.endm ++ ++/* void ff_mlp_rematrix_channel_arm(int32_t *samples, ++ * const int32_t *coeffs, ++ * const uint8_t *bypassed_lsbs, ++ * const int8_t *noise_buffer, ++ * int index, ++ * unsigned int dest_ch, ++ * uint16_t blockpos, ++ * unsigned int maxchan, ++ * int matrix_noise_shift, ++ * int access_unit_size_pow2, ++ * int32_t mask); ++ */ ++function ff_mlp_rematrix_channel_arm, export=1 ++ push {v1-fp,lr} ++ add v1, sp, #9*4 // point at arguments on stack ++ ldm v1, {v1-sl} ++ teq v4, #1 ++ itt ne ++ teqne v4, #5 ++ teqne v4, #7 ++ bne 99f ++ teq v6, #64 ++ it ne ++ teqne v6, #128 ++ bne 99f ++ sub v2, v2, #MAX_CHANNELS ++ push {a4,v2,sl} // initialise NOISE,DCH,MASK; make sp dword-aligned ++ movs INDEX, v3, lsl #7 ++ beq 98f // just in case, do nothing if blockpos = 0 ++ subs INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time ++ adc lr, v1, v1 // calculate index2 (C was set by preceding subs) ++ orr INDEX, INDEX, lr ++ // Switch on matrix_noise_shift: values 0 and 1 are ++ // disproportionately common so do those in a form the branch ++ // predictor can accelerate. Values can only go up to 15. ++ cmp v5, #1 ++ beq 11f ++ blo 10f ++A ldr pc, [pc, v5, lsl #2] ++T tbh [pc, v5, lsl #1] ++0: ++A .word 0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f ++T .hword 0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2 ++T .hword (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2 ++T .hword (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2 ++10: switch_on_au_size 0 ++11: switch_on_au_size 1 ++12: switch_on_au_size 2 ++13: switch_on_au_size 3 ++14: switch_on_au_size 4 ++15: switch_on_au_size 5 ++16: switch_on_au_size 6 ++17: switch_on_au_size 7 ++18: switch_on_au_size 8 ++19: switch_on_au_size 9 ++20: switch_on_au_size 10 ++21: switch_on_au_size 11 ++22: switch_on_au_size 12 ++23: switch_on_au_size 13 ++24: switch_on_au_size 14 ++25: switch_on_au_size 15 ++ ++98: add sp, sp, #3*4 ++ pop {v1-fp,pc} ++99: // Can't handle these parameters, drop back to C ++ pop {v1-fp,lr} ++ b X(ff_mlp_rematrix_channel) ++endfunc ++ ++ .unreq PSA ++ .unreq PCO ++ .unreq PBL ++ .unreq INDEX ++ .unreq CO0 ++ .unreq CO1 ++ .unreq CO2 ++ .unreq CO3 ++ .unreq SA0 ++ .unreq SA1 ++ .unreq SA2 ++ .unreq SA3 ++ .unreq AC0 ++ .unreq AC1 ++ .unreq NOISE ++ .unreq LSB ++ .unreq DCH ++ .unreq MASK +diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c +index 9a14815..1bb2276 100644 +--- a/libavcodec/arm/mlpdsp_init_arm.c ++++ b/libavcodec/arm/mlpdsp_init_arm.c +@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff, + int firorder, int iirorder, + unsigned int filter_shift, int32_t mask, + int blocksize, int32_t *sample_buffer); ++void ff_mlp_rematrix_channel_arm(int32_t *samples, ++ const int32_t *coeffs, ++ const uint8_t *bypassed_lsbs, ++ const int8_t *noise_buffer, ++ int index, ++ unsigned int dest_ch, ++ uint16_t blockpos, ++ unsigned int maxchan, ++ int matrix_noise_shift, ++ int access_unit_size_pow2, ++ int32_t mask); + + av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c) + { + c->mlp_filter_channel = ff_mlp_filter_channel_arm; ++ c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm; + } +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0005-truehd-break-out-part-of-output_data-into-platform-s.patch b/projects/RPi/patches/ffmpeg/0005-truehd-break-out-part-of-output_data-into-platform-s.patch new file mode 100644 index 0000000000..c5880e909a --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0005-truehd-break-out-part-of-output_data-into-platform-s.patch @@ -0,0 +1,197 @@ +From 5bfcb7a691eb63c56f1485b60f399d79ff943799 Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Wed, 12 Mar 2014 18:18:39 +0000 +Subject: [PATCH 5/6] truehd: break out part of output_data into + platform-specific callback. + +Verified with profiling that this doesn't have a measurable effect upon +overall performance. +--- + libavcodec/mlpdec.c | 40 +++++++++++++++++++++++----------------- + libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++ + libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++ + 3 files changed, 83 insertions(+), 17 deletions(-) + +diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c +index 01ded5c..061dabc 100644 +--- a/libavcodec/mlpdec.c ++++ b/libavcodec/mlpdec.c +@@ -363,6 +363,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb) + m->avctx->sample_fmt = AV_SAMPLE_FMT_S32; + else + m->avctx->sample_fmt = AV_SAMPLE_FMT_S16; ++ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign, ++ m->substream[m->max_decoded_substream].output_shift, ++ m->substream[m->max_decoded_substream].max_matrix_channel, ++ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32); + + m->params_valid = 1; + for (substr = 0; substr < MAX_SUBSTREAMS; substr++) +@@ -612,6 +616,10 @@ FF_ENABLE_DEPRECATION_WARNINGS + if (substr == m->max_decoded_substream) { + m->avctx->channels = s->max_matrix_channel + 1; + m->avctx->channel_layout = s->ch_layout; ++ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign, ++ s->output_shift, ++ s->max_matrix_channel, ++ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32); + + if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) { + if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) || +@@ -857,9 +865,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp, + return ret; + + if (s->param_presence_flags & PARAM_OUTSHIFT) +- if (get_bits1(gbp)) ++ if (get_bits1(gbp)) { + for (ch = 0; ch <= s->max_matrix_channel; ch++) + s->output_shift[ch] = get_sbits(gbp, 4); ++ if (substr == m->max_decoded_substream) ++ m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign, ++ s->output_shift, ++ s->max_matrix_channel, ++ m->avctx->sample_fmt == AV_SAMPLE_FMT_S32); ++ } + + if (s->param_presence_flags & PARAM_QUANTSTEP) + if (get_bits1(gbp)) +@@ -1058,9 +1072,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr, + { + AVCodecContext *avctx = m->avctx; + SubStream *s = &m->substream[substr]; +- unsigned int i, out_ch = 0; +- int32_t *data_32; +- int16_t *data_16; + int ret; + int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32); + +@@ -1078,19 +1089,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr, + frame->nb_samples = s->blockpos; + if ((ret = ff_get_buffer(avctx, frame, 0)) < 0) + return ret; +- data_32 = (int32_t *)frame->data[0]; +- data_16 = (int16_t *)frame->data[0]; +- +- for (i = 0; i < s->blockpos; i++) { +- for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) { +- int mat_ch = s->ch_assign[out_ch]; +- int32_t sample = m->sample_buffer[i][mat_ch] +- << s->output_shift[mat_ch]; +- s->lossless_check_data ^= (sample & 0xffffff) << mat_ch; +- if (is32) *data_32++ = sample << 8; +- else *data_16++ = sample >> 8; +- } +- } ++ s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data, ++ s->blockpos, ++ m->sample_buffer, ++ frame->data[0], ++ s->ch_assign, ++ s->output_shift, ++ s->max_matrix_channel, ++ is32); + + /* Update matrix encoding side data */ + if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0) +diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c +index 7a359b0..3ae8c37 100644 +--- a/libavcodec/mlpdsp.c ++++ b/libavcodec/mlpdsp.c +@@ -89,10 +89,48 @@ void ff_mlp_rematrix_channel(int32_t *samples, + } + } + ++static int32_t (*mlp_select_pack_output(uint8_t *ch_assign, ++ int8_t *output_shift, ++ uint8_t max_matrix_channel, ++ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) ++{ ++ return ff_mlp_pack_output; ++} ++ ++int32_t ff_mlp_pack_output(int32_t lossless_check_data, ++ uint16_t blockpos, ++ int32_t (*sample_buffer)[MAX_CHANNELS], ++ void *data, ++ uint8_t *ch_assign, ++ int8_t *output_shift, ++ uint8_t max_matrix_channel, ++ int is32) ++{ ++ unsigned int i, out_ch = 0; ++ int32_t *data_32 = data; ++ int16_t *data_16 = data; ++ ++ for (i = 0; i < blockpos; i++) { ++ for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) { ++ int mat_ch = ch_assign[out_ch]; ++ int32_t sample = sample_buffer[i][mat_ch] ++ << output_shift[mat_ch]; ++ lossless_check_data ^= (sample & 0xffffff) << mat_ch; ++ if (is32) ++ *data_32++ = sample << 8; ++ else ++ *data_16++ = sample >> 8; ++ } ++ } ++ return lossless_check_data; ++} ++ + av_cold void ff_mlpdsp_init(MLPDSPContext *c) + { + c->mlp_filter_channel = mlp_filter_channel; + c->mlp_rematrix_channel = ff_mlp_rematrix_channel; ++ c->mlp_select_pack_output = mlp_select_pack_output; ++ c->mlp_pack_output = ff_mlp_pack_output; + if (ARCH_ARM) + ff_mlpdsp_init_arm(c); + if (ARCH_X86) +diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h +index f98e9be..a0edeb7 100644 +--- a/libavcodec/mlpdsp.h ++++ b/libavcodec/mlpdsp.h +@@ -23,6 +23,7 @@ + #define AVCODEC_MLPDSP_H + + #include ++#include "mlp.h" + + void ff_mlp_rematrix_channel(int32_t *samples, + const int32_t *coeffs, +@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples, + int access_unit_size_pow2, + int32_t mask); + ++int32_t ff_mlp_pack_output(int32_t lossless_check_data, ++ uint16_t blockpos, ++ int32_t (*sample_buffer)[MAX_CHANNELS], ++ void *data, ++ uint8_t *ch_assign, ++ int8_t *output_shift, ++ uint8_t max_matrix_channel, ++ int is32); ++ + typedef struct MLPDSPContext { + void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff, + int firorder, int iirorder, +@@ -52,6 +62,18 @@ typedef struct MLPDSPContext { + int matrix_noise_shift, + int access_unit_size_pow2, + int32_t mask); ++ int32_t (*(*mlp_select_pack_output)(uint8_t *ch_assign, ++ int8_t *output_shift, ++ uint8_t max_matrix_channel, ++ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int); ++ int32_t (*mlp_pack_output)(int32_t lossless_check_data, ++ uint16_t blockpos, ++ int32_t (*sample_buffer)[MAX_CHANNELS], ++ void *data, ++ uint8_t *ch_assign, ++ int8_t *output_shift, ++ uint8_t max_matrix_channel, ++ int is32); + } MLPDSPContext; + + void ff_mlpdsp_init(MLPDSPContext *c); +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/0006-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch b/projects/RPi/patches/ffmpeg/0006-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch new file mode 100644 index 0000000000..93add62da5 --- /dev/null +++ b/projects/RPi/patches/ffmpeg/0006-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch @@ -0,0 +1,689 @@ +From c647209386bd811cc1c33b4fc8ec17a00f8c8ded Mon Sep 17 00:00:00 2001 +From: Ben Avison +Date: Thu, 13 Mar 2014 00:21:55 +0000 +Subject: [PATCH 6/6] truehd: add hand-scheduled ARM asm version of + ff_mlp_pack_output. + +Profiling results for overall decode and the output_data function in +particular are as follows: + + Before After + Mean StdDev Mean StdDev Confidence Change +6:2 total 339.6 15.1 329.3 16.0 95.8% +3.1% (insignificant) +6:2 function 24.6 6.0 9.9 3.1 100.0% +148.5% +8:2 total 324.5 15.5 323.6 14.3 15.2% +0.3% (insignificant) +8:2 function 20.4 3.9 9.9 3.4 100.0% +104.7% +6:6 total 572.8 20.6 539.9 24.2 100.0% +6.1% +6:6 function 54.5 5.6 16.0 3.8 100.0% +240.9% +8:8 total 741.5 21.2 702.5 18.5 100.0% +5.6% +8:8 function 63.9 7.6 18.4 4.8 100.0% +247.3% + +The assembly version has also been tested with a fuzz tester to ensure that +any combinations of inputs not exercised by my available test streams still +generate mathematically identical results to the C version. +--- + libavcodec/arm/Makefile | 1 + + libavcodec/arm/mlpdsp_armv6.S | 530 +++++++++++++++++++++++++++++++++++++++ + libavcodec/arm/mlpdsp_init_arm.c | 96 +++++++ + 3 files changed, 627 insertions(+) + create mode 100644 libavcodec/arm/mlpdsp_armv6.S + +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index ba673b1..7b2f923 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -52,6 +52,7 @@ ARMV6-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_armv6.o + ARMV6-OBJS-$(CONFIG_H264DSP) += arm/h264dsp_armv6.o + ARMV6-OBJS-$(CONFIG_HPELDSP) += arm/hpeldsp_init_armv6.o \ + arm/hpeldsp_armv6.o ++ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o + ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o + ARMV6-OBJS-$(CONFIG_VP8_DECODER) += arm/vp8_armv6.o \ + arm/vp8dsp_init_armv6.o \ +diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S +new file mode 100644 +index 0000000..05a2c85 +--- /dev/null ++++ b/libavcodec/arm/mlpdsp_armv6.S +@@ -0,0 +1,530 @@ ++/* ++ * Copyright (c) 2014 RISC OS Open Ltd ++ * Author: Ben Avison ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++ ++.macro loadregoffsh2 group, index, base, offgroup, offindex ++ .altmacro ++ loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex) ++ .noaltmacro ++.endm ++ ++.macro loadregoffsh2_ group, index, base, offgroup, offindex ++ ldr \group\index, [\base, \offgroup\offindex, lsl #2] ++.endm ++ ++.macro eorlslreg check, data, group, index ++ .altmacro ++ eorlslreg_ \check, \data, \group, %(\index) ++ .noaltmacro ++.endm ++ ++.macro eorlslreg_ check, data, group, index ++ eor \check, \check, \data, lsl \group\index ++.endm ++ ++.macro decr_modulo var, by, modulus ++ .set \var, \var - \by ++ .if \var == 0 ++ .set \var, \modulus ++ .endif ++.endm ++ ++ .macro load_group1 size, channels, r0, r1, r2, r3, pointer_dead=0 ++ .if \size == 2 ++ ldrd \r0, \r1, [IN], #(\size + 8 - \channels) * 4 ++ .else // size == 4 ++ .if IDX1 > 4 || \channels==8 ++ ldm IN!, {\r0, \r1, \r2, \r3} ++ .else ++ ldm IN, {\r0, \r1, \r2, \r3} ++ .if !\pointer_dead ++ add IN, IN, #(4 + 8 - \channels) * 4 ++ .endif ++ .endif ++ .endif ++ decr_modulo IDX1, \size, \channels ++ .endm ++ ++ .macro load_group2 size, channels, r0, r1, r2, r3, pointer_dead=0 ++ .if \size == 2 ++ .if IDX1 > 2 ++ ldm IN!, {\r2, \r3} ++ .else ++//A .ifc \r2, ip ++//A .if \pointer_dead ++//A ldm IN, {\r2, \r3} ++//A .else ++//A ldr \r2, [IN], #4 ++//A ldr \r3, [IN], #(\size - 1 + 8 - \channels) * 4 ++//A .endif ++//A .else ++ ldrd \r2, \r3, [IN], #(\size + 8 - \channels) * 4 ++//A .endif ++ .endif ++ .endif ++ decr_modulo IDX1, \size, \channels ++ .endm ++ ++.macro implement_pack inorder, channels, shift ++.if \inorder ++.ifc \shift, mixed ++ ++CHECK .req a1 ++COUNT .req a2 ++IN .req a3 ++OUT .req a4 ++DAT0 .req v1 ++DAT1 .req v2 ++DAT2 .req v3 ++DAT3 .req v4 ++SHIFT0 .req v5 ++SHIFT1 .req v6 ++SHIFT2 .req sl ++SHIFT3 .req fp ++SHIFT4 .req ip ++SHIFT5 .req lr ++ ++ .macro output4words ++ .set SIZE_GROUP1, IDX1 ++ .if SIZE_GROUP1 > 4 ++ .set SIZE_GROUP1, 4 ++ .endif ++ .set SIZE_GROUP2, 4 - SIZE_GROUP1 ++ load_group1 SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3 ++ load_group2 SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3 ++ .if \channels == 2 ++ lsl DAT0, SHIFT0 ++ lsl DAT1, SHIFT1 ++ lsl DAT2, SHIFT0 ++ lsl DAT3, SHIFT1 ++ .elseif \channels == 6 ++ .if IDX2 == 6 ++ lsl DAT0, SHIFT0 ++ lsl DAT1, SHIFT1 ++ lsl DAT2, SHIFT2 ++ lsl DAT3, SHIFT3 ++ .elseif IDX2 == 2 ++ lsl DAT0, SHIFT4 ++ lsl DAT1, SHIFT5 ++ lsl DAT2, SHIFT0 ++ lsl DAT3, SHIFT1 ++ .else // IDX2 == 4 ++ lsl DAT0, SHIFT2 ++ lsl DAT1, SHIFT3 ++ lsl DAT2, SHIFT4 ++ lsl DAT3, SHIFT5 ++ .endif ++ .elseif \channels == 8 ++ .if IDX2 == 8 ++ uxtb SHIFT0, SHIFT4, ror #0 ++ uxtb SHIFT1, SHIFT4, ror #8 ++ uxtb SHIFT2, SHIFT4, ror #16 ++ uxtb SHIFT3, SHIFT4, ror #24 ++ .else ++ uxtb SHIFT0, SHIFT5, ror #0 ++ uxtb SHIFT1, SHIFT5, ror #8 ++ uxtb SHIFT2, SHIFT5, ror #16 ++ uxtb SHIFT3, SHIFT5, ror #24 ++ .endif ++ lsl DAT0, SHIFT0 ++ lsl DAT1, SHIFT1 ++ lsl DAT2, SHIFT2 ++ lsl DAT3, SHIFT3 ++ .endif ++ eor CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2) ++ eor CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2) ++ decr_modulo IDX2, 2, \channels ++ eor CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2) ++ eor CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2) ++ decr_modulo IDX2, 2, \channels ++ stm OUT!, {DAT0 - DAT3} ++ .endm ++ ++ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4) ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4 ++ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels ++ ++function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1 ++ .if SAMPLES_PER_LOOP > 1 ++ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice ++ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not ++ .endif ++ teq COUNT, #0 ++ it eq ++ bxeq lr ++ push {v1-v6,sl,fp,lr} ++ ldr SHIFT0, [sp, #(9+1)*4] // get output_shift from stack ++ ldr SHIFT1, =0x08080808 ++ ldr SHIFT4, [SHIFT0] ++ .if \channels == 2 ++ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8 ++ uxtb SHIFT0, SHIFT4, ror #0 ++ uxtb SHIFT1, SHIFT4, ror #8 ++ .else ++ ldr SHIFT5, [SHIFT0, #4] ++ uadd8 SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8 ++ uadd8 SHIFT5, SHIFT5, SHIFT1 ++ .if \channels == 6 ++ uxtb SHIFT0, SHIFT4, ror #0 ++ uxtb SHIFT1, SHIFT4, ror #8 ++ uxtb SHIFT2, SHIFT4, ror #16 ++ uxtb SHIFT3, SHIFT4, ror #24 ++ uxtb SHIFT4, SHIFT5, ror #0 ++ uxtb SHIFT5, SHIFT5, ror #8 ++ .endif ++ .endif ++ .set IDX1, \channels ++ .set IDX2, \channels ++0: ++ .rept WORDS_PER_LOOP / 4 ++ output4words ++ .endr ++ subs COUNT, COUNT, #SAMPLES_PER_LOOP ++ bne 0b ++ pop {v1-v6,sl,fp,pc} ++ .ltorg ++endfunc ++ .purgem output4words ++ ++ .unreq CHECK ++ .unreq COUNT ++ .unreq IN ++ .unreq OUT ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq SHIFT0 ++ .unreq SHIFT1 ++ .unreq SHIFT2 ++ .unreq SHIFT3 ++ .unreq SHIFT4 ++ .unreq SHIFT5 ++ ++.else // not mixed ++ ++CHECK .req a1 ++COUNT .req a2 ++IN .req a3 ++OUT .req a4 ++DAT0 .req v1 ++DAT1 .req v2 ++DAT2 .req v3 ++DAT3 .req v4 ++DAT4 .req v5 ++DAT5 .req v6 ++DAT6 .req sl // use these rather than the otherwise unused ++DAT7 .req fp // ip and lr so that we can load them usinf LDRD ++ ++ .macro output4words tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0 ++ .if \head ++ .set SIZE_GROUP1, IDX1 ++ .if SIZE_GROUP1 > 4 ++ .set SIZE_GROUP1, 4 ++ .endif ++ .set SIZE_GROUP2, 4 - SIZE_GROUP1 ++ load_group1 SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead ++ .endif ++ .if \tail ++ eor CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2) ++ eor CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2) ++ decr_modulo IDX2, 2, \channels ++ .endif ++ .if \head ++ load_group2 SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead ++ .endif ++ .if \tail ++ eor CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2) ++ eor CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2) ++ decr_modulo IDX2, 2, \channels ++ stm OUT!, {\r4, \r5, \r6, \r7} ++ .endif ++ .if \head ++ lsl \r0, #8 + \shift ++ lsl \r1, #8 + \shift ++ lsl \r2, #8 + \shift ++ lsl \r3, #8 + \shift ++ .endif ++ .endm ++ ++ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 8) ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8 ++ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels ++ ++function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1 ++ .if SAMPLES_PER_LOOP > 1 ++ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice ++ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not ++ .endif ++ subs COUNT, COUNT, #SAMPLES_PER_LOOP ++ it lo ++ bxlo lr ++ push {v1-v6,sl,fp,lr} ++ .set IDX1, \channels ++ .set IDX2, \channels ++ output4words 0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7 ++0: beq 1f ++ .rept WORDS_PER_LOOP / 8 ++ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3 ++ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7 ++ .endr ++ subs COUNT, COUNT, #SAMPLES_PER_LOOP ++ bne 0b ++1: ++ .rept WORDS_PER_LOOP / 8 - 1 ++ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3 ++ output4words 1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7 ++ .endr ++ output4words 1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1 ++ output4words 1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7 ++ pop {v1-v6,sl,fp,pc} ++endfunc ++ .purgem output4words ++ ++ .unreq CHECK ++ .unreq COUNT ++ .unreq IN ++ .unreq OUT ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq DAT4 ++ .unreq DAT5 ++ .unreq DAT6 ++ .unreq DAT7 ++ ++.endif // mixed ++.else // not inorder ++.ifc \shift, mixed ++ ++// This case not currently handled ++ ++.else // not mixed ++ ++#if !CONFIG_THUMB ++ ++CHECK .req a1 ++COUNT .req a2 ++IN .req a3 ++OUT .req a4 ++DAT0 .req v1 ++DAT1 .req v2 ++DAT2 .req v3 ++DAT3 .req v4 ++CHAN0 .req v5 ++CHAN1 .req v6 ++CHAN2 .req sl ++CHAN3 .req fp ++CHAN4 .req ip ++CHAN5 .req lr ++ ++ .macro output4words ++ .if \channels == 8 ++ .if IDX1 == 8 ++ uxtb CHAN0, CHAN4, ror #0 ++ uxtb CHAN1, CHAN4, ror #8 ++ uxtb CHAN2, CHAN4, ror #16 ++ uxtb CHAN3, CHAN4, ror #24 ++ .else ++ uxtb CHAN0, CHAN5, ror #0 ++ uxtb CHAN1, CHAN5, ror #8 ++ uxtb CHAN2, CHAN5, ror #16 ++ uxtb CHAN3, CHAN5, ror #24 ++ .endif ++ ldr DAT0, [IN, CHAN0, lsl #2] ++ ldr DAT1, [IN, CHAN1, lsl #2] ++ ldr DAT2, [IN, CHAN2, lsl #2] ++ ldr DAT3, [IN, CHAN3, lsl #2] ++ .if IDX1 == 4 ++ add IN, IN, #8*4 ++ .endif ++ decr_modulo IDX1, 4, \channels ++ .else ++ .set SIZE_GROUP1, IDX1 ++ .if SIZE_GROUP1 > 4 ++ .set SIZE_GROUP1, 4 ++ .endif ++ .set SIZE_GROUP2, 4 - SIZE_GROUP1 ++ .if SIZE_GROUP1 == 2 ++ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1) ++ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1) ++ add IN, IN, #8*4 ++ .else // SIZE_GROUP1 == 4 ++ loadregoffsh2 DAT, 0, IN, CHAN, 0 + (\channels - IDX1) ++ loadregoffsh2 DAT, 1, IN, CHAN, 1 + (\channels - IDX1) ++ loadregoffsh2 DAT, 2, IN, CHAN, 2 + (\channels - IDX1) ++ loadregoffsh2 DAT, 3, IN, CHAN, 3 + (\channels - IDX1) ++ .if IDX1 == 4 ++ add IN, IN, #8*4 ++ .endif ++ .endif ++ decr_modulo IDX1, SIZE_GROUP1, \channels ++ .if SIZE_GROUP2 == 2 ++ loadregoffsh2 DAT, 2, IN, CHAN, 0 + (\channels - IDX1) ++ loadregoffsh2 DAT, 3, IN, CHAN, 1 + (\channels - IDX1) ++ .if IDX1 == 2 ++ add IN, IN, #8*4 ++ .endif ++ .endif ++ decr_modulo IDX1, SIZE_GROUP2, \channels ++ .endif ++ .if \channels == 8 // in this case we can corrupt CHAN0-3 ++ rsb CHAN0, CHAN0, #8 ++ rsb CHAN1, CHAN1, #8 ++ rsb CHAN2, CHAN2, #8 ++ rsb CHAN3, CHAN3, #8 ++ lsl DAT0, #8 + \shift ++ lsl DAT1, #8 + \shift ++ lsl DAT2, #8 + \shift ++ lsl DAT3, #8 + \shift ++ eor CHECK, CHECK, DAT0, lsr CHAN0 ++ eor CHECK, CHECK, DAT1, lsr CHAN1 ++ eor CHECK, CHECK, DAT2, lsr CHAN2 ++ eor CHECK, CHECK, DAT3, lsr CHAN3 ++ .else ++ .if \shift != 0 ++ lsl DAT0, #\shift ++ lsl DAT1, #\shift ++ lsl DAT2, #\shift ++ lsl DAT3, #\shift ++ .endif ++ bic DAT0, DAT0, #0xff000000 ++ bic DAT1, DAT1, #0xff000000 ++ bic DAT2, DAT2, #0xff000000 ++ bic DAT3, DAT3, #0xff000000 ++ eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2) ++ eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2) ++ decr_modulo IDX2, 2, \channels ++ eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2) ++ eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2) ++ decr_modulo IDX2, 2, \channels ++ lsl DAT0, #8 ++ lsl DAT1, #8 ++ lsl DAT2, #8 ++ lsl DAT3, #8 ++ .endif ++ stm OUT!, {DAT0 - DAT3} ++ .endm ++ ++ .set WORDS_PER_LOOP, \channels // calculate LCM (channels, 4) ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .if (WORDS_PER_LOOP % 2) == 0 ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2 ++ .endif ++ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4 ++ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels ++ ++function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1 ++ .if SAMPLES_PER_LOOP > 1 ++ tst COUNT, #SAMPLES_PER_LOOP - 1 // always seems to be in practice ++ bne X(ff_mlp_pack_output) // but just in case, branch to C implementation if not ++ .endif ++ teq COUNT, #0 ++ it eq ++ bxeq lr ++ push {v1-v6,sl,fp,lr} ++ ldr CHAN0, [sp, #(9+0)*4] // get ch_assign from stack ++ ldr CHAN4, [CHAN0] ++ .if \channels == 2 ++ uxtb CHAN0, CHAN4, ror #0 ++ uxtb CHAN1, CHAN4, ror #8 ++ .else ++ ldr CHAN5, [CHAN0, #4] ++ .if \channels == 6 ++ uxtb CHAN0, CHAN4, ror #0 ++ uxtb CHAN1, CHAN4, ror #8 ++ uxtb CHAN2, CHAN4, ror #16 ++ uxtb CHAN3, CHAN4, ror #24 ++ uxtb CHAN4, CHAN5, ror #0 ++ uxtb CHAN5, CHAN5, ror #8 ++ .endif ++ .endif ++ .set IDX1, \channels ++ .set IDX2, \channels ++0: ++ .rept WORDS_PER_LOOP / 4 ++ output4words ++ .endr ++ subs COUNT, COUNT, #SAMPLES_PER_LOOP ++ bne 0b ++ pop {v1-v6,sl,fp,pc} ++ .ltorg ++endfunc ++ .purgem output4words ++ ++ .unreq CHECK ++ .unreq COUNT ++ .unreq IN ++ .unreq OUT ++ .unreq DAT0 ++ .unreq DAT1 ++ .unreq DAT2 ++ .unreq DAT3 ++ .unreq CHAN0 ++ .unreq CHAN1 ++ .unreq CHAN2 ++ .unreq CHAN3 ++ .unreq CHAN4 ++ .unreq CHAN5 ++ ++#endif // !CONFIG_THUMB ++ ++.endif // mixed ++.endif // inorder ++.endm // implement_pack ++ ++.macro pack_channels inorder, channels ++ implement_pack \inorder, \channels, 0 ++ implement_pack \inorder, \channels, 1 ++ implement_pack \inorder, \channels, 2 ++ implement_pack \inorder, \channels, 3 ++ implement_pack \inorder, \channels, 4 ++ implement_pack \inorder, \channels, 5 ++ implement_pack \inorder, \channels, mixed ++.endm ++ ++.macro pack_order inorder ++ pack_channels \inorder, 2 ++ pack_channels \inorder, 6 ++ pack_channels \inorder, 8 ++.endm ++ ++ pack_order 0 ++ pack_order 1 +diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c +index 1bb2276..10ec316 100644 +--- a/libavcodec/arm/mlpdsp_init_arm.c ++++ b/libavcodec/arm/mlpdsp_init_arm.c +@@ -41,8 +41,104 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples, + int access_unit_size_pow2, + int32_t mask); + ++#define DECLARE_PACK(order,channels,shift) \ ++ int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int); ++#define ENUMERATE_PACK(order,channels,shift) \ ++ ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6, ++#define PACK_CHANNELS(macro,order,channels) \ ++ macro(order,channels,0) \ ++ macro(order,channels,1) \ ++ macro(order,channels,2) \ ++ macro(order,channels,3) \ ++ macro(order,channels,4) \ ++ macro(order,channels,5) \ ++ macro(order,channels,mixed) ++#define PACK_ORDER(macro,order) \ ++ PACK_CHANNELS(macro,order,2) \ ++ PACK_CHANNELS(macro,order,6) \ ++ PACK_CHANNELS(macro,order,8) ++#define PACK_ALL(macro) \ ++ PACK_ORDER(macro,outof) \ ++ PACK_ORDER(macro,in) ++PACK_ALL(DECLARE_PACK) ++ ++#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0 ++#if CONFIG_THUMB ++#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0 ++#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0 ++#endif ++ ++static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign, ++ int8_t *output_shift, ++ uint8_t max_matrix_channel, ++ int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) ++{ ++ int ch_index; ++ int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0]; ++ int inorder = 1; ++ static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = { ++ PACK_ALL(ENUMERATE_PACK) ++ }; ++ int i; ++ ++ if (!is32) // don't support 16-bit output (it's not used by TrueHD) ++ return ff_mlp_pack_output; ++ ++ switch (max_matrix_channel) { ++ case 1: ++ ch_index = 0; ++ break; ++ case 5: ++ ch_index = 1; ++ break; ++ case 7: ++ ch_index = 2; ++ break; ++ default: ++ return ff_mlp_pack_output; ++ } ++ ++ for (i = 0; i <= max_matrix_channel; i++) { ++ if (shift != 6 && output_shift[i] != shift) ++ shift = 6; // indicate mixed shifts ++ if (ch_assign[i] != i) ++ inorder = 0; ++ } ++#if CONFIG_THUMB ++ if (!inorder) ++ return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode ++#else ++ if (shift == 6 && !inorder) ++ return ff_mlp_pack_output; // can't currently handle both an order array and a shift array ++#endif ++ ++ return routine[(inorder*3+ch_index)*7+shift]; ++} ++ + av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c) + { ++ int cpu_flags = av_get_cpu_flags(); ++ + c->mlp_filter_channel = ff_mlp_filter_channel_arm; + c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm; ++ if (cpu_flags & AV_CPU_FLAG_ARMV6) ++ c->mlp_select_pack_output = mlp_select_pack_output_armv6; + } +-- +1.9.1 diff --git a/projects/RPi/patches/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch b/projects/RPi/patches/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch new file mode 100644 index 0000000000..962feb2759 --- /dev/null +++ b/projects/RPi/patches/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch @@ -0,0 +1,47 @@ +commit 0e7427498cb1131671f6fe9d054245ae7e5a36f5 +Author: popcornmix +Date: Tue Mar 25 19:43:07 2014 +0000 + + [ffmpeg] Speed up wtv index creation + + The index creation is O(N^2) with number of entries (typically thousands). + On a Pi this can take more than 60 seconds to execute for a recording of a few hours. + + By replacing with an O(N) loop, this takes virtually zero time + +diff --git a/libavformat/wtvdec.c b/libavformat/wtvdec.c +index e423370..70898bd 100644 +--- a/libavformat/wtvdec.c ++++ b/libavformat/wtvdec.c +@@ -980,21 +980,23 @@ static int read_header(AVFormatContext *s) + pb = wtvfile_open(s, root, root_size, ff_timeline_table_0_entries_Events_le16); + if (pb) { + int i; ++ AVIndexEntry *e = wtv->index_entries; ++ AVIndexEntry *e_end = wtv->index_entries + wtv->nb_index_entries - 1; ++ uint64_t last_position = 0; + while (1) { + uint64_t frame_nb = avio_rl64(pb); + uint64_t position = avio_rl64(pb); ++ while (frame_nb > e->size && e <= e_end) { ++ e->pos = last_position; ++ e++; ++ } + if (url_feof(pb)) + break; +- for (i = wtv->nb_index_entries - 1; i >= 0; i--) { +- AVIndexEntry *e = wtv->index_entries + i; +- if (frame_nb > e->size) +- break; +- if (position > e->pos) +- e->pos = position; +- } ++ last_position = position; + } ++ e_end->pos = last_position; + wtvfile_close(pb); +- st->duration = wtv->index_entries[wtv->nb_index_entries - 1].timestamp; ++ st->duration = e_end->timestamp; + } + } + }