projects/RPi/patches/ffmpeg: add RPi specific patches

Signed-off-by: Stephan Raue <stephan@openelec.tv>
2025-07-24 11:16:51 +00:00 · 2014-05-01 21:37:30 +02:00 · 2014-05-01 21:37:30 +02:00 · 5084896ac1
commit 5084896ac1
parent cc971f66ea
10 changed files with 3294 additions and 0 deletions
--- a/projects/RPi/patches/ffmpeg/0001-h264-Move-search-code-search-functions-into-separate.patch
+++ b/projects/RPi/patches/ffmpeg/0001-h264-Move-search-code-search-functions-into-separate.patch
@ -0,0 +1,752 @@
+From 8cdb3bf2837a3fb4fff3c6586316f81ae5f7b6cd Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Wed, 16 Apr 2014 01:51:31 +0100
+Subject: [PATCH 1/3] h264: Move search code search functions into separate
+ source files.
+
+This permits re-use with parsers for codecs which use similar start codes.
+
+Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
+---
+ libavcodec/Makefile               |   2 +-
+ libavcodec/arm/Makefile           |   2 +-
+ libavcodec/arm/h264dsp_armv6.S    | 253 --------------------------------------
+ libavcodec/arm/h264dsp_init_arm.c |   4 +-
+ libavcodec/arm/startcode_armv6.S  | 253 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/h264dsp.c              |  31 +----
+ libavcodec/startcode.c            |  57 +++++++++
+ libavcodec/startcode.h            |  35 ++++++
+ 8 files changed, 351 insertions(+), 286 deletions(-)
+ delete mode 100644 libavcodec/arm/h264dsp_armv6.S
+ create mode 100644 libavcodec/arm/startcode_armv6.S
+ create mode 100644 libavcodec/startcode.c
+ create mode 100644 libavcodec/startcode.h
+
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index b56ecd1..19caf11 100644
+--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
+@@ -49,7 +49,7 @@ OBJS-$(CONFIG_FFT)                     += avfft.o fft_fixed.o fft_float.o \
+ OBJS-$(CONFIG_GOLOMB)                  += golomb.o
+ OBJS-$(CONFIG_H263DSP)                 += h263dsp.o
+ OBJS-$(CONFIG_H264CHROMA)              += h264chroma.o
+-OBJS-$(CONFIG_H264DSP)                 += h264dsp.o h264idct.o
+OBJS-$(CONFIG_H264DSP)                 += h264dsp.o h264idct.o startcode.o
+ OBJS-$(CONFIG_H264PRED)                += h264pred.o
+ OBJS-$(CONFIG_H264QPEL)                += h264qpel.o
+ OBJS-$(CONFIG_HPELDSP)                 += hpeldsp.o
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index a8446b2..b6410b2 100644
+--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
+@@ -47,7 +47,7 @@ ARMV6-OBJS-$(CONFIG_DSPUTIL)           += arm/dsputil_init_armv6.o      \
+                                           arm/simple_idct_armv6.o       \
+
+ ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
+-ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/h264dsp_armv6.o
+ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/startcode_armv6.o
+ ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
+                                           arm/hpeldsp_armv6.o
+ ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
+diff --git a/libavcodec/arm/h264dsp_armv6.S b/libavcodec/arm/h264dsp_armv6.S
+deleted file mode 100644
+index 2758262..0000000
+--- a/libavcodec/arm/h264dsp_armv6.S
+++ /dev/null
+@@ -1,253 +0,0 @@
+-/*
+- * Copyright (c) 2013 RISC OS Open Ltd
+- * Author: Ben Avison <bavison@riscosopen.org>
+- *
+- * This file is part of FFmpeg.
+- *
+- * FFmpeg is free software; you can redistribute it and/or
+- * modify it under the terms of the GNU Lesser General Public
+- * License as published by the Free Software Foundation; either
+- * version 2.1 of the License, or (at your option) any later version.
+- *
+- * FFmpeg is distributed in the hope that it will be useful,
+- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+- * Lesser General Public License for more details.
+- *
+- * You should have received a copy of the GNU Lesser General Public
+- * License along with FFmpeg; if not, write to the Free Software
+- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+- */
+-
+-#include "libavutil/arm/asm.S"
+-
+-RESULT  .req    a1
+-BUF     .req    a1
+-SIZE    .req    a2
+-PATTERN .req    a3
+-PTR     .req    a4
+-DAT0    .req    v1
+-DAT1    .req    v2
+-DAT2    .req    v3
+-DAT3    .req    v4
+-TMP0    .req    v5
+-TMP1    .req    v6
+-TMP2    .req    ip
+-TMP3    .req    lr
+-
+-#define PRELOAD_DISTANCE 4
+-
+-.macro innerloop4
+-        ldr     DAT0, [PTR], #4
+-        subs    SIZE, SIZE, #4 @ C flag survives rest of macro
+-        sub     TMP0, DAT0, PATTERN, lsr #14
+-        bic     TMP0, TMP0, DAT0
+-        ands    TMP0, TMP0, PATTERN
+-.endm
+-
+-.macro innerloop16  decrement, do_preload
+-        ldmia   PTR!, {DAT0,DAT1,DAT2,DAT3}
+- .ifnc "\do_preload",""
+-        pld     [PTR, #PRELOAD_DISTANCE*32]
+- .endif
+- .ifnc "\decrement",""
+-        subs    SIZE, SIZE, #\decrement @ C flag survives rest of macro
+- .endif
+-        sub     TMP0, DAT0, PATTERN, lsr #14
+-        sub     TMP1, DAT1, PATTERN, lsr #14
+-        bic     TMP0, TMP0, DAT0
+-        bic     TMP1, TMP1, DAT1
+-        sub     TMP2, DAT2, PATTERN, lsr #14
+-        sub     TMP3, DAT3, PATTERN, lsr #14
+-        ands    TMP0, TMP0, PATTERN
+-        bic     TMP2, TMP2, DAT2
+-        it      eq
+-        andseq  TMP1, TMP1, PATTERN
+-        bic     TMP3, TMP3, DAT3
+-        itt     eq
+-        andseq  TMP2, TMP2, PATTERN
+-        andseq  TMP3, TMP3, PATTERN
+-.endm
+-
+-/* int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size) */
+-function ff_h264_find_start_code_candidate_armv6, export=1
+-        push    {v1-v6,lr}
+-        mov     PTR, BUF
+-        @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
+-        @ before using code that does preloads
+-        cmp     SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
+-        blo     60f
+-
+-        @ Get to word-alignment, 1 byte at a time
+-        tst     PTR, #3
+-        beq     2f
+-1:      ldrb    DAT0, [PTR], #1
+-        sub     SIZE, SIZE, #1
+-        teq     DAT0, #0
+-        beq     90f
+-        tst     PTR, #3
+-        bne     1b
+-2:      @ Get to 4-word alignment, 1 word at a time
+-        ldr     PATTERN, =0x80008000
+-        setend  be
+-        tst     PTR, #12
+-        beq     4f
+-3:      innerloop4
+-        bne     91f
+-        tst     PTR, #12
+-        bne     3b
+-4:      @ Get to cacheline (8-word) alignment
+-        tst     PTR, #16
+-        beq     5f
+-        innerloop16  16
+-        bne     93f
+-5:      @ Check complete cachelines, with preloading
+-        @ We need to stop when there are still (PRELOAD_DISTANCE+1)
+-        @ complete cachelines to go
+-        sub     SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
+-6:      innerloop16  , do_preload
+-        bne     93f
+-        innerloop16  32
+-        bne     93f
+-        bcs     6b
+-        @ Preload trailing part-cacheline, if any
+-        tst     SIZE, #31
+-        beq     7f
+-        pld     [PTR, #(PRELOAD_DISTANCE+1)*32]
+-        @ Check remaining data without doing any more preloads. First
+-        @ do in chunks of 4 words:
+-7:      adds    SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
+-        bmi     9f
+-8:      innerloop16  16
+-        bne     93f
+-        bcs     8b
+-        @ Then in words:
+-9:      adds    SIZE, SIZE, #16 - 4
+-        bmi     11f
+-10:     innerloop4
+-        bne     91f
+-        bcs     10b
+-11:     setend  le
+-        @ Check second byte of final halfword
+-        ldrb    DAT0, [PTR, #-1]
+-        teq     DAT0, #0
+-        beq     90f
+-        @ Check any remaining bytes
+-        tst     SIZE, #3
+-        beq     13f
+-12:     ldrb    DAT0, [PTR], #1
+-        sub     SIZE, SIZE, #1
+-        teq     DAT0, #0
+-        beq     90f
+-        tst     SIZE, #3
+-        bne     12b
+-        @ No candidate found
+-13:     sub     RESULT, PTR, BUF
+-        b       99f
+-
+-60:     @ Small buffer - simply check by looping over bytes
+-        subs    SIZE, SIZE, #1
+-        bcc     99f
+-61:     ldrb    DAT0, [PTR], #1
+-        subs    SIZE, SIZE, #1
+-        teq     DAT0, #0
+-        beq     90f
+-        bcs     61b
+-        @ No candidate found
+-        sub     RESULT, PTR, BUF
+-        b       99f
+-
+-90:     @ Found a candidate at the preceding byte
+-        sub     RESULT, PTR, BUF
+-        sub     RESULT, RESULT, #1
+-        b       99f
+-
+-91:     @ Found a candidate somewhere in the preceding 4 bytes
+-        sub     RESULT, PTR, BUF
+-        sub     RESULT, RESULT, #4
+-        sub     TMP0, DAT0, #0x20000
+-        bics    TMP0, TMP0, DAT0
+-        itt     pl
+-        ldrbpl  DAT0, [PTR, #-3]
+-        addpl   RESULT, RESULT, #2
+-        bpl     92f
+-        teq     RESULT, #0
+-        beq     98f @ don't look back a byte if found at first byte in buffer
+-        ldrb    DAT0, [PTR, #-5]
+-92:     teq     DAT0, #0
+-        it      eq
+-        subeq   RESULT, RESULT, #1
+-        b       98f
+-
+-93:     @ Found a candidate somewhere in the preceding 16 bytes
+-        sub     RESULT, PTR, BUF
+-        sub     RESULT, RESULT, #16
+-        teq     TMP0, #0
+-        beq     95f @ not in first 4 bytes
+-        sub     TMP0, DAT0, #0x20000
+-        bics    TMP0, TMP0, DAT0
+-        itt     pl
+-        ldrbpl  DAT0, [PTR, #-15]
+-        addpl   RESULT, RESULT, #2
+-        bpl     94f
+-        teq     RESULT, #0
+-        beq     98f @ don't look back a byte if found at first byte in buffer
+-        ldrb    DAT0, [PTR, #-17]
+-94:     teq     DAT0, #0
+-        it      eq
+-        subeq   RESULT, RESULT, #1
+-        b       98f
+-95:     add     RESULT, RESULT, #4
+-        teq     TMP1, #0
+-        beq     96f @ not in next 4 bytes
+-        sub     TMP1, DAT1, #0x20000
+-        bics    TMP1, TMP1, DAT1
+-        itee    mi
+-        ldrbmi  DAT0, [PTR, #-13]
+-        ldrbpl  DAT0, [PTR, #-11]
+-        addpl   RESULT, RESULT, #2
+-        teq     DAT0, #0
+-        it      eq
+-        subeq   RESULT, RESULT, #1
+-        b       98f
+-96:     add     RESULT, RESULT, #4
+-        teq     TMP2, #0
+-        beq     97f @ not in next 4 bytes
+-        sub     TMP2, DAT2, #0x20000
+-        bics    TMP2, TMP2, DAT2
+-        itee    mi
+-        ldrbmi  DAT0, [PTR, #-9]
+-        ldrbpl  DAT0, [PTR, #-7]
+-        addpl   RESULT, RESULT, #2
+-        teq     DAT0, #0
+-        it      eq
+-        subeq   RESULT, RESULT, #1
+-        b       98f
+-97:     add     RESULT, RESULT, #4
+-        sub     TMP3, DAT3, #0x20000
+-        bics    TMP3, TMP3, DAT3
+-        itee    mi
+-        ldrbmi  DAT0, [PTR, #-5]
+-        ldrbpl  DAT0, [PTR, #-3]
+-        addpl   RESULT, RESULT, #2
+-        teq     DAT0, #0
+-        it      eq
+-        subeq   RESULT, RESULT, #1
+-        @ drop through to 98f
+-98:     setend  le
+-99:     pop     {v1-v6,pc}
+-endfunc
+-
+-        .unreq  RESULT
+-        .unreq  BUF
+-        .unreq  SIZE
+-        .unreq  PATTERN
+-        .unreq  PTR
+-        .unreq  DAT0
+-        .unreq  DAT1
+-        .unreq  DAT2
+-        .unreq  DAT3
+-        .unreq  TMP0
+-        .unreq  TMP1
+-        .unreq  TMP2
+-        .unreq  TMP3
+diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
+index a0418fd..eb6c514 100644
+--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
+@@ -24,7 +24,7 @@
+ #include "libavutil/arm/cpu.h"
+ #include "libavcodec/h264dsp.h"
+
+-int ff_h264_find_start_code_candidate_armv6(const uint8_t *buf, int size);
+int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size);
+
+ void ff_h264_v_loop_filter_luma_neon(uint8_t *pix, int stride, int alpha,
+                                      int beta, int8_t *tc0);
+@@ -109,7 +109,7 @@ av_cold void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
+     int cpu_flags = av_get_cpu_flags();
+
+     if (have_armv6(cpu_flags))
+-        c->h264_find_start_code_candidate = ff_h264_find_start_code_candidate_armv6;
+        c->h264_find_start_code_candidate = ff_startcode_find_candidate_armv6;
+     if (have_neon(cpu_flags))
+         h264dsp_init_neon(c, bit_depth, chroma_format_idc);
+ }
+diff --git a/libavcodec/arm/startcode_armv6.S b/libavcodec/arm/startcode_armv6.S
+new file mode 100644
+index 0000000..a46f009
+--- /dev/null
+++ b/libavcodec/arm/startcode_armv6.S
+@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+RESULT  .req    a1
+BUF     .req    a1
+SIZE    .req    a2
+PATTERN .req    a3
+PTR     .req    a4
+DAT0    .req    v1
+DAT1    .req    v2
+DAT2    .req    v3
+DAT3    .req    v4
+TMP0    .req    v5
+TMP1    .req    v6
+TMP2    .req    ip
+TMP3    .req    lr
+
+#define PRELOAD_DISTANCE 4
+
+.macro innerloop4
+        ldr     DAT0, [PTR], #4
+        subs    SIZE, SIZE, #4 @ C flag survives rest of macro
+        sub     TMP0, DAT0, PATTERN, lsr #14
+        bic     TMP0, TMP0, DAT0
+        ands    TMP0, TMP0, PATTERN
+.endm
+
+.macro innerloop16  decrement, do_preload
+        ldmia   PTR!, {DAT0,DAT1,DAT2,DAT3}
+ .ifnc "\do_preload",""
+        pld     [PTR, #PRELOAD_DISTANCE*32]
+ .endif
+ .ifnc "\decrement",""
+        subs    SIZE, SIZE, #\decrement @ C flag survives rest of macro
+ .endif
+        sub     TMP0, DAT0, PATTERN, lsr #14
+        sub     TMP1, DAT1, PATTERN, lsr #14
+        bic     TMP0, TMP0, DAT0
+        bic     TMP1, TMP1, DAT1
+        sub     TMP2, DAT2, PATTERN, lsr #14
+        sub     TMP3, DAT3, PATTERN, lsr #14
+        ands    TMP0, TMP0, PATTERN
+        bic     TMP2, TMP2, DAT2
+        it      eq
+        andseq  TMP1, TMP1, PATTERN
+        bic     TMP3, TMP3, DAT3
+        itt     eq
+        andseq  TMP2, TMP2, PATTERN
+        andseq  TMP3, TMP3, PATTERN
+.endm
+
+/* int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size) */
+function ff_startcode_find_candidate_armv6, export=1
+        push    {v1-v6,lr}
+        mov     PTR, BUF
+        @ Ensure there are at least (PRELOAD_DISTANCE+2) complete cachelines to go
+        @ before using code that does preloads
+        cmp     SIZE, #(PRELOAD_DISTANCE+3)*32 - 1
+        blo     60f
+
+        @ Get to word-alignment, 1 byte at a time
+        tst     PTR, #3
+        beq     2f
+1:      ldrb    DAT0, [PTR], #1
+        sub     SIZE, SIZE, #1
+        teq     DAT0, #0
+        beq     90f
+        tst     PTR, #3
+        bne     1b
+2:      @ Get to 4-word alignment, 1 word at a time
+        ldr     PATTERN, =0x80008000
+        setend  be
+        tst     PTR, #12
+        beq     4f
+3:      innerloop4
+        bne     91f
+        tst     PTR, #12
+        bne     3b
+4:      @ Get to cacheline (8-word) alignment
+        tst     PTR, #16
+        beq     5f
+        innerloop16  16
+        bne     93f
+5:      @ Check complete cachelines, with preloading
+        @ We need to stop when there are still (PRELOAD_DISTANCE+1)
+        @ complete cachelines to go
+        sub     SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32
+6:      innerloop16  , do_preload
+        bne     93f
+        innerloop16  32
+        bne     93f
+        bcs     6b
+        @ Preload trailing part-cacheline, if any
+        tst     SIZE, #31
+        beq     7f
+        pld     [PTR, #(PRELOAD_DISTANCE+1)*32]
+        @ Check remaining data without doing any more preloads. First
+        @ do in chunks of 4 words:
+7:      adds    SIZE, SIZE, #(PRELOAD_DISTANCE+2)*32 - 16
+        bmi     9f
+8:      innerloop16  16
+        bne     93f
+        bcs     8b
+        @ Then in words:
+9:      adds    SIZE, SIZE, #16 - 4
+        bmi     11f
+10:     innerloop4
+        bne     91f
+        bcs     10b
+11:     setend  le
+        @ Check second byte of final halfword
+        ldrb    DAT0, [PTR, #-1]
+        teq     DAT0, #0
+        beq     90f
+        @ Check any remaining bytes
+        tst     SIZE, #3
+        beq     13f
+12:     ldrb    DAT0, [PTR], #1
+        sub     SIZE, SIZE, #1
+        teq     DAT0, #0
+        beq     90f
+        tst     SIZE, #3
+        bne     12b
+        @ No candidate found
+13:     sub     RESULT, PTR, BUF
+        b       99f
+
+60:     @ Small buffer - simply check by looping over bytes
+        subs    SIZE, SIZE, #1
+        bcc     99f
+61:     ldrb    DAT0, [PTR], #1
+        subs    SIZE, SIZE, #1
+        teq     DAT0, #0
+        beq     90f
+        bcs     61b
+        @ No candidate found
+        sub     RESULT, PTR, BUF
+        b       99f
+
+90:     @ Found a candidate at the preceding byte
+        sub     RESULT, PTR, BUF
+        sub     RESULT, RESULT, #1
+        b       99f
+
+91:     @ Found a candidate somewhere in the preceding 4 bytes
+        sub     RESULT, PTR, BUF
+        sub     RESULT, RESULT, #4
+        sub     TMP0, DAT0, #0x20000
+        bics    TMP0, TMP0, DAT0
+        itt     pl
+        ldrbpl  DAT0, [PTR, #-3]
+        addpl   RESULT, RESULT, #2
+        bpl     92f
+        teq     RESULT, #0
+        beq     98f @ don't look back a byte if found at first byte in buffer
+        ldrb    DAT0, [PTR, #-5]
+92:     teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+
+93:     @ Found a candidate somewhere in the preceding 16 bytes
+        sub     RESULT, PTR, BUF
+        sub     RESULT, RESULT, #16
+        teq     TMP0, #0
+        beq     95f @ not in first 4 bytes
+        sub     TMP0, DAT0, #0x20000
+        bics    TMP0, TMP0, DAT0
+        itt     pl
+        ldrbpl  DAT0, [PTR, #-15]
+        addpl   RESULT, RESULT, #2
+        bpl     94f
+        teq     RESULT, #0
+        beq     98f @ don't look back a byte if found at first byte in buffer
+        ldrb    DAT0, [PTR, #-17]
+94:     teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+95:     add     RESULT, RESULT, #4
+        teq     TMP1, #0
+        beq     96f @ not in next 4 bytes
+        sub     TMP1, DAT1, #0x20000
+        bics    TMP1, TMP1, DAT1
+        itee    mi
+        ldrbmi  DAT0, [PTR, #-13]
+        ldrbpl  DAT0, [PTR, #-11]
+        addpl   RESULT, RESULT, #2
+        teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+96:     add     RESULT, RESULT, #4
+        teq     TMP2, #0
+        beq     97f @ not in next 4 bytes
+        sub     TMP2, DAT2, #0x20000
+        bics    TMP2, TMP2, DAT2
+        itee    mi
+        ldrbmi  DAT0, [PTR, #-9]
+        ldrbpl  DAT0, [PTR, #-7]
+        addpl   RESULT, RESULT, #2
+        teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        b       98f
+97:     add     RESULT, RESULT, #4
+        sub     TMP3, DAT3, #0x20000
+        bics    TMP3, TMP3, DAT3
+        itee    mi
+        ldrbmi  DAT0, [PTR, #-5]
+        ldrbpl  DAT0, [PTR, #-3]
+        addpl   RESULT, RESULT, #2
+        teq     DAT0, #0
+        it      eq
+        subeq   RESULT, RESULT, #1
+        @ drop through to 98f
+98:     setend  le
+99:     pop     {v1-v6,pc}
+endfunc
+
+        .unreq  RESULT
+        .unreq  BUF
+        .unreq  SIZE
+        .unreq  PATTERN
+        .unreq  PTR
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  TMP0
+        .unreq  TMP1
+        .unreq  TMP2
+        .unreq  TMP3
+diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
+index a2a4aba..a4da776 100644
+--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
+@@ -33,6 +33,7 @@
+ #include "avcodec.h"
+ #include "h264dsp.h"
+ #include "h264idct.h"
+#include "startcode.h"
+ #include "libavutil/common.h"
+
+ #define BIT_DEPTH 8
+@@ -63,34 +64,6 @@
+ #include "h264addpx_template.c"
+ #undef BIT_DEPTH
+
+-static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
+-{
+-    int i = 0;
+-#if HAVE_FAST_UNALIGNED
+-    /* we check i < size instead of i + 3 / 7 because it is
+-     * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
+-     * bytes at the end.
+-     */
+-#       if HAVE_FAST_64BIT
+-    while (i < size &&
+-            !((~*(const uint64_t *)(buf + i) &
+-                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+-                    0x8080808080808080ULL))
+-        i += 8;
+-#       else
+-    while (i < size &&
+-            !((~*(const uint32_t *)(buf + i) &
+-                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+-                    0x80808080U))
+-        i += 4;
+-#       endif
+-#endif
+-    for (; i < size; i++)
+-        if (!buf[i])
+-            break;
+-    return i;
+-}
+-
+ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
+                              const int chroma_format_idc)
+ {
+@@ -178,7 +151,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
+         H264_DSP(8);
+         break;
+     }
+-    c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
+    c->h264_find_start_code_candidate = ff_startcode_find_candidate_c;
+
+     if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc);
+     if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
+diff --git a/libavcodec/startcode.c b/libavcodec/startcode.c
+new file mode 100644
+index 0000000..5df7695
+--- /dev/null
+++ b/libavcodec/startcode.c
+@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include "startcode.h"
+#include "config.h"
+
+int ff_startcode_find_candidate_c(const uint8_t *buf, int size)
+{
+    int i = 0;
+#if HAVE_FAST_UNALIGNED
+    /* we check i < size instead of i + 3 / 7 because it is
+     * simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
+     * bytes at the end.
+     */
+#       if HAVE_FAST_64BIT
+    while (i < size &&
+            !((~*(const uint64_t *)(buf + i) &
+                    (*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
+                    0x8080808080808080ULL))
+        i += 8;
+#       else
+    while (i < size &&
+            !((~*(const uint32_t *)(buf + i) &
+                    (*(const uint32_t *)(buf + i) - 0x01010101U)) &
+                    0x80808080U))
+        i += 4;
+#       endif
+#endif
+    for (; i < size; i++)
+        if (!buf[i])
+            break;
+    return i;
+}
+diff --git a/libavcodec/startcode.h b/libavcodec/startcode.h
+new file mode 100644
+index 0000000..cc55d5f
+--- /dev/null
+++ b/libavcodec/startcode.h
+@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2003-2010 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Accelerated start code search function for start codes common to
+ * MPEG-1/2/4 video, VC-1, H.264/5
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVCODEC_STARTCODE_H
+#define AVCODEC_STARTCODE_H
+
+#include <stdint.h>
+
+int ff_startcode_find_candidate_c(const uint8_t *buf, int size);
+
+#endif /* AVCODEC_STARTCODE_H */
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0001-truehd-tune-VLC-decoding-for-ARM.patch
+++ b/projects/RPi/patches/ffmpeg/0001-truehd-tune-VLC-decoding-for-ARM.patch
@ -0,0 +1,65 @@
+From 425d69b993d25489e4830766507d9d8f6c819802 Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Wed, 19 Mar 2014 17:26:19 +0000
+Subject: [PATCH 1/6] truehd: tune VLC decoding for ARM.
+
+Profiling on a Raspberry Pi revealed the best performance to correspond
+with VLC_BITS = 5. Results for overall audio decode and the get_vlc2 function
+in particular are as follows:
+
+              Before          After
+              Mean   StdDev   Mean   StdDev  Confidence  Change
+6:2 total     348.8  20.1     339.6  15.1    88.8%       +2.7%  (insignificant)
+6:2 function  38.1   8.1      26.4   4.1     100.0%      +44.5%
+8:2 total     339.1  15.4     324.5  15.5    99.4%       +4.5%
+8:2 function  33.8   7.0      27.3   5.6     99.7%       +23.6%
+6:6 total     604.6  20.8     572.8  20.6    100.0%      +5.6%
+6:6 function  95.8   8.4      68.9   8.2     100.0%      +39.1%
+8:8 total     766.4  17.6     741.5  21.2    100.0%      +3.4%
+8:8 function  106.0  11.4     86.1   9.9     100.0%      +23.1%
+
+Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
+---
+ libavcodec/mlpdec.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
+index 93ed552..cbd9000 100644
+--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
+@@ -37,9 +37,16 @@
+ #include "mlp_parser.h"
+ #include "mlpdsp.h"
+ #include "mlp.h"
+#include "config.h"
+
+ /** number of bits used for VLC lookup - longest Huffman code is 9 */
+#if ARCH_ARM == 1
+#define VLC_BITS            5
+#define VLC_STATIC_SIZE     64
+#else
+ #define VLC_BITS            9
+#define VLC_STATIC_SIZE     512
+#endif
+
+ typedef struct SubStream {
+     /// Set if a valid restart header has been read. Otherwise the substream cannot be decoded.
+@@ -193,13 +200,13 @@ static av_cold void init_static(void)
+     if (!huff_vlc[0].bits) {
+         INIT_VLC_STATIC(&huff_vlc[0], VLC_BITS, 18,
+                     &ff_mlp_huffman_tables[0][0][1], 2, 1,
+-                    &ff_mlp_huffman_tables[0][0][0], 2, 1, 512);
+                    &ff_mlp_huffman_tables[0][0][0], 2, 1, VLC_STATIC_SIZE);
+         INIT_VLC_STATIC(&huff_vlc[1], VLC_BITS, 16,
+                     &ff_mlp_huffman_tables[1][0][1], 2, 1,
+-                    &ff_mlp_huffman_tables[1][0][0], 2, 1, 512);
+                    &ff_mlp_huffman_tables[1][0][0], 2, 1, VLC_STATIC_SIZE);
+         INIT_VLC_STATIC(&huff_vlc[2], VLC_BITS, 15,
+                     &ff_mlp_huffman_tables[2][0][1], 2, 1,
+-                    &ff_mlp_huffman_tables[2][0][0], 2, 1, 512);
+                    &ff_mlp_huffman_tables[2][0][0], 2, 1, VLC_STATIC_SIZE);
+     }
+
+     ff_mlp_init_crc();
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0002-truehd-add-hand-scheduled-ARM-asm-version-of-mlp_fil.patch
+++ b/projects/RPi/patches/ffmpeg/0002-truehd-add-hand-scheduled-ARM-asm-version-of-mlp_fil.patch
@ -0,0 +1,557 @@
+From bfe3d8c8e4e046163dc314aa16207413e377283f Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Mon, 3 Mar 2014 19:44:23 +0000
+Subject: [PATCH 2/6] truehd: add hand-scheduled ARM asm version of
+ mlp_filter_channel.
+
+Profiling results for overall audio decode and the mlp_filter_channel(_arm)
+function in particular are as follows:
+
+              Before          After
+              Mean   StdDev   Mean   StdDev  Confidence  Change
+6:2 total     380.4  22.0     370.8  17.0    87.4%       +2.6%  (insignificant)
+6:2 function  60.7   7.2      36.6   8.1     100.0%      +65.8%
+8:2 total     357.0  17.5     343.2  19.0    97.8%       +4.0%  (insignificant)
+8:2 function  60.3   8.8      37.3   3.8     100.0%      +61.8%
+6:6 total     717.2  23.2     658.4  15.7    100.0%      +8.9%
+6:6 function  140.4  12.9     81.5   9.2     100.0%      +72.4%
+8:8 total     981.9  16.2     896.2  24.5    100.0%      +9.6%
+8:8 function  193.4  15.0     103.3  11.5    100.0%      +87.2%
+
+Experiments with adding preload instructions to this function yielded no
+useful benefit, so these have not been included.
+
+The assembly version has also been tested with a fuzz tester to ensure that
+any combinations of inputs not exercised by my available test streams still
+generate mathematically identical results to the C version.
+---
+ libavcodec/arm/Makefile          |   2 +
+ libavcodec/arm/mlpdsp_arm.S      | 433 +++++++++++++++++++++++++++++++++++++++
+ libavcodec/arm/mlpdsp_init_arm.c |  36 ++++
+ libavcodec/mlpdsp.c              |   2 +
+ libavcodec/mlpdsp.h              |   1 +
+ 5 files changed, 474 insertions(+)
+ create mode 100644 libavcodec/arm/mlpdsp_arm.S
+ create mode 100644 libavcodec/arm/mlpdsp_init_arm.c
+
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index a8446b2..ba673b1 100644
+--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
+@@ -22,6 +22,8 @@ OBJS-$(CONFIG_H264PRED)                += arm/h264pred_init_arm.o
+ OBJS-$(CONFIG_H264QPEL)                += arm/h264qpel_init_arm.o
+ OBJS-$(CONFIG_HPELDSP)                 += arm/hpeldsp_init_arm.o        \
+                                           arm/hpeldsp_arm.o
+OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o         \
+                                          arm/mlpdsp_arm.o
+ OBJS-$(CONFIG_MPEGAUDIODSP)            += arm/mpegaudiodsp_init_arm.o
+ OBJS-$(CONFIG_MPEGVIDEO)               += arm/mpegvideo_arm.o
+ OBJS-$(CONFIG_NEON_CLOBBER_TEST)       += arm/neontest.o
+diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
+new file mode 100644
+index 0000000..615819d
+--- /dev/null
+++ b/libavcodec/arm/mlpdsp_arm.S
+@@ -0,0 +1,433 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+#define MAX_CHANNELS        8
+#define MAX_FIR_ORDER       8
+#define MAX_IIR_ORDER       4
+#define MAX_RATEFACTOR      4
+#define MAX_BLOCKSIZE       (40 * MAX_RATEFACTOR)
+
+PST     .req    a1
+PCO     .req    a2
+AC0     .req    a3
+AC1     .req    a4
+CO0     .req    v1
+CO1     .req    v2
+CO2     .req    v3
+CO3     .req    v4
+ST0     .req    v5
+ST1     .req    v6
+ST2     .req    sl
+ST3     .req    fp
+I       .req    ip
+PSAMP   .req    lr
+
+
+// Some macros that do loads/multiplies where the register number is determined
+// from an assembly-time expression. Boy is GNU assembler's syntax ugly...
+
+.macro load  group, index, base, offset
+       .altmacro
+       load_ \group, %(\index), \base, \offset
+       .noaltmacro
+.endm
+
+.macro load_ group, index, base, offset
+        ldr     \group\index, [\base, #\offset]
+.endm
+
+.macro loadd  group, index, base, offset
+       .altmacro
+       loadd_ \group, %(\index), %(\index+1), \base, \offset
+       .noaltmacro
+.endm
+
+.macro loadd_ group, index0, index1, base, offset
+A .if offset >= 256
+A       ldr     \group\index0, [\base, #\offset]
+A       ldr     \group\index1, [\base, #(\offset) + 4]
+A .else
+        ldrd    \group\index0, \group\index1, [\base, #\offset]
+A .endif
+.endm
+
+.macro multiply  index, accumulate, long
+        .altmacro
+        multiply_ %(\index), \accumulate, \long
+        .noaltmacro
+.endm
+
+.macro multiply_  index, accumulate, long
+ .if \long
+  .if \accumulate
+        smlal   AC0, AC1, CO\index, ST\index
+  .else
+        smull   AC0, AC1, CO\index, ST\index
+  .endif
+ .else
+  .if \accumulate
+        mla     AC0, CO\index, ST\index, AC0
+  .else
+        mul     AC0, CO\index, ST\index
+  .endif
+ .endif
+.endm
+
+// A macro to update the load register number and load offsets
+
+.macro inc  howmany
+  .set LOAD_REG, (LOAD_REG + \howmany) & 3
+  .set OFFSET_CO, OFFSET_CO + 4 * \howmany
+  .set OFFSET_ST, OFFSET_ST + 4 * \howmany
+  .if FIR_REMAIN > 0
+    .set FIR_REMAIN, FIR_REMAIN - \howmany
+    .if FIR_REMAIN == 0
+      .set OFFSET_CO, 4 * MAX_FIR_ORDER
+      .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+    .endif
+  .elseif IIR_REMAIN > 0
+    .set IIR_REMAIN, IIR_REMAIN - \howmany
+  .endif
+.endm
+
+// Macro to implement the inner loop for one specific combination of parameters
+
+.macro implement_filter  mask_minus1, shift_0, shift_8, iir_taps, fir_taps
+  .set TOTAL_TAPS, \iir_taps + \fir_taps
+
+  // Deal with register allocation...
+  .set DEFINED_SHIFT, 0
+  .set DEFINED_MASK, 0
+  .set SHUFFLE_SHIFT, 0
+  .set SHUFFLE_MASK, 0
+  .set SPILL_SHIFT, 0
+  .set SPILL_MASK, 0
+  .if TOTAL_TAPS == 0
+    // Little register pressure in this case - just keep MASK where it was
+    .if !\mask_minus1
+      MASK .req ST1
+      .set DEFINED_MASK, 1
+    .endif
+  .else
+    .if \shift_0
+      .if !\mask_minus1
+        // AC1 is unused with shift 0
+        MASK .req AC1
+        .set DEFINED_MASK, 1
+        .set SHUFFLE_MASK, 1
+      .endif
+    .elseif \shift_8
+      .if !\mask_minus1
+        .if TOTAL_TAPS <= 4
+        // All coefficients are preloaded (so pointer not needed)
+          MASK .req PCO
+          .set DEFINED_MASK, 1
+          .set SHUFFLE_MASK, 1
+        .else
+          .set SPILL_MASK, 1
+        .endif
+      .endif
+    .else // shift not 0 or 8
+      .if TOTAL_TAPS <= 3
+        // All coefficients are preloaded, and at least one CO register is unused
+        .if \fir_taps & 1
+          SHIFT .req CO0
+          .set DEFINED_SHIFT, 1
+          .set SHUFFLE_SHIFT, 1
+        .else
+          SHIFT .req CO3
+          .set DEFINED_SHIFT, 1
+          .set SHUFFLE_SHIFT, 1
+        .endif
+        .if !\mask_minus1
+          MASK .req PCO
+          .set DEFINED_MASK, 1
+          .set SHUFFLE_MASK, 1
+        .endif
+      .elseif TOTAL_TAPS == 4
+        // All coefficients are preloaded
+        SHIFT .req PCO
+        .set DEFINED_SHIFT, 1
+        .set SHUFFLE_SHIFT, 1
+        .if !\mask_minus1
+          .set SPILL_MASK, 1
+        .endif
+      .else
+        .set SPILL_SHIFT, 1
+        .if !\mask_minus1
+          .set SPILL_MASK, 1
+        .endif
+      .endif
+    .endif
+  .endif
+  .if SPILL_SHIFT
+    SHIFT .req ST0
+    .set DEFINED_SHIFT, 1
+  .endif
+  .if SPILL_MASK
+    MASK .req ST1
+    .set DEFINED_MASK, 1
+  .endif
+
+        // Preload coefficients if possible
+  .if TOTAL_TAPS <= 4
+    .set OFFSET_CO, 0
+    .if \fir_taps & 1
+      .set LOAD_REG, 1
+    .else
+      .set LOAD_REG, 0
+    .endif
+    .rept \fir_taps
+        load    CO, LOAD_REG, PCO, OFFSET_CO
+      .set LOAD_REG, (LOAD_REG + 1) & 3
+      .set OFFSET_CO, OFFSET_CO + 4
+    .endr
+    .set OFFSET_CO, 4 * MAX_FIR_ORDER
+    .rept \iir_taps
+        load    CO, LOAD_REG, PCO, OFFSET_CO
+      .set LOAD_REG, (LOAD_REG + 1) & 3
+      .set OFFSET_CO, OFFSET_CO + 4
+    .endr
+  .endif
+
+        // Move mask/shift to final positions if necessary
+        // Need to do this after preloading, because in some cases we
+        // reuse the coefficient pointer register
+  .if SHUFFLE_SHIFT
+        mov     SHIFT, ST0
+  .endif
+  .if SHUFFLE_MASK
+        mov     MASK, ST1
+  .endif
+
+        // Begin loop
+01:
+  .if TOTAL_TAPS == 0
+        // Things simplify a lot in this case
+        // In fact this could be pipelined further if it's worth it...
+        ldr     ST0, [PSAMP]
+        subs    I, I, #1
+    .if !\mask_minus1
+        and     ST0, ST0, MASK
+    .endif
+        str     ST0, [PST, #-4]!
+        str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+        str     ST0, [PSAMP], #4 * MAX_CHANNELS
+        bne     01b
+  .else
+    .if \fir_taps & 1
+      .set LOAD_REG, 1
+    .else
+      .set LOAD_REG, 0
+    .endif
+    .set LOAD_BANK, 0
+    .set FIR_REMAIN, \fir_taps
+    .set IIR_REMAIN, \iir_taps
+    .if FIR_REMAIN == 0 // only IIR terms
+      .set OFFSET_CO, 4 * MAX_FIR_ORDER
+      .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
+    .else
+      .set OFFSET_CO, 0
+      .set OFFSET_ST, 0
+    .endif
+    .set MUL_REG, LOAD_REG
+    .set COUNTER, 0
+    .rept TOTAL_TAPS + 2
+        // Do load(s)
+     .if FIR_REMAIN != 0 || IIR_REMAIN != 0
+      .if COUNTER == 0
+       .if TOTAL_TAPS > 4
+        load    CO, LOAD_REG, PCO, OFFSET_CO
+       .endif
+        load    ST, LOAD_REG, PST, OFFSET_ST
+        inc     1
+      .elseif COUNTER == 1 && (\fir_taps & 1) == 0
+       .if TOTAL_TAPS > 4
+        load    CO, LOAD_REG, PCO, OFFSET_CO
+       .endif
+        load    ST, LOAD_REG, PST, OFFSET_ST
+        inc     1
+      .elseif LOAD_BANK == 0
+       .if TOTAL_TAPS > 4
+        .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+        load    CO, LOAD_REG, PCO, OFFSET_CO
+        .else
+        loadd   CO, LOAD_REG, PCO, OFFSET_CO
+        .endif
+       .endif
+       .set LOAD_BANK, 1
+      .else
+       .if FIR_REMAIN == 0 && IIR_REMAIN == 1
+        load    ST, LOAD_REG, PST, OFFSET_ST
+        inc     1
+       .else
+        loadd   ST, LOAD_REG, PST, OFFSET_ST
+        inc     2
+       .endif
+       .set LOAD_BANK, 0
+      .endif
+     .endif
+
+        // Do interleaved multiplies, slightly delayed
+     .if COUNTER >= 2
+        multiply MUL_REG, COUNTER > 2, !\shift_0
+      .set MUL_REG, (MUL_REG + 1) & 3
+     .endif
+     .set COUNTER, COUNTER + 1
+    .endr
+
+        // Post-process the result of the multiplies
+    .if SPILL_SHIFT
+        ldr     SHIFT, [sp, #9*4 + 0*4]
+    .endif
+    .if SPILL_MASK
+        ldr     MASK, [sp, #9*4 + 1*4]
+    .endif
+        ldr     ST2, [PSAMP]
+        subs    I, I, #1
+    .if \shift_8
+        mov     AC0, AC0, lsr #8
+        orr     AC0, AC0, AC1, lsl #24
+    .elseif !\shift_0
+        rsb     ST3, SHIFT, #32
+        mov     AC0, AC0, lsr SHIFT
+A       orr     AC0, AC0, AC1, lsl ST3
+T       mov     AC1, AC1, lsl ST3
+T       orr     AC0, AC0, AC1
+    .endif
+    .if \mask_minus1
+        add     ST3, ST2, AC0
+    .else
+        add     ST2, ST2, AC0
+        and     ST3, ST2, MASK
+        sub     ST2, ST3, AC0
+    .endif
+        str     ST3, [PST, #-4]!
+        str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
+        str     ST3, [PSAMP], #4 * MAX_CHANNELS
+        bne     01b
+  .endif
+        b       99f
+
+  .if DEFINED_SHIFT
+    .unreq SHIFT
+  .endif
+  .if DEFINED_MASK
+    .unreq MASK
+  .endif
+.endm
+
+.macro switch_on_fir_taps  mask_minus1, shift_0, shift_8, iir_taps
+A       ldr     pc, [pc, a3, LSL #2] // firorder is in range 0-(8-iir_taps)
+T       tbh     [pc, a3, lsl #1]
+0:
+A       .word   0, 70f, 71f, 72f, 73f, 74f
+T       .hword  (70f - 0b) / 2, (71f - 0b) / 2, (72f - 0b) / 2, (73f - 0b) / 2, (74f - 0b) / 2
+ .if \iir_taps <= 3
+A       .word   75f
+T       .hword  (75f - 0b) / 2
+  .if \iir_taps <= 2
+A       .word   76f
+T       .hword  (76f - 0b) / 2
+   .if \iir_taps <= 1
+A       .word   77f
+T       .hword  (77f - 0b) / 2
+    .if \iir_taps == 0
+A       .word   78f
+T       .hword  (78f - 0b) / 2
+    .endif
+   .endif
+  .endif
+ .endif
+70:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
+71:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
+72:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
+73:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
+74:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
+ .if \iir_taps <= 3
+75:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
+  .if \iir_taps <= 2
+76:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
+   .if \iir_taps <= 1
+77:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
+    .if \iir_taps == 0
+78:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
+    .endif
+   .endif
+  .endif
+ .endif
+.endm
+
+.macro switch_on_iir_taps  mask_minus1, shift_0, shift_8
+A       ldr     pc, [pc, a4, LSL #2] // irorder is in range 0-4
+T       tbh    [pc, a4, lsl #1]
+0:
+A       .word   0, 60f, 61f, 62f, 63f, 64f
+T       .hword  (60f - 0b) / 2, (61f - 0b) / 2, (62f - 0b) / 2, (63f - 0b) / 2, (64f - 0b) / 2
+60:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 0
+61:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 1
+62:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 2
+63:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 3
+64:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 4
+.endm
+
+/* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+ *                                int firorder, int iirorder,
+ *                                unsigned int filter_shift, int32_t mask,
+ *                                int blocksize, int32_t *sample_buffer);
+ */
+function ff_mlp_filter_channel_arm, export=1
+        push    {v1-fp,lr}
+        add     v1, sp, #9*4 // point at arguments on stack
+        ldm     v1, {ST0,ST1,I,PSAMP}
+        cmp     ST1, #-1
+        bne     30f
+        movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+        bne     20f
+        bcs     10f
+        switch_on_iir_taps 1, 1, 0
+10:     switch_on_iir_taps 1, 0, 1
+20:     switch_on_iir_taps 1, 0, 0
+30:     movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
+        bne     50f
+        bcs     40f
+        switch_on_iir_taps 0, 1, 0
+40:     switch_on_iir_taps 0, 0, 1
+50:     switch_on_iir_taps 0, 0, 0
+99:     pop     {v1-fp,pc}
+endfunc
+
+        .unreq  PST
+        .unreq  PCO
+        .unreq  AC0
+        .unreq  AC1
+        .unreq  CO0
+        .unreq  CO1
+        .unreq  CO2
+        .unreq  CO3
+        .unreq  ST0
+        .unreq  ST1
+        .unreq  ST2
+        .unreq  ST3
+        .unreq  I
+        .unreq  PSAMP
+diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
+new file mode 100644
+index 0000000..9a14815
+--- /dev/null
+++ b/libavcodec/arm/mlpdsp_init_arm.c
+@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/arm/cpu.h"
+#include "libavutil/attributes.h"
+#include "libavcodec/mlpdsp.h"
+
+void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+                               int firorder, int iirorder,
+                               unsigned int filter_shift, int32_t mask,
+                               int blocksize, int32_t *sample_buffer);
+
+av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+{
+    c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+}
+diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
+index b413e86..4b403b8 100644
+--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
+@@ -60,6 +60,8 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
+ av_cold void ff_mlpdsp_init(MLPDSPContext *c)
+ {
+     c->mlp_filter_channel = mlp_filter_channel;
+    if (ARCH_ARM)
+        ff_mlpdsp_init_arm(c);
+     if (ARCH_X86)
+         ff_mlpdsp_init_x86(c);
+ }
+diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
+index 84a8aa3..129bcfe 100644
+--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
+@@ -32,6 +32,7 @@ typedef struct MLPDSPContext {
+ } MLPDSPContext;
+
+ void ff_mlpdsp_init(MLPDSPContext *c);
+void ff_mlpdsp_init_arm(MLPDSPContext *c);
+ void ff_mlpdsp_init_x86(MLPDSPContext *c);
+
+ #endif /* AVCODEC_MLPDSP_H */
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0002-vc-1-Add-platform-specific-start-code-search-routine.patch
+++ b/projects/RPi/patches/ffmpeg/0002-vc-1-Add-platform-specific-start-code-search-routine.patch
@ -0,0 +1,143 @@
+From a60747132a1a6652ac0d18f3f110a20ea637ac30 Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Wed, 16 Apr 2014 01:51:32 +0100
+Subject: [PATCH 2/3] vc-1: Add platform-specific start code search routine to
+ VC1DSPContext.
+
+Initialise VC1DSPContext for parser as well as for decoder.
+Note, the VC-1 code doesn't actually use the function pointer yet.
+
+Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
+---
+ libavcodec/Makefile              | 6 +++---
+ libavcodec/arm/Makefile          | 2 ++
+ libavcodec/arm/vc1dsp_init_arm.c | 4 ++++
+ libavcodec/vc1.c                 | 2 ++
+ libavcodec/vc1dec.c              | 1 -
+ libavcodec/vc1dsp.c              | 3 +++
+ libavcodec/vc1dsp.h              | 8 ++++++++
+ 7 files changed, 22 insertions(+), 4 deletions(-)
+
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 19caf11..120f85a 100644
+--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
+@@ -458,7 +458,7 @@ OBJS-$(CONFIG_VB_DECODER)              += vb.o
+ OBJS-$(CONFIG_VBLE_DECODER)            += vble.o
+ OBJS-$(CONFIG_VC1_DECODER)             += vc1dec.o vc1.o vc1data.o vc1dsp.o \
+                                           msmpeg4dec.o msmpeg4.o msmpeg4data.o \
+-                                          wmv2dsp.o
+                                          wmv2dsp.o startcode.o
+ OBJS-$(CONFIG_VCR1_DECODER)            += vcr1.o
+ OBJS-$(CONFIG_VMDAUDIO_DECODER)        += vmdav.o
+ OBJS-$(CONFIG_VMDVIDEO_DECODER)        += vmdav.o
+@@ -783,9 +783,9 @@ OBJS-$(CONFIG_PNM_PARSER)              += pnm_parser.o pnm.o
+ OBJS-$(CONFIG_RV30_PARSER)             += rv34_parser.o
+ OBJS-$(CONFIG_RV40_PARSER)             += rv34_parser.o
+ OBJS-$(CONFIG_TAK_PARSER)              += tak_parser.o tak.o
+-OBJS-$(CONFIG_VC1_PARSER)              += vc1_parser.o vc1.o vc1data.o \
+OBJS-$(CONFIG_VC1_PARSER)              += vc1_parser.o vc1.o vc1data.o vc1dsp.o \
+                                           msmpeg4.o msmpeg4data.o mpeg4video.o \
+-                                          h263.o
+                                          h263.o startcode.o
+ OBJS-$(CONFIG_VORBIS_PARSER)           += vorbis_parser.o xiph.o
+ OBJS-$(CONFIG_VP3_PARSER)              += vp3_parser.o
+ OBJS-$(CONFIG_VP8_PARSER)              += vp8_parser.o
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index b6410b2..fa2b18e 100644
+--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
+@@ -51,6 +51,8 @@ ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/startcode_armv6.o
+ ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
+                                           arm/hpeldsp_armv6.o
+ ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
+ARMV6-OBJS-$(CONFIG_VC1_DECODER)       += arm/startcode_armv6.o
+ARMV6-OBJS-$(CONFIG_VC1_PARSER)        += arm/startcode_armv6.o
+ ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
+                                           arm/vp8dsp_init_armv6.o       \
+                                           arm/vp8dsp_armv6.o
+diff --git a/libavcodec/arm/vc1dsp_init_arm.c b/libavcodec/arm/vc1dsp_init_arm.c
+index 47d4126..4a84848 100644
+--- a/libavcodec/arm/vc1dsp_init_arm.c
+++ b/libavcodec/arm/vc1dsp_init_arm.c
+@@ -23,10 +23,14 @@
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+
+int ff_startcode_find_candidate_armv6(const uint8_t *buf, int size);
+
+ av_cold void ff_vc1dsp_init_arm(VC1DSPContext *dsp)
+ {
+     int cpu_flags = av_get_cpu_flags();
+
+    if (have_armv6(cpu_flags))
+        dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_armv6;
+     if (have_neon(cpu_flags))
+         ff_vc1dsp_init_neon(dsp);
+ }
+diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
+index 49d4885..cb941dd 100644
+--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
+@@ -1706,5 +1706,7 @@ av_cold int ff_vc1_init_common(VC1Context *v)
+     v->pq      = -1;
+     v->mvrange = 0; /* 7.1.1.18, p80 */
+
+    ff_vc1dsp_init(&v->vc1dsp);
+
+     return 0;
+ }
+diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
+index 30fee47..67cda42 100644
+--- a/libavcodec/vc1dec.c
+++ b/libavcodec/vc1dec.c
+@@ -5631,7 +5631,6 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
+     ff_vc1_decode_end(avctx);
+
+     ff_h264chroma_init(&v->h264chroma, 8);
+-    ff_vc1dsp_init(&v->vc1dsp);
+
+     if (avctx->codec_id == AV_CODEC_ID_WMV3 || avctx->codec_id == AV_CODEC_ID_WMV3IMAGE) {
+         int count = 0;
+diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
+index ec9c17b..09a9006 100644
+--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
+@@ -30,6 +30,7 @@
+ #include "h264chroma.h"
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+#include "startcode.h"
+
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -947,6 +948,8 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
+     dsp->sprite_v_double_twoscale = sprite_v_double_twoscale_c;
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+
+    dsp->vc1_find_start_code_candidate = ff_startcode_find_candidate_c;
+
+     if (ARCH_AARCH64)
+         ff_vc1dsp_init_aarch64(dsp);
+     if (ARCH_ARM)
+diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
+index 990fbc3..6a90eed 100644
+--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
+@@ -74,6 +74,14 @@ typedef struct VC1DSPContext {
+     void (*sprite_v_double_twoscale)(uint8_t *dst, const uint8_t *src1a, const uint8_t *src1b, int offset1,
+                                                    const uint8_t *src2a, const uint8_t *src2b, int offset2,
+                                      int alpha, int width);
+
+    /**
+     * Search buf from the start for up to size bytes. Return the index
+     * of a zero byte, or >= size if not found. Ideally, use lookahead
+     * to filter out any zero bytes that are known to not be followed by
+     * one or more further zero bytes and a one byte.
+     */
+    int (*vc1_find_start_code_candidate)(const uint8_t *buf, int size);
+ } VC1DSPContext;
+
+ void ff_vc1dsp_init(VC1DSPContext* c);
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0003-truehd-break-out-part-of-rematrix_channels-into-plat.patch
+++ b/projects/RPi/patches/ffmpeg/0003-truehd-break-out-part-of-rematrix_channels-into-plat.patch
@ -0,0 +1,158 @@
+From bb74fc44081fb6d7923ce1b7ed3e3e6514695f3e Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Wed, 5 Mar 2014 21:01:28 +0000
+Subject: [PATCH 3/6] truehd: break out part of rematrix_channels into
+ platform-specific callback.
+
+Verified with profiling that this doesn't have a measurable effect upon
+overall performance.
+---
+ libavcodec/mlpdec.c | 37 ++++++++++++-------------------------
+ libavcodec/mlpdsp.c | 33 +++++++++++++++++++++++++++++++++
+ libavcodec/mlpdsp.h | 23 +++++++++++++++++++++++
+ 3 files changed, 68 insertions(+), 25 deletions(-)
+
+diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
+index cbd9000..01ded5c 100644
+--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
+@@ -1024,7 +1024,7 @@ static void fill_noise_buffer(MLPDecodeContext *m, unsigned int substr)
+ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
+ {
+     SubStream *s = &m->substream[substr];
+-    unsigned int mat, src_ch, i;
+    unsigned int mat;
+     unsigned int maxchan;
+
+     maxchan = s->max_matrix_channel;
+@@ -1036,31 +1036,18 @@ static void rematrix_channels(MLPDecodeContext *m, unsigned int substr)
+     }
+
+     for (mat = 0; mat < s->num_primitive_matrices; mat++) {
+-        int matrix_noise_shift = s->matrix_noise_shift[mat];
+         unsigned int dest_ch = s->matrix_out_ch[mat];
+-        int32_t mask = MSB_MASK(s->quant_step_size[dest_ch]);
+-        int32_t *coeffs = s->matrix_coeff[mat];
+-        int index  = s->num_primitive_matrices - mat;
+-        int index2 = 2 * index + 1;
+-
+-        /* TODO: DSPContext? */
+-
+-        for (i = 0; i < s->blockpos; i++) {
+-            int32_t bypassed_lsb = m->bypassed_lsbs[i][mat];
+-            int32_t *samples = m->sample_buffer[i];
+-            int64_t accum = 0;
+-
+-            for (src_ch = 0; src_ch <= maxchan; src_ch++)
+-                accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+-
+-            if (matrix_noise_shift) {
+-                index &= m->access_unit_size_pow2 - 1;
+-                accum += m->noise_buffer[index] << (matrix_noise_shift + 7);
+-                index += index2;
+-            }
+-
+-            samples[dest_ch] = ((accum >> 14) & mask) + bypassed_lsb;
+-        }
+        m->dsp.mlp_rematrix_channel(&m->sample_buffer[0][0],
+                                    s->matrix_coeff[mat],
+                                    &m->bypassed_lsbs[0][mat],
+                                    m->noise_buffer,
+                                    s->num_primitive_matrices - mat,
+                                    dest_ch,
+                                    s->blockpos,
+                                    maxchan,
+                                    s->matrix_noise_shift[mat],
+                                    m->access_unit_size_pow2,
+                                    MSB_MASK(s->quant_step_size[dest_ch]));
+     }
+ }
+
+diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
+index 4b403b8..7a359b0 100644
+--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
+@@ -57,9 +57,42 @@ static void mlp_filter_channel(int32_t *state, const int32_t *coeff,
+     }
+ }
+
+void ff_mlp_rematrix_channel(int32_t *samples,
+                             const int32_t *coeffs,
+                             const uint8_t *bypassed_lsbs,
+                             const int8_t *noise_buffer,
+                             int index,
+                             unsigned int dest_ch,
+                             uint16_t blockpos,
+                             unsigned int maxchan,
+                             int matrix_noise_shift,
+                             int access_unit_size_pow2,
+                             int32_t mask)
+{
+    unsigned int src_ch, i;
+    int index2 = 2 * index + 1;
+    for (i = 0; i < blockpos; i++) {
+        int64_t accum = 0;
+
+        for (src_ch = 0; src_ch <= maxchan; src_ch++)
+            accum += (int64_t) samples[src_ch] * coeffs[src_ch];
+
+        if (matrix_noise_shift) {
+            index &= access_unit_size_pow2 - 1;
+            accum += noise_buffer[index] << (matrix_noise_shift + 7);
+            index += index2;
+        }
+
+        samples[dest_ch] = ((accum >> 14) & mask) + *bypassed_lsbs;
+        bypassed_lsbs += MAX_CHANNELS;
+        samples += MAX_CHANNELS;
+    }
+}
+
+ av_cold void ff_mlpdsp_init(MLPDSPContext *c)
+ {
+     c->mlp_filter_channel = mlp_filter_channel;
+    c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
+     if (ARCH_ARM)
+         ff_mlpdsp_init_arm(c);
+     if (ARCH_X86)
+diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
+index 129bcfe..f98e9be 100644
+--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
+@@ -24,11 +24,34 @@
+
+ #include <stdint.h>
+
+void ff_mlp_rematrix_channel(int32_t *samples,
+                             const int32_t *coeffs,
+                             const uint8_t *bypassed_lsbs,
+                             const int8_t *noise_buffer,
+                             int index,
+                             unsigned int dest_ch,
+                             uint16_t blockpos,
+                             unsigned int maxchan,
+                             int matrix_noise_shift,
+                             int access_unit_size_pow2,
+                             int32_t mask);
+
+ typedef struct MLPDSPContext {
+     void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
+                                int firorder, int iirorder,
+                                unsigned int filter_shift, int32_t mask,
+                                int blocksize, int32_t *sample_buffer);
+    void (*mlp_rematrix_channel)(int32_t *samples,
+                                 const int32_t *coeffs,
+                                 const uint8_t *bypassed_lsbs,
+                                 const int8_t *noise_buffer,
+                                 int index,
+                                 unsigned int dest_ch,
+                                 uint16_t blockpos,
+                                 unsigned int maxchan,
+                                 int matrix_noise_shift,
+                                 int access_unit_size_pow2,
+                                 int32_t mask);
+ } MLPDSPContext;
+
+ void ff_mlpdsp_init(MLPDSPContext *c);
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0003-vc-1-Optimise-parser-with-special-attention-to-ARM.patch
+++ b/projects/RPi/patches/ffmpeg/0003-vc-1-Optimise-parser-with-special-attention-to-ARM.patch
@ -0,0 +1,401 @@
+From c39df43eae03768427243668c040de8437c4f79c Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Wed, 23 Apr 2014 01:41:04 +0100
+Subject: [PATCH 3/3] vc-1: Optimise parser (with special attention to ARM)
+
+The previous implementation of the parser made four passes over each input
+buffer (reduced to two if the container format already guaranteed the input
+buffer corresponded to frames, such as with MKV). But these buffers are
+often 200K in size, certainly enough to flush the data out of L1 cache, and
+for many CPUs, all the way out to main memory. The passes were:
+
+1) locate frame boundaries (not needed for MKV etc)
+2) copy the data into a contiguous block (not needed for MKV etc)
+3) locate the start codes within each frame
+4) unescape the data between start codes
+
+After this, the unescaped data was parsed to extract certain header fields,
+but because the unescape operation was so large, this was usually also
+effectively operating on uncached memory. Most of the unescaped data was
+simply thrown away and never processed further. Only step 2 - because it
+used memcpy - was using prefetch, making things even worse.
+
+This patch reorganises these steps so that, aside from the copying, the
+operations are performed in parallel, maximising cache utilisation. No more
+than the worst-case number of bytes needed for header parsing is unescaped.
+Most of the data is, in practice, only read in order to search for a start
+code, for which optimised implementations already existed in the H264 codec
+(notably the ARM version uses prefetch, so we end up doing both remaining
+passes at maximum speed). For MKV files, we know when we've found the last
+start code of interest in a given frame, so we are able to avoid doing even
+that one remaining pass for most of the buffer.
+
+In some use-cases (such as the Raspberry Pi) video decode is handled by the
+GPU, but the entire elementary stream is still fed through the parser to
+pick out certain elements of the header which are necessary to manage the
+decode process. As you might expect, in these cases, the performance of the
+parser is significant.
+
+To measure parser performance, I used the same VC-1 elementary stream in
+either an MPEG-2 transport stream or a MKV file, and fed it through ffmpeg
+with -c:v copy -c:a copy -f null. These are the gperftools counts for
+those streams, both filtered to only include vc1_parse() and its callees,
+and unfiltered (to include the whole binary). Lower numbers are better:
+
+                Before          After
+File  Filtered  Mean   StdDev   Mean   StdDev  Confidence  Change
+M2TS  No        861.7  8.2      650.5  8.1     100.0%      +32.5%
+MKV   No        868.9  7.4      731.7  9.0     100.0%      +18.8%
+M2TS  Yes       250.0  11.2     27.2   3.4     100.0%      +817.9%
+MKV   Yes       149.0  12.8     1.7    0.8     100.0%      +8526.3%
+
+Yes, that last case shows vc1_parse() running 86 times faster! The M2TS
+case does show a larger absolute improvement though, since it was worse
+to begin with.
+
+This patch has been tested with the FATE suite (albeit on x86 for speed).
+
+Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
+---
+ libavcodec/vc1_parser.c | 284 ++++++++++++++++++++++++++++++------------------
+ 1 file changed, 180 insertions(+), 104 deletions(-)
+
+diff --git a/libavcodec/vc1_parser.c b/libavcodec/vc1_parser.c
+index cc29ce1..4ed14bc 100644
+--- a/libavcodec/vc1_parser.c
+++ b/libavcodec/vc1_parser.c
+@@ -30,122 +30,88 @@
+ #include "vc1.h"
+ #include "get_bits.h"
+
+/** The maximum number of bytes of a sequence, entry point or
+ *  frame header whose values we pay any attention to */
+#define UNESCAPED_THRESHOLD 37
+
+/** The maximum number of bytes of a sequence, entry point or
+ *  frame header which must be valid memory (because they are
+ *  used to update the bitstream cache in skip_bits() calls)
+ */
+#define UNESCAPED_LIMIT 144
+
+typedef enum {
+    NO_MATCH,
+    ONE_ZERO,
+    TWO_ZEROS,
+    ONE
+} VC1ParseSearchState;
+
+ typedef struct {
+     ParseContext pc;
+     VC1Context v;
+    uint8_t prev_start_code;
+    size_t bytes_to_skip;
+    uint8_t unesc_buffer[UNESCAPED_LIMIT];
+    size_t unesc_index;
+    VC1ParseSearchState search_state;
+ } VC1ParseContext;
+
+-static void vc1_extract_headers(AVCodecParserContext *s, AVCodecContext *avctx,
+-                                const uint8_t *buf, int buf_size)
+static void vc1_extract_header(AVCodecParserContext *s, AVCodecContext *avctx,
+                               const uint8_t *buf, int buf_size)
+ {
+    /* Parse the header we just finished unescaping */
+     VC1ParseContext *vpc = s->priv_data;
+     GetBitContext gb;
+-    const uint8_t *start, *end, *next;
+-    uint8_t *buf2 = av_mallocz(buf_size + FF_INPUT_BUFFER_PADDING_SIZE);
+-
+    int ret;
+     vpc->v.s.avctx = avctx;
+     vpc->v.parse_only = 1;
+-    vpc->v.first_pic_header_flag = 1;
+-    next = buf;
+-    s->repeat_pict = 0;
+-
+-    for(start = buf, end = buf + buf_size; next < end; start = next){
+-        int buf2_size, size;
+-        int ret;
+-
+-        next = find_next_marker(start + 4, end);
+-        size = next - start - 4;
+-        buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
+-        init_get_bits(&gb, buf2, buf2_size * 8);
+-        if(size <= 0) continue;
+-        switch(AV_RB32(start)){
+-        case VC1_CODE_SEQHDR:
+-            ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
+-            break;
+-        case VC1_CODE_ENTRYPOINT:
+-            ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
+-            break;
+-        case VC1_CODE_FRAME:
+-            if(vpc->v.profile < PROFILE_ADVANCED)
+-                ret = ff_vc1_parse_frame_header    (&vpc->v, &gb);
+-            else
+-                ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+-
+-            if (ret < 0)
+-                break;
+-
+-            /* keep AV_PICTURE_TYPE_BI internal to VC1 */
+-            if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
+-                s->pict_type = AV_PICTURE_TYPE_B;
+-            else
+-                s->pict_type = vpc->v.s.pict_type;
+-
+-            if (avctx->ticks_per_frame > 1){
+-                // process pulldown flags
+-                s->repeat_pict = 1;
+-                // Pulldown flags are only valid when 'broadcast' has been set.
+-                // So ticks_per_frame will be 2
+-                if (vpc->v.rff){
+-                    // repeat field
+-                    s->repeat_pict = 2;
+-                }else if (vpc->v.rptfrm){
+-                    // repeat frames
+-                    s->repeat_pict = vpc->v.rptfrm * 2 + 1;
+-                }
+-            }
+-
+-            if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
+-                s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
+-            else
+-                s->field_order = AV_FIELD_PROGRESSIVE;
+    init_get_bits(&gb, buf, buf_size * 8);
+    switch (vpc->prev_start_code) {
+    case VC1_CODE_SEQHDR & 0xFF:
+        ff_vc1_decode_sequence_header(avctx, &vpc->v, &gb);
+        break;
+    case VC1_CODE_ENTRYPOINT & 0xFF:
+        ff_vc1_decode_entry_point(avctx, &vpc->v, &gb);
+        break;
+    case VC1_CODE_FRAME & 0xFF:
+        if(vpc->v.profile < PROFILE_ADVANCED)
+            ret = ff_vc1_parse_frame_header    (&vpc->v, &gb);
+        else
+            ret = ff_vc1_parse_frame_header_adv(&vpc->v, &gb);
+
+        if (ret < 0)
+             break;
+-        }
+-    }
+
+-    av_free(buf2);
+-}
+        /* keep AV_PICTURE_TYPE_BI internal to VC1 */
+        if (vpc->v.s.pict_type == AV_PICTURE_TYPE_BI)
+            s->pict_type = AV_PICTURE_TYPE_B;
+        else
+            s->pict_type = vpc->v.s.pict_type;
+
+-/**
+- * Find the end of the current frame in the bitstream.
+- * @return the position of the first byte of the next frame, or -1
+- */
+-static int vc1_find_frame_end(ParseContext *pc, const uint8_t *buf,
+-                               int buf_size) {
+-    int pic_found, i;
+-    uint32_t state;
+-
+-    pic_found= pc->frame_start_found;
+-    state= pc->state;
+-
+-    i=0;
+-    if(!pic_found){
+-        for(i=0; i<buf_size; i++){
+-            state= (state<<8) | buf[i];
+-            if(state == VC1_CODE_FRAME || state == VC1_CODE_FIELD){
+-                i++;
+-                pic_found=1;
+-                break;
+        if (avctx->ticks_per_frame > 1){
+            // process pulldown flags
+            s->repeat_pict = 1;
+            // Pulldown flags are only valid when 'broadcast' has been set.
+            // So ticks_per_frame will be 2
+            if (vpc->v.rff){
+                // repeat field
+                s->repeat_pict = 2;
+            }else if (vpc->v.rptfrm){
+                // repeat frames
+                s->repeat_pict = vpc->v.rptfrm * 2 + 1;
+             }
+        }else{
+            s->repeat_pict = 0;
+         }
+-    }
+
+-    if(pic_found){
+-        /* EOF considered as end of frame */
+-        if (buf_size == 0)
+-            return 0;
+-        for(; i<buf_size; i++){
+-            state= (state<<8) | buf[i];
+-            if(IS_MARKER(state) && state != VC1_CODE_FIELD && state != VC1_CODE_SLICE){
+-                pc->frame_start_found=0;
+-                pc->state=-1;
+-                return i-3;
+-            }
+-        }
+        if (vpc->v.broadcast && vpc->v.interlace && !vpc->v.psf)
+            s->field_order = vpc->v.tff ? AV_FIELD_TT : AV_FIELD_BB;
+        else
+            s->field_order = AV_FIELD_PROGRESSIVE;
+
+        break;
+     }
+-    pc->frame_start_found= pic_found;
+-    pc->state= state;
+-    return END_NOT_FOUND;
+ }
+
+ static int vc1_parse(AVCodecParserContext *s,
+@@ -153,22 +119,127 @@ static int vc1_parse(AVCodecParserContext *s,
+                            const uint8_t **poutbuf, int *poutbuf_size,
+                            const uint8_t *buf, int buf_size)
+ {
+    /* Here we do the searching for frame boundaries and headers at
+     * the same time. Only a minimal amount at the start of each
+     * header is unescaped. */
+     VC1ParseContext *vpc = s->priv_data;
+-    int next;
+    int pic_found = vpc->pc.frame_start_found;
+    uint8_t *unesc_buffer = vpc->unesc_buffer;
+    size_t unesc_index = vpc->unesc_index;
+    VC1ParseSearchState search_state = vpc->search_state;
+    int next = END_NOT_FOUND;
+    int i = vpc->bytes_to_skip;
+
+    if (pic_found && buf_size == 0) {
+        /* EOF considered as end of frame */
+        memset(unesc_buffer + unesc_index, 0, UNESCAPED_THRESHOLD - unesc_index);
+        vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+        next = 0;
+    }
+    while (i < buf_size) {
+        int start_code_found = 0;
+        uint8_t b;
+        while (i < buf_size && unesc_index < UNESCAPED_THRESHOLD) {
+            b = buf[i++];
+            unesc_buffer[unesc_index++] = b;
+            if (search_state <= ONE_ZERO)
+                search_state = b ? NO_MATCH : search_state + 1;
+            else if (search_state == TWO_ZEROS) {
+                if (b == 1)
+                    search_state = ONE;
+                else if (b > 1) {
+                    if (b == 3)
+                        unesc_index--; // swallow emulation prevention byte
+                    search_state = NO_MATCH;
+                }
+            }
+            else { // search_state == ONE
+                // Header unescaping terminates early due to detection of next start code
+                search_state = NO_MATCH;
+                start_code_found = 1;
+                break;
+            }
+        }
+        if ((s->flags & PARSER_FLAG_COMPLETE_FRAMES) &&
+                unesc_index >= UNESCAPED_THRESHOLD &&
+                vpc->prev_start_code == (VC1_CODE_FRAME & 0xFF))
+        {
+            // No need to keep scanning the rest of the buffer for
+            // start codes if we know it contains a complete frame and
+            // we've already unescaped all we need of the frame header
+            vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+            break;
+        }
+        if (unesc_index >= UNESCAPED_THRESHOLD && !start_code_found) {
+            while (i < buf_size) {
+                if (search_state == NO_MATCH) {
+                    i += vpc->v.vc1dsp.vc1_find_start_code_candidate(buf + i, buf_size - i);
+                    if (i < buf_size) {
+                        search_state = ONE_ZERO;
+                    }
+                    i++;
+                } else {
+                    b = buf[i++];
+                    if (search_state == ONE_ZERO)
+                        search_state = b ? NO_MATCH : TWO_ZEROS;
+                    else if (search_state == TWO_ZEROS) {
+                        if (b >= 1)
+                            search_state = b == 1 ? ONE : NO_MATCH;
+                    }
+                    else { // search_state == ONE
+                        search_state = NO_MATCH;
+                        start_code_found = 1;
+                        break;
+                    }
+                }
+            }
+        }
+        if (start_code_found) {
+            vc1_extract_header(s, avctx, unesc_buffer, unesc_index);
+
+            vpc->prev_start_code = b;
+            unesc_index = 0;
+
+            if (!(s->flags & PARSER_FLAG_COMPLETE_FRAMES)) {
+                if (!pic_found && (b == (VC1_CODE_FRAME & 0xFF) || b == (VC1_CODE_FIELD & 0xFF))) {
+                    pic_found = 1;
+                }
+                else if (pic_found && b != (VC1_CODE_FIELD & 0xFF) && b != (VC1_CODE_SLICE & 0xFF)) {
+                    next = i - 4;
+                    pic_found = b == (VC1_CODE_FRAME & 0xFF);
+                    break;
+                }
+            }
+        }
+    }
+
+-    if(s->flags & PARSER_FLAG_COMPLETE_FRAMES){
+-        next= buf_size;
+-    }else{
+-        next= vc1_find_frame_end(&vpc->pc, buf, buf_size);
+    vpc->pc.frame_start_found = pic_found;
+    vpc->unesc_index = unesc_index;
+    vpc->search_state = search_state;
+
+    if (s->flags & PARSER_FLAG_COMPLETE_FRAMES) {
+        next = buf_size;
+    } else {
+         if (ff_combine_frame(&vpc->pc, next, &buf, &buf_size) < 0) {
+            vpc->bytes_to_skip = 0;
+             *poutbuf = NULL;
+             *poutbuf_size = 0;
+             return buf_size;
+         }
+     }
+
+-    vc1_extract_headers(s, avctx, buf, buf_size);
+    vpc->v.first_pic_header_flag = 1;
+
+    /* If we return with a valid pointer to a combined frame buffer
+     * then on the next call then we'll have been unhelpfully rewound
+     * by up to 4 bytes (depending upon whether the start code
+     * overlapped the input buffer, and if so by how much). We don't
+     * want this: it will either cause spurious second detections of
+     * the start code we've already seen, or cause extra bytes to be
+     * inserted at the start of the unescaped buffer. */
+    vpc->bytes_to_skip = 4;
+    if (next < 0)
+        vpc->bytes_to_skip += next;
+
+     *poutbuf = buf;
+     *poutbuf_size = buf_size;
+@@ -199,6 +270,11 @@ static av_cold int vc1_parse_init(AVCodecParserContext *s)
+ {
+     VC1ParseContext *vpc = s->priv_data;
+     vpc->v.s.slice_context_count = 1;
+    vpc->v.first_pic_header_flag = 1;
+    vpc->prev_start_code = 0;
+    vpc->bytes_to_skip = 0;
+    vpc->unesc_index = 0;
+    vpc->search_state = NO_MATCH;
+     return ff_vc1_init_common(&vpc->v);
+ }
+
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0004-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch
+++ b/projects/RPi/patches/ffmpeg/0004-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch
@ -0,0 +1,285 @@
+From 98428a8cf593587b403076bb54b46cc70ed17ff2 Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Mon, 10 Mar 2014 14:42:05 +0000
+Subject: [PATCH 4/6] truehd: add hand-scheduled ARM asm version of
+ ff_mlp_rematrix_channel.
+
+Profiling results for overall audio decode and the rematrix_channels function
+in particular are as follows:
+
+              Before          After
+              Mean   StdDev   Mean   StdDev  Confidence  Change
+6:2 total     370.8  17.0     348.8  20.1    99.9%       +6.3%
+6:2 function  46.4   8.4      45.8   6.6     18.0%       +1.2%  (insignificant)
+8:2 total     343.2  19.0     339.1  15.4    54.7%       +1.2%  (insignificant)
+8:2 function  38.9   3.9      40.2   6.9     52.4%       -3.2%  (insignificant)
+6:6 total     658.4  15.7     604.6  20.8    100.0%      +8.9%
+6:6 function  109.0  8.7      59.5   5.4     100.0%      +83.3%
+8:8 total     896.2  24.5     766.4  17.6    100.0%      +16.9%
+8:8 function  223.4  12.8     93.8   5.0     100.0%      +138.3%
+
+The assembly version has also been tested with a fuzz tester to ensure that
+any combinations of inputs not exercised by my available test streams still
+generate mathematically identical results to the C version.
+---
+ libavcodec/arm/mlpdsp_arm.S      | 222 +++++++++++++++++++++++++++++++++++++++
+ libavcodec/arm/mlpdsp_init_arm.c |  12 +++
+ 2 files changed, 234 insertions(+)
+
+diff --git a/libavcodec/arm/mlpdsp_arm.S b/libavcodec/arm/mlpdsp_arm.S
+index 615819d..9b51d0c 100644
+--- a/libavcodec/arm/mlpdsp_arm.S
+++ b/libavcodec/arm/mlpdsp_arm.S
+@@ -431,3 +431,225 @@ endfunc
+         .unreq  ST3
+         .unreq  I
+         .unreq  PSAMP
+
+/********************************************************************/
+
+PSA     .req    a1 // samples
+PCO     .req    a2 // coeffs
+PBL     .req    a3 // bypassed_lsbs
+INDEX   .req    a4
+CO0     .req    v1
+CO1     .req    v2
+CO2     .req    v3
+CO3     .req    v4
+SA0     .req    v5
+SA1     .req    v6
+SA2     .req    sl
+SA3     .req    fp
+AC0     .req    ip
+AC1     .req    lr
+NOISE   .req    SA0
+LSB     .req    SA1
+DCH     .req    SA2 // dest_ch
+MASK    .req    SA3
+
+    // INDEX is used as follows:
+    // bits 0..6   index2 (values up to 17, but wider so that we can
+    //               add to index field without needing to mask)
+    // bits 7..14  i (values up to 160)
+    // bit 15      underflow detect for i
+    // bits 25..31 (if access_unit_size_pow2 == 128)  \ index
+    // bits 26..31 (if access_unit_size_pow2 == 64)   /
+
+.macro implement_rematrix  shift, index_mask, mask_minus1, maxchan
+    .if \maxchan == 1
+        // We can just leave the coefficients in registers in this case
+        ldrd    CO0, CO1, [PCO]
+    .endif
+1:
+    .if \maxchan == 1
+        ldrd    SA0, SA1, [PSA]
+        smull   AC0, AC1, CO0, SA0
+    .elseif \maxchan == 5
+        ldr     CO0, [PCO, #0]
+        ldr     SA0, [PSA, #0]
+        ldr     CO1, [PCO, #4]
+        ldr     SA1, [PSA, #4]
+        ldrd    CO2, CO3, [PCO, #8]
+        smull   AC0, AC1, CO0, SA0
+        ldrd    SA2, SA3, [PSA, #8]
+        smlal   AC0, AC1, CO1, SA1
+        ldrd    CO0, CO1, [PCO, #16]
+        smlal   AC0, AC1, CO2, SA2
+        ldrd    SA0, SA1, [PSA, #16]
+        smlal   AC0, AC1, CO3, SA3
+        smlal   AC0, AC1, CO0, SA0
+    .else // \maxchan == 7
+        ldr     CO2, [PCO, #0]
+        ldr     SA2, [PSA, #0]
+        ldr     CO3, [PCO, #4]
+        ldr     SA3, [PSA, #4]
+        ldrd    CO0, CO1, [PCO, #8]
+        smull   AC0, AC1, CO2, SA2
+        ldrd    SA0, SA1, [PSA, #8]
+        smlal   AC0, AC1, CO3, SA3
+        ldrd    CO2, CO3, [PCO, #16]
+        smlal   AC0, AC1, CO0, SA0
+        ldrd    SA2, SA3, [PSA, #16]
+        smlal   AC0, AC1, CO1, SA1
+        ldrd    CO0, CO1, [PCO, #24]
+        smlal   AC0, AC1, CO2, SA2
+        ldrd    SA0, SA1, [PSA, #24]
+        smlal   AC0, AC1, CO3, SA3
+        smlal   AC0, AC1, CO0, SA0
+    .endif
+        ldm     sp, {NOISE, DCH, MASK}
+        smlal   AC0, AC1, CO1, SA1
+    .if \shift != 0
+      .if \index_mask == 63
+        add     NOISE, NOISE, INDEX, lsr #32-6
+        ldrb    LSB, [PBL], #MAX_CHANNELS
+        ldrsb   NOISE, [NOISE]
+        add     INDEX, INDEX, INDEX, lsl #32-6
+      .else // \index_mask == 127
+        add     NOISE, NOISE, INDEX, lsr #32-7
+        ldrb    LSB, [PBL], #MAX_CHANNELS
+        ldrsb   NOISE, [NOISE]
+        add     INDEX, INDEX, INDEX, lsl #32-7
+      .endif
+        sub     INDEX, INDEX, #1<<7
+        adds    AC0, AC0, NOISE, lsl #\shift + 7
+        adc     AC1, AC1, NOISE, asr #31
+    .else
+        ldrb    LSB, [PBL], #MAX_CHANNELS
+        sub     INDEX, INDEX, #1<<7
+    .endif
+        add     PSA, PSA, #MAX_CHANNELS*4
+        mov     AC0, AC0, lsr #14
+        orr     AC0, AC0, AC1, lsl #18
+    .if !\mask_minus1
+        and     AC0, AC0, MASK
+    .endif
+        add     AC0, AC0, LSB
+        tst     INDEX, #1<<15
+        str     AC0, [PSA, DCH, lsl #2]  // DCH is precompensated for the early increment of PSA
+        beq     1b
+        b       98f
+.endm
+
+.macro switch_on_maxchan  shift, index_mask, mask_minus1
+        cmp     v4, #5
+        blo     51f
+        beq     50f
+        implement_rematrix  \shift, \index_mask, \mask_minus1, 7
+50:     implement_rematrix  \shift, \index_mask, \mask_minus1, 5
+51:     implement_rematrix  \shift, \index_mask, \mask_minus1, 1
+.endm
+
+.macro switch_on_mask  shift, index_mask
+        cmp     sl, #-1
+        bne     40f
+        switch_on_maxchan  \shift, \index_mask, 1
+40:     switch_on_maxchan  \shift, \index_mask, 0
+.endm
+
+.macro switch_on_au_size  shift
+  .if \shift == 0
+        switch_on_mask  \shift, undefined
+  .else
+        teq     v6, #64
+        bne     30f
+        orr     INDEX, INDEX, v1, lsl #32-6
+        switch_on_mask  \shift, 63
+30:     orr     INDEX, INDEX, v1, lsl #32-7
+        switch_on_mask  \shift, 127
+  .endif
+.endm
+
+/* void ff_mlp_rematrix_channel_arm(int32_t *samples,
+ *                                  const int32_t *coeffs,
+ *                                  const uint8_t *bypassed_lsbs,
+ *                                  const int8_t *noise_buffer,
+ *                                  int index,
+ *                                  unsigned int dest_ch,
+ *                                  uint16_t blockpos,
+ *                                  unsigned int maxchan,
+ *                                  int matrix_noise_shift,
+ *                                  int access_unit_size_pow2,
+ *                                  int32_t mask);
+ */
+function ff_mlp_rematrix_channel_arm, export=1
+        push    {v1-fp,lr}
+        add     v1, sp, #9*4 // point at arguments on stack
+        ldm     v1, {v1-sl}
+        teq     v4, #1
+        itt     ne
+        teqne   v4, #5
+        teqne   v4, #7
+        bne     99f
+        teq     v6, #64
+        it      ne
+        teqne   v6, #128
+        bne     99f
+        sub     v2, v2, #MAX_CHANNELS
+        push    {a4,v2,sl}          // initialise NOISE,DCH,MASK; make sp dword-aligned
+        movs    INDEX, v3, lsl #7
+        beq     98f                 // just in case, do nothing if blockpos = 0
+        subs    INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
+        adc     lr, v1, v1          // calculate index2 (C was set by preceding subs)
+        orr     INDEX, INDEX, lr
+        // Switch on matrix_noise_shift: values 0 and 1 are
+        // disproportionately common so do those in a form the branch
+        // predictor can accelerate. Values can only go up to 15.
+        cmp     v5, #1
+        beq     11f
+        blo     10f
+A       ldr     pc, [pc, v5, lsl #2]
+T       tbh     [pc, v5, lsl #1]
+0:
+A       .word   0, 0, 0, 12f, 13f, 14f, 15f, 16f, 17f, 18f, 19f, 20f, 21f, 22f, 23f, 24f, 25f
+T       .hword  0, 0, (12f - 0b) / 2, (13f - 0b) / 2, (14f - 0b) / 2, (15f - 0b) / 2
+T       .hword  (16f - 0b) / 2, (17f - 0b) / 2, (18f - 0b) / 2, (19f - 0b) / 2
+T       .hword  (20f - 0b) / 2, (21f - 0b) / 2, (22f - 0b) / 2, (23f - 0b) / 2, (24f - 0b) / 2, (25f - 0b) / 2
+10:     switch_on_au_size  0
+11:     switch_on_au_size  1
+12:     switch_on_au_size  2
+13:     switch_on_au_size  3
+14:     switch_on_au_size  4
+15:     switch_on_au_size  5
+16:     switch_on_au_size  6
+17:     switch_on_au_size  7
+18:     switch_on_au_size  8
+19:     switch_on_au_size  9
+20:     switch_on_au_size  10
+21:     switch_on_au_size  11
+22:     switch_on_au_size  12
+23:     switch_on_au_size  13
+24:     switch_on_au_size  14
+25:     switch_on_au_size  15
+
+98:     add     sp, sp, #3*4
+        pop     {v1-fp,pc}
+99:     // Can't handle these parameters, drop back to C
+        pop     {v1-fp,lr}
+        b       X(ff_mlp_rematrix_channel)
+endfunc
+
+        .unreq  PSA
+        .unreq  PCO
+        .unreq  PBL
+        .unreq  INDEX
+        .unreq  CO0
+        .unreq  CO1
+        .unreq  CO2
+        .unreq  CO3
+        .unreq  SA0
+        .unreq  SA1
+        .unreq  SA2
+        .unreq  SA3
+        .unreq  AC0
+        .unreq  AC1
+        .unreq  NOISE
+        .unreq  LSB
+        .unreq  DCH
+        .unreq  MASK
+diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
+index 9a14815..1bb2276 100644
+--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
+@@ -29,8 +29,20 @@ void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
+                                int firorder, int iirorder,
+                                unsigned int filter_shift, int32_t mask,
+                                int blocksize, int32_t *sample_buffer);
+void ff_mlp_rematrix_channel_arm(int32_t *samples,
+                                 const int32_t *coeffs,
+                                 const uint8_t *bypassed_lsbs,
+                                 const int8_t *noise_buffer,
+                                 int index,
+                                 unsigned int dest_ch,
+                                 uint16_t blockpos,
+                                 unsigned int maxchan,
+                                 int matrix_noise_shift,
+                                 int access_unit_size_pow2,
+                                 int32_t mask);
+
+ av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+ {
+     c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+    c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+ }
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0005-truehd-break-out-part-of-output_data-into-platform-s.patch
+++ b/projects/RPi/patches/ffmpeg/0005-truehd-break-out-part-of-output_data-into-platform-s.patch
@ -0,0 +1,197 @@
+From 5bfcb7a691eb63c56f1485b60f399d79ff943799 Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Wed, 12 Mar 2014 18:18:39 +0000
+Subject: [PATCH 5/6] truehd: break out part of output_data into
+ platform-specific callback.
+
+Verified with profiling that this doesn't have a measurable effect upon
+overall performance.
+---
+ libavcodec/mlpdec.c | 40 +++++++++++++++++++++++-----------------
+ libavcodec/mlpdsp.c | 38 ++++++++++++++++++++++++++++++++++++++
+ libavcodec/mlpdsp.h | 22 ++++++++++++++++++++++
+ 3 files changed, 83 insertions(+), 17 deletions(-)
+
+diff --git a/libavcodec/mlpdec.c b/libavcodec/mlpdec.c
+index 01ded5c..061dabc 100644
+--- a/libavcodec/mlpdec.c
+++ b/libavcodec/mlpdec.c
+@@ -363,6 +363,10 @@ static int read_major_sync(MLPDecodeContext *m, GetBitContext *gb)
+         m->avctx->sample_fmt = AV_SAMPLE_FMT_S32;
+     else
+         m->avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+    m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(m->substream[m->max_decoded_substream].ch_assign,
+                                                           m->substream[m->max_decoded_substream].output_shift,
+                                                           m->substream[m->max_decoded_substream].max_matrix_channel,
+                                                           m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+     m->params_valid = 1;
+     for (substr = 0; substr < MAX_SUBSTREAMS; substr++)
+@@ -612,6 +616,10 @@ FF_ENABLE_DEPRECATION_WARNINGS
+     if (substr == m->max_decoded_substream) {
+         m->avctx->channels       = s->max_matrix_channel + 1;
+         m->avctx->channel_layout = s->ch_layout;
+        m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+                                                               s->output_shift,
+                                                               s->max_matrix_channel,
+                                                               m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+         if (m->avctx->codec_id == AV_CODEC_ID_MLP && m->needs_reordering) {
+             if (m->avctx->channel_layout == (AV_CH_LAYOUT_QUAD|AV_CH_LOW_FREQUENCY) ||
+@@ -857,9 +865,15 @@ static int read_decoding_params(MLPDecodeContext *m, GetBitContext *gbp,
+                 return ret;
+
+     if (s->param_presence_flags & PARAM_OUTSHIFT)
+-        if (get_bits1(gbp))
+        if (get_bits1(gbp)) {
+             for (ch = 0; ch <= s->max_matrix_channel; ch++)
+                 s->output_shift[ch] = get_sbits(gbp, 4);
+            if (substr == m->max_decoded_substream)
+                m->dsp.mlp_pack_output = m->dsp.mlp_select_pack_output(s->ch_assign,
+                                                                       s->output_shift,
+                                                                       s->max_matrix_channel,
+                                                                       m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+        }
+
+     if (s->param_presence_flags & PARAM_QUANTSTEP)
+         if (get_bits1(gbp))
+@@ -1058,9 +1072,6 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
+ {
+     AVCodecContext *avctx = m->avctx;
+     SubStream *s = &m->substream[substr];
+-    unsigned int i, out_ch = 0;
+-    int32_t *data_32;
+-    int16_t *data_16;
+     int ret;
+     int is32 = (m->avctx->sample_fmt == AV_SAMPLE_FMT_S32);
+
+@@ -1078,19 +1089,14 @@ static int output_data(MLPDecodeContext *m, unsigned int substr,
+     frame->nb_samples = s->blockpos;
+     if ((ret = ff_get_buffer(avctx, frame, 0)) < 0)
+         return ret;
+-    data_32 = (int32_t *)frame->data[0];
+-    data_16 = (int16_t *)frame->data[0];
+-
+-    for (i = 0; i < s->blockpos; i++) {
+-        for (out_ch = 0; out_ch <= s->max_matrix_channel; out_ch++) {
+-            int mat_ch = s->ch_assign[out_ch];
+-            int32_t sample = m->sample_buffer[i][mat_ch]
+-                          << s->output_shift[mat_ch];
+-            s->lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+-            if (is32) *data_32++ = sample << 8;
+-            else      *data_16++ = sample >> 8;
+-        }
+-    }
+    s->lossless_check_data = m->dsp.mlp_pack_output(s->lossless_check_data,
+                                                    s->blockpos,
+                                                    m->sample_buffer,
+                                                    frame->data[0],
+                                                    s->ch_assign,
+                                                    s->output_shift,
+                                                    s->max_matrix_channel,
+                                                    is32);
+
+     /* Update matrix encoding side data */
+     if ((ret = ff_side_data_update_matrix_encoding(frame, s->matrix_encoding)) < 0)
+diff --git a/libavcodec/mlpdsp.c b/libavcodec/mlpdsp.c
+index 7a359b0..3ae8c37 100644
+--- a/libavcodec/mlpdsp.c
+++ b/libavcodec/mlpdsp.c
+@@ -89,10 +89,48 @@ void ff_mlp_rematrix_channel(int32_t *samples,
+     }
+ }
+
+static int32_t (*mlp_select_pack_output(uint8_t *ch_assign,
+                                        int8_t *output_shift,
+                                        uint8_t max_matrix_channel,
+                                        int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+    return ff_mlp_pack_output;
+}
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+                           uint16_t blockpos,
+                           int32_t (*sample_buffer)[MAX_CHANNELS],
+                           void *data,
+                           uint8_t *ch_assign,
+                           int8_t *output_shift,
+                           uint8_t max_matrix_channel,
+                           int is32)
+{
+    unsigned int i, out_ch = 0;
+    int32_t *data_32 = data;
+    int16_t *data_16 = data;
+
+    for (i = 0; i < blockpos; i++) {
+        for (out_ch = 0; out_ch <= max_matrix_channel; out_ch++) {
+            int mat_ch = ch_assign[out_ch];
+            int32_t sample = sample_buffer[i][mat_ch]
+                          << output_shift[mat_ch];
+            lossless_check_data ^= (sample & 0xffffff) << mat_ch;
+            if (is32)
+                *data_32++ = sample << 8;
+            else
+                *data_16++ = sample >> 8;
+        }
+    }
+    return lossless_check_data;
+}
+
+ av_cold void ff_mlpdsp_init(MLPDSPContext *c)
+ {
+     c->mlp_filter_channel = mlp_filter_channel;
+     c->mlp_rematrix_channel = ff_mlp_rematrix_channel;
+    c->mlp_select_pack_output = mlp_select_pack_output;
+    c->mlp_pack_output = ff_mlp_pack_output;
+     if (ARCH_ARM)
+         ff_mlpdsp_init_arm(c);
+     if (ARCH_X86)
+diff --git a/libavcodec/mlpdsp.h b/libavcodec/mlpdsp.h
+index f98e9be..a0edeb7 100644
+--- a/libavcodec/mlpdsp.h
+++ b/libavcodec/mlpdsp.h
+@@ -23,6 +23,7 @@
+ #define AVCODEC_MLPDSP_H
+
+ #include <stdint.h>
+#include "mlp.h"
+
+ void ff_mlp_rematrix_channel(int32_t *samples,
+                              const int32_t *coeffs,
+@@ -36,6 +37,15 @@ void ff_mlp_rematrix_channel(int32_t *samples,
+                              int access_unit_size_pow2,
+                              int32_t mask);
+
+int32_t ff_mlp_pack_output(int32_t lossless_check_data,
+                           uint16_t blockpos,
+                           int32_t (*sample_buffer)[MAX_CHANNELS],
+                           void *data,
+                           uint8_t *ch_assign,
+                           int8_t *output_shift,
+                           uint8_t max_matrix_channel,
+                           int is32);
+
+ typedef struct MLPDSPContext {
+     void (*mlp_filter_channel)(int32_t *state, const int32_t *coeff,
+                                int firorder, int iirorder,
+@@ -52,6 +62,18 @@ typedef struct MLPDSPContext {
+                                  int matrix_noise_shift,
+                                  int access_unit_size_pow2,
+                                  int32_t mask);
+    int32_t (*(*mlp_select_pack_output)(uint8_t *ch_assign,
+                                        int8_t *output_shift,
+                                        uint8_t max_matrix_channel,
+                                        int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+    int32_t (*mlp_pack_output)(int32_t lossless_check_data,
+                               uint16_t blockpos,
+                               int32_t (*sample_buffer)[MAX_CHANNELS],
+                               void *data,
+                               uint8_t *ch_assign,
+                               int8_t *output_shift,
+                               uint8_t max_matrix_channel,
+                               int is32);
+ } MLPDSPContext;
+
+ void ff_mlpdsp_init(MLPDSPContext *c);
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/0006-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch
+++ b/projects/RPi/patches/ffmpeg/0006-truehd-add-hand-scheduled-ARM-asm-version-of-ff_mlp_.patch
@ -0,0 +1,689 @@
+From c647209386bd811cc1c33b4fc8ec17a00f8c8ded Mon Sep 17 00:00:00 2001
+From: Ben Avison <bavison@riscosopen.org>
+Date: Thu, 13 Mar 2014 00:21:55 +0000
+Subject: [PATCH 6/6] truehd: add hand-scheduled ARM asm version of
+ ff_mlp_pack_output.
+
+Profiling results for overall decode and the output_data function in
+particular are as follows:
+
+              Before          After
+              Mean   StdDev   Mean   StdDev  Confidence  Change
+6:2 total     339.6  15.1     329.3  16.0    95.8%       +3.1%  (insignificant)
+6:2 function  24.6   6.0      9.9    3.1     100.0%      +148.5%
+8:2 total     324.5  15.5     323.6  14.3    15.2%       +0.3%  (insignificant)
+8:2 function  20.4   3.9      9.9    3.4     100.0%      +104.7%
+6:6 total     572.8  20.6     539.9  24.2    100.0%      +6.1%
+6:6 function  54.5   5.6      16.0   3.8     100.0%      +240.9%
+8:8 total     741.5  21.2     702.5  18.5    100.0%      +5.6%
+8:8 function  63.9   7.6      18.4   4.8     100.0%      +247.3%
+
+The assembly version has also been tested with a fuzz tester to ensure that
+any combinations of inputs not exercised by my available test streams still
+generate mathematically identical results to the C version.
+---
+ libavcodec/arm/Makefile          |   1 +
+ libavcodec/arm/mlpdsp_armv6.S    | 530 +++++++++++++++++++++++++++++++++++++++
+ libavcodec/arm/mlpdsp_init_arm.c |  96 +++++++
+ 3 files changed, 627 insertions(+)
+ create mode 100644 libavcodec/arm/mlpdsp_armv6.S
+
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index ba673b1..7b2f923 100644
+--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
+@@ -52,6 +52,7 @@ ARMV6-OBJS-$(CONFIG_AC3DSP)            += arm/ac3dsp_armv6.o
+ ARMV6-OBJS-$(CONFIG_H264DSP)           += arm/h264dsp_armv6.o
+ ARMV6-OBJS-$(CONFIG_HPELDSP)           += arm/hpeldsp_init_armv6.o      \
+                                           arm/hpeldsp_armv6.o
+ARMV6-OBJS-$(CONFIG_MLP_DECODER)       += arm/mlpdsp_armv6.o
+ ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP)      += arm/mpegaudiodsp_fixed_armv6.o
+ ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
+                                           arm/vp8dsp_init_armv6.o       \
+diff --git a/libavcodec/arm/mlpdsp_armv6.S b/libavcodec/arm/mlpdsp_armv6.S
+new file mode 100644
+index 0000000..05a2c85
+--- /dev/null
+++ b/libavcodec/arm/mlpdsp_armv6.S
+@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2014 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+.macro loadregoffsh2  group, index, base, offgroup, offindex
+       .altmacro
+       loadregoffsh2_ \group, %(\index), \base, \offgroup, %(\offindex)
+       .noaltmacro
+.endm
+
+.macro loadregoffsh2_ group, index, base, offgroup, offindex
+        ldr     \group\index, [\base, \offgroup\offindex, lsl #2]
+.endm
+
+.macro eorlslreg  check, data, group, index
+        .altmacro
+        eorlslreg_ \check, \data, \group, %(\index)
+        .noaltmacro
+.endm
+
+.macro eorlslreg_ check, data, group, index
+        eor     \check, \check, \data, lsl \group\index
+.endm
+
+.macro decr_modulo var, by, modulus
+ .set \var, \var - \by
+ .if \var == 0
+  .set \var, \modulus
+ .endif
+.endm
+
+ .macro load_group1  size, channels, r0, r1, r2, r3, pointer_dead=0
+  .if \size == 2
+        ldrd    \r0, \r1, [IN], #(\size + 8 - \channels) * 4
+  .else // size == 4
+   .if IDX1 > 4 || \channels==8
+        ldm     IN!, {\r0, \r1, \r2, \r3}
+   .else
+        ldm     IN, {\r0, \r1, \r2, \r3}
+    .if !\pointer_dead
+        add     IN, IN, #(4 + 8 - \channels) * 4
+     .endif
+   .endif
+  .endif
+        decr_modulo IDX1, \size, \channels
+ .endm
+
+ .macro load_group2  size, channels, r0, r1, r2, r3, pointer_dead=0
+  .if \size == 2
+   .if IDX1 > 2
+        ldm     IN!, {\r2, \r3}
+   .else
+//A   .ifc \r2, ip
+//A    .if \pointer_dead
+//A       ldm     IN, {\r2, \r3}
+//A    .else
+//A       ldr     \r2, [IN], #4
+//A       ldr     \r3, [IN], #(\size - 1 + 8 - \channels) * 4
+//A    .endif
+//A   .else
+        ldrd    \r2, \r3, [IN], #(\size + 8 - \channels) * 4
+//A   .endif
+   .endif
+  .endif
+        decr_modulo IDX1, \size, \channels
+ .endm
+
+.macro implement_pack  inorder, channels, shift
+.if \inorder
+.ifc \shift, mixed
+
+CHECK   .req    a1
+COUNT   .req    a2
+IN      .req    a3
+OUT     .req    a4
+DAT0    .req    v1
+DAT1    .req    v2
+DAT2    .req    v3
+DAT3    .req    v4
+SHIFT0  .req    v5
+SHIFT1  .req    v6
+SHIFT2  .req    sl
+SHIFT3  .req    fp
+SHIFT4  .req    ip
+SHIFT5  .req    lr
+
+ .macro output4words
+  .set SIZE_GROUP1, IDX1
+  .if SIZE_GROUP1 > 4
+   .set SIZE_GROUP1, 4
+  .endif
+  .set SIZE_GROUP2, 4 - SIZE_GROUP1
+        load_group1  SIZE_GROUP1, \channels, DAT0, DAT1, DAT2, DAT3
+        load_group2  SIZE_GROUP2, \channels, DAT0, DAT1, DAT2, DAT3
+   .if \channels == 2
+        lsl     DAT0, SHIFT0
+        lsl     DAT1, SHIFT1
+        lsl     DAT2, SHIFT0
+        lsl     DAT3, SHIFT1
+   .elseif \channels == 6
+    .if IDX2 == 6
+        lsl     DAT0, SHIFT0
+        lsl     DAT1, SHIFT1
+        lsl     DAT2, SHIFT2
+        lsl     DAT3, SHIFT3
+    .elseif IDX2 == 2
+        lsl     DAT0, SHIFT4
+        lsl     DAT1, SHIFT5
+        lsl     DAT2, SHIFT0
+        lsl     DAT3, SHIFT1
+    .else // IDX2 == 4
+        lsl     DAT0, SHIFT2
+        lsl     DAT1, SHIFT3
+        lsl     DAT2, SHIFT4
+        lsl     DAT3, SHIFT5
+    .endif
+   .elseif \channels == 8
+    .if IDX2 == 8
+        uxtb    SHIFT0, SHIFT4, ror #0
+        uxtb    SHIFT1, SHIFT4, ror #8
+        uxtb    SHIFT2, SHIFT4, ror #16
+        uxtb    SHIFT3, SHIFT4, ror #24
+    .else
+        uxtb    SHIFT0, SHIFT5, ror #0
+        uxtb    SHIFT1, SHIFT5, ror #8
+        uxtb    SHIFT2, SHIFT5, ror #16
+        uxtb    SHIFT3, SHIFT5, ror #24
+    .endif
+        lsl     DAT0, SHIFT0
+        lsl     DAT1, SHIFT1
+        lsl     DAT2, SHIFT2
+        lsl     DAT3, SHIFT3
+   .endif
+        eor     CHECK, CHECK, DAT0, lsr #8 - (\channels - IDX2)
+        eor     CHECK, CHECK, DAT1, lsr #7 - (\channels - IDX2)
+   decr_modulo IDX2, 2, \channels
+        eor     CHECK, CHECK, DAT2, lsr #8 - (\channels - IDX2)
+        eor     CHECK, CHECK, DAT3, lsr #7 - (\channels - IDX2)
+   decr_modulo IDX2, 2, \channels
+        stm     OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels  // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_mixedshift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+        tst     COUNT, #SAMPLES_PER_LOOP - 1  // always seems to be in practice
+        bne     X(ff_mlp_pack_output)         // but just in case, branch to C implementation if not
+ .endif
+        teq     COUNT, #0
+        it      eq
+        bxeq    lr
+        push    {v1-v6,sl,fp,lr}
+        ldr     SHIFT0, [sp, #(9+1)*4]  // get output_shift from stack
+        ldr     SHIFT1, =0x08080808
+        ldr     SHIFT4, [SHIFT0]
+ .if \channels == 2
+        uadd8   SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+        uxtb    SHIFT0, SHIFT4, ror #0
+        uxtb    SHIFT1, SHIFT4, ror #8
+ .else
+        ldr     SHIFT5, [SHIFT0, #4]
+        uadd8   SHIFT4, SHIFT4, SHIFT1 // increase all shifts by 8
+        uadd8   SHIFT5, SHIFT5, SHIFT1
+  .if \channels == 6
+        uxtb    SHIFT0, SHIFT4, ror #0
+        uxtb    SHIFT1, SHIFT4, ror #8
+        uxtb    SHIFT2, SHIFT4, ror #16
+        uxtb    SHIFT3, SHIFT4, ror #24
+        uxtb    SHIFT4, SHIFT5, ror #0
+        uxtb    SHIFT5, SHIFT5, ror #8
+  .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+        output4words
+ .endr
+        subs    COUNT, COUNT, #SAMPLES_PER_LOOP
+        bne     0b
+        pop     {v1-v6,sl,fp,pc}
+        .ltorg
+endfunc
+ .purgem output4words
+
+        .unreq  CHECK
+        .unreq  COUNT
+        .unreq  IN
+        .unreq  OUT
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  SHIFT0
+        .unreq  SHIFT1
+        .unreq  SHIFT2
+        .unreq  SHIFT3
+        .unreq  SHIFT4
+        .unreq  SHIFT5
+
+.else // not mixed
+
+CHECK   .req    a1
+COUNT   .req    a2
+IN      .req    a3
+OUT     .req    a4
+DAT0    .req    v1
+DAT1    .req    v2
+DAT2    .req    v3
+DAT3    .req    v4
+DAT4    .req    v5
+DAT5    .req    v6
+DAT6    .req    sl // use these rather than the otherwise unused
+DAT7    .req    fp // ip and lr so that we can load them usinf LDRD
+
+ .macro output4words  tail, head, r0, r1, r2, r3, r4, r5, r6, r7, pointer_dead=0
+  .if \head
+   .set SIZE_GROUP1, IDX1
+   .if SIZE_GROUP1 > 4
+    .set SIZE_GROUP1, 4
+   .endif
+   .set SIZE_GROUP2, 4 - SIZE_GROUP1
+        load_group1  SIZE_GROUP1, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+  .endif
+  .if \tail
+        eor     CHECK, CHECK, \r4, lsr #8 - (\channels - IDX2)
+        eor     CHECK, CHECK, \r5, lsr #7 - (\channels - IDX2)
+   decr_modulo IDX2, 2, \channels
+  .endif
+  .if \head
+        load_group2  SIZE_GROUP2, \channels, \r0, \r1, \r2, \r3, \pointer_dead
+  .endif
+  .if \tail
+        eor     CHECK, CHECK, \r6, lsr #8 - (\channels - IDX2)
+        eor     CHECK, CHECK, \r7, lsr #7 - (\channels - IDX2)
+   decr_modulo IDX2, 2, \channels
+        stm     OUT!, {\r4, \r5, \r6, \r7}
+  .endif
+  .if \head
+        lsl     \r0, #8 + \shift
+        lsl     \r1, #8 + \shift
+        lsl     \r2, #8 + \shift
+        lsl     \r3, #8 + \shift
+  .endif
+ .endm
+
+ .set WORDS_PER_LOOP, \channels  // calculate LCM (channels, 8)
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 8
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_inorder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+        tst     COUNT, #SAMPLES_PER_LOOP - 1  // always seems to be in practice
+        bne     X(ff_mlp_pack_output)         // but just in case, branch to C implementation if not
+ .endif
+        subs    COUNT, COUNT, #SAMPLES_PER_LOOP
+        it      lo
+        bxlo    lr
+        push    {v1-v6,sl,fp,lr}
+ .set IDX1, \channels
+ .set IDX2, \channels
+        output4words  0, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+0:      beq     1f
+ .rept WORDS_PER_LOOP / 8
+        output4words  1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+        output4words  1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+        subs    COUNT, COUNT, #SAMPLES_PER_LOOP
+        bne     0b
+1:
+ .rept WORDS_PER_LOOP / 8 - 1
+        output4words  1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3
+        output4words  1, 1, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+ .endr
+        output4words  1, 1, DAT4, DAT5, DAT6, DAT7, DAT0, DAT1, DAT2, DAT3, pointer_dead=1
+        output4words  1, 0, DAT0, DAT1, DAT2, DAT3, DAT4, DAT5, DAT6, DAT7
+        pop     {v1-v6,sl,fp,pc}
+endfunc
+ .purgem output4words
+
+        .unreq  CHECK
+        .unreq  COUNT
+        .unreq  IN
+        .unreq  OUT
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  DAT4
+        .unreq  DAT5
+        .unreq  DAT6
+        .unreq  DAT7
+
+.endif // mixed
+.else // not inorder
+.ifc \shift, mixed
+
+// This case not currently handled
+
+.else // not mixed
+
+#if !CONFIG_THUMB
+
+CHECK   .req    a1
+COUNT   .req    a2
+IN      .req    a3
+OUT     .req    a4
+DAT0    .req    v1
+DAT1    .req    v2
+DAT2    .req    v3
+DAT3    .req    v4
+CHAN0   .req    v5
+CHAN1   .req    v6
+CHAN2   .req    sl
+CHAN3   .req    fp
+CHAN4   .req    ip
+CHAN5   .req    lr
+
+ .macro output4words
+  .if \channels == 8
+   .if IDX1 == 8
+        uxtb    CHAN0, CHAN4, ror #0
+        uxtb    CHAN1, CHAN4, ror #8
+        uxtb    CHAN2, CHAN4, ror #16
+        uxtb    CHAN3, CHAN4, ror #24
+   .else
+        uxtb    CHAN0, CHAN5, ror #0
+        uxtb    CHAN1, CHAN5, ror #8
+        uxtb    CHAN2, CHAN5, ror #16
+        uxtb    CHAN3, CHAN5, ror #24
+   .endif
+        ldr     DAT0, [IN, CHAN0, lsl #2]
+        ldr     DAT1, [IN, CHAN1, lsl #2]
+        ldr     DAT2, [IN, CHAN2, lsl #2]
+        ldr     DAT3, [IN, CHAN3, lsl #2]
+   .if IDX1 == 4
+        add     IN, IN, #8*4
+   .endif
+        decr_modulo IDX1, 4, \channels
+  .else
+   .set SIZE_GROUP1, IDX1
+   .if SIZE_GROUP1 > 4
+    .set SIZE_GROUP1, 4
+   .endif
+   .set SIZE_GROUP2, 4 - SIZE_GROUP1
+   .if SIZE_GROUP1 == 2
+        loadregoffsh2  DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+        loadregoffsh2  DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+        add     IN, IN, #8*4
+   .else // SIZE_GROUP1 == 4
+        loadregoffsh2  DAT, 0, IN, CHAN, 0 + (\channels - IDX1)
+        loadregoffsh2  DAT, 1, IN, CHAN, 1 + (\channels - IDX1)
+        loadregoffsh2  DAT, 2, IN, CHAN, 2 + (\channels - IDX1)
+        loadregoffsh2  DAT, 3, IN, CHAN, 3 + (\channels - IDX1)
+    .if IDX1 == 4
+        add     IN, IN, #8*4
+    .endif
+   .endif
+        decr_modulo IDX1, SIZE_GROUP1, \channels
+   .if SIZE_GROUP2 == 2
+        loadregoffsh2  DAT, 2, IN, CHAN, 0 + (\channels - IDX1)
+        loadregoffsh2  DAT, 3, IN, CHAN, 1 + (\channels - IDX1)
+    .if IDX1 == 2
+        add     IN, IN, #8*4
+    .endif
+   .endif
+        decr_modulo IDX1, SIZE_GROUP2, \channels
+  .endif
+  .if \channels == 8 // in this case we can corrupt CHAN0-3
+        rsb     CHAN0, CHAN0, #8
+        rsb     CHAN1, CHAN1, #8
+        rsb     CHAN2, CHAN2, #8
+        rsb     CHAN3, CHAN3, #8
+        lsl     DAT0, #8 + \shift
+        lsl     DAT1, #8 + \shift
+        lsl     DAT2, #8 + \shift
+        lsl     DAT3, #8 + \shift
+        eor     CHECK, CHECK, DAT0, lsr CHAN0
+        eor     CHECK, CHECK, DAT1, lsr CHAN1
+        eor     CHECK, CHECK, DAT2, lsr CHAN2
+        eor     CHECK, CHECK, DAT3, lsr CHAN3
+  .else
+   .if \shift != 0
+        lsl     DAT0, #\shift
+        lsl     DAT1, #\shift
+        lsl     DAT2, #\shift
+        lsl     DAT3, #\shift
+   .endif
+        bic     DAT0, DAT0, #0xff000000
+        bic     DAT1, DAT1, #0xff000000
+        bic     DAT2, DAT2, #0xff000000
+        bic     DAT3, DAT3, #0xff000000
+        eorlslreg CHECK, DAT0, CHAN, 0 + (\channels - IDX2)
+        eorlslreg CHECK, DAT1, CHAN, 1 + (\channels - IDX2)
+   decr_modulo IDX2, 2, \channels
+        eorlslreg CHECK, DAT2, CHAN, 0 + (\channels - IDX2)
+        eorlslreg CHECK, DAT3, CHAN, 1 + (\channels - IDX2)
+   decr_modulo IDX2, 2, \channels
+        lsl     DAT0, #8
+        lsl     DAT1, #8
+        lsl     DAT2, #8
+        lsl     DAT3, #8
+  .endif
+        stm     OUT!, {DAT0 - DAT3}
+ .endm
+
+ .set WORDS_PER_LOOP, \channels  // calculate LCM (channels, 4)
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .if (WORDS_PER_LOOP % 2) == 0
+  .set WORDS_PER_LOOP, WORDS_PER_LOOP / 2
+ .endif
+ .set WORDS_PER_LOOP, WORDS_PER_LOOP * 4
+ .set SAMPLES_PER_LOOP, WORDS_PER_LOOP / \channels
+
+function ff_mlp_pack_output_outoforder_\channels\()ch_\shift\()shift_armv6, export=1
+ .if SAMPLES_PER_LOOP > 1
+        tst     COUNT, #SAMPLES_PER_LOOP - 1  // always seems to be in practice
+        bne     X(ff_mlp_pack_output)         // but just in case, branch to C implementation if not
+ .endif
+        teq     COUNT, #0
+        it      eq
+        bxeq    lr
+        push    {v1-v6,sl,fp,lr}
+        ldr     CHAN0, [sp, #(9+0)*4]  // get ch_assign from stack
+        ldr     CHAN4, [CHAN0]
+ .if \channels == 2
+        uxtb    CHAN0, CHAN4, ror #0
+        uxtb    CHAN1, CHAN4, ror #8
+ .else
+        ldr     CHAN5, [CHAN0, #4]
+  .if \channels == 6
+        uxtb    CHAN0, CHAN4, ror #0
+        uxtb    CHAN1, CHAN4, ror #8
+        uxtb    CHAN2, CHAN4, ror #16
+        uxtb    CHAN3, CHAN4, ror #24
+        uxtb    CHAN4, CHAN5, ror #0
+        uxtb    CHAN5, CHAN5, ror #8
+  .endif
+ .endif
+ .set IDX1, \channels
+ .set IDX2, \channels
+0:
+ .rept WORDS_PER_LOOP / 4
+        output4words
+ .endr
+        subs    COUNT, COUNT, #SAMPLES_PER_LOOP
+        bne     0b
+        pop     {v1-v6,sl,fp,pc}
+        .ltorg
+endfunc
+ .purgem output4words
+
+        .unreq  CHECK
+        .unreq  COUNT
+        .unreq  IN
+        .unreq  OUT
+        .unreq  DAT0
+        .unreq  DAT1
+        .unreq  DAT2
+        .unreq  DAT3
+        .unreq  CHAN0
+        .unreq  CHAN1
+        .unreq  CHAN2
+        .unreq  CHAN3
+        .unreq  CHAN4
+        .unreq  CHAN5
+
+#endif // !CONFIG_THUMB
+
+.endif // mixed
+.endif // inorder
+.endm // implement_pack
+
+.macro pack_channels  inorder, channels
+        implement_pack  \inorder, \channels, 0
+        implement_pack  \inorder, \channels, 1
+        implement_pack  \inorder, \channels, 2
+        implement_pack  \inorder, \channels, 3
+        implement_pack  \inorder, \channels, 4
+        implement_pack  \inorder, \channels, 5
+        implement_pack  \inorder, \channels, mixed
+.endm
+
+.macro pack_order  inorder
+        pack_channels  \inorder, 2
+        pack_channels  \inorder, 6
+        pack_channels  \inorder, 8
+.endm
+
+        pack_order  0
+        pack_order  1
+diff --git a/libavcodec/arm/mlpdsp_init_arm.c b/libavcodec/arm/mlpdsp_init_arm.c
+index 1bb2276..10ec316 100644
+--- a/libavcodec/arm/mlpdsp_init_arm.c
+++ b/libavcodec/arm/mlpdsp_init_arm.c
+@@ -41,8 +41,104 @@ void ff_mlp_rematrix_channel_arm(int32_t *samples,
+                                  int access_unit_size_pow2,
+                                  int32_t mask);
+
+#define DECLARE_PACK(order,channels,shift) \
+    int32_t ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int);
+#define ENUMERATE_PACK(order,channels,shift) \
+    ff_mlp_pack_output_##order##order_##channels##ch_##shift##shift_armv6,
+#define PACK_CHANNELS(macro,order,channels) \
+        macro(order,channels,0) \
+        macro(order,channels,1) \
+        macro(order,channels,2) \
+        macro(order,channels,3) \
+        macro(order,channels,4) \
+        macro(order,channels,5) \
+        macro(order,channels,mixed)
+#define PACK_ORDER(macro,order) \
+        PACK_CHANNELS(macro,order,2) \
+        PACK_CHANNELS(macro,order,6) \
+        PACK_CHANNELS(macro,order,8)
+#define PACK_ALL(macro) \
+        PACK_ORDER(macro,outof) \
+        PACK_ORDER(macro,in)
+PACK_ALL(DECLARE_PACK)
+
+#define ff_mlp_pack_output_outoforder_2ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_mixedshift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_mixedshift_armv6 0
+#if CONFIG_THUMB
+#define ff_mlp_pack_output_outoforder_2ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_2ch_5shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_6ch_5shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_0shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_1shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_2shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_3shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_4shift_armv6 0
+#define ff_mlp_pack_output_outoforder_8ch_5shift_armv6 0
+#endif
+
+static int32_t (*mlp_select_pack_output_armv6(uint8_t *ch_assign,
+                                              int8_t *output_shift,
+                                              uint8_t max_matrix_channel,
+                                              int is32))(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int)
+{
+    int ch_index;
+    int shift = output_shift[0] < 0 || output_shift[0] > 5 ? 6 : output_shift[0];
+    int inorder = 1;
+    static int32_t (*const routine[2*3*7])(int32_t, uint16_t, int32_t (*)[], void *, uint8_t*, int8_t *, uint8_t, int) = {
+            PACK_ALL(ENUMERATE_PACK)
+    };
+    int i;
+
+    if (!is32) // don't support 16-bit output (it's not used by TrueHD)
+        return ff_mlp_pack_output;
+
+    switch (max_matrix_channel) {
+    case 1:
+        ch_index = 0;
+        break;
+    case 5:
+        ch_index = 1;
+        break;
+    case 7:
+        ch_index = 2;
+        break;
+    default:
+        return ff_mlp_pack_output;
+    }
+
+    for (i = 0; i <= max_matrix_channel; i++) {
+        if (shift != 6 && output_shift[i] != shift)
+            shift = 6; // indicate mixed shifts
+        if (ch_assign[i] != i)
+            inorder = 0;
+    }
+#if CONFIG_THUMB
+    if (!inorder)
+        return ff_mlp_pack_output; // can't currently handle an order array except in ARM mode
+#else
+    if (shift == 6 && !inorder)
+        return ff_mlp_pack_output; // can't currently handle both an order array and a shift array
+#endif
+
+    return routine[(inorder*3+ch_index)*7+shift];
+}
+
+ av_cold void ff_mlpdsp_init_arm(MLPDSPContext *c)
+ {
+    int cpu_flags = av_get_cpu_flags();
+
+     c->mlp_filter_channel = ff_mlp_filter_channel_arm;
+     c->mlp_rematrix_channel = ff_mlp_rematrix_channel_arm;
+    if (cpu_flags & AV_CPU_FLAG_ARMV6)
+        c->mlp_select_pack_output = mlp_select_pack_output_armv6;
+ }
+--
+1.9.1
--- a/projects/RPi/patches/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
+++ b/projects/RPi/patches/ffmpeg/ffmpeg_Speed_up_wtv_index_creation.patch
@ -0,0 +1,47 @@
+commit 0e7427498cb1131671f6fe9d054245ae7e5a36f5
+Author: popcornmix <popcornmix@gmail.com>
+Date:   Tue Mar 25 19:43:07 2014 +0000
+
+    [ffmpeg] Speed up wtv index creation
+
+    The index creation is O(N^2) with number of entries (typically thousands).
+    On a Pi this can take more than 60 seconds to execute for a recording of a few hours.
+
+    By replacing with an O(N) loop, this takes virtually zero time
+
+diff --git a/libavformat/wtvdec.c b/libavformat/wtvdec.c
+index e423370..70898bd 100644
+--- a/libavformat/wtvdec.c
+++ b/libavformat/wtvdec.c
+@@ -980,21 +980,23 @@ static int read_header(AVFormatContext *s)
+                 pb = wtvfile_open(s, root, root_size, ff_timeline_table_0_entries_Events_le16);
+                 if (pb) {
+                     int i;
+                    AVIndexEntry *e = wtv->index_entries;
+                    AVIndexEntry *e_end = wtv->index_entries + wtv->nb_index_entries - 1;
+                    uint64_t last_position = 0;
+                     while (1) {
+                         uint64_t frame_nb = avio_rl64(pb);
+                         uint64_t position = avio_rl64(pb);
+                        while (frame_nb > e->size && e <= e_end) {
+                           e->pos = last_position;
+                           e++;
+                        }
+                         if (url_feof(pb))
+                             break;
+-                        for (i = wtv->nb_index_entries - 1; i >= 0; i--) {
+-                            AVIndexEntry *e = wtv->index_entries + i;
+-                            if (frame_nb > e->size)
+-                                break;
+-                            if (position > e->pos)
+-                                e->pos = position;
+-                        }
+                        last_position = position;
+                     }
+                    e_end->pos = last_position;
+                     wtvfile_close(pb);
+-                    st->duration = wtv->index_entries[wtv->nb_index_entries - 1].timestamp;
+                    st->duration = e_end->timestamp;
+                 }
+             }
+         }