From a9feeb705f37f1aef091119934ff28078c34f1f0 Mon Sep 17 00:00:00 2001
From: MilhouseVH <milhouseVH.github@nmacleod.com>
Date: Thu, 1 Feb 2018 23:50:49 +0000
Subject: [PATCH] ffmpeg: update to ffmpeg-f96fd5c (3.4.1-Leia-Alpha-1)

---
 packages/multimedia/ffmpeg/package.mk         |    4 +-
 ...l-unsupported-GMC-with-more-than-one.patch |    6 +-
 ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 4526 +++++++----------
 ...g-99.1004-added_upstream_mvc_patches.patch |   24 +-
 4 files changed, 1867 insertions(+), 2693 deletions(-)

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index c203364dc1..af3edf59da 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -18,8 +18,8 @@
 
 PKG_NAME="ffmpeg"
 # Current branch is: release/3.4-kodi
-PKG_VERSION="d413620"
-PKG_SHA256="c02de2197f8b70544f018e83f48c1bed2a1b47e1a1aa34ef59d9167fb0d2090a"
+PKG_VERSION="f96fd5c"
+PKG_SHA256="35ccc07c72b203101030a35b4bb11779365adb7bbf143ef1d68a1f87c781e38b"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
index c3c09d6325..6721c8d3be 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
@@ -1,4 +1,4 @@
-From 214a8ccc1489db28ce6cec2739365d7eebbdb0f9 Mon Sep 17 00:00:00 2001
+From d8bdcc8791c501921ee8961f3b0de0bd47668ebf Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 5 Jun 2015 22:48:33 +0100
 Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
@@ -10,10 +10,10 @@ Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
  2 files changed, 5 insertions(+)
 
 diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index c207d3a784..08aa8112b1 100644
+index c26b6d607c..6c4b011b5c 100644
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
-@@ -2967,6 +2967,7 @@ typedef struct AVCodecContext {
+@@ -2965,6 +2965,7 @@ typedef struct AVCodecContext {
  #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
  #define FF_BUG_TRUNCATED       16384
  #define FF_BUG_IEDGE           32768
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index 5104bfd261..b3fb4b36ac 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -19,7 +19,7 @@ index dabb51762d..0b1f739d22 100644
  /ffplay
  /ffprobe
 diff --git a/configure b/configure
-index 18d80ee87a..d519af9074 100755
+index 18d80ee87a..9e621d09c1 100755
 --- a/configure
 +++ b/configure
 @@ -313,6 +313,7 @@ External library support:
@@ -38,15 +38,32 @@ index 18d80ee87a..d519af9074 100755
      runtime_cpudetect
      safe_bitstream_reader
      shared
-@@ -2500,6 +2502,8 @@ hap_decoder_select="snappy texturedsp"
+@@ -2198,6 +2200,7 @@ CONFIG_EXTRA="
+     rtpdec
+     rtpenc_chain
+     rv34dsp
++    sand
+     sinewin
+     snappy
+     srtp
+@@ -2500,6 +2503,8 @@ hap_decoder_select="snappy texturedsp"
  hap_encoder_deps="libsnappy"
  hap_encoder_select="texturedspenc"
  hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
 +hevc_rpi_decoder_deps="rpi"
-+hevc_rpi_decoder_select="hevc_decoder"
++hevc_rpi_decoder_select="hevc_decoder sand"
  huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
  huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
  iac_decoder_select="imc_decoder"
+@@ -3269,6 +3274,8 @@ tinterlace_filter_deps="gpl"
+ tinterlace_merge_test_deps="tinterlace_filter"
+ tinterlace_pad_test_deps="tinterlace_filter"
+ tonemap_filter_deps="const_nan"
++unsand_filter_deps="rpi"
++unsand_filter_select="sand"
+ uspp_filter_deps="gpl avcodec"
+ vaguedenoiser_filter_deps="gpl"
+ vidstabdetect_filter_deps="libvidstab"
 diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
 index 3ee31473dc..312864d737 100644
 --- a/fftools/ffmpeg.c
@@ -451,7 +468,7 @@ index 100fa76e46..93a1b8edaf 100644
  
  /* Add all the streams from the given input file to the global
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index c4ec09b1c4..3b94d47e9a 100644
+index c4ec09b1c4..f2abbb06b3 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
 @@ -4,6 +4,7 @@ DESC = FFmpeg codec library
@@ -494,10 +511,10 @@ index c4ec09b1c4..3b94d47e9a 100644
 +
 +ifneq ("$(wildcard $(QASM_PY))","")
 +$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,rpi_hevc_shader $< > $@
++	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
-+	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_shader,rpi_hevc_shader $< > $@
++	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
 +endif
 +
 +ifneq ("$(wildcard $(VASMVIDCORE))","")
@@ -528,7 +545,7 @@ index 4f34312e67..5361a22141 100644
      REGISTER_DECODER(HEVC_QSV,          hevc_qsv);
      REGISTER_DECODER(HEVC_RKMPP,        hevc_rkmpp);
 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index 1eeac5449e..64aca64e52 100644
+index 1eeac5449e..022ab7ab3d 100644
 --- a/libavcodec/arm/Makefile
 +++ b/libavcodec/arm/Makefile
 @@ -40,6 +40,7 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
@@ -539,7 +556,7 @@ index 1eeac5449e..64aca64e52 100644
  OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
  OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
  OBJS-$(CONFIG_VORBIS_DECODER)          += arm/vorbisdsp_init_arm.o
-@@ -134,9 +135,18 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+@@ -134,9 +135,16 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
@@ -550,20 +567,18 @@ index 1eeac5449e..64aca64e52 100644
 +NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
 +                                          arm/rpi_hevc_misc_neon.o       \
 +                                          arm/rpi_hevcdsp_deblock_neon.o \
-+                                          arm/rpi_hevcdsp_epel_neon.o    \
 +                                          arm/rpi_hevcdsp_idct_neon.o    \
 +                                          arm/rpi_hevcdsp_res16_neon.o   \
-+                                          arm/rpi_hevcdsp_qpel_neon.o    \
 +                                          arm/rpi_hevcdsp_sao_neon.o     \
 +                                          arm/rpi_hevcdsp_cres_neon.o
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
 diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b45e..0a3980a1ef 100644
+index fdbf86b45e..a60bc899bd 100644
 --- a/libavcodec/arm/cabac.h
 +++ b/libavcodec/arm/cabac.h
-@@ -26,13 +26,34 @@
+@@ -26,83 +26,173 @@
  #include "libavutil/internal.h"
  #include "libavcodec/cabac.h"
  
@@ -590,85 +605,143 @@ index fdbf86b45e..0a3980a1ef 100644
 +
  #define get_cabac_inline get_cabac_inline_arm
  static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-                                                  uint8_t *const state)
+-                                                 uint8_t *const state)
++                                                 uint8_t *state)
  {
-     int bit;
-+#if 0
-     void *reg_b, *reg_c, *tmp;
+-    int bit;
+-    void *reg_b, *reg_c, *tmp;
 -
-     __asm__ volatile(
-         "ldrb       %[bit]        , [%[state]]                  \n\t"
-         "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
-@@ -100,9 +121,141 @@ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-           [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
-         : "memory", "cc"
-         );
-+#else
-+   // *** Not thumb compatible yet
-+   unsigned int reg_b, tmp;
-+    __asm__ (
-+        "ldrb       %[bit]        , [%[state]]                  \n\t"
-+        "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
-+        "and        %[tmp]        , %[range]    , #0xC0         \n\t"
-+        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
-+        "ldrb       %[tmp]        , [%[r_b]     , %[tmp], lsl #1] \n\t"
-+// %bit = *state
-+// %range = range
-+// %tmp = RangeLPS
-+        "sub        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+        "cmp        %[low]        , %[range]    , lsl #17       \n\t"
-+        "ittt       ge                                          \n\t"
-+        "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+        "mvnge      %[bit]        , %[bit]                      \n\t"
-+        "movge      %[range]      , %[tmp]                      \n\t"
-+
-+        "clz        %[tmp]        , %[range]                    \n\t"
-+        "sub        %[tmp]        , #23                         \n\t"
-+
-+        "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+        "strb       %[r_b]        , [%[state]]                  \n\t"
-+        "lsls       %[tmp]        , %[low]      , #16           \n\t"
-+
-+        "bne        2f                                          \n\t"
-+        LOAD_16BITS_BEHI
-+        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
-+        "movw       %[r_b]        , #0xFFFF                     \n\t"
-+        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-+
-+        "rbit       %[r_b]        , %[low]                      \n\t"
-+        "clz        %[r_b]        , %[r_b]                      \n\t"
-+        "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-+#if CONFIG_THUMB
-+        "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
-+        "add        %[low]        , %[low]      , %[tmp]        \n\t"
-+#else
-+        "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
-+#endif
-+        "2:                                                     \n\t"
-+        :    [bit]"=&r"(bit),
-+             [low]"+&r"(c->low),
-+           [range]"+&r"(c->range),
-+             [r_b]"=&r"(reg_b),
-+             [ptr]"+&r"(c->bytestream),
-+             [tmp]"=&r"(tmp)
-+          :  [state]"r"(state),
-+            [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+              [byte]"M"(offsetof(CABACContext, bytestream)),
-+#if !UNCHECKED_BITSTREAM_READER
-+                 [c]"r"(c),
-+               [end]"M"(offsetof(CABACContext, bytestream_end)),
-+#endif
-+           [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+        : "memory", "cc"
-+        );
-+#endif
- 
-     return bit & 1;
- }
+-    __asm__ volatile(
+-        "ldrb       %[bit]        , [%[state]]                  \n\t"
+-        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
+-        "mov        %[tmp]        , %[range]                    \n\t"
+-        "and        %[range]      , %[range]    , #0xC0         \n\t"
+-        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+-        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
+-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+-        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
+-        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
+-        "cmp        %[tmp]        , %[low]                      \n\t"
+-        "it         gt                                          \n\t"
+-        "movgt      %[range]      , %[r_c]                      \n\t"
+-        "itt        cc                                          \n\t"
+-        "mvncc      %[bit]        , %[bit]                      \n\t"
+-        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
+-        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
+-        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
+-        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
+-        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+-        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+-        "uxth       %[r_c]        , %[low]                      \n\t"
+-        "strb       %[r_b]        , [%[state]]                  \n\t"
+-        "tst        %[r_c]        , %[r_c]                      \n\t"
+-        "bne        2f                                          \n\t"
+-        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
++    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
++    int bit, ptr, low, tmp1, tmp2;
++    __asm__ volatile (
++        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
++        "ldrb    %[ptr], [%[state]]                       \n\t"
++        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
++        "and     %[tmp2], %[bit], #0xc0                   \n\t"
++        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
++        "ldr     %[low], [%[c], %[low_off]]               \n\t"
++        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
++        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
++        "mov     %[tmp1], %[bit]                          \n\t"
++        "cmp     %[low], %[bit], lsl #17                  \n\t"
++        "movge   %[tmp1], %[tmp2]                         \n\t"
++        "mvnge   %[ptr], %[ptr]                           \n\t"
++        "clz     %[tmp2], %[tmp1]                         \n\t"
++        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
++        "sub     %[tmp2], %[tmp2], #23                    \n\t"
++        "and     %[bit], %[ptr], #1                       \n\t"
++        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
++        "lsl     %[low], %[low], %[tmp2]                  \n\t"
++        "lsls    %[ptr], %[low], #16                      \n\t"
++        "bne     1f                                       \n\t"
++        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
++        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
+ #if UNCHECKED_BITSTREAM_READER
+-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+-        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+-        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "rbit    %[state], %[low]                         \n\t"
++        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
+ #else
+-        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
+-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+-        "cmp        %[r_c]        , %[r_b]                      \n\t"
+-        "itt        lt                                          \n\t"
+-        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
+-        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
++        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "rbit    %[state], %[low]                         \n\t"
++        "cmp     %[tmp1], %[ptr]                          \n\t"
++        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
+ #endif
+-        "sub        %[r_c]        , %[low]      , #1            \n\t"
+-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+-        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
+-        "rev        %[tmp]        , %[tmp]                      \n\t"
+-        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
+-        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+-        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
+-        "movw       %[r_b]        , #0xFFFF                     \n\t"
+-        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+-        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
+-        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
+-        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+-        "2:                                                     \n\t"
+-        :    [bit]"=&r"(bit),
+-             [low]"+&r"(c->low),
+-           [range]"+&r"(c->range),
+-             [r_b]"=&r"(reg_b),
+-             [r_c]"=&r"(reg_c),
+-             [tmp]"=&r"(tmp)
+-        :        [c]"r"(c),
+-             [state]"r"(state),
+-            [tables]"r"(ff_h264_cabac_tables),
+-              [byte]"M"(offsetof(CABACContext, bytestream)),
++        "clz     %[state], %[state]                       \n\t"
++        "movw    %[mlps_tables], #0xffff                  \n\t"
++        "sub     %[state], %[state], #16                  \n\t"
++        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
++        "rev     %[tmp1], %[tmp1]                         \n\t"
++        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
++        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
++        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
++        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
++        "str     %[low], [%[c], %[low_off]]               \n\t"
++        "b       2f                                       \n\t"
++        "1:                                               \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
++        "str     %[low], [%[c], %[low_off]]               \n\t"
++        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
++        "2:                                               \n\t"
++    :  // Outputs
++             [state]"+r"(state),
++       [mlps_tables]"+r"(mlps_tables),
++               [bit]"=&r"(bit),
++               [ptr]"=&r"(ptr),
++               [low]"=&r"(low),
++              [tmp1]"=&r"(tmp1),
++              [tmp2]"=&r"(tmp2)
++    :  // Inputs
++               [c]"r"(c),
++         [low_off]"J"(offsetof(CABACContext, low)),
++       [range_off]"J"(offsetof(CABACContext, range)),
++         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
++         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++    :  // Clobbers
++       "cc", "memory"
++    );
++    return bit;
++}
 +
 +#define get_cabac_bypass get_cabac_bypass_arm
 +static inline int get_cabac_bypass_arm(CABACContext * const c)
@@ -689,21 +762,27 @@ index fdbf86b45e..0a3980a1ef 100644
 +        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
 +        "1:                                                     \n\t"
 +        : // Outputs
-+              [rv]"+&r"(rv),
-+             [low]"+&r"(c->low),
-+             [tmp]"=&r"(tmp),
-+             [ptr]"+&r"(c->bytestream)
++              [rv]"+r"(rv),
++             [low]"+r"(c->low),
++             [tmp]"=r"(tmp),
++             [ptr]"+r"(c->bytestream)
 +        : // Inputs
 +#if !UNCHECKED_BITSTREAM_READER
 +                 [c]"r"(c),
-+               [end]"M"(offsetof(CABACContext, bytestream_end)),
+                [end]"M"(offsetof(CABACContext, bytestream_end)),
+-          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+-           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+-          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+-        : "memory", "cc"
+-        );
 +#endif
 +             [range]"r"(c->range)
 +        : "cc"
 +    );
 +    return rv;
 +}
-+
+ 
+-    return bit & 1;
 +
 +#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
 +static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
@@ -723,10 +802,10 @@ index fdbf86b45e..0a3980a1ef 100644
 +        "sub        %[low]        , %[low]      , %[tmp]        \n\t"
 +        "1:                                                     \n\t"
 +        : // Outputs
-+              [rv]"+&r"(rv),
-+             [low]"+&r"(c->low),
-+             [tmp]"=&r"(tmp),
-+             [ptr]"+&r"(c->bytestream)
++              [rv]"+r"(rv),
++             [low]"+r"(c->low),
++             [tmp]"=r"(tmp),
++             [ptr]"+r"(c->bytestream)
 +        : // Inputs
 +#if !UNCHECKED_BITSTREAM_READER
 +                 [c]"r"(c),
@@ -736,17 +815,17 @@ index fdbf86b45e..0a3980a1ef 100644
 +        : "cc"
 +    );
 +    return rv;
-+}
+ }
 +
  #endif /* HAVE_ARMV6T2_INLINE */
  
  #endif /* AVCODEC_ARM_CABAC_H */
 diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
 new file mode 100644
-index 0000000000..31d3c59205
+index 0000000000..10b2c6f850
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevc_cabac.h
-@@ -0,0 +1,491 @@
+@@ -0,0 +1,477 @@
 +/*
 + * This file is part of FFmpeg.
 + *
@@ -810,19 +889,18 @@ index 0000000000..31d3c59205
 +    const unsigned int last_coeff_abs_level_remaining,
 +    const unsigned int c_rice_param)
 +{
-+    int t;
++    int t = last_coeff_abs_level_remaining << 1;
 +    __asm__ (
-+    "lsl   %[t], %[coeff], #1               \n\t"
 +    "lsrs  %[t], %[t], %[shift]             \n\t"
++
 +    "it    eq                               \n\t"
 +    "subeq %[stat], %[stat], #1             \n\t"
 +    "cmp   %[t], #6                         \n\t"
 +    "adc   %[stat], %[stat], #0             \n\t"
 +    "usat  %[stat], #8, %[stat]             \n\t"
-+    : [stat]"+&r"(*stat_coeff),
-+         [t]"=&r"(t)
-+    :  [coeff]"r"(last_coeff_abs_level_remaining),
-+       [shift]"r"(c_rice_param)
++    : [stat]"+r"(*stat_coeff),
++         [t]"+r"(t)
++    :  [shift]"r"(c_rice_param)
 +    : "cc"
 +    );
 +}
@@ -850,10 +928,10 @@ index 0000000000..31d3c59205
 +         "ite        eq                                          \n\t"
 +         "usateq     %[st]         , #2          , %[i]          \n\t"
 +         "movne      %[st]         , #0                          \n\t"
-+
-+         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
 +         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
 +         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
 +         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
 +         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
 +         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
@@ -861,20 +939,18 @@ index 0000000000..31d3c59205
 +         "cmp        %[low]        , %[range], lsl #17           \n\t"
 +         "ittt       ge                                          \n\t"
 +         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
-+         "mvnge      %[bit]        , %[bit]                      \n\t"
 +         "movge      %[range]      , %[tmp]                      \n\t"
-+
-+         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
-+         "and        %[bit]        , %[bit]      , #1            \n\t"
-+         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
 +
 +         "clz        %[tmp]        , %[range]                    \n\t"
 +         "sub        %[tmp]        , #23                         \n\t"
-+
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "and        %[bit]        , %[bit]      , #1            \n\t"
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
 +         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
 +         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
 +
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
 +// There is a small speed gain from combining both conditions, using a single
 +// branch and then working out what that meant later
 +         "lsls       %[tmp]        , %[low]      , #16           \n\t"
@@ -888,29 +964,28 @@ index 0000000000..31d3c59205
 +
 +// Do reload
 +         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "rbit       %[bit]        , %[low]                      \n\t"
 +         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "clz        %[bit]        , %[bit]                      \n\t"
 +         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "sub        %[bit]        , %[bit]      , #16           \n\t"
++         "cmp        %[n]          , %[i]                        \n\t"
 +         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
 +
-+         "rbit       %[r_b]        , %[low]                      \n\t"
-+         "clz        %[r_b]        , %[r_b]                      \n\t"
-+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-+
 +#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
++         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
 +         "add        %[low]        , %[low]      , %[tmp]        \n\t"
 +#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
++         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
 +#endif
 +
-+         "cmp        %[n]          , %[i]                        \n\t"
 +         "bne        1b                                          \n\t"
 +         "2:                                                     \n\t"
 +         :    [bit]"=&r"(bit),
-+              [low]"+&r"(c->low),
-+            [range]"+&r"(c->range),
++              [low]"+r"(c->low),
++            [range]"+r"(c->range),
 +              [r_b]"=&r"(reg_b),
-+             [bptr]"+&r"(c->bytestream),
++             [bptr]"+r"(c->bytestream),
 +                [i]"=&r"(i),
 +              [tmp]"=&r"(tmp),
 +               [st]"=&r"(st),
@@ -918,7 +993,6 @@ index 0000000000..31d3c59205
 +          :  [state0]"r"(state0),
 +                  [n]"r"(n),
 +        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+               [byte]"M"(offsetof(CABACContext, bytestream)),
 +            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
 +         : "memory", "cc"
 +    );
@@ -935,26 +1009,32 @@ index 0000000000..31d3c59205
 +{
 +    unsigned int reg_b, tmp, st, bit;
 +     __asm__ (
-+         "1:                                                     \n\t"
 +// Get bin from map
-+         "ldrb       %[st]         , [%[ctx_map], %[n]]          \n\t"
++         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
++         "1:                                                     \n\t"
 +
 +// Load state & ranges
-+         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
 +         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
 +         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
 +         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
 +         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
 +         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
 +
 +         "cmp        %[low]        , %[range], lsl #17           \n\t"
 +         "ittt       ge                                          \n\t"
-+         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
 +         "mvnge      %[bit]        , %[bit]                      \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
 +         "movge      %[range]      , %[tmp]                      \n\t"
 +
++// Renorm
++         "clz        %[tmp]        , %[range]                    \n\t"
 +         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
 +         "tst        %[bit]        , #1                          \n\t"
++         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
 +// GCC asm seems to need strbne written differently for thumb and arm
 +#if CONFIG_THUMB
 +         "it         ne                                          \n\t"
@@ -963,24 +1043,17 @@ index 0000000000..31d3c59205
 +         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
 +#endif
 +
-+// Renorm
-+         "clz        %[tmp]        , %[range]                    \n\t"
-+         "sub        %[tmp]        , #23                         \n\t"
-+         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
-+         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
-+
-+         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
 +// There is a small speed gain from combining both conditions, using a single
 +// branch and then working out what that meant later
 +         "subs       %[n]          , %[n]        , #1            \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
 +#if CONFIG_THUMB
 +         "itt        ne                                          \n\t"
 +         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        1b                                          \n\t"
 +#else
 +         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
-+         "bne        1b                                          \n\t"
 +#endif
++         "bne        1b                                          \n\t"
 +
 +// If we have bits left then n must be 0 so give up now
 +         "lsls       %[tmp]        , %[low]      , #16           \n\t"
@@ -988,38 +1061,36 @@ index 0000000000..31d3c59205
 +
 +// Do reload
 +         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "rbit       %[bit]        , %[low]                      \n\t"
 +         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "clz        %[bit]        , %[bit]                      \n\t"
++         "cmp        %[n]          , #0                          \n\t"
 +         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "sub        %[bit]        , %[bit]      , #16           \n\t"
 +         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
 +
-+         "rbit       %[r_b]        , %[low]                      \n\t"
-+         "clz        %[r_b]        , %[r_b]                      \n\t"
-+         "sub        %[r_b]        , %[r_b]      , #16           \n\t"
-+
 +#if CONFIG_THUMB
-+         "lsl        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
++         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
 +         "add        %[low]        , %[low]      , %[tmp]        \n\t"
 +#else
-+         "add        %[low]        , %[low]      , %[tmp], lsl %[r_b] \n\t"
++         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
 +#endif
 +
 +// Check to see if we still have more to do
-+         "cmp        %[n]          , #0                          \n\t"
 +         "bne        1b                                          \n\t"
 +         "2:                                                     \n\t"
 +         :    [bit]"=&r"(bit),
-+              [low]"+&r"(c->low),
-+            [range]"+&r"(c->range),
++              [low]"+r"(c->low),
++            [range]"+r"(c->range),
 +              [r_b]"=&r"(reg_b),
-+             [bptr]"+&r"(c->bytestream),
-+              [idx]"+&r"(p),
-+                [n]"+&r"(n),
++             [bptr]"+r"(c->bytestream),
++              [idx]"+r"(p),
++                [n]"+r"(n),
 +              [tmp]"=&r"(tmp),
-+               [st]"=&r"(st)
++               [st]"=&r"(st),
++          [ctx_map]"+r"(ctx_map)
 +          :  [state0]"r"(state0),
-+            [ctx_map]"r"(ctx_map),
 +        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+               [byte]"M"(offsetof(CABACContext, bytestream)),
 +            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
 +         : "memory", "cc"
 +    );
@@ -1042,17 +1113,15 @@ index 0000000000..31d3c59205
 +#define get_cabac_by22_peek get_cabac_by22_peek_arm
 +static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
 +{
-+    uint32_t rv, tmp;
++    uint32_t rv = c->low &~ 1, tmp;
 +    __asm__ (
-+        "bic      %[rv]  , %[low], #1            \n\t"
 +        "cmp      %[inv] , #0                    \n\t"
 +        "it       ne                             \n\t"
 +        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
 +        :  // Outputs
-+             [rv]"=&r"(rv),
++             [rv]"+r"(rv),
 +             [tmp]"=r"(tmp)
 +        :  // Inputs
-+             [low]"r"(c->low),
 +             [inv]"r"(c->range)
 +        :  // Clobbers
 +                "cc"
@@ -1060,180 +1129,176 @@ index 0000000000..31d3c59205
 +    return rv << 1;
 +}
 +
-+#if 0
-+
-+// ***** Slower than the C  :-(
 +#define get_cabac_by22_flush get_cabac_by22_flush_arm
-+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, const uint32_t val)
++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
 +{
-+    uint32_t m, tmp;
-+    __asm__ (
-+    "add    %[bits], %[bits], %[n]   \n\t"
-+    "ldr    %[m], [%[ptr], %[bits], lsr #3]  \n\t"
-+
-+    "rsb    %[tmp], %[n], #32        \n\t"
-+    "lsr    %[tmp], %[val], %[tmp]   \n\t"
-+    "mul    %[tmp], %[range], %[tmp] \n\t"
-+
-+    "rev    %[m], %[m]               \n\t"
-+
-+    "lsl    %[tmp], %[tmp], #23      \n\t"
-+    "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-+
-+    "and    %[tmp], %[bits], #7         \n\t"
-+    "lsl    %[m], %[m], %[tmp]          \n\t"
-+
-+    "orr    %[low], %[low], %[m], lsr #9      \n\t"
++    uint32_t bits, ptr, tmp1, tmp2;
++    __asm__ volatile (
++        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
++        "rsb     %[tmp1], %[n], #32                \n\t"
++        "add     %[bits], %[bits], %[n]            \n\t"
++        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
++        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
++        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
++        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
++        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
++        "and     %[tmp2], %[bits], #7              \n\t"
++        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
++        "rev     %[ptr], %[ptr]                    \n\t"
++        "lsl     %[tmp1], %[tmp1], #23             \n\t"
++        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
++        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
++        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
++        "str     %[val], [%[cc], %[low_off]]       \n\t"
 +        :  // Outputs
-+             [m]"=&r"(m),
-+           [tmp]"=&r"(tmp),
-+          [bits]"+&r"(c->by22.bits),
-+           [low]"+&r"(c->low)
++            [val]"+r"(val),
++           [bits]"=&r"(bits),
++            [ptr]"=&r"(ptr),
++           [tmp1]"=&r"(tmp1),
++           [tmp2]"=&r"(tmp2)
 +        :  // Inputs
-+               [n]"r"(n),
-+             [val]"r"(val),
-+             [inv]"r"(c->range),
-+           [range]"r"(c->by22.range),
-+             [ptr]"r"(c->bytestream)
++                  [cc]"r"(c),
++                   [n]"r"(n),
++            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
++             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++           [range_off]"J"(offsetof(CABACContext, by22.range)),
++             [low_off]"J"(offsetof(CABACContext, low))
 +        :  // Clobbers
++           "memory"
 +    );
 +}
 +
-+
-+// Works but slower than C
-+#define coeff_abs_level_remaining_decode_by22(c,r) coeff_abs_level_remaining_decode_by22_arm(c, r)
-+static int coeff_abs_level_remaining_decode_by22_arm(CABACContext * const c, const unsigned int c_rice_param)
++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
 +{
-+    uint32_t n, val, tmp, level;
-+
-+//    PROFILE_START();
-+
-+    __asm__ (
-+            // Peek
-+            "bic    %[val],  %[low],   #1  \n\t"
-+            "cmp    %[inv], #0          \n\t"
-+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-+            "lsl    %[val], %[val], #1  \n\t"
-+
-+            // Count bits (n = prefix)
-+            "mvn    %[n], %[val] \n\t"
-+            "clz    %[n], %[n]   \n\t"
-+
-+            "lsl    %[level], %[val], %[n] \n\t"
-+            "subs   %[tmp], %[n], #3 \n\t"
-+            "blo    2f \n\t"
-+
-+            // prefix >= 3
-+            // < tmp = prefix - 3
-+            // > tmp = prefix + rice - 3
-+            "add    %[tmp], %[tmp], %[rice] \n\t"
-+            // > n = prefix * 2 + rice - 3
-+            "add    %[n], %[tmp], %[n] \n\t"
-+            "cmp    %[n], #21 \n\t"
-+            "bhi    3f \n\t"
-+
-+            "orr    %[level], %[level], #0x80000000 \n\t"
-+            "rsb    %[tmp], %[tmp], #31 \n\t"
-+            "lsr    %[level], %[level], %[tmp] \n\t"
-+
-+            "mov    %[tmp], #2 \n\t"
-+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-+            "b      1f \n\t"
-+
-+            // > 22 bits used in total - need reload
-+            "3:  \n\t"
-+
-+            // Stash prefix + rice - 3 in level (only spare reg)
-+            "mov    %[level], %[tmp] \n\t"
-+            // Restore n to flush value (prefix)
-+            "sub    %[n], %[n], %[tmp] \n\t"
-+
-+            // Flush + reload
-+
-+//          "rsb    %[tmp], %[n], #32        \n\t"
-+//          "lsr    %[tmp], %[val], %[tmp]   \n\t"
-+//          "mul    %[tmp], %[range], %[tmp] \n\t"
-+
-+            // As it happens we know that all the bits we are flushing are 1
-+            // so we can cheat slightly
-+            "rsb    %[tmp], %[range], %[range], lsl %[n] \n\t"
-+            "lsl    %[tmp], %[tmp], #23      \n\t"
-+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-+
-+            "add    %[bits], %[bits], %[n]   \n\t"
-+            "ldr    %[n], [%[ptr], %[bits], lsr #3]  \n\t"
-+            "rev    %[n], %[n]               \n\t"
-+            "and    %[tmp], %[bits], #7         \n\t"
-+            "lsl    %[n], %[n], %[tmp]          \n\t"
-+
-+            "orr    %[low], %[low], %[n], lsr #9      \n\t"
-+
-+            // (reload)
-+
-+            "bic    %[val],  %[low],   #1  \n\t"
-+            "cmp    %[inv], #0          \n\t"
-+            "umullne  %[tmp], %[val], %[inv], %[val] \n\t"
-+            "lsl    %[val], %[val], #1  \n\t"
-+
-+            // Build value
-+
-+            "mov    %[n], %[level] \n\t"
-+
-+            "orr     %[tmp], %[val], #0x80000000 \n\t"
-+            "rsb     %[level], %[level], #31 \n\t"
-+            "lsr     %[level], %[tmp], %[level] \n\t"
-+
-+            "mov    %[tmp], #2 \n\t"
-+            "add    %[level], %[level], %[tmp], lsl %[rice] \n\t"
-+            "b      1f \n\t"
-+
-+            // prefix < 3
-+            "2:  \n\t"
-+            "rsb    %[tmp], %[rice], #31 \n\t"
-+            "lsr    %[level], %[level], %[tmp] \n\t"
-+            "orr    %[level], %[level], %[n], lsl %[rice] \n\t"
-+            "add    %[n], %[n], %[rice] \n\t"
-+
-+            "1:  \n\t"
-+            // Flush
-+            "add    %[n], %[n], #1 \n\t"
-+
-+            "rsb    %[tmp], %[n], #32        \n\t"
-+            "lsr    %[tmp], %[val], %[tmp]   \n\t"
-+
-+            "add    %[bits], %[bits], %[n]   \n\t"
-+            "ldr    %[val], [%[ptr], %[bits], lsr #3]  \n\t"
-+
-+            "mul    %[tmp], %[range], %[tmp] \n\t"
-+            "lsl    %[tmp], %[tmp], #23      \n\t"
-+            "rsb    %[low], %[tmp], %[low], lsl %[n] \n\t"
-+
-+            "rev    %[val], %[val]               \n\t"
-+            "and    %[tmp], %[bits], #7         \n\t"
-+            "lsl    %[val], %[val], %[tmp]          \n\t"
-+
-+            "orr    %[low], %[low], %[val], lsr #9      \n\t"
++    uint32_t last_coeff_abs_level_remaining;
++    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
++    __asm__ volatile (
++        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
++        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
++        "bic     %[remain], %[remain], #1                     \n\t"
++        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
++        "cmp     %[prefix], #0                                \n\t"
++        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
++        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
++        "lsl     %[remain], %[remain], #1                     \n\t"
++        "mvn     %[prefix], %[remain]                         \n\t"
++        "clz     %[prefix], %[prefix]                         \n\t"
++        "rsbs    %[n1], %[prefix], #2                         \n\t"
++        "bcc     1f                                           \n\t"
++        "adc     %[n1], %[rice], %[prefix]                    \n\t"
++        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
++        "rsb     %[n2], %[n1], #32                            \n\t"
++        "and     %[tmp1], %[tmp2], #7                         \n\t"
++        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
++        "lsr     %[n2], %[remain], %[n2]                      \n\t"
++        "mul     %[n2], %[range], %[n2]                       \n\t"
++        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
++        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
++        "rsb     %[tmp2], %[rice], #31                        \n\t"
++        "lsl     %[remain], %[remain], %[prefix]              \n\t"
++        "lsl     %[n2], %[n2], #23                            \n\t"
++        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
++        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
++        "b       3f                                           \n\t"
++        "1:                                                   \n\t"
++        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
++        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
++        "bhi     2f                                           \n\t"
++        "sub     %[n1], %[n2], #2                             \n\t"
++        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
++        "rsb     %[n2], %[n1], #32                            \n\t"
++        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
++        "lsr     %[n2], %[remain], %[n2]                      \n\t"
++        "mul     %[n2], %[range], %[n2]                       \n\t"
++        "rsb     %[range], %[rice], #34                       \n\t"
++        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
++        "and     %[tmp1], %[tmp2], #7                         \n\t"
++        "lsl     %[remain], %[remain], %[prefix]              \n\t"
++        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
++        "rsb     %[prefix], %[prefix], %[range]               \n\t"
++        "orr     %[remain], %[remain], #0x80000000            \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "lsl     %[n2], %[n2], #23                            \n\t"
++        "mov     %[range], #2                                 \n\t"
++        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
++        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
++        "lsl     %[rice], %[range], %[rice]                   \n\t"
++        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
++        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
++        "b       4f                                           \n\t"
++        "2:                                                   \n\t"
++        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
++        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
++        "rsb     %[tmp1], %[prefix], #32                      \n\t"
++        "push    {%[rice]}                                    \n\t"
++        "and     %[rice], %[n1], #7                           \n\t"
++        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
++        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
++        "mul     %[remain], %[range], %[tmp1]                 \n\t"
++        "rev     %[tmp2], %[tmp2]                             \n\t"
++        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
++        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
++        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
++        "sub     %[tmp2], %[n2], #2                           \n\t"
++        "lsl     %[remain], %[remain], #23                    \n\t"
++        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
++        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
++        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
++        "bic     %[n1], %[remain], #1                         \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
++        "cmp     %[tmp1], #0                                  \n\t"
++        "rsb     %[rice], %[tmp2], #32                        \n\t"
++        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
++        "and     %[tmp1], %[prefix], #7                       \n\t"
++        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
++        "lsl     %[n1], %[n1], #1                             \n\t"
++        "lsr     %[rice], %[n1], %[rice]                      \n\t"
++        "rsb     %[n2], %[n2], #34                            \n\t"
++        "mul     %[range], %[range], %[rice]                  \n\t"
++        "pop     {%[rice]}                                    \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "orr     %[n1], %[n1], #0x80000000                    \n\t"
++        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
++        "mov     %[prefix], #2                                \n\t"
++        "lsl     %[range], %[range], #23                      \n\t"
++        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
++        "lsl     %[remain], %[prefix], %[rice]                \n\t"
++        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
++        "3:                                                   \n\t"
++        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
++        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
++        "4:                                                   \n\t"
++        "str     %[range], [%[cc], %[low_off]]                \n\t"
 +        :  // Outputs
-+         [level]"=&r"(level),
-+             [n]"=&r"(n),
-+           [val]"=&r"(val),
-+           [tmp]"=&r"(tmp),
-+          [bits]"+&r"(c->by22.bits),
-+           [low]"+&r"(c->low)
++            [remain]"=&r"(last_coeff_abs_level_remaining),
++              [rice]"+r"(rice_param),
++            [prefix]"=&r"(prefix),
++                [n1]"=&r"(n1),
++             [range]"=&r"(range),
++                [n2]"=&r"(n2),
++               [ptr]"=&r"(ptr),
++              [tmp1]"=&r"(tmp1),
++              [tmp2]"=&r"(tmp2)
 +        :  // Inputs
-+            [rice]"r"(c_rice_param),
-+             [inv]"r"(c->range),
-+           [range]"r"(c->by22.range),
-+             [ptr]"r"(c->bytestream)
++                          [cc]"r"(c),
++            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
++                     [low_off]"J"(offsetof(CABACContext, low)),
++                   [range_off]"J"(offsetof(CABACContext, range)),
++               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
++              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
++                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
 +        :  // Clobbers
-+                "cc"
++           "cc", "memory"
 +    );
-+
-+//    PROFILE_ACC(residual_abs);
-+
-+    return level;
++    return last_coeff_abs_level_remaining;
 +}
-+#endif
 +
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
@@ -3359,349 +3424,6 @@ index 0000000000..d691cda836
 +        m_filter_v_chroma_16 10
 +endfunc
 +
-diff --git a/libavcodec/arm/rpi_hevcdsp_epel_neon.S b/libavcodec/arm/rpi_hevcdsp_epel_neon.S
-new file mode 100644
-index 0000000000..acc6911091
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_epel_neon.S
-@@ -0,0 +1,337 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+#define MAX_PB_SIZE #64
-+
-+.macro vextin_d4
-+    vld1.8    {q10}, [r1], r2
-+    vmov      d16, d20
-+    vext.8    d17, d20, d21, #1
-+    vext.8    d18, d20, d21, #2
-+    vext.8    d19, d20, d21, #3
-+.endm
-+
-+.macro vextin_d4_8
-+    vld1.8    d16, [r1], r2
-+    vext.8    d17, d16, d16, #1
-+    vext.8    d18, d16, d16, #2
-+    vext.8    d19, d16, d16, #3
-+.endm
-+
-+.macro load_coeffs_16b coeffs
-+    ldr      \coeffs, [\coeffs]
-+    vdup.i8  d0, \coeffs
-+    lsr      \coeffs, #8
-+    vdup.i8  d1, \coeffs
-+    lsr      \coeffs, #8
-+    vdup.i8  d2, \coeffs
-+    lsr      \coeffs, #8
-+    vdup.i8  d3, \coeffs
-+.endm
-+
-+.macro epel_filter_16b out=q12
-+    vmull.u8 q3, d16, d0
-+    vmull.u8 q11, d19, d3
-+    vmull.u8 \out, d17, d1
-+    vmull.u8 q10, d18, d2
-+    vadd.s16 q3, q11
-+    vadd.s16 \out, q10
-+    vsub.s16 \out, q3
-+.endm
-+
-+.macro load_coeffs_32b coeffs
-+    ldr      \coeffs, [\coeffs]
-+    vmov.i64 d4, #0
-+    vmov.8   d4[0], \coeffs
-+    lsr      \coeffs, #8
-+    vmov.8   d4[2], \coeffs
-+    lsr      \coeffs, #8
-+    vmov.8   d4[4], \coeffs
-+    lsr      \coeffs, #8
-+    vmov.8   d4[6], \coeffs
-+.endm
-+
-+.macro epel_filter_32b
-+    vmull.s16 q3, d24, d4[0] //q12
-+    vmull.s16 q4, d25, d4[0]
-+    vmull.s16 q5, d30, d4[3] //q15
-+    vmull.s16 q6, d31, d4[3]
-+
-+    vmull.s16 q7, d26, d4[1] // q13
-+    vmull.s16 q8, d27, d4[1]
-+    vmull.s16 q9, d28, d4[2] // q14
-+    vmull.s16 q10, d29, d4[2]
-+    vadd.s32 q3, q5
-+    vadd.s32 q4, q6
-+    vadd.s32 q7, q9
-+    vadd.s32 q8, q10
-+    vsub.s32 q7, q3
-+    vsub.s32 q8, q4
-+    vqshrn.s32  d6, q7, #6
-+    vqshrn.s32  d7, q8, #6
-+.endm
-+
-+.macro epel_filter_32b_4
-+    vmull.s16 q3, d24, d4[0] //q12
-+    vmull.s16 q5, d30, d4[3] //q15
-+    vmull.s16 q7, d26, d4[1] // q13
-+    vmull.s16 q9, d28, d4[2] // q14
-+    vadd.s32 q3, q5
-+    vadd.s32 q7, q9
-+    vsub.s32 q7, q3
-+    vqshrn.s32  d6, q7, #6
-+.endm
-+
-+function ff_hevc_rpi_put_epel_h_neon_8, export=1
-+        push   {r4-r7}
-+        mov    r4, MAX_PB_SIZE
-+        ldr    r7, [sp, #16] // mx
-+        ldr    r5, [sp, #24] // width
-+        sub    r7, #1
-+        lsl    r7, #2
-+        vpush {d8-d15}
-+@ adr reaches if we are in thumb mode but not in arm
-+T       adr    r12, epel_coeffs
-+A       adrl   r12, epel_coeffs
-+        add    r7, r12
-+        sub       r1, #1
-+        lsl       r4, #1
-+        load_coeffs_16b r7
-+        mov   r12, r3
-+        mov   r6, r0
-+        mov   r7, r1
-+        cmp       r5, #6
-+        bgt       8f
-+        cmp       r5, #4
-+        blt       2f
-+        b         4f
-+8:      subs r3, #1
-+        pld [r1]
-+        vextin_d4
-+        epel_filter_16b
-+        vst1.16    {q12}, [r0], r4
-+        bne 8b
-+        subs    r5, #8
-+        beq  99f
-+        mov       r3, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r1, r7
-+        cmp       r5, #4
-+        bgt       8b
-+4:      subs r3, #1
-+        pld [r1]
-+        vextin_d4_8
-+        epel_filter_16b
-+        vst1.16    d24, [r0], r4
-+        bne 4b
-+        subs      r5, #4
-+        beq       99f
-+        mov       r3, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #4
-+        mov       r1, r7
-+2:      subs r3, #1
-+        pld [r1]
-+        vextin_d4_8
-+        epel_filter_16b
-+        vst1.32    d24[0], [r0], r4
-+        bne 2b
-+99:     vpop {d8-d15}
-+        pop {r4-r7}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_epel_v_neon_8, export=1
-+        push   {r4-r7}
-+        mov    r4, MAX_PB_SIZE
-+        ldr    r7, [sp, #20] // my
-+        ldr    r5, [sp, #24] // width
-+        sub    r7, #1
-+        lsl    r7, #2
-+        vpush {d8-d15}
-+T       adr    r12, epel_coeffs
-+A       adrl   r12, epel_coeffs
-+        add    r7, r12
-+        load_coeffs_16b r7
-+        sub       r1, r2
-+        lsl       r4, #1
-+        mov   r12, r3
-+        mov   r6, r0
-+        mov   r7, r1
-+0:      pld [r1]
-+        vld1.8    {d16}, [r1], r2
-+        pld [r1]
-+        vld1.8    {d17}, [r1], r2
-+        pld [r1]
-+        vld1.8    {d18}, [r1], r2
-+        cmp       r5, #6
-+        bgt       8f
-+        cmp       r5, #4
-+        blt       2f
-+        b         4f
-+8:      pld [r1]
-+        vld1.8    {d19}, [r1], r2
-+        subs r3, #1
-+        epel_filter_16b
-+        vst1.16    {q12}, [r0], r4
-+        vmov d16, d17
-+        vmov d17, d18
-+        vmov d18, d19
-+        bne 8b
-+        subs    r5, #8
-+        beq  99f
-+        mov       r3, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r1, r7
-+        b         0b
-+4:      pld       [r1]
-+        vld1.8    {d19}, [r1], r2
-+        subs r3, #1
-+        epel_filter_16b
-+        vst1.16    d24, [r0], r4
-+        vmov d16, d17
-+        vmov d17, d18
-+        vmov d18, d19
-+        bne 4b
-+        subs      r5, #4
-+        beq       99f
-+        mov       r3, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #4
-+        mov       r1, r7
-+        b         0b
-+2:      pld [r1]
-+        vld1.8    {d19}, [r1], r2
-+        subs r3, #1
-+        epel_filter_16b
-+        vst1.32    d24[0], [r0], r4
-+        vmov d16, d17
-+        vmov d17, d18
-+        vmov d18, d19
-+        bne 2b
-+99:     vpop {d8-d15}
-+        pop {r4-r7}
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_epel_hv_neon_8, export=1
-+        push   {r4-r7}
-+        mov    r4, MAX_PB_SIZE
-+        ldr    r6, [sp, #16] // mx
-+        ldr    r7, [sp, #20] // my
-+        ldr    r5, [sp, #24] // width
-+        sub    r7, #1
-+        lsl    r7, #2
-+        vpush {d8-d15}
-+        adr    r12, epel_coeffs
-+        sub    r6, #1
-+        lsl    r6, #2
-+        add    r6, r12 // mx epel coeff offset
-+        add    r7, r12
-+        sub       r1, #1
-+        sub       r1, r2
-+        lsl       r4, #1
-+        load_coeffs_16b r6
-+        load_coeffs_32b r7
-+        mov   r12, r3
-+        mov   r6, r0
-+        mov   r7, r1
-+0:      pld   [r1]
-+        vextin_d4
-+        epel_filter_16b q12
-+        pld   [r1]
-+        vextin_d4
-+        epel_filter_16b q13
-+        pld   [r1]
-+        vextin_d4
-+        epel_filter_16b q14
-+        cmp       r5, #6
-+        bgt       8f
-+        cmp       r5, #4
-+        blt       2f
-+        b         4f
-+8:      pld     [r1]
-+        vextin_d4
-+        epel_filter_16b q15
-+        subs r3, #1
-+        epel_filter_32b
-+        vst1.16    {q3}, [r0], r4
-+        vmov q12, q13
-+        vmov q13, q14
-+        vmov q14, q15
-+        bne 8b
-+        subs    r5, #8
-+        beq  99f
-+        mov       r3, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r1, r7
-+        b         0b
-+4:      pld      [r1]
-+        vextin_d4_8
-+        epel_filter_16b q15
-+        subs r3, #1
-+        epel_filter_32b_4
-+        vst1.16    d6, [r0], r4
-+        vmov q12, q13
-+        vmov q13, q14
-+        vmov q14, q15
-+        bne 4b
-+        subs      r5, #4
-+        beq       99f
-+        mov       r3, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #4
-+        mov       r1, r7
-+        b         0b
-+2:      pld      [r1]
-+        vextin_d4_8
-+        epel_filter_16b q15
-+        subs r3, #1
-+        epel_filter_32b_4
-+        vst1.32    d6[0], [r0], r4
-+        vmov q12, q13
-+        vmov q13, q14
-+        vmov q14, q15
-+        bne 2b
-+99:     vpop {d8-d15}
-+        pop {r4-r7}
-+        bx lr
-+endfunc
-+
-+epel_coeffs:
-+       .byte 2, 58, 10, 2
-+       .byte 4, 54, 16, 2
-+       .byte 6, 46, 28, 4
-+       .byte 4, 36, 36, 4
-+       .byte 4, 28, 46, 6
-+       .byte 2, 16, 54, 4
-+       .byte 2, 10, 58, 2
 diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
 new file mode 100644
 index 0000000000..cd79460984
@@ -4127,10 +3849,10 @@ index 0000000000..109fa98c29
 +}
 diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
 new file mode 100644
-index 0000000000..472d9d75c9
+index 0000000000..764647fed9
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-@@ -0,0 +1,652 @@
+@@ -0,0 +1,473 @@
 +/*
 + * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -4159,6 +3881,9 @@ index 0000000000..472d9d75c9
 +#include "libavcodec/avcodec.h"
 +#include "libavcodec/bit_depth_template.c"
 +
++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
++// have been removed from head as we never use them.
++
 +void ff_hevc_rpi_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +void ff_hevc_rpi_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +void ff_hevc_rpi_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
@@ -4361,114 +4086,6 @@ index 0000000000..472d9d75c9
 +                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
 +
 +
-+#define PUT_PIXELS(name) \
-+    void name(int16_t *dst, uint8_t *src, \
-+                                ptrdiff_t srcstride, int height, \
-+                                intptr_t mx, intptr_t my, int width)
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w2_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w4_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w6_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w8_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w12_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w16_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w24_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w32_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w48_neon_8);
-+PUT_PIXELS(ff_hevc_rpi_put_pixels_w64_neon_8);
-+#undef PUT_PIXELS
-+void ff_hevc_rpi_put_epel_h_neon_8(int16_t *dst, uint8_t *src,
-+                                ptrdiff_t srcstride, int height,
-+                                intptr_t mx, intptr_t my, int width);
-+void ff_hevc_rpi_put_epel_v_neon_8(int16_t *dst, uint8_t *src,
-+                                ptrdiff_t srcstride, int height,
-+                                intptr_t mx, intptr_t my, int width);
-+void ff_hevc_rpi_put_epel_hv_neon_8(int16_t *dst, uint8_t *src,
-+                                ptrdiff_t srcstride, int height,
-+                                intptr_t mx, intptr_t my, int width);
-+
-+static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                   int height, int width);
-+static void (*put_hevc_qpel_uw_neon[4][4])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
-+void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width);
-+void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width);
-+void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width);
-+#define QPEL_FUNC(name) \
-+    void name(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride, \
-+                                   int height, int width)
-+
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_v1_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_v2_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_v3_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v1_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v2_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h1v3_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v1_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v2_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h2v3_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v1_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v2_neon_8);
-+QPEL_FUNC(ff_hevc_rpi_put_qpel_h3v3_neon_8);
-+#undef QPEL_FUNC
-+
-+#define QPEL_FUNC_UW_PIX(name) \
-+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
-+                                   int height, intptr_t mx, intptr_t my, int width);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8);
-+QPEL_FUNC_UW_PIX(ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8);
-+#undef QPEL_FUNC_UW_PIX
-+
-+#define QPEL_FUNC_UW(name) \
-+    void name(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, \
-+                                   int width, int height, int16_t* src2, ptrdiff_t src2stride);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_pixels_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v1_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v2_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_v3_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v1_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v2_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h1v3_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v1_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v2_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h2v3_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v1_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v2_neon_8);
-+QPEL_FUNC_UW(ff_hevc_rpi_put_qpel_uw_h3v3_neon_8);
-+#undef QPEL_FUNC_UW
-+
-+void ff_hevc_rpi_put_qpel_neon_wrapper(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width) {
-+
-+    put_hevc_qpel_neon[my][mx](dst, MAX_PB_SIZE, src, srcstride, height, width);
-+}
-+
-+void ff_hevc_rpi_put_qpel_uni_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                   int height, intptr_t mx, intptr_t my, int width) {
-+
-+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, NULL, 0);
-+}
-+
-+void ff_hevc_rpi_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+                                       int16_t *src2,
-+                                       int height, intptr_t mx, intptr_t my, int width) {
-+    put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
-+}
-+
 +void ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
 +                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
 +                                                const MvField *curr, const MvField *neigh, uint8_t *bs);
@@ -4571,7 +4188,6 @@ index 0000000000..472d9d75c9
 +av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
 +{
 +    if (bit_depth == 8) {
-+        int x;
 +        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon;
 +        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon;
 +        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon;
@@ -4636,79 +4252,6 @@ index 0000000000..472d9d75c9
 +        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
 +        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
 +#endif
-+        put_hevc_qpel_neon[1][0]       = ff_hevc_rpi_put_qpel_v1_neon_8;
-+        put_hevc_qpel_neon[2][0]       = ff_hevc_rpi_put_qpel_v2_neon_8;
-+        put_hevc_qpel_neon[3][0]       = ff_hevc_rpi_put_qpel_v3_neon_8;
-+        put_hevc_qpel_neon[0][1]       = ff_hevc_rpi_put_qpel_h1_neon_8;
-+        put_hevc_qpel_neon[0][2]       = ff_hevc_rpi_put_qpel_h2_neon_8;
-+        put_hevc_qpel_neon[0][3]       = ff_hevc_rpi_put_qpel_h3_neon_8;
-+        put_hevc_qpel_neon[1][1]       = ff_hevc_rpi_put_qpel_h1v1_neon_8;
-+        put_hevc_qpel_neon[1][2]       = ff_hevc_rpi_put_qpel_h2v1_neon_8;
-+        put_hevc_qpel_neon[1][3]       = ff_hevc_rpi_put_qpel_h3v1_neon_8;
-+        put_hevc_qpel_neon[2][1]       = ff_hevc_rpi_put_qpel_h1v2_neon_8;
-+        put_hevc_qpel_neon[2][2]       = ff_hevc_rpi_put_qpel_h2v2_neon_8;
-+        put_hevc_qpel_neon[2][3]       = ff_hevc_rpi_put_qpel_h3v2_neon_8;
-+        put_hevc_qpel_neon[3][1]       = ff_hevc_rpi_put_qpel_h1v3_neon_8;
-+        put_hevc_qpel_neon[3][2]       = ff_hevc_rpi_put_qpel_h2v3_neon_8;
-+        put_hevc_qpel_neon[3][3]       = ff_hevc_rpi_put_qpel_h3v3_neon_8;
-+        put_hevc_qpel_uw_neon[1][0]      = ff_hevc_rpi_put_qpel_uw_v1_neon_8;
-+        put_hevc_qpel_uw_neon[2][0]      = ff_hevc_rpi_put_qpel_uw_v2_neon_8;
-+        put_hevc_qpel_uw_neon[3][0]      = ff_hevc_rpi_put_qpel_uw_v3_neon_8;
-+        put_hevc_qpel_uw_neon[0][1]      = ff_hevc_rpi_put_qpel_uw_h1_neon_8;
-+        put_hevc_qpel_uw_neon[0][2]      = ff_hevc_rpi_put_qpel_uw_h2_neon_8;
-+        put_hevc_qpel_uw_neon[0][3]      = ff_hevc_rpi_put_qpel_uw_h3_neon_8;
-+        put_hevc_qpel_uw_neon[1][1]      = ff_hevc_rpi_put_qpel_uw_h1v1_neon_8;
-+        put_hevc_qpel_uw_neon[1][2]      = ff_hevc_rpi_put_qpel_uw_h2v1_neon_8;
-+        put_hevc_qpel_uw_neon[1][3]      = ff_hevc_rpi_put_qpel_uw_h3v1_neon_8;
-+        put_hevc_qpel_uw_neon[2][1]      = ff_hevc_rpi_put_qpel_uw_h1v2_neon_8;
-+        put_hevc_qpel_uw_neon[2][2]      = ff_hevc_rpi_put_qpel_uw_h2v2_neon_8;
-+        put_hevc_qpel_uw_neon[2][3]      = ff_hevc_rpi_put_qpel_uw_h3v2_neon_8;
-+        put_hevc_qpel_uw_neon[3][1]      = ff_hevc_rpi_put_qpel_uw_h1v3_neon_8;
-+        put_hevc_qpel_uw_neon[3][2]      = ff_hevc_rpi_put_qpel_uw_h2v3_neon_8;
-+        put_hevc_qpel_uw_neon[3][3]      = ff_hevc_rpi_put_qpel_uw_h3v3_neon_8;
-+        for (x = 0; x < 10; x++) {
-+            c->put_hevc_qpel[x][1][0]         = ff_hevc_rpi_put_qpel_neon_wrapper;
-+            c->put_hevc_qpel[x][0][1]         = ff_hevc_rpi_put_qpel_neon_wrapper;
-+            c->put_hevc_qpel[x][1][1]         = ff_hevc_rpi_put_qpel_neon_wrapper;
-+            c->put_hevc_qpel_uni[x][1][0]     = ff_hevc_rpi_put_qpel_uni_neon_wrapper;
-+            c->put_hevc_qpel_uni[x][0][1]     = ff_hevc_rpi_put_qpel_uni_neon_wrapper;
-+            c->put_hevc_qpel_uni[x][1][1]     = ff_hevc_rpi_put_qpel_uni_neon_wrapper;
-+            c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_rpi_put_qpel_bi_neon_wrapper;
-+            c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_rpi_put_qpel_bi_neon_wrapper;
-+            c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_rpi_put_qpel_bi_neon_wrapper;
-+            c->put_hevc_epel[x][1][0]         = ff_hevc_rpi_put_epel_v_neon_8;
-+            c->put_hevc_epel[x][0][1]         = ff_hevc_rpi_put_epel_h_neon_8;
-+            c->put_hevc_epel[x][1][1]         = ff_hevc_rpi_put_epel_hv_neon_8;
-+        }
-+        c->put_hevc_epel[0][0][0]  = ff_hevc_rpi_put_pixels_w2_neon_8;
-+        c->put_hevc_epel[1][0][0]  = ff_hevc_rpi_put_pixels_w4_neon_8;
-+        c->put_hevc_epel[2][0][0]  = ff_hevc_rpi_put_pixels_w6_neon_8;
-+        c->put_hevc_epel[3][0][0]  = ff_hevc_rpi_put_pixels_w8_neon_8;
-+        c->put_hevc_epel[4][0][0]  = ff_hevc_rpi_put_pixels_w12_neon_8;
-+        c->put_hevc_epel[5][0][0]  = ff_hevc_rpi_put_pixels_w16_neon_8;
-+        c->put_hevc_epel[6][0][0]  = ff_hevc_rpi_put_pixels_w24_neon_8;
-+        c->put_hevc_epel[7][0][0]  = ff_hevc_rpi_put_pixels_w32_neon_8;
-+        c->put_hevc_epel[8][0][0]  = ff_hevc_rpi_put_pixels_w48_neon_8;
-+        c->put_hevc_epel[9][0][0]  = ff_hevc_rpi_put_pixels_w64_neon_8;
-+
-+        c->put_hevc_qpel[0][0][0]  = ff_hevc_rpi_put_pixels_w2_neon_8;
-+        c->put_hevc_qpel[1][0][0]  = ff_hevc_rpi_put_pixels_w4_neon_8;
-+        c->put_hevc_qpel[2][0][0]  = ff_hevc_rpi_put_pixels_w6_neon_8;
-+        c->put_hevc_qpel[3][0][0]  = ff_hevc_rpi_put_pixels_w8_neon_8;
-+        c->put_hevc_qpel[4][0][0]  = ff_hevc_rpi_put_pixels_w12_neon_8;
-+        c->put_hevc_qpel[5][0][0]  = ff_hevc_rpi_put_pixels_w16_neon_8;
-+        c->put_hevc_qpel[6][0][0]  = ff_hevc_rpi_put_pixels_w24_neon_8;
-+        c->put_hevc_qpel[7][0][0]  = ff_hevc_rpi_put_pixels_w32_neon_8;
-+        c->put_hevc_qpel[8][0][0]  = ff_hevc_rpi_put_pixels_w48_neon_8;
-+        c->put_hevc_qpel[9][0][0]  = ff_hevc_rpi_put_pixels_w64_neon_8;
-+
-+        c->put_hevc_qpel_uni[1][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w4_neon_8;
-+        c->put_hevc_qpel_uni[3][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w8_neon_8;
-+        c->put_hevc_qpel_uni[5][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w16_neon_8;
-+        c->put_hevc_qpel_uni[6][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w24_neon_8;
-+        c->put_hevc_qpel_uni[7][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w32_neon_8;
-+        c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w48_neon_8;
-+        c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_rpi_put_qpel_uw_pixels_w64_neon_8;
 +    }
 +    else if (bit_depth == 10) {
 +        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
@@ -4783,1011 +4326,6 @@ index 0000000000..472d9d75c9
 +    assert(offsetof(MvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
 +}
-diff --git a/libavcodec/arm/rpi_hevcdsp_qpel_neon.S b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S
-new file mode 100644
-index 0000000000..86a9dcc377
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_qpel_neon.S
-@@ -0,0 +1,999 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+#define MAX_PB_SIZE #64
-+
-+.macro regshuffle_d8
-+    vmov d16, d17
-+    vmov d17, d18
-+    vmov d18, d19
-+    vmov d19, d20
-+    vmov d20, d21
-+    vmov d21, d22
-+    vmov d22, d23
-+.endm
-+
-+.macro regshuffle_q8
-+    vmov q0, q1
-+    vmov q1, q2
-+    vmov q2, q3
-+    vmov q3, q4
-+    vmov q4, q5
-+    vmov q5, q6
-+    vmov q6, q7
-+.endm
-+
-+.macro vextin8
-+        pld       [r2]
-+        vld1.8    {q11}, [r2], r3
-+        vext.8    d16, d22, d23, #1
-+        vext.8    d17, d22, d23, #2
-+        vext.8    d18, d22, d23, #3
-+        vext.8    d19, d22, d23, #4
-+        vext.8    d20, d22, d23, #5
-+        vext.8    d21, d22, d23, #6
-+        vext.8    d22, d22, d23, #7
-+.endm
-+
-+.macro loadin8
-+        pld       [r2]
-+        vld1.8    {d16}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d17}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d18}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d19}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d20}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d21}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d22}, [r2], r3
-+        pld       [r2]
-+        vld1.8    {d23}, [r2], r3
-+.endm
-+
-+.macro qpel_filter_1_32b
-+        vmov.i16   d16, #58
-+        vmov.i16   d17, #10
-+        vmull.s16   q9, d6, d16   // 58 * d0
-+        vmull.s16  q10, d7, d16   // 58 * d1
-+        vmov.i16   d16, #17
-+        vmull.s16  q11, d4, d17   // 10 * c0
-+        vmull.s16  q12, d5, d17   // 10 * c1
-+        vmov.i16   d17, #5
-+        vmull.s16  q13, d8, d16   // 17 * e0
-+        vmull.s16  q14, d9, d16   // 17 * e1
-+        vmull.s16  q15, d10, d17  //  5 * f0
-+        vmull.s16   q8, d11, d17  //  5 * f1
-+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
-+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
-+        vshll.s16  q11, d2, #2    // 4 * b0
-+        vshll.s16  q12, d3, #2    // 4 * b1
-+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
-+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
-+        vsubl.s16  q13, d12, d0   // g0 - a0
-+        vsubl.s16  q14, d13, d1   // g1 - a1
-+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
-+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
-+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
-+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
-+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
-+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
-+        vqshrn.s32  d16, q9, #6
-+        vqshrn.s32  d17, q10, #6
-+.endm
-+
-+// input  q0 - q7
-+// output q8
-+.macro qpel_filter_2_32b
-+        vmov.i32   q8, #11
-+        vaddl.s16   q9, d6, d8   // d0 + e0
-+        vaddl.s16  q10, d7, d9   // d1 + e1
-+        vaddl.s16  q11, d4, d10  // c0 + f0
-+        vaddl.s16  q12, d5, d11  // c1 + f1
-+        vmul.s32   q11, q8       // 11 * (c0 + f0)
-+        vmul.s32   q12, q8       // 11 * (c1 + f1)
-+        vmov.i32   q8, #40
-+        vaddl.s16  q15, d2, d12  // b0 + g0
-+        vmul.s32    q9, q8       // 40 * (d0 + e0)
-+        vmul.s32   q10, q8       // 40 * (d1 + e1)
-+        vaddl.s16   q8, d3, d13  // b1 + g1
-+        vaddl.s16  q13, d0, d14  // a0 + h0
-+        vaddl.s16  q14, d1, d15  // a1 + h1
-+        vshl.s32   q15, #2       // 4*(b0+g0)
-+        vshl.s32    q8, #2       // 4*(b1+g1)
-+        vadd.s32   q11, q13      // 11 * (c0 + f0) + a0 + h0
-+        vadd.s32   q12, q14      // 11 * (c1 + f1) + a1 + h1
-+        vadd.s32   q9, q15       // 40 * (d0 + e0) + 4*(b0+g0)
-+        vadd.s32   q10, q8       // 40 * (d1 + e1) + 4*(b1+g1)
-+        vsub.s32   q9, q11       // 40 * (d0 + e0) + 4*(b0+g0) - (11 * (c0 + f0) + a0 + h0)
-+        vsub.s32   q10, q12      // 40 * (d1 + e1) + 4*(b1+g1) - (11 * (c1 + f1) + a1 + h1)
-+        vqshrn.s32  d16, q9, #6
-+        vqshrn.s32  d17, q10, #6
-+.endm
-+
-+.macro qpel_filter_3_32b
-+        vmov.i16   d16, #58
-+        vmov.i16   d17, #10
-+        vmull.s16   q9, d8, d16   // 58 * d0
-+        vmull.s16  q10, d9, d16   // 58 * d1
-+        vmov.i16   d16, #17
-+        vmull.s16  q11, d10, d17  // 10 * c0
-+        vmull.s16  q12, d11, d17  // 10 * c1
-+        vmov.i16   d17, #5
-+        vmull.s16  q13, d6, d16   // 17 * e0
-+        vmull.s16  q14, d7, d16   // 17 * e1
-+        vmull.s16  q15, d4, d17   //  5 * f0
-+        vmull.s16   q8, d5, d17   //  5 * f1
-+        vsub.s32    q9, q11       // 58 * d0 - 10 * c0
-+        vsub.s32   q10, q12       // 58 * d1 - 10 * c1
-+        vshll.s16  q11, d12, #2   // 4 * b0
-+        vshll.s16  q12, d13, #2   // 4 * b1
-+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0
-+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1
-+        vsubl.s16  q13, d2, d14   // g0 - a0
-+        vsubl.s16  q14, d3, d15   // g1 - a1
-+        vadd.s32    q9, q11       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0
-+        vadd.s32   q10, q12       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1
-+        vsub.s32   q13, q15       // g0 - a0 - 5 * f0
-+        vsub.s32   q14, q8        // g1 - a1 - 5 * f1
-+        vadd.s32    q9, q13       // 58 * d0 - 10 * c0 + 17 * e0 + 4 * b0 + g0 - a0 - 5 * f0
-+        vadd.s32   q10, q14       // 58 * d1 - 10 * c1 + 17 * e1 + 4 * b1 + g1 - a1 - 5 * f1
-+        vqshrn.s32  d16, q9, #6
-+        vqshrn.s32  d17, q10, #6
-+.endm
-+
-+.macro qpel_filter_1 out=q7
-+        vmov.u8    d24, #58
-+        vmov.u8    d25, #10
-+        vshll.u8   q13, d20, #4   // 16*e
-+        vshll.u8   q14, d21, #2   // 4*f
-+        vmull.u8  \out, d19, d24  // 58*d
-+        vaddw.u8   q13, q13, d20  // 17*e
-+        vmull.u8   q15, d18, d25  // 10*c
-+        vaddw.u8   q14, q14, d21  // 5*f
-+        vsubl.u8   q12, d22, d16  // g - a
-+        vadd.u16  \out, q13       // 58d + 17e
-+        vshll.u8   q13, d17, #2   // 4*b
-+        vadd.u16   q15, q14       // 10*c + 5*f
-+        vadd.s16   q13, q12       // - a + 4*b + g
-+        vsub.s16  \out, q15       // -10*c + 58*d + 17*e -5*f
-+        vadd.s16  \out, q13       // -a + 4*b -10*c + 58*d + 17*e -5*f
-+.endm
-+
-+.macro qpel_filter_2 out=q7
-+        vmov.i16   q12, #10
-+        vmov.i16   q14, #11
-+        vaddl.u8   q13, d19, d20   // d + e
-+        vaddl.u8   q15, d18, d21   // c + f
-+        vmul.u16   q13, q12        // 10 * (d+e)
-+        vmul.u16   q15, q14        // 11 * ( c + f)
-+        vaddl.u8  \out, d17, d22   // b + g
-+        vaddl.u8   q12, d16, d23   // a + h
-+        vadd.u16  \out, q13        // b + 10 * (d + e) + g
-+        vadd.s16   q12, q15
-+        vshl.u16  \out, #2         // 4 * (b + 10 * (d + e) + g)
-+        vsub.s16  \out, q12
-+.endm
-+
-+.macro qpel_filter_3 out=q7
-+        vmov.u8    d24, #58
-+        vmov.u8    d25, #10
-+        vshll.u8   q13, d19, #4     // 16*e
-+        vshll.u8   q14, d18, #2     // 4*f
-+        vmull.u8  \out, d20, d24    // 58*d
-+        vaddw.u8   q13, q13, d19    // 17*e
-+        vmull.u8   q15, d21, d25    // 10*c
-+        vaddw.u8   q14, q14, d18    // 5*f
-+        vsubl.u8   q12, d17, d23    // g - a
-+        vadd.u16  \out, q13         // 58d + 17e
-+        vshll.u8   q13, d22, #2     // 4*b
-+        vadd.u16   q15, q14         // 10*c + 5*f
-+        vadd.s16   q13, q12         // - a + 4*b + g
-+        vsub.s16  \out, q15         // -10*c + 58*d + 17*e -5*f
-+        vadd.s16  \out, q13         // -a + 4*b -10*c + 58*d + 17*e -5*f
-+.endm
-+
-+.macro  hevc_put_qpel_vX_neon_8 filter
-+        push   {r4, r5, r6, r7}
-+        ldr    r4, [sp, #16] // height
-+        ldr    r5, [sp, #20] // width
-+        vpush {d8-d15}
-+        sub       r2, r2, r3, lsl #1
-+        sub       r2, r3
-+        mov       r12, r4
-+        mov       r6, r0
-+        mov       r7, r2
-+        lsl       r1, #1
-+0:      loadin8
-+        cmp       r5, #4
-+        beq       4f
-+8:      subs r4, #1
-+        \filter
-+        vst1.16    {q7}, [r0], r1
-+        regshuffle_d8
-+        vld1.8    {d23}, [r2], r3
-+        bne 8b
-+        subs  r5, #8
-+        beq       99f
-+        mov r4, r12
-+        add r6, #16
-+        mov r0, r6
-+        add r7, #8
-+        mov r2, r7
-+        b     0b
-+4:      subs r4, #1
-+        \filter
-+        vst1.16    d14, [r0], r1
-+        regshuffle_d8
-+        vld1.32    {d23[0]}, [r2], r3
-+        bne 4b
-+99:     vpop {d8-d15}
-+        pop {r4, r5, r6, r7}
-+        bx lr
-+.endm
-+
-+.macro  hevc_put_qpel_uw_vX_neon_8 filter
-+        push   {r4-r10}
-+        ldr    r5, [sp, #28] // width
-+        ldr    r4, [sp, #32] // height
-+        ldr    r8, [sp, #36] // src2
-+        ldr    r9, [sp, #40] // src2stride
-+        vpush {d8-d15}
-+        sub       r2, r2, r3, lsl #1
-+        sub       r2, r3
-+        mov       r12, r4
-+        mov       r6, r0
-+        mov       r7, r2
-+        cmp       r8, #0
-+        bne       .Lbi\@
-+0:      loadin8
-+        cmp       r5, #4
-+        beq       4f
-+8:      subs r4, #1
-+        \filter
-+        vqrshrun.s16   d0, q7, #6
-+        vst1.8    d0, [r0], r1
-+        regshuffle_d8
-+        vld1.8    {d23}, [r2], r3
-+        bne 8b
-+        subs  r5, #8
-+        beq       99f
-+        mov r4, r12
-+        add r6, #8
-+        mov r0, r6
-+        add r7, #8
-+        mov r2, r7
-+        b     0b
-+4:      subs r4, #1
-+        \filter
-+        vqrshrun.s16   d0, q7, #6
-+        vst1.32    d0[0], [r0], r1
-+        regshuffle_d8
-+        vld1.32    {d23[0]}, [r2], r3
-+        bne 4b
-+        b   99f
-+.Lbi\@: lsl       r9, #1
-+        mov       r10, r8
-+0:      loadin8
-+        cmp       r5, #4
-+        beq       4f
-+8:      subs r4, #1
-+        \filter
-+        vld1.16        {q0}, [r8], r9
-+        vqadd.s16      q0, q7
-+        vqrshrun.s16   d0, q0, #7
-+        vst1.8         d0, [r0], r1
-+        regshuffle_d8
-+        vld1.8    {d23}, [r2], r3
-+        bne 8b
-+        subs  r5, #8
-+        beq       99f
-+        mov r4, r12
-+        add r6, #8
-+        mov r0, r6
-+        add r10, #16
-+        mov r8, r10
-+        add r7, #8
-+        mov r2, r7
-+        b     0b
-+4:      subs r4, #1
-+        \filter
-+        vld1.16      d0, [r8], r9
-+        vqadd.s16    d0, d14
-+        vqrshrun.s16 d0, q0, #7
-+        vst1.32      d0[0], [r0], r1
-+        regshuffle_d8
-+        vld1.32    {d23[0]}, [r2], r3
-+        bne 4b
-+99:     vpop {d8-d15}
-+        pop {r4-r10}
-+        bx lr
-+.endm
-+
-+function ff_hevc_rpi_put_qpel_v1_neon_8, export=1
-+        hevc_put_qpel_vX_neon_8 qpel_filter_1
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_v2_neon_8, export=1
-+        hevc_put_qpel_vX_neon_8 qpel_filter_2
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_v3_neon_8, export=1
-+        hevc_put_qpel_vX_neon_8 qpel_filter_3
-+endfunc
-+
-+
-+function ff_hevc_rpi_put_qpel_uw_v1_neon_8, export=1
-+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_1
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_v2_neon_8, export=1
-+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_2
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_v3_neon_8, export=1
-+        hevc_put_qpel_uw_vX_neon_8 qpel_filter_3
-+endfunc
-+
-+.macro hevc_put_qpel_hX_neon_8 filter
-+        push     {r4, r5, r6, r7}
-+        ldr    r4, [sp, #16] // height
-+        ldr    r5, [sp, #20] // width
-+
-+        vpush    {d8-d15}
-+        sub       r2, #4
-+        lsl       r1, #1
-+        mov      r12, r4
-+        mov       r6, r0
-+        mov       r7, r2
-+        cmp       r5, #4
-+        beq       4f
-+8:      subs      r4, #1
-+        vextin8
-+        \filter
-+        vst1.16   {q7}, [r0], r1
-+        bne       8b
-+        subs      r5, #8
-+        beq      99f
-+        mov       r4, r12
-+        add       r6, #16
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r2, r7
-+        cmp       r5, #4
-+        bne       8b
-+4:      subs      r4, #1
-+        vextin8
-+        \filter
-+        vst1.16  d14, [r0], r1
-+        bne       4b
-+99:     vpop     {d8-d15}
-+        pop      {r4, r5, r6, r7}
-+        bx lr
-+.endm
-+
-+.macro hevc_put_qpel_uw_hX_neon_8 filter
-+        push     {r4-r10}
-+        ldr       r5, [sp, #28] // width
-+        ldr       r4, [sp, #32] // height
-+        ldr       r8, [sp, #36] // src2
-+        ldr       r9, [sp, #40] // src2stride
-+        vpush    {d8-d15}
-+        sub       r2, #4
-+        mov      r12, r4
-+        mov       r6, r0
-+        mov       r7, r2
-+        cmp       r8, #0
-+        bne       .Lbi\@
-+        cmp       r5, #4
-+        beq       4f
-+8:      subs      r4, #1
-+        vextin8
-+        \filter
-+        vqrshrun.s16   d0, q7, #6
-+        vst1.8    d0, [r0], r1
-+        bne       8b
-+        subs      r5, #8
-+        beq      99f
-+        mov       r4, r12
-+        add       r6, #8
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r2, r7
-+        cmp       r5, #4
-+        bne       8b
-+4:      subs      r4, #1
-+        vextin8
-+        \filter
-+        vqrshrun.s16   d0, q7, #6
-+        vst1.32  d0[0], [r0], r1
-+        bne       4b
-+        b         99f
-+.Lbi\@:
-+        lsl       r9, #1
-+        cmp       r5, #4
-+        beq       4f
-+        mov       r10, r8
-+8:      subs      r4, #1
-+        vextin8
-+        \filter
-+        vld1.16        {q0}, [r8], r9
-+        vqadd.s16      q0, q7
-+        vqrshrun.s16   d0, q0, #7
-+        vst1.8         d0, [r0], r1
-+        bne       8b
-+        subs      r5, #8
-+        beq      99f
-+        mov       r4, r12
-+        add       r6, #8
-+        add       r10, #16
-+        mov       r8, r10
-+        mov       r0, r6
-+        add       r7, #8
-+        mov       r2, r7
-+        cmp       r5, #4
-+        bne       8b
-+4:      subs      r4, #1
-+        vextin8
-+        \filter
-+        vld1.16      d0, [r8], r9
-+        vqadd.s16    d0, d14
-+        vqrshrun.s16 d0, q0, #7
-+        vst1.32      d0[0], [r0], r1
-+        bne       4b
-+99:     vpop     {d8-d15}
-+        pop      {r4-r10}
-+        bx lr
-+.endm
-+
-+function ff_hevc_rpi_put_qpel_h1_neon_8, export=1
-+        hevc_put_qpel_hX_neon_8 qpel_filter_1
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h2_neon_8, export=1
-+        hevc_put_qpel_hX_neon_8 qpel_filter_2
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h3_neon_8, export=1
-+        hevc_put_qpel_hX_neon_8 qpel_filter_3
-+endfunc
-+
-+
-+function ff_hevc_rpi_put_qpel_uw_h1_neon_8, export=1
-+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_1
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h2_neon_8, export=1
-+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_2
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h3_neon_8, export=1
-+        hevc_put_qpel_uw_hX_neon_8 qpel_filter_3
-+endfunc
-+
-+.macro hevc_put_qpel_hXvY_neon_8 filterh filterv
-+        push   {r4, r5, r6, r7}
-+        ldr    r4, [sp, #16] // height
-+        ldr    r5, [sp, #20] // width
-+
-+        vpush {d8-d15}
-+        sub       r2, #4
-+        sub       r2, r2, r3, lsl #1
-+        sub       r2, r3  // extra_before 3
-+        lsl       r1, #1
-+        mov       r12, r4
-+        mov       r6, r0
-+        mov       r7, r2
-+0:      vextin8
-+        \filterh q0
-+        vextin8
-+        \filterh q1
-+        vextin8
-+        \filterh q2
-+        vextin8
-+        \filterh q3
-+        vextin8
-+        \filterh q4
-+        vextin8
-+        \filterh q5
-+        vextin8
-+        \filterh q6
-+        vextin8
-+        \filterh q7
-+        cmp r5, #4
-+        beq 4f
-+8:      subs  r4, #1
-+        \filterv
-+        vst1.16    {q8}, [r0], r1
-+        regshuffle_q8
-+        vextin8
-+        \filterh q7
-+        bne 8b
-+        subs  r5, #8
-+        beq 99f
-+        mov r4, r12
-+        add r6, #16
-+        mov r0, r6
-+        add r7, #8
-+        mov r2, r7
-+        b 0b
-+4:      subs  r4, #1
-+        \filterv
-+        vst1.16    d16, [r0], r1
-+        regshuffle_q8
-+        vextin8
-+        \filterh q7
-+        bne 4b
-+99:     vpop {d8-d15}
-+        pop {r4, r5, r6, r7}
-+        bx lr
-+.endm
-+
-+.macro hevc_put_qpel_uw_hXvY_neon_8 filterh filterv
-+        push     {r4-r10}
-+        ldr       r5, [sp, #28] // width
-+        ldr       r4, [sp, #32] // height
-+        ldr       r8, [sp, #36] // src2
-+        ldr       r9, [sp, #40] // src2stride
-+        vpush {d8-d15}
-+        sub       r2, #4
-+        sub       r2, r2, r3, lsl #1
-+        sub       r2, r3  // extra_before 3
-+        mov       r12, r4
-+        mov       r6, r0
-+        mov       r7, r2
-+        cmp       r8, #0
-+        bne       .Lbi\@
-+0:      vextin8
-+        \filterh q0
-+        vextin8
-+        \filterh q1
-+        vextin8
-+        \filterh q2
-+        vextin8
-+        \filterh q3
-+        vextin8
-+        \filterh q4
-+        vextin8
-+        \filterh q5
-+        vextin8
-+        \filterh q6
-+        vextin8
-+        \filterh q7
-+        cmp r5, #4
-+        beq 4f
-+8:      subs  r4, #1
-+        \filterv
-+        vqrshrun.s16   d0, q8, #6
-+        vst1.8    d0, [r0], r1
-+        regshuffle_q8
-+        vextin8
-+        \filterh q7
-+        bne 8b
-+        subs  r5, #8
-+        beq 99f
-+        mov r4, r12
-+        add r6, #8
-+        mov r0, r6
-+        add r7, #8
-+        mov r2, r7
-+        b 0b
-+4:      subs  r4, #1
-+        \filterv
-+        vqrshrun.s16   d0, q8, #6
-+        vst1.32        d0[0], [r0], r1
-+        regshuffle_q8
-+        vextin8
-+        \filterh q7
-+        bne 4b
-+        b   99f
-+.Lbi\@: lsl      r9, #1
-+        mov      r10, r8
-+0:      vextin8
-+        \filterh q0
-+        vextin8
-+        \filterh q1
-+        vextin8
-+        \filterh q2
-+        vextin8
-+        \filterh q3
-+        vextin8
-+        \filterh q4
-+        vextin8
-+        \filterh q5
-+        vextin8
-+        \filterh q6
-+        vextin8
-+        \filterh q7
-+        cmp r5, #4
-+        beq 4f
-+8:      subs  r4, #1
-+        \filterv
-+        vld1.16        {q0}, [r8], r9
-+        vqadd.s16      q0, q8
-+        vqrshrun.s16   d0, q0, #7
-+        vst1.8         d0, [r0], r1
-+        regshuffle_q8
-+        vextin8
-+        \filterh q7
-+        bne 8b
-+        subs  r5, #8
-+        beq 99f
-+        mov r4, r12
-+        add r6, #8
-+        mov r0, r6
-+        add r10, #16
-+        mov r8, r10
-+        add r7, #8
-+        mov r2, r7
-+        b 0b
-+4:      subs  r4, #1
-+        \filterv
-+        vld1.16      d0, [r8], r9
-+        vqadd.s16    d0, d16
-+        vqrshrun.s16 d0, q0, #7
-+        vst1.32      d0[0], [r0], r1
-+        regshuffle_q8
-+        vextin8
-+        \filterh q7
-+        bne 4b
-+99:     vpop {d8-d15}
-+        pop {r4-r10}
-+        bx lr
-+.endm
-+
-+
-+function ff_hevc_rpi_put_qpel_h1v1_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h2v1_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h3v1_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h1v2_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h2v2_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h3v2_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h1v3_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h2v3_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_h3v3_neon_8, export=1
-+        hevc_put_qpel_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
-+endfunc
-+
-+
-+function ff_hevc_rpi_put_qpel_uw_h1v1_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_1_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h2v1_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_1_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h3v1_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_1_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h1v2_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_2_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h2v2_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_2_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h3v2_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_2_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h1v3_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_1 qpel_filter_3_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h2v3_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_2 qpel_filter_3_32b
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_h3v3_neon_8, export=1
-+        hevc_put_qpel_uw_hXvY_neon_8 qpel_filter_3 qpel_filter_3_32b
-+endfunc
-+
-+.macro init_put_pixels
-+        pld    [r1]
-+        pld    [r1, r2]
-+        mov    r12, MAX_PB_SIZE
-+        lsl    r12, #1
-+.endm
-+
-+function ff_hevc_rpi_put_pixels_w2_neon_8, export=1
-+        init_put_pixels
-+        vmov.u8      d5, #255
-+        vshr.u64     d5, #32
-+0:      subs r3, #1
-+        vld1.32     {d0[0]}, [r1], r2
-+        pld [r1]
-+        vld1.32     d6, [r0]
-+        vshll.u8    q0, d0, #6
-+        vbit        d6, d0, d5
-+        vst1.32     d6, [r0], r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w4_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #2
-+        vld1.32   {d0[0]}, [r1], r2
-+        vld1.32   {d0[1]}, [r1], r2
-+        pld       [r1]
-+        pld       [r1, r2]
-+        vshll.u8   q0, d0, #6
-+        vst1.64   {d0}, [r0], r12
-+        vst1.64   {d1}, [r0], r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w6_neon_8, export=1
-+        init_put_pixels
-+        vmov.u8      q10, #255
-+        vshr.u64     d21, #32
-+0:      subs r3, #1
-+        vld1.16     {d0}, [r1], r2
-+        pld [r1]
-+        vshll.u8    q0, d0, #6
-+        vld1.8      {q12}, [r0]
-+        vbit        q12, q0, q10
-+        vst1.8      {q12}, [r0], r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w8_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #2
-+        vld1.8   {d0}, [r1], r2
-+        vld1.8   {d2}, [r1], r2
-+        pld        [r1]
-+        pld        [r1, r2]
-+        vshll.u8   q0, d0, #6
-+        vshll.u8   q1, d2, #6
-+        vst1.16   {q0}, [r0], r12
-+        vst1.16   {q1}, [r0], r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w12_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #2
-+        vld1.64    {d0}, [r1]
-+        add       r1, #8
-+        vld1.32   {d1[0]}, [r1], r2
-+        sub       r1, #8
-+        vld1.64    {d2}, [r1]
-+        add       r1, #8
-+        vld1.32   {d1[1]}, [r1], r2
-+        sub       r1, #8
-+        pld       [r1]
-+        pld       [r1, r2]
-+        vshll.u8  q8, d0, #6
-+        vshll.u8  q9, d1, #6
-+        vshll.u8  q10, d2, #6
-+        vmov      d22, d19
-+        vst1.64   {d16, d17, d18}, [r0], r12
-+        vst1.64   {d20, d21, d22}, [r0], r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w16_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #2
-+        vld1.8   {q0}, [r1], r2
-+        vld1.8   {q1}, [r1], r2
-+        pld       [r1]
-+        pld       [r1, r2]
-+        vshll.u8  q8, d0, #6
-+        vshll.u8  q9, d1, #6
-+        vshll.u8  q10, d2, #6
-+        vshll.u8  q11, d3, #6
-+        vst1.8    {q8, q9}, [r0], r12
-+        vst1.8    {q10, q11}, [r0], r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w24_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #1
-+        vld1.8   {d0, d1, d2}, [r1], r2
-+        pld       [r1]
-+        vshll.u8  q10, d0, #6
-+        vshll.u8  q11, d1, #6
-+        vshll.u8  q12, d2, #6
-+        vstm     r0, {q10, q11, q12}
-+        add      r0, r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w32_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #1
-+        vld1.8 {q0, q1}, [r1], r2
-+        pld       [r1]
-+        vshll.u8  q8, d0, #6
-+        vshll.u8  q9, d1, #6
-+        vshll.u8  q10, d2, #6
-+        vshll.u8  q11, d3, #6
-+        vstm    r0, {q8, q9, q10, q11}
-+        add     r0, r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w48_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #1
-+        vld1.8    {q0, q1}, [r1]
-+        add r1, #32
-+        vld1.8    {q2}, [r1], r2
-+        sub r1, #32
-+        pld       [r1]
-+        vshll.u8  q8, d0, #6
-+        vshll.u8  q9, d1, #6
-+        vshll.u8  q10, d2, #6
-+        vshll.u8  q11, d3, #6
-+        vshll.u8  q12, d4, #6
-+        vshll.u8  q13, d5, #6
-+        vstm r0, {q8, q9, q10, q11, q12, q13}
-+        add  r0, r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_pixels_w64_neon_8, export=1
-+        init_put_pixels
-+0:      subs r3, #1
-+        vld1.8    {q0, q1}, [r1]
-+        add      r1, #32
-+        vld1.8    {q2, q3}, [r1], r2
-+        sub      r1, #32
-+        pld       [r1]
-+        vshll.u8  q8, d0, #6
-+        vshll.u8  q9, d1, #6
-+        vshll.u8  q10, d2, #6
-+        vshll.u8  q11, d3, #6
-+        vshll.u8  q12, d4, #6
-+        vshll.u8  q13, d5, #6
-+        vshll.u8  q14, d6, #6
-+        vshll.u8  q15, d7, #6
-+        vstm    r0, {q8, q9, q10, q11, q12, q13, q14, q15}
-+        add r0, r12
-+        bne 0b
-+        bx lr
-+endfunc
-+
-+function ff_hevc_rpi_put_qpel_uw_pixels_neon_8, export=1
-+        push   {r4-r9}
-+        ldr    r5, [sp, #24] // width
-+        ldr    r4, [sp, #28] // height
-+        ldr    r8, [sp, #32] // src2
-+        ldr    r9, [sp, #36] // src2stride
-+        vpush {d8-d15}
-+        cmp    r8, #0
-+        bne    2f
-+1:      subs r4, #1
-+        vld1.8     {d0}, [r2], r3
-+        vst1.8      d0, [r0], r1
-+        bne 1b
-+        vpop {d8-d15}
-+        pop   {r4-r9}
-+        bx lr
-+2:      subs  r4, #1
-+        vld1.8         {d0}, [r2], r3
-+        vld1.16        {q1}, [r8], r9
-+        vshll.u8       q0, d0, #6
-+        vqadd.s16      q0, q1
-+        vqrshrun.s16   d0, q0, #7
-+        vst1.8      d0, [r0], r1
-+        bne 2b
-+        vpop {d8-d15}
-+        pop   {r4-r9}
-+        bx lr
-+endfunc
-+
-+.macro put_qpel_uw_pixels width, regs, regs2, regs3, regs4
-+function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1
-+        ldr    r12, [sp] // height
-+1:      subs   r12, #4
-+        vld1.32     {\regs}  , [r2], r3
-+        vld1.32     {\regs2} , [r2], r3
-+        vld1.32     {\regs3} , [r2], r3
-+        vld1.32     {\regs4} , [r2], r3
-+        vst1.32     {\regs}  , [r0], r1
-+        vst1.32     {\regs2} , [r0], r1
-+        vst1.32     {\regs3} , [r0], r1
-+        vst1.32     {\regs4} , [r0], r1
-+        bne 1b
-+        bx lr
-+endfunc
-+.endm
-+
-+.macro put_qpel_uw_pixels_m width, regs, regs2, regs3, regs4
-+function ff_hevc_rpi_put_qpel_uw_pixels_w\width\()_neon_8, export=1
-+        push   {r4-r5}
-+        ldr    r12, [sp, #8] // height
-+1:      subs r12, #2
-+        mov      r4, r2
-+        vld1.32   {\regs} , [r2]!
-+        vld1.32   {\regs2} , [r2]
-+        add      r2, r4, r3
-+        mov      r4, r2
-+        vld1.32   {\regs3} , [r2]!
-+        vld1.32   {\regs4} , [r2]
-+        add      r2, r4, r3
-+        mov      r5, r0
-+        vst1.32   {\regs} , [r0]!
-+        vst1.32   {\regs2} , [r0]
-+        add      r0, r5, r1
-+        mov      r5, r0
-+        vst1.32   {\regs3} , [r0]!
-+        vst1.32   {\regs4} , [r0]
-+        add      r0, r5, r1
-+        bne 1b
-+        pop   {r4-r5}
-+        bx lr
-+endfunc
-+.endm
-+
-+put_qpel_uw_pixels    4, d0[0], d0[1], d1[0], d1[1]
-+put_qpel_uw_pixels    8, d0,    d1,    d2,    d3
-+put_qpel_uw_pixels_m 12, d0,    d1[0], d2,    d3[0]
-+put_qpel_uw_pixels   16, q0,    q1,    q2,    q3
-+put_qpel_uw_pixels   24, d0-d2, d3-d5, d16-d18, d19-d21
-+put_qpel_uw_pixels   32, q0-q1, q2-q3, q8-q9, q10-q11
-+put_qpel_uw_pixels_m 48, q0-q1, q2,    q8-q9, q10
-+put_qpel_uw_pixels_m 64, q0-q1, q2-q3, q8-q9, q10-q11
 diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
 new file mode 100644
 index 0000000000..7dfcc2751a
@@ -6406,12 +4944,13 @@ index 0000000000..7dfcc2751a
 +
 diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
 new file mode 100644
-index 0000000000..8c32cb23e7
+index 0000000000..b56dc8ccc5
 --- /dev/null
 +++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-@@ -0,0 +1,1882 @@
+@@ -0,0 +1,2156 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
 + *
 + * This file is part of FFmpeg.
 + *
@@ -7245,9 +5784,8 @@ index 0000000000..8c32cb23e7
 +        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
 +
 +        vtbl.8   d6,  {d27}, d6
-+        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
-+
 +        vtbl.8   d7,  {d27}, d7
++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
 +        vzip.8   q2,  q3
 +
 +        vsub.s8  q0,  q15
@@ -7309,33 +5847,36 @@ index 0000000000..8c32cb23e7
 +
 +        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
 +        vadd.s16 q1, q1, q13
-+        vmov.u8  q12, #2
 +        vadd.s16 q2, q2, q14
 +        vadd.s16 q3, q3, q15
 +
++        vmov.u8  q12, #2
++
 +        vmovn.s16 d0, q0
 +        vmovn.s16 d1, q1
 +        vmovn.s16 d2, q2
 +        vmovn.s16 d3, q3
 +
++        vldr     d26, [r5]
++
 +        vuzp.8   q0, q1
 +
-+        vld1.8   {d26, d27}, [r5]
++        vldr     d27, [r5, #8]
 +
 +        vadd.s8  q0, q0, q12
 +        vadd.s8  q1, q1, q12
 +
++        vmov.i64 q12, #0
++
 +        vtbl.8   d0, {d26}, d0
 +        vtbl.8   d1, {d26}, d1
 +        vtbl.8   d2, {d27}, d2
 +        vtbl.8   d3, {d27}, d3
 +
-+        vmov.i64 q12, #0
++        vdup.i16 q13, r4
 +
 +        vzip.8   q0, q1
 +
-+        vdup.i16 q13, r4
-+
 +        @ Avoid overwrite whilst widening
 +        vaddw.s8 q2, q6, d2
 +        vaddw.s8 q3, q7, d3
@@ -7360,19 +5901,19 @@ index 0000000000..8c32cb23e7
 +@ q15.u8 #128
 +
 +function edge_16b_body_8
-+        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
-+        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
-+        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
-+        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
++        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
++        vadd.u8  q9,  q14, q9
++        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
++        vsub.u8  q9,  q9,  q0
++        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
++        vadd.u8  q9,  q9,  q0
++        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
++        vsub.u8  q0,  q9,  q0
 +
-+        vsub.s8  q0,  q3
-+        vsub.s8  q10, q9
-+        vadd.s8  q0,  q10       @ a = sign(c-a)
-+
-+        vadd.s8  q0,  q14
-+        vuzp.8   d0,  d1
 +        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
 +
++        vuzp.8   d0,  d1
++
 +        vtbl.8   d0,  {d16}, d0
 +        vtbl.8   d1,  {d17}, d1
 +
@@ -7394,21 +5935,20 @@ index 0000000000..8c32cb23e7
 +@ q14.u8 #2
 +@ q15.u16 max
 +function edge_16b_body_16
-+        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
-+        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
-+        vsub.s16 q0, q3         @ a = sign(c-a)
-+        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
-+        vsub.s16 q0, q3
-+        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
-+        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
++        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
++        vadd.u16 q9, q14, q9
++        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
++        vsub.u16 q9, q9, q0
++        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
++        vadd.u16 q9, q9, q0
++        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
++        vsub.u16 q0, q9, q0
 +
 +        vmovn.s16 d0, q0
 +        @ d1 will have random contents that we transform but
 +        @ that doesn't matter as we then discard them
 +        vuzp.8   d0, d1
 +
-+        vadd.s8  q0, q0, q14
-+
 +        vtbl.8   d0, {d16}, d0
 +        vtbl.8   d1, {d17}, d1
 +
@@ -7434,52 +5974,53 @@ index 0000000000..8c32cb23e7
 +@   int height)                       [sp, #sp_base + 8]
 +
 +.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
-+        push     {r4-r6, lr}    @ 16 bytes
-+.set sp_base, 16
 +
 +@ Build translate registers
 +@ As translate values can only be 0-4 we don't care about junk in the rest
 +@ of the register
-+        mov      r12, #2
 +.if \is_chroma
-+        ldr      r4, [sp, #16]
-+.set sp_base, sp_base + 4
-+.endif
-+        vld1.8   {d16[2]}, [r3], r12
-+        vld1.8   {d16[0]}, [r3], r12
-+        vld1.8   {d16[1]}, [r3], r12
-+        vld1.8   {d16[3]}, [r3], r12
++        ldr      ip, [sp, #0]
++        push     {r4-r6, lr}    @ 16 bytes
++        vld1.8   {d16[2]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[2]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[0]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[0]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[1]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[1]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[3]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[3]}, [ip]
++        add      ip, ip, #2
 +        vld1.8   {d16[4]}, [r3]
-+.if \is_chroma
-+        vld1.8   {d17[2]}, [r4], r12
-+        vld1.8   {d17[0]}, [r4], r12
-+        vld1.8   {d17[1]}, [r4], r12
-+        vld1.8   {d17[3]}, [r4], r12
-+        vld1.8   {d17[4]}, [r4]
-+.else
-+        vmov     d17, d16
-+.endif
-+
-+@ Setup constant registers
-+.if \bit_depth > 8
-+        movw     r4, (1 << \bit_depth) - 1
-+.endif
-+.if \setup_16b
-+.if \bit_depth > 8
-+        vmov.i64 q12, #0
-+        vdup.16  q15, r4
-+.else
-+        vmov.u8  q15, #128
-+.endif
-+        vmov.u8  q14, #2
-+.endif
++        vld1.8   {d17[4]}, [ip]
 +        movw     r3, EDGE_SRC_STRIDE
++.set sp_base, 20
++.else
++        add      ip, r3, #4
++        vld1.8   {d16[1]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[0]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[0]}, [r3]
++        add      r3, r3, #6
++        vld1.8   {d17[1]}, [ip]
++        vld1.8   {d16[2]}, [r3]
++        movw     r3, EDGE_SRC_STRIDE
++        push     {r4-r6, lr}    @ 16 bytes
++        vzip.8   d16, d17
++        vmov     d17, d16
++.set sp_base, 16
++.endif
 +
-+@ If setup_64b we need the xlat table on the stack and q4-q7 saved
++@ If setup_64b we need the xlat table on the stack
 +.if \setup_64b
 +        sub      r5, sp, #16
-+        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
-+.set sp_base, sp_base + 80
 +.endif
 +
 +@ Get jump address
@@ -7487,18 +6028,40 @@ index 0000000000..8c32cb23e7
 +@ If we may have w4 then we add a 2nd jump table after the 1st
 +.if \check_w4
 +        ldr      r12, [sp, #sp_base + 4]        @ width
-+        cmp      r12, #8
-+.endif
-+        ldr      r12, [sp, #sp_base + 0]        @ e0
 +        adr      r6, \jump_tab
-+.if \check_w4
++        ldr      lr, [sp, #sp_base + 0]        @ e0
++        cmp      r12, #8
 +        it lt
 +        addlt    r6, #16
++.else
++        ldr      lr, [sp, #sp_base + 0]        @ e0
++        adr      r6, \jump_tab
 +.endif
-+        ldr      r6, [r6, r12, lsl #2]
 +
 +        ldr      r12, [sp, #sp_base + 8]        @ height
 +
++.if \bit_depth > 8
++        movw     r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++        vmov.i64 q12, #0
++        vdup.16  q15, r4
++        vmov.u16 q14, #2
++.else
++        vmov.u8  q15, #128
++        vmov.u8  q14, #2
++.endif
++.endif
++
++@ If setup_64b we need q4-q7 saved.
++.if \setup_64b
++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++        ldr      r6, [r6, lr, lsl #2]
++
 +@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
 +.if \do2
 +        push     {r0, r1, r6, r12}
@@ -7529,18 +6092,20 @@ index 0000000000..8c32cb23e7
 +
 +
 +.macro  edge_64b_e0, body_fn, pb
-+        mov      r6, lr
 +        sub      r1, #8
++        mov      r6, lr
 +1:      vldm     r1, {d7-d16}
-+        subs     r12, #1
-+        add      r1, r3
 +        // load a
 +        vext.8   q0,  q3,  q4, #(16 - \pb)
++        add      r1, r3
 +        vext.8   q1,  q4,  q5, #(16 - \pb)
++        subs     r12, #1
 +        vext.8   q2,  q5,  q6, #(16 - \pb)
 +        vext.8   q3,  q6,  q7, #(16 - \pb)
++        pld      [r1]
 +        // load b
 +        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
++        pld      [r1, #64]
 +        vext.8   q8,  q4,  q5, #\pb
 +        vext.8   q9,  q5,  q6, #\pb
 +        vext.8   q10, q6,  q7, #\pb
@@ -7552,424 +6117,671 @@ index 0000000000..8c32cb23e7
 +.endm
 +
 +.macro  edge_32bx2_e0, body_fn, pb
-+        mov      r6, lr
-+
-+1:      subs     r12, #2
-+
-+        vld1.8   {q4-q5}, [r1]
-+        sub      r1, #\pb
-+        vld1.8   {q0-q1}, [r1]
-+        add      r1, #(\pb * 2)
-+        vld1.8   {q8-q9}, [r1], r3
-+        sub      r1, #\pb
-+        vld1.8   {q6-q7}, [r1]
-+        sub      r1, #\pb
-+        vld1.8   {q2-q3}, [r1]
-+        add      r1, #(\pb * 2)
-+        vld1.8   {q10-q11}, [r1], r3
-+        sub      r1, #\pb
-+
++        add      r6, r1, r3
++        push     {r7,lr}
++        sub      r1, #8
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      vldmia   r1, {d7-d12}
++        // load a
++        vext.8   q0, q3, q4, #16 - \pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q1, q4, q5, #16 - \pb
++        subs     r12, #2
++        // load b
++        vext.8   q8, q4, q5, #\pb
++        vext.8   q9, q5, q6, #\pb
++        vldr     d25, [r6, #-8]
++        vldmia   r6, {d12-d15}
++        vldr     d26, [r6, #32]
++        // load a
++        vext.8   q2, q12, q6, #16 - \pb
++        add      r6, r6, r3, lsl #1
++        vext.8   q3, q6, q7, #16 - \pb
++        // load b
++        vext.8   q10, q6, q7, #\pb
++        vext.8   q11, q7, q13, #\pb
 +        bl       \body_fn
-+
-+        vst1.8   {q0,q1}, [r0], r2
-+        vst1.8   {q2,q3}, [r0], r2
-+
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
 +        bgt      1b
-+        bx       r6
++        pop      {r7,pc}
 +.endm
 +
 +.macro  edge_16b_e0, body_fn, pb
++        sub      r1, #8
 +        mov      r6, lr
-+        sub      r1, #\pb
-+        sub      r3, #\pb * 2
-+
-+1:      subs     r12, #1
-+
-+        vld1.64  {q0}, [r1]             @ load a
-+        add      r1, #\pb
-+        vld1.64  {q1}, [r1, :128]       @ load c
-+        add      r1, #\pb
-+        vld1.64  {q2}, [r1], r3         @ load b
++1:      vldmia   r1, {d1-d4}
++        add      r1, r3
++        subs     r12, #1
++        vext.8   q0, q0, q1, #16 - \pb
++        vext.8   q2, q1, q2, #\pb
 +
 +        bl       \body_fn
-+        vst1.8   {q0}, [r0], r2
++        vst1.8   {q0}, [r0, :128], r2
 +        bgt      1b
 +        bx       r6
 +.endm
 +
 +.macro  edge_8bx2_e0, body_fn, pb
-+        mov      r6, lr
-+
-+1:      subs     r12, #2
-+
-+        vld1.8   {d2}, [r1, :64]
-+        sub      r1, #\pb
-+        vld1.8   {d0}, [r1]
-+        add      r1, #(\pb * 2)
-+        vld1.8   {d4}, [r1], r3
-+        sub      r1, #\pb
-+        vld1.8   {d3}, [r1, :64]
-+        sub      r1, #\pb
-+        vld1.8   {d1}, [r1]
-+        add      r1, #(\pb * 2)
-+        vld1.8   {d5}, [r1], r3
-+        sub      r1, #\pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        sub      r1, #8
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      vldmia   r1, {d1-d2}
++        vldmia   r6, {d3-d4}
++        vldr     d6, [r1, #16]
++        subs     r12, #2
++        vldr     d7, [r6, #-8]
++        add      r1, r1, r3, lsl #1
++        vext.8   d0, d1, d2, #8 - \pb
++        add      r6, r6, r3, lsl #1
++        vext.8   d5, d3, d4, #\pb
++        vext.8   d4, d2, d6, #\pb
++        vext.8   d1, d7, d3, #8 - \pb
 +
 +        bl       \body_fn
-+
 +        vst1.8   {d0}, [r0, :64], r2
-+        vst1.8   {d1}, [r0, :64], r2
-+
++        vst1.8   {d1}, [r7, :64], r2
 +        bgt      1b
-+        bx       r6
++        pop      {r7,pc}
 +.endm
 +
 +.macro  edge_4bx4_e0, body_fn, pb
-+        mov      r6, lr
++        add      r6, r1, r3
++        push     {r7,lr}
++        add      r7, r0, r2
++        lsl      r2, #1
 +
-+1:      subs     r12, #4
-+
-+        vld1.32  {d2[0]}, [r1]
-+        sub      r1, #\pb
-+        vld1.32  {d0[0]}, [r1]
-+        add      r1, #(\pb * 2)
-+        vld1.32  {d4[0]}, [r1], r3      @ R
-+        vld1.32  {d4[1]}, [r1]
-+        sub      r1, #\pb
-+        vld1.32  {d2[1]}, [r1]
-+        sub      r1, #\pb
-+        vld1.32  {d0[1]}, [r1], r3      @ L
-+        vld1.32  {d1[0]}, [r1]
-+        add      r1, #\pb
-+        vld1.32  {d3[0]}, [r1]
-+        add      r1, #\pb
-+        vld1.32  {d5[0]}, [r1], r3      @ R
-+        vld1.32  {d5[1]}, [r1]
-+        sub      r1, #(\pb * 2)
-+        vld1.32  {d1[1]}, [r1]
-+        add      r1, #\pb
-+        vld1.32  {d3[1]}, [r1], r3      @ M
++        tst      r1, #4
++        bne      2f
++1:      // r1 (and assumed r6) are 64-bit aligned
++        vldr     d2, [r1]
++        vldr     d0, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        vldr     d20, [r6]
++        subs     r12, #4
++        vldr     d18, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d3, [r1]
++        vshr.u64 d4, d2, #\pb * 8
++        vldr     d1, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        vldr     d21, [r6]
++        vext.8   d0, d0, d2, #8 - \pb
++        vldr     d19, [r6,#-8]
++        add      r6, r6, r3, lsl #1
++        vshr.u64 d22, d20, #\pb * 8
++        vext.8   d18, d18, d20, #8 - \pb
++        vshr.u64 d5, d3, #\pb * 8
++        vext.8   d1, d1, d3, #8 - \pb
++        vshr.u64 d23, d21, #\pb * 8
++        vext.8   d19, d19, d21, #8 - \pb
++        vsli.64  q1, q10, #32
++        vsli.64  q2, q11, #32
++        vsli.64  q0, q9, #32
 +
 +        bl       \body_fn
-+
-+        vst1.32  {d0[0]}, [r0], r2
-+        vst1.32  {d0[1]}, [r0], r2
-+        vst1.32  {d1[0]}, [r0], r2
-+        vst1.32  {d1[1]}, [r0], r2
-+
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
 +        bgt      1b
-+        bx       r6
++        pop      {r7,pc}
++
++2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
++        vldr     d20, [r1, #-4]
++        vldr     d22, [r1, #4]
++        add      r1, r1, r3, lsl #1
++        vldr     d2, [r6, #-4]
++        subs     r12, #4
++        vldr     d4, [r6, #4]
++        add      r6, r6, r3, lsl #1
++        vldr     d21, [r1, #-4]
++        vshl.i64 d18, d20, #\pb * 8
++        vldr     d23, [r1, #4]
++        add      r1, r1, r3, lsl #1
++        vldr     d3, [r6, #-4]
++        vext.8   d22, d20, d22, #\pb
++        vldr     d5, [r6, #4]
++        add      r6, r6, r3, lsl #1
++        vshl.i64 d0, d2, #\pb * 8
++        vext.8   d4, d2, d4, #\pb
++        vshl.i64 d19, d21, #\pb * 8
++        vext.8   d23, d21, d23, #\pb
++        vshl.i64 d1, d3, #\pb * 8
++        vext.8   d5, d3, d5, #\pb
++        vsri.64  q1, q10, #32
++        vsri.64  q0, q9, #32
++        vsri.64  q2, q11, #32
++
++        bl       \body_fn
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      2b
++        pop      {r7,pc}
 +.endm
 +
 +
 +.macro  edge_64b_e1, body_fn
-+        mov      r6, lr
 +        sub      r1, r3
++        push     {lr}
++        add      r6, r1, #32
 +        // load a
-+        vld1.8   {q0-q1}, [r1, :128]!
-+        vld1.8   {q2-q3}, [r1, :128], r3
-+        sub      r1, #32
++        vld1.8   {q0-q1}, [r1, :256], r3
++        vld1.8   {q2-q3}, [r6, :256], r3
 +        // load c
-+        vld1.8   {q4-q5}, [r1, :128]!
-+        vld1.8   {q6-q7}, [r1, :128], r3
-+        sub      r1, #32
-+1:      subs     r12, #1
-+        // load b
-+        vld1.8   {q8-q9}, [r1, :128]!
-+        vld1.8   {q10-q11}, [r1, :128], r3
-+        sub      r1, #32
++        vld1.8   {q4-q5}, [r1, :256], r3
++        vld1.8   {q6-q7}, [r6, :256], r3
++1:      // load b
++        vld1.8   {q8-q9}, [r1, :256], r3
++        subs     r12, #1
++        vld1.8   {q10-q11}, [r6, :256], r3
 +        bl       \body_fn
 +        vstm     r0, {q0-q3}
-+        add      r0, r0, r2
 +        // copy c to a
 +        vmov.64  q0, q4
++        pld      [r1, r3]
 +        vmov.64  q1, q5
++        pople    {lr}
 +        vmov.64  q2, q6
++        bxle     lr
 +        vmov.64  q3, q7
++        add      r0, r0, r2
 +        // copy b to c
 +        vmov.64  q4, q8
 +        vmov.64  q5, q9
 +        vmov.64  q6, q10
 +        vmov.64  q7, q11
-+        bgt      1b
-+        bx       r6
++        b        1b
 +.endm
 +
 +.macro  edge_32bx2_e1, body_fn
++        sub      r6, r1, r3
++        vld1.8   {q2-q3}, [r1, :256], r3
++        vld1.8   {q0-q1}, [r6, :256]
 +        mov      r6, lr
-+        sub      r1, r3
-+        // load a
-+        vld1.8   {q0-q1}, [r1, :128], r3
-+        vld1.8   {q4-q5}, [r1, :128], r3
 +
-+1:      subs     r12, #2
-+        @ Given the data duplication here we could obviously do better than
++1:      @ Given the data duplication here we could obviously do better than
 +        @ using the generic body_fn but it almost certainly isn't worth it
-+        vmov     q2, q4
-+        vmov     q3, q5
-+        vld1.8   {q8-q9}, [r1, :128], r3
-+        vld1.8   {q10-q11}, [r1, :128], r3
++        vld1.8   {q8-q9}, [r1, :256], r3
++        subs     r12, #2
++        vmov     q4, q2
++        vmov     q5, q3
++        vld1.8   {q10-q11}, [r1, :256], r3
 +        vmov     q6, q8
 +        vmov     q7, q9
 +
 +        bl       \body_fn
 +
-+        vst1.8   {q0,q1}, [r0], r2
-+        vst1.8   {q2,q3}, [r0], r2
-+
-+        // copy c to a
-+        vmov.64  q0, q8
-+        vmov.64  q1, q9
-+
-+        // copy b to c
-+        vmov.64  q4, q10
-+        vmov.64  q5, q11
-+        bgt      1b
-+        bx       r6
++        vst1.8   {q0-q1}, [r0, :256], r2
++        // copy b to a
++        vmov     q0, q8
++        vmov     q1, q9
++        vst1.8   {q2-q3}, [r0, :256], r2
++        vmov     q2, q10
++        bxle     r6
++        vmov     q3, q11
++        b        1b
 +.endm
 +
 +.macro  edge_16b_e1, body_fn
-+        mov      r6, lr
-+        sub      r1, r3
-+        // load a
-+        vld1.8   {q0}, [r1, :128], r3
++        sub      r6, r1, r3
 +        // load c
 +        vld1.8   {q1}, [r1, :128], r3
-+1:      subs     r12, #1
-+        // load b
++        // load a
++        vld1.8   {q0}, [r6, :128]
++        mov      r6, lr
++1:      // load b
 +        vld1.8   {q2}, [r1, :128], r3
 +        bl       \body_fn
-+        vst1.8   {q0}, [r0], r2
++        vst1.8   {q0}, [r0, :128], r2
++        subs     r12, #1
 +        // copy c to a
 +        vmov.64  q0, q1
++        bxle     r6
 +        // copy b to c
 +        vmov.64  q1, q2
-+        bgt      1b
-+        bx       r6
++        b        1b
 +.endm
 +
 +.macro  edge_8bx2_e1, body_fn
-+        mov      r6, lr
-+        sub      r1, r3
-+        // load a
-+        vld1.8   {d0}, [r1, :64], r3
-+        vld1.8   {d2}, [r1, :64], r3
-+
-+1:      subs     r12, #2
-+        @ Given the data duplication here we could obviously do better than
++        sub      r6, r1, r3
++        lsl      r3, #1
++        push     {r7, lr}
++        vld1.8   {d1}, [r1, :64], r3
++        vld1.8   {d0}, [r6, :64], r3
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      @ Given the data duplication here we could obviously do better than
 +        @ using the generic body_fn but it almost certainly isn't worth it
-+        vmov.64  d1, d2
-+        vld1.8   {d4}, [r1, :64], r3
++        vld1.8   {d4}, [r6, :64], r3
++        vmov     d2, d1
 +        vld1.8   {d5}, [r1, :64], r3
-+        vmov.64  d3, d4
++        subs     r12, #2
++        vmov     d3, d4
 +
 +        bl       \body_fn
 +
-+        vst1.8   {d0}, [r0], r2
-+        vst1.8   {d1}, [r0], r2
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
 +
-+        // copy c to a
-+        vmov.64  d0, d4
-+        // copy b to c
-+        vmov.64  d2, d5
++        // copy b to a
++        vmov     q0, q2
 +        bgt      1b
-+        bx       r6
++        pop      {r7, pc}
 +.endm
 +
 +.macro  edge_4bx4_e1, body_fn
-+        mov      r6, lr
-+debug_me:
-+        sub      r1, r3
-+        // load a
-+        vld1.32  {d0[0]}, [r1], r3
-+        vld1.32  {d0[1]}, [r1], r3
-+
-+1:      subs     r12, #4
-+        @ Given the data duplication here we could probably do better than
-+        @ using the generic body_fn but it almost certainly isn't worth it
-+        vld1.32  {d4[0]}, [r1], r3
-+        vld1.32  {d4[1]}, [r1], r3
-+        vld1.32  {d5[0]}, [r1], r3
-+        vld1.32  {d5[1]}, [r1], r3
-+
-+        vmov.32  d1, d4
++        sub      r6, r1, r3
++        lsl      r3, #1
++        push     {r7, lr}
++        vld1.32  {d0[1]}, [r1, :32], r3
++        add      r7, r0, r2
++        vld1.32  {d0[0]}, [r6, :32], r3
++        lsl      r2, #1
++        vld1.32  {d4[1]}, [r1, :32], r3
++        vld1.32  {d4[0]}, [r6, :32], r3
++        vld1.32  {d5[1]}, [r1, :32], r3
++        vld1.32  {d5[0]}, [r6, :32], r3
++        vmov     d1, d4
 +        vext.32  d2, d0, d4, #1
++        subs     r12, #4
++        vmov     d22, d5
 +        vext.32  d3, d4, d5, #1
++        b        2f
 +
++1:      vst1.32  {d0[0]}, [r0, :32], r2
++        vext.32  d2, d22, d4, #1
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vmov     d0, d22
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vext.32  d3, d4, d5, #1
++        vst1.32  {d1[1]}, [r7, :32], r2
++        vmov     d1, d4
++        vmov     d22, d5
++2:      @ Given the data duplication here we could probably do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
 +        bl       \body_fn
++        ble      3f
++        vld1.32  {d4[0]}, [r6, :32], r3
++        subs     r12, #4
++        vld1.32  {d4[1]}, [r1, :32], r3
++        vld1.32  {d5[0]}, [r6, :32], r3
++        vld1.32  {d5[1]}, [r1, :32], r3
++        b        1b
 +
-+        vst1.32  {d0[0]}, [r0], r2
-+        vst1.32  {d0[1]}, [r0], r2
-+        vst1.32  {d1[0]}, [r0], r2
-+        vst1.32  {d1[1]}, [r0], r2
-+
-+        vmov.32  d0, d5
-+        bgt      1b
-+        bx       r6
++3:      vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32]
++        vst1.32  {d1[1]}, [r7, :32]
++        pop      {r7, pc}
 +.endm
 +
 +.macro  edge_64b_e2, body_fn, pb
-+        mov      r6, lr
-+        sub      r1, #32
-+        sub      r3, #(32 - \pb)
++        push     {lr}
++        sub      r6, r1, r3
++        // load c and a
++        vld1.8   {q4-q5}, [r1, :128]
++        vldr     d25, [r6, #-8]
++        vldmia   r6, {d16-d23}
++        vext.8   q0, q12, q8, #16 - \pb
++        add      r6, r1, #32
++        vext.8   q1, q8, q9, #16 - \pb
++        add      r1, r1, r3
++        vext.8   q2, q9, q10, #16 - \pb
++        vld1.8   {q6-q7}, [r6, :128]
++        sub      r6, r1, r3
++        vext.8   q3, q10, q11, #16 - \pb
 +
-+1:      sub      r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        vld1.8   {q0-q1}, [r1]!
-+        vld1.8   {q2-q3}, [r1], r3
++1:      // load b
++        vldmia   r1, {d16-d24}
++        vext.8   q8, q8, q9, #\pb
++        pld      [r1, r3]
++        vext.8   q9, q9, q10, #\pb
 +        subs     r12, #1
-+        // load  c
-+        vld1.8   {q4-q5}, [r1, :128]!
-+        vld1.8   {q6-q7}, [r1, :128], r3
-+        // load  b
-+        vld1.8   {q8-q9}, [r1]!
-+        vld1.8   {q10-q11}, [r1]
-+        sub      r1, #(64 + \pb)
++        vext.8   q10, q10, q11, #\pb
++        vext.8   q11, q11, q12, #\pb
 +        bl       \body_fn
-+        vstm     r0, {q0-q3}
++        // next a is mostly available in c
++        vldr     d25, [r6, #-8]
++        vstmia   r0, {q0-q3}
++        vext.8   q3, q6, q7, #16 - \pb
++        pople    {lr}
++        vext.8   q2, q5, q6, #16 - \pb
++        bxle     lr
++        vext.8   q1, q4, q5, #16 - \pb
++        add      r6, r6, r3
++        vext.8   q0, q12, q4, #16 - \pb
 +        add      r0, r0, r2
-+        bgt      1b
-+
-+        add      r3, #(32 - \pb)
-+        bx       r6
++        // next c is mostly available in b
++        vldr     d8, [r1]
++        vext.8   d9, d16, d17, #8 - \pb
++        vext.8   q5, q8, q9, #16 - \pb
++        add      r1, r1, r3
++        vext.8   q6, q9, q10, #16 - \pb
++        pld      [r6, #-8]
++        vext.8   q7, q10, q11, #16 - \pb
++        b        1b
 +.endm
 +
 +.macro  edge_32bx2_e2, body_fn, pb
-+        mov      r6, lr
-+        sub      r1, #\pb
-+
-+1:      sub      r1, r3
-+        vld1.8   {q0-q1}, [r1], r3
-+        vld1.8   {q2-q3}, [r1]
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        // load a and first 32b of c
++        vld1.8   {q4-q5}, [r1, :256]
++        vldr     d25, [r6, #-8]
++        vld1.8   {q13-q14}, [r6, :256]
++        vldr     d31, [r1, #-8]
++        add      r6, r6, r3, lsl #1
++        vext.8   q0, q12, q13, #16 - \pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q1, q13, q14, #16 - \pb
++        vext.8   q2, q15, q4, #16 - \pb
++        vext.8   q3, q4, q5, #16 - \pb
++1:
++        // load second 32b of c and second 32b of b
++        vldmia   r6, {d12-d16}
++        vldmia   r1, {d20-d24}
++        // first 32b of b is mostly available in second 32b of c
++        vext.8   q9, q7, q8, #\pb
 +        subs     r12, #2
-+        // load  c
-+        add      r1, #\pb
-+        vld1.8   {q4-q5}, [r1, :128], r3
-+        vld1.8   {q6-q7}, [r1, :128]
-+        // load  b
-+        add      r1, #\pb
-+        vld1.8   {q8-q9}, [r1], r3
-+        vld1.8   {q10-q11}, [r1]
-+        sub      r1, #(\pb * 2)
++        vext.8   q8, q6, q7, #\pb
++        vext.8   q10, q10, q11, #\pb
++        vext.8   q11, q11, q12, #\pb
 +
 +        bl       \body_fn
 +
-+        vst1.8   {q0-q1}, [r0], r2
-+        vst1.8   {q2-q3}, [r0], r2
-+        bgt      1b
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        ble      2f
 +
-+        bx       r6
++        vldr     d25, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d8, [r1]
++        vext.8   d9, d20, d21, #8 - \pb
++        vldr     d31, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        // first 32b of a is mostly available in second 32b of c
++        vext.8   q1, q6, q7, #16 - \pb
++        vext.8   q0, q12, q6, #16 - \pb
++        // first 32b of c is mostly available in second 32b of b
++        vext.8   q5, q10, q11, #16 - \pb
++        // second 32b of a is mostly available in first 32b of c
++        vext.8   q2, q15, q4, #16 - \pb
++        vext.8   q3, q4, q5, #16 - \pb
++        b        1b
++
++2:      pop      {r7, pc}
 +.endm
 +
 +.macro  edge_16b_e2, body_fn, pb
-+        mov      r6, lr
-+        add     r3, #\pb
-+
-+1:      sub      r1, r3
-+        // load a
-+        vld1.8   {q0}, [r1], r3
-+        subs     r12, #1
-+        // load  c
++        push     {lr}
++        sub      r6, r1, r3
 +        vld1.8   {q1}, [r1, :128], r3
-+        // load  b
-+        vld1.8   {q2}, [r1]
-+        sub      r1, #\pb
++        vldr     d19, [r6, #-8]
++        vld1.8   {q10}, [r6, :128], r3
++
++1:      vldmia   r1, {d4-d6}
++        vext.8   q0, q9, q10, #16 - \pb
++        subs     r12, #1
++        vext.8   q2, q2, q3, #\pb
 +        bl       \body_fn
-+        vst1.8   {q0}, [r0], r2
-+        bgt      1b
-+        bx       r6
++        vst1.8   {q0}, [r0, :128], r2
++        ble      2f
++        vmov     q10, q1
++        vldr     d2, [r1]
++        add      r1, r1, r3
++        vldr     d19, [r6, #-8]
++        add      r6, r6, r3
++        vext.8   d3, d4, d5, #8 - \pb
++        b        1b
++
++2:      pop      {pc}
 +.endm
 +
 +.macro  edge_8bx2_e2, body_fn, pb
-+        mov      r6, lr
-+        sub      r1, #\pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        vldr     d18, [r6, #-8]
++        vldr     d19, [r6]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #-8]
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldmia   r6, {d3-d4}
++        vld1.8   {d21-d22}, [r1, :128]
 +
-+1:      sub      r1, r3
-+        vld1.8   {d0}, [r1], r3
-+        vld1.8   {d1}, [r1]
++1:      vext.8   d0, d18, d19, #8 - \pb
++        vext.8   d4, d3, d4, #\pb
++        vext.8   d1, d20, d2, #8 - \pb
 +        subs     r12, #2
-+        // load  c
-+        add      r1, #\pb
-+        vld1.8   {d2}, [r1, :64], r3
-+        vld1.8   {d3}, [r1, :64]
-+        // load  b
-+        add      r1, #\pb
-+        vld1.8   {d4}, [r1], r3
-+        vld1.8   {d5}, [r1]
-+        sub      r1, #(\pb * 2)
++        vext.8   d5, d21, d22, #\pb
 +
 +        bl       \body_fn
 +
-+        vst1.8   {d0}, [r0], r2
-+        vst1.8   {d1}, [r0], r2
-+        bgt      1b
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        ble      2f
 +
-+        bx       r6
++        vldr     d18, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #-8]
++        vmov     d19, d3
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldmia   r6, {d3-d4}
++        vld1.8   {d21-d22}, [r1, :128]
++        b        1b
++
++2:      pop      {r7, pc}
 +.endm
 +
 +.macro  edge_4bx4_e2, body_fn, pb
-+        mov      r6, lr
-+        sub      r1, #\pb
++        sub      r6, r1, r3
++        push     {r7-r9, lr}
++        add      r8, r1, r3
++        sub      r6, r6, #\pb
++        add      r8, r8, #\pb
++        add      r7, r0, r2
++        lsl      r2, #1
 +
-+1:      sub      r1, r3
-+        @ line 0 {d0[0], -,     -    }  r1 lo
-+        vld1.32  {d0[0]}, [r1], r3
++1:      vld1.32  {d0[0]}, [r6], r3
 +        subs     r12, #4
-+        @ Line 1 {d0[1], d2[0], -    }  r1 lo
-+        vld1.32  {d0[1]}, [r1]
-+        add      r1, #\pb
 +        vld1.32  {d2[0]}, [r1], r3
-+        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
-+        vld1.32  {d2[1]}, [r1]
-+        sub      r1, #\pb
-+        vld1.32  {d1[0]}, [r1]
-+        add      r1, #\pb * 2
-+        vld1.32  {d4[0]}, [r1], r3
-+        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
-+        vld1.32  {d4[1]}, [r1]
-+        sub      r1, #\pb * 2
-+        vld1.32  {d1[1]}, [r1]
-+        add      r1, #\pb
++        vld1.32  {d4[0]}, [r8], r3
++        vld1.32  {d0[1]}, [r6], r3
++        vld1.32  {d2[1]}, [r1], r3
++        vld1.32  {d4[1]}, [r8], r3
++        vld1.32  {d1[0]}, [r6], r3
 +        vld1.32  {d3[0]}, [r1], r3
-+        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
-+        vld1.32  {d3[1]}, [r1]
-+        add      r1, #\pb
-+        vld1.32  {d5[0]}, [r1], r3
-+        @ Line 4 {-,      -,    d5[1]}  r1 hi
-+        vld1.32  {d5[1]}, [r1]
-+        sub      r1, #(\pb * 2)
++        vld1.32  {d5[0]}, [r8], r3
++        vld1.32  {d1[1]}, [r6], r3
++        vld1.32  {d3[1]}, [r1], r3
++        vld1.32  {d5[1]}, [r8], r3
 +
 +        bl       \body_fn
 +
-+        vst1.32  {d0[0]}, [r0], r2
-+        vst1.32  {d0[1]}, [r0], r2
-+        vst1.32  {d1[0]}, [r0], r2
-+        vst1.32  {d1[1]}, [r0], r2
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
 +        bgt      1b
 +
-+        bx       r6
++        pop      {r7-r9,pc}
 +.endm
 +
 +.macro  edge_64b_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_64b_e2 \body_fn, (-\pb)
++        push     {lr}
++        sub      r6, r1, r3
++        // load c and a
++        vld1.8   {q4-q5}, [r1, :128]
++        vldmia   r6, {d16-d24}
++        vext.8   q0, q8, q9, #\pb
++        add      r6, r1, #32
++        vext.8   q1, q9, q10, #\pb
++        add      r1, r1, r3
++        vext.8   q2, q10, q11, #\pb
++        vld1.8   {q6-q7}, [r6, :128]
++        sub      r6, r1, r3
++        vext.8   q3, q11, q12, #\pb
++
++1:      // load b
++        vldr     d17, [r1, #-8]
++        vldmia   r1, {d18-d25}
++        vext.8   q8, q8, q9, #16 - \pb
++        pld      [r1, r3]
++        vext.8   q9, q9, q10, #16 - \pb
++        subs     r12, #1
++        vext.8   q10, q10, q11, #16 - \pb
++        vext.8   q11, q11, q12, #16 - \pb
++        bl       \body_fn
++        // next a is mostly available in c
++        vldr     d24, [r6, #64]
++        vstmia   r0, {q0-q3}
++        vext.8   q0, q4, q5, #\pb
++        pople    {lr}
++        vext.8   q1, q5, q6, #\pb
++        bxle     lr
++        vext.8   q2, q6, q7, #\pb
++        add      r6, r6, r3
++        vext.8   q3, q7, q12, #\pb
++        add      r0, r0, r2
++        // next c is mostly available in b
++        vext.8   d14, d22, d23, #\pb
++        vldr     d15, [r1, #56]
++        vext.8   q4, q8, q9, #\pb
++        add      r1, r1, r3
++        vext.8   q5, q9, q10, #\pb
++        vext.8   q6, q10, q11, #\pb
++        b        1b
 +.endm
 +
 +.macro  edge_32bx2_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_32bx2_e2 \body_fn, (-\pb)
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        // load a and first 32b of c
++        vldmia   r1, {d8-d12}
++        vldmia   r6, {d24-d28}
++        vext.8   q2, q4, q5, #\pb
++        add      r6, r6, r3, lsl #1
++        vext.8   q3, q5, q6, #\pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q0, q12, q13, #\pb
++        vext.8   q1, q13, q14, #\pb
++1:
++        // load second 32b of c and second 32b of b
++        vldr     d25, [r6, #-8]
++        subs     r12, #2
++        vldmia   r6, {d12-d15}
++        vldr     d27, [r1, #-8]
++        vldmia   r1, {d20-d23}
++        // first 32b of b is mostly available in second 32b of c
++        vext.8   q8, q12, q6, #16 - \pb
++        vext.8   q9, q6, q7, #16 - \pb
++        vext.8   q11, q10, q11, #16 - \pb
++        vext.8   q10, q13, q10, #16 - \pb
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        ble      2f
++
++        vldr     d24, [r6, #32]
++        add      r6, r6, r3, lsl #1
++        vldr     d11, [r1, #24]
++        vext.8   d10, d22, d23, #\pb
++        vldr     d30, [r1, #32]
++        add      r1, r1, r3, lsl #1
++        // first 32b of a is mostly available in second 32b of c
++        vext.8   q0, q6, q7, #\pb
++        vext.8   q1, q7, q12, #\pb
++        // first 32b of c is mostly available in second 32b of b
++        vext.8   q4, q10, q11, #\pb
++        // second 32b of a is mostly available in first 32b of c
++        vext.8   q3, q5, q15, #\pb
++        vext.8   q2, q4, q5, #\pb
++        b        1b
++
++2:      pop      {r7, pc}
 +.endm
 +
 +.macro  edge_16b_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_16b_e2 \body_fn, (-\pb)
++        push     {lr}
++        sub      r6, r1, r3
++        vld1.8   {q1}, [r1, :128], r3
++        vldmia   r6, {d18-d20}
++        add      r6, r6, r3
++
++1:      vldr     d5, [r1, #-8]
++        vld1.8   {q3}, [r1, :128]
++        subs     r12, #1
++        vext.8   q0, q9, q10, #\pb
++        vext.8   q2, q2, q3, #16 - \pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        ble      2f
++        vmov     q9, q1
++        vldr     d3, [r1, #8]
++        add      r1, r1, r3
++        vldr     d20, [r6, #16]
++        add      r6, r6, r3
++        vext.8   d2, d4, d5, #\pb
++        b        1b
++
++2:      pop      {pc}
 +.endm
 +
 +.macro  edge_8bx2_e3, body_fn, pb
-+        @ e3 is the same as e2 but with the X offset reversed
-+        edge_8bx2_e2 \body_fn, (-\pb)
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        vld1.8   {d18-d19}, [r6]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #8]
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldr     d4, [r6, #-8]
++        vldr     d3, [r6]
++        vldr     d21, [r1, #-8]
++        vldr     d22, [r1]
++
++1:      vext.8   d0, d18, d19, #\pb
++        vext.8   d4, d4, d3, #8 - \pb
++        vext.8   d1, d2, d20, #\pb
++        subs     r12, #2
++        vext.8   d5, d21, d22, #8 - \pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        ble      2f
++
++        vldr     d19, [r6, #8]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #8]
++        vmov     d18, d3
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldr     d4, [r6, #-8]
++        vldr     d3, [r6]
++        vldr     d21, [r1, #-8]
++        vldr     d22, [r1]
++        b        1b
++
++2:      pop      {r7, pc}
 +.endm
 +
 +.macro  edge_4bx4_e3, body_fn, pb
@@ -8349,80 +7161,6 @@ index 1bf1c620d6..ccfa991f60 100644
      const uint8_t *bytestream_start;
      const uint8_t *bytestream;
      const uint8_t *bytestream_end;
-diff --git a/libavcodec/cllc.c b/libavcodec/cllc.c
-index af0f6da2e9..bd491c0c55 100644
---- a/libavcodec/cllc.c
-+++ b/libavcodec/cllc.c
-@@ -34,6 +34,10 @@
- #define VLC_DEPTH 2
- 
- 
-+#define VLC_BITS 7
-+#define VLC_DEPTH 2
-+
-+
- typedef struct CLLCContext {
-     AVCodecContext *avctx;
-     BswapDSPContext bdsp;
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 6a13bbbf0e..478b7c0ffc 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1665,6 +1665,41 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .props     = AV_CODEC_PROP_LOSSLESS,
-         .mime_types= MT("image/png"),
-     },
-+    {
-+        .id        = AV_CODEC_ID_CFHD,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "cfhd",
-+        .long_name = NULL_IF_CONFIG_SMALL("Cineform HD"),
-+        .props     = AV_CODEC_PROP_LOSSY,
-+    },
-+    {
-+        .id        = AV_CODEC_ID_TRUEMOTION2RT,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "truemotion2rt",
-+        .long_name = NULL_IF_CONFIG_SMALL("Duck TrueMotion 2.0 Real Time"),
-+        .props     = AV_CODEC_PROP_LOSSY,
-+    },
-+    {
-+        .id        = AV_CODEC_ID_MAGICYUV,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "magicyuv",
-+        .long_name = NULL_IF_CONFIG_SMALL("MagicYUV Lossless Video"),
-+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-+    },
-+    {
-+        .id        = AV_CODEC_ID_SHEERVIDEO,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "sheervideo",
-+        .long_name = NULL_IF_CONFIG_SMALL("BitJazz SheerVideo"),
-+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-+    },
-+    {
-+        .id        = AV_CODEC_ID_YLC,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "ylc",
-+        .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
-+        .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-+    },
- 
-     /* various PCM "codecs" */
-     {
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index dd0a965af0..053325c26b 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -115,7 +115,7 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-                     goto found;
-                 }
-             } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
--                       nalu_type == H264_NAL_IDR_SLICE) {
-+                       nalu_type == H264_NAL_IDR_SLICE)) {
-                 state += 8;
-                 continue;
-             }
 diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
 index 0b1195dc3e..5ef81fa739 100644
 --- a/libavcodec/mmaldec.c
@@ -8461,7 +7199,7 @@ index 8da2a9735e..9089f9b4ea 100644
      { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
      { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
 diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d181b74570..76e844caa8 100644
+index d181b74570..c52c450956 100644
 --- a/libavcodec/rawenc.c
 +++ b/libavcodec/rawenc.c
 @@ -24,6 +24,7 @@
@@ -8477,7 +7215,7 @@ index d181b74570..76e844caa8 100644
  #include "libavutil/imgutils.h"
  #include "libavutil/internal.h"
 +#include "libavutil/avassert.h"
-+#if CONFIG_RPI
++#if CONFIG_SAND
 +#include "libavutil/rpi_sand_fns.h"
 +#endif
  
@@ -8487,7 +7225,7 @@ index d181b74570..76e844caa8 100644
      return 0;
  }
  
-+#if CONFIG_RPI
++#if CONFIG_SAND
 +static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
 +                      const AVFrame *frame)
 +{
@@ -8543,7 +7281,7 @@ index d181b74570..76e844caa8 100644
      if (ret < 0)
          return ret;
  
-+#if CONFIG_RPI
++#if CONFIG_SAND
 +    if (av_rpi_is_sand_frame(frame)) {
 +        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
 +        *got_packet = (ret == 0);
@@ -8556,10 +7294,10 @@ index d181b74570..76e844caa8 100644
      if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
 diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
 new file mode 100644
-index 0000000000..e498c1a3eb
+index 0000000000..e02c26fea6
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2381 @@
+@@ -0,0 +1,2332 @@
 +/*
 + * HEVC CABAC decoding
 + *
@@ -8611,6 +7349,17 @@ index 0000000000..e498c1a3eb
 +// code size.
 +#define USE_N_END_1 1
 +
++#if !USE_BY22_DIV
++// * 1/x @ 32 bits gets us 22 bits of accuracy
++#define CABAC_BY22_PEEK_BITS  22
++#else
++// A real 32-bit divide gets us another bit
++// If we have a 64 bit int & a unit time divider then we should get a lot
++// of bits (55)  but that is untested and it is unclear if it would give
++// us a large advantage
++#define CABAC_BY22_PEEK_BITS  23
++#endif
++
 +#if ARCH_ARM
 +#include "arm/rpi_hevc_cabac.h"
 +#endif
@@ -9154,6 +7903,16 @@ index 0000000000..e498c1a3eb
 +}
 +#endif
 +
++static inline int cabac_overflow(const CABACContext * const cc)
++{
++    av_assert0(cc->bytestream >= cc->bytestream_start);
++    return cc->bytestream >= cc->bytestream_end + 4;
++}
++
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
++{
++    return cabac_overflow(&lc->cc);
++}
 +
 +#if !USE_BY22
 +// If no by22 then _by22 functions will revert to normal and so _peek/_flush
@@ -9179,17 +7938,6 @@ index 0000000000..e498c1a3eb
 +// O(1) nature of the code more worthwhile.
 +
 +
-+#if !USE_BY22_DIV
-+// * 1/x @ 32 bits gets us 22 bits of accuracy
-+#define CABAC_BY22_PEEK_BITS  22
-+#else
-+// A real 32-bit divide gets us another bit
-+// If we have a 64 bit int & a unit time divider then we should get a lot
-+// of bits (55)  but that is untested and it is unclear if it would give
-+// us a large advantage
-+#define CABAC_BY22_PEEK_BITS  23
-+#endif
-+
 +// Bypass block start
 +// Must be called before _by22_peek is used as it sets the CABAC environment
 +// into the correct state.  _by22_finish must be called to return to 'normal'
@@ -9274,30 +8022,19 @@ index 0000000000..e498c1a3eb
 +#endif  // USE_BY22
 +
 +
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts)
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
 +{
-+    // ???? Does this work with tiles + WPP? (No)
-+    // **** Need to save rice state too
-+    // pred_qpy is handled by get_qPy_pred and lc->first_qp_group
-+    if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        (ctb_addr_ts % s->ps.sps->ctb_width == 2 ||
-+         (s->ps.sps->ctb_width == 2 &&
-+          ctb_addr_ts % s->ps.sps->ctb_width == 0))) {
-+        memcpy(s->cabac_state, lc->cabac_state, HEVC_CONTEXTS);
-+    }
++    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
++    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
 +}
 +
 +static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
 +{
-+    memcpy(lc->cabac_state, s->cabac_state, HEVC_CONTEXTS);
++    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
++    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
 +}
 +
-+static int cabac_reinit(HEVCRpiLocalContext *lc)
-+{
-+    return skip_bytes(&lc->cc, 0) == NULL ? AVERROR_INVALIDDATA : 0;
-+}
-+
-+static int cabac_init_decoder(HEVCRpiLocalContext * const lc)
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
 +{
 +    GetBitContext * const gb = &lc->gb;
 +    skip_bits(gb, 1);
@@ -9331,68 +8068,19 @@ index 0000000000..e498c1a3eb
 +        lc->stat_coeff[i] = 0;
 +}
 +
-+int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts)
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
 +{
-+    if (ctb_addr_ts == s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]) {
-+        int ret = cabac_init_decoder(lc);
-+        if (ret < 0)
-+            return ret;
-+        if (s->sh.dependent_slice_segment_flag == 0 ||
-+            (s->ps.pps->tiles_enabled_flag &&
-+             s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]))
-+            cabac_init_state(s, lc);
-+
-+        if (!s->sh.first_slice_in_pic_flag &&
-+            s->ps.pps->entropy_coding_sync_enabled_flag) {
-+            if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {
-+                if (s->ps.sps->ctb_width == 1)
-+                    cabac_init_state(s, lc);
-+                else if (s->sh.dependent_slice_segment_flag == 1)
-+                    load_states(s, lc);
-+            }
-+        }
-+    } else {
-+        if (s->ps.pps->tiles_enabled_flag &&
-+            s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1]) {
-+            if (!lc->wpp_init) {
-+                int ret;
-+                if (s->threads_number == 1)  // **** Ummm... can only be 1 in our world but this is a wpp test
-+                    ret = cabac_reinit(lc);
-+                else
-+                    ret = cabac_init_decoder(lc);
-+                if (ret < 0)
-+                    return ret;
-+            }
-+            lc->wpp_init = 0;
-+
-+            cabac_init_state(s, lc);
-+        }
-+        if (s->ps.pps->entropy_coding_sync_enabled_flag) {
-+            if (ctb_addr_ts % s->ps.sps->ctb_width == 0) {  // ** Tiles + WPP bust
-+                // If wpp_init is set then we have been set up in the correct pos
-+                if (!lc->wpp_init) {
-+                    int ret;
-+                    // * Strong argument for putting the read terminate & align
-+                    //   at the end of the previous block (where it logically
-+                    //   resides) rather than here
-+                    get_cabac_terminate(&lc->cc);
-+                    if (s->threads_number == 1)
-+                        ret = cabac_reinit(lc);
-+                    else
-+                        ret = cabac_init_decoder(lc);
-+                    if (ret < 0)
-+                        return ret;
-+                }
-+                lc->wpp_init = 0;
-+
-+                if (s->ps.sps->ctb_width == 1)
-+                    cabac_init_state(s, lc);
-+                else
-+                    load_states(s, lc);
-+            }
-+        }
++    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
++    {
++        lc->qPy_pred = s->sh.slice_qp;
++        cabac_init_state(s, lc);
 +    }
-+    return 0;
++    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
++    {
++        lc->qPy_pred = s->sh.slice_qp;
++        load_states(s, lc);
++    }
++    lc->cabac_init_req = 0;
 +}
 +
 +#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
@@ -10737,7 +9425,8 @@ index 0000000000..e498c1a3eb
 +            }
 +        }
 +    } while ((i = next_subset(lc, i, c_idx_nz,
-+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
++                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
++             !cabac_overflow(&lc->cc));
 +
 +    if (lc->cu.cu_transquant_bypass_flag) {
 +        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
@@ -12134,10 +10823,10 @@ index 0000000000..a1d6d56b04
 +}
 diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
 new file mode 100644
-index 0000000000..9db79e658f
+index 0000000000..93f3530ff5
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,769 @@
+@@ -0,0 +1,761 @@
 +/*
 + * HEVC video decoder
 + *
@@ -12188,13 +10877,9 @@ index 0000000000..9db79e658f
 +    lc->na.cand_up       = (lc->ctb_up_flag   || y0b);
 +    lc->na.cand_left     = (lc->ctb_left_flag || x0b);
 +    lc->na.cand_up_left  = (!x0b && !y0b) ? lc->ctb_up_left_flag : lc->na.cand_left && lc->na.cand_up;
-+    lc->na.cand_up_right_sap =
-+            ((x0b + nPbW) == (1 << s->ps.sps->log2_ctb_size)) ?
-+                    lc->ctb_up_right_flag && !y0b : lc->na.cand_up;
-+    lc->na.cand_up_right =
-+            lc->na.cand_up_right_sap
-+                     && (x0 + nPbW) < lc->end_of_tiles_x;
-+    lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_tiles_y) ? 0 : lc->na.cand_left;
++    lc->na.cand_up_right = (x0 + nPbW) >= lc->end_of_ctb_x ?
++                    (lc->ctb_up_right_flag && !y0b) : lc->na.cand_up;
++    lc->na.cand_bottom_left = ((y0 + nPbH) >= lc->end_of_ctb_y) ? 0 : lc->na.cand_left;
 +}
 +
 +/*
@@ -12436,7 +11121,7 @@ index 0000000000..9db79e658f
 +    const int cand_left        = lc->na.cand_left;
 +    const int cand_up_left     = lc->na.cand_up_left;
 +    const int cand_up          = lc->na.cand_up;
-+    const int cand_up_right    = lc->na.cand_up_right_sap;
++    const int cand_up_right    = lc->na.cand_up_right;
 +
 +    const int xA1    = x0 - 1;
 +    const int yA1    = y0 + nPbH - 1;
@@ -12503,7 +11188,6 @@ index 0000000000..9db79e658f
 +
 +    // above right spatial merge candidate
 +    is_available_b0 = AVAILABLE(cand_up_right, B0) &&
-+                      xB0 < s->ps.sps->width &&
 +                      PRED_BLOCK_AVAILABLE(B0) &&
 +                      !is_diff_mer(s, xB0, yB0, x0, y0);
 +
@@ -12517,7 +11201,6 @@ index 0000000000..9db79e658f
 +
 +    // left bottom spatial merge candidate
 +    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
-+                      yA0 < s->ps.sps->height &&
 +                      PRED_BLOCK_AVAILABLE(A0) &&
 +                      !is_diff_mer(s, xA0, yA0, x0, y0);
 +
@@ -12750,7 +11433,7 @@ index 0000000000..9db79e658f
 +    const int cand_left        = lc->na.cand_left;
 +    const int cand_up_left     = lc->na.cand_up_left;
 +    const int cand_up          = lc->na.cand_up;
-+    const int cand_up_right    = lc->na.cand_up_right_sap;
++    const int cand_up_right    = lc->na.cand_up_right;
 +    ref_idx_curr       = LX;
 +    ref_idx            = mv->ref_idx[LX];
 +    pred_flag_index_l0 = LX;
@@ -12761,7 +11444,6 @@ index 0000000000..9db79e658f
 +    yA0 = y0 + nPbH;
 +
 +    is_available_a0 = AVAILABLE(cand_bottom_left, A0) &&
-+                      yA0 < s->ps.sps->height &&
 +                      PRED_BLOCK_AVAILABLE(A0);
 +
 +    //left spatial merge candidate
@@ -12816,7 +11498,6 @@ index 0000000000..9db79e658f
 +    yB0    = y0 - 1;
 +
 +    is_available_b0 =  AVAILABLE(cand_up_right, B0) &&
-+                       xB0 < s->ps.sps->width &&
 +                       PRED_BLOCK_AVAILABLE(B0);
 +
 +    // above spatial merge candidate
@@ -13099,10 +11780,10 @@ index 0000000000..4b4d032a16
 +#endif /* AVCODEC_RPI_HEVC_PARSE_H */
 diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
 new file mode 100644
-index 0000000000..f65efa1015
+index 0000000000..d28ae0ec92
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_ps.c
-@@ -0,0 +1,1712 @@
+@@ -0,0 +1,1756 @@
 +/*
 + * HEVC Parameter Set decoding
 + *
@@ -14367,9 +13048,10 @@ index 0000000000..f65efa1015
 +    av_freep(&pps->col_idxX);
 +    av_freep(&pps->ctb_addr_rs_to_ts);
 +    av_freep(&pps->ctb_addr_ts_to_rs);
-+    av_freep(&pps->tile_pos_rs);
++    av_freep(&pps->tile_pos_ts);
 +    av_freep(&pps->tile_size);
 +    av_freep(&pps->tile_id);
++    av_freep(&pps->ctb_ts_flags);
 +    av_freep(&pps->min_tb_addr_zs_tab);
 +
 +    av_freep(&pps);
@@ -14466,13 +13148,17 @@ index 0000000000..f65efa1015
 +    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
 +    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
 +    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
-+    pps->tile_size         = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_size));
++    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
++    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
++    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
 +    pps->min_tb_addr_zs_tab = av_malloc_array((sps->tb_mask+2) * (sps->tb_mask+2), sizeof(*pps->min_tb_addr_zs_tab));
 +    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-+        !pps->tile_id || !pps->min_tb_addr_zs_tab) {
++        !pps->tile_id || !pps->min_tb_addr_zs_tab || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
 +        return AVERROR(ENOMEM);
 +    }
 +
++    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
++
 +    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
 +        int tb_x   = ctb_addr_rs % sps->ctb_width;
 +        int tb_y   = ctb_addr_rs / sps->ctb_width;
@@ -14506,24 +13192,62 @@ index 0000000000..f65efa1015
 +        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
 +    }
 +
-+    for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
-+        for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
-+            for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
-+                for (x = pps->col_bd[i]; x < pps->col_bd[i + 1]; x++)
-+                    pps->tile_id[pps->ctb_addr_rs_to_ts[y * sps->ctb_width + x]] = tile_id;
++    {
++        uint8_t * pflags = pps->ctb_ts_flags;
++        uint16_t * ptid = pps->tile_id;
 +
-+    pps->tile_pos_rs = av_malloc_array(tile_id, sizeof(*pps->tile_pos_rs));
-+    if (!pps->tile_pos_rs)
-+        return AVERROR(ENOMEM);
-+
-+    for (j = 0; j < pps->num_tile_rows; j++)
-+        for (i = 0; i < pps->num_tile_columns; i++)
++        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
 +        {
-+            pps->tile_size[j * pps->num_tile_columns + i] =
-+                pps->column_width[i] * pps->row_height[j];
-+            pps->tile_pos_rs[j * pps->num_tile_columns + i] =
-+                pps->row_bd[j] * sps->ctb_width + pps->col_bd[i];
++            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
++            {
++                const unsigned int tile_w = pps->column_width[i];
++
++                pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++                for (x = 0; x != tile_w; ++x) {
++                    pflags[x] |= CTB_TS_FLAGS_TOT;
++                }
++
++                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
++                {
++                    pflags[0] |= CTB_TS_FLAGS_SOTL;
++
++                    if (pps->entropy_coding_sync_enabled_flag)
++                    {
++                        if (pps->column_width[i] != 1)
++                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
++                        else
++                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
++                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
++                    }
++
++                    for (x = 0; x != tile_w; ++x)
++                        *ptid++ = tile_id;
++
++                    pflags += tile_w;
++                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
++                    if (i + 1 == pps->num_tile_columns)
++                        pflags[-1] |= CTB_TS_FLAGS_EOL;
++                }
++
++                pflags[-1] |= CTB_TS_FLAGS_EOT;
++            }
 +        }
++    }
++
++    {
++        unsigned int ts = 0;
++        for (j = 0; j < pps->num_tile_rows; j++)
++            for (i = 0; i < pps->num_tile_columns; i++)
++            {
++                const unsigned int size = pps->column_width[i] * pps->row_height[j];
++                pps->tile_size[j * pps->num_tile_columns + i] = size;
++                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
++                ts += size;
++            }
++    }
 +
 +    log2_diff = sps->log2_ctb_size - sps->log2_min_tb_size;
 +    pps->min_tb_addr_zs = &pps->min_tb_addr_zs_tab[1*(sps->tb_mask+2)+1];
@@ -14780,6 +13504,7 @@ index 0000000000..f65efa1015
 +    if (get_bits_left(gb) < 0) {
 +        av_log(avctx, AV_LOG_ERROR,
 +               "Overread PPS by %d bits\n", -get_bits_left(gb));
++        ret = AVERROR_INVALIDDATA;
 +        goto err;
 +    }
 +
@@ -14817,10 +13542,10 @@ index 0000000000..f65efa1015
 +}
 diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
 new file mode 100644
-index 0000000000..1600076a69
+index 0000000000..989f8953b4
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,437 @@
+@@ -0,0 +1,446 @@
 +/*
 + * HEVC parameter set parsing
 + *
@@ -14868,7 +13593,7 @@ index 0000000000..1600076a69
 +    uint8_t nb_refs;
 +} LongTermRPS;
 +
-+typedef struct SliceHeader {
++typedef struct RpiSliceHeader {
 +    unsigned int pps_id;
 +
 +    ///< address (in raster order) of the first block in the current slice segment
@@ -14941,9 +13666,7 @@ index 0000000000..1600076a69
 +
 +    int16_t luma_offset_l1[16];
 +    int16_t chroma_offset_l1[16][2];
-+
-+    int slice_ctb_addr_rs;
-+} SliceHeader;
++} RpiSliceHeader;
 +
 +typedef struct HEVCWindow {
 +    unsigned int left_offset;
@@ -15138,6 +13861,15 @@ index 0000000000..1600076a69
 +    int data_size;
 +} HEVCRpiSPS;
 +
++#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
++#define CTB_TS_FLAGS_EOTL       (1U << 1)
++#define CTB_TS_FLAGS_EOL        (1U << 2)
++#define CTB_TS_FLAGS_EOT        (1U << 3)
++#define CTB_TS_FLAGS_CSAVE      (1U << 4)
++#define CTB_TS_FLAGS_CIREQ      (1U << 5)     // Cabac init request
++#define CTB_TS_FLAGS_TOT        (1U << 6)
++#define CTB_TS_FLAGS_CLOAD      (1U << 7)
++
 +typedef struct HEVCRpiPPS {
 +    unsigned int sps_id; ///< seq_parameter_set_id
 +
@@ -15198,19 +13930,21 @@ index 0000000000..1600076a69
 +    uint8_t log2_sao_offset_scale_chroma;
 +
 +    // Inferred parameters
-+    unsigned int *column_width;  ///< ColumnWidth
-+    unsigned int *row_height;    ///< RowHeight
-+    unsigned int *col_bd;        ///< ColBd
-+    unsigned int *row_bd;        ///< RowBd
-+    int *col_idxX;
++    uint16_t *column_width;  ///< ColumnWidth
++    uint16_t *row_height;    ///< RowHeight
++    uint16_t *col_bd;        ///< ColBd
++    uint16_t *row_bd;        ///< RowBd
++    uint16_t *col_idxX;
 +
-+    int *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
-+    int *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
-+    int *tile_id;           ///< TileId
-+    int *tile_pos_rs;       ///< TilePosRS
-+    int *tile_size;         ///< TileSize
++    // We can limit these to uint16_t given our other size limits
++    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
++    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
++    uint16_t *tile_id;           ///< TileId
++    uint16_t *tile_pos_ts;       ///< TilePosRS
++    uint16_t *tile_size;         ///< TileSize
 +    int *min_tb_addr_zs;    ///< MinTbAddrZS
 +    int *min_tb_addr_zs_tab;///< MinTbAddrZS
++    uint8_t * ctb_ts_flags;
 +
 +    uint8_t data[4096];
 +    int data_size;
@@ -15384,7 +14118,7 @@ index 0000000000..7fa6af1cdf
 +}
 diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
 new file mode 100644
-index 0000000000..ef15784317
+index 0000000000..d7745711ab
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_refs.c
 @@ -0,0 +1,515 @@
@@ -15668,7 +14402,7 @@ index 0000000000..ef15784317
 +
 +int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
 +{
-+    SliceHeader *sh = &s->sh;
++    RpiSliceHeader *sh = &s->sh;
 +
 +    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
 +    uint8_t list_idx;
@@ -15905,10 +14639,10 @@ index 0000000000..ef15784317
 +}
 diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
 new file mode 100644
-index 0000000000..c98b0804ed
+index 0000000000..c5133a8a88
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_sei.c
-@@ -0,0 +1,364 @@
+@@ -0,0 +1,368 @@
 +/*
 + * HEVC Supplementary Enhancement Information messages
 + *
@@ -16235,12 +14969,16 @@ index 0000000000..c98b0804ed
 +    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
 +
 +    while (byte == 0xFF) {
++       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
++           return AVERROR_INVALIDDATA;
 +        byte          = get_bits(gb, 8);
 +        payload_type += byte;
 +    }
 +    byte = 0xFF;
 +    while (byte == 0xFF) {
-+        byte          = get_bits(gb, 8);
++        if (get_bits_left(gb) < 8 + 8LL*payload_size)
++            return AVERROR_INVALIDDATA;
++         byte          = get_bits(gb, 8);
 +        payload_size += byte;
 +    }
 +    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
@@ -16416,7 +15154,7 @@ index 0000000000..41e4a20127
 +#endif /* AVCODEC_RPI_HEVC_SEI_H */
 diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
 new file mode 100644
-index 0000000000..4f1d6c71f2
+index 0000000000..fe506c8ad0
 --- /dev/null
 +++ b/libavcodec/rpi_hevc_shader.c
 @@ -0,0 +1,1570 @@
@@ -17988,7 +16726,7 @@ index 0000000000..4f1d6c71f2
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
-+#pragma Align_to(8, rpi_shader)
++#pragma Align_to(8, ff_hevc_rpi_shader)
 +#endif
 diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
 new file mode 100644
@@ -27686,10 +26424,10 @@ index 0000000000..56d5206827
 +};
 diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
 new file mode 100644
-index 0000000000..00bd911a86
+index 0000000000..0ad64f9f19
 --- /dev/null
 +++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,5630 @@
+@@ -0,0 +1,5679 @@
 +/*
 + * HEVC video Decoder
 + *
@@ -27772,8 +26510,20 @@ index 0000000000..00bd911a86
 +#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
 +#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
 +
-+#define QPU_C_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
-+#define QPU_Y_COMMANDS (((HEVC_RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
++
++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
++
++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
++
++// Total cmds to allocate - allow for slack & setup
++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
++
++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
 +
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
@@ -28951,7 +27701,7 @@ index 0000000000..00bd911a86
 +static int hls_slice_header(HEVCRpiContext *s)
 +{
 +    GetBitContext *gb = &s->HEVClc->gb;
-+    SliceHeader *sh   = &s->sh;
++    RpiSliceHeader *sh   = &s->sh;
 +    int i, ret;
 +
 +    // Coded parameters
@@ -29310,8 +28060,16 @@ index 0000000000..00bd911a86
 +                return AVERROR(ENOMEM);
 +            }
 +            for (i = 0; i < sh->num_entry_point_offsets; i++) {
-+                unsigned val = get_bits_long(gb, offset_len);
-+                sh->entry_point_offset[i] = val + 1; // +1; // +1 to get the size
++                uint32_t val_minus1 = get_bits_long(gb, offset_len);
++                if (val_minus1 > (1 << 28))
++                {
++                    // We can declare offsets of > 2^28 bad without loss of generality
++                    // Will check actual bounds wrt NAL later, but this keeps
++                    // the values within bounds we can deal with easily
++                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
++                    return AVERROR_INVALIDDATA;
++                }
++                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
 +            }
 +            if (s->threads_number > 1 && (s->ps.pps->num_tile_rows > 1 || s->ps.pps->num_tile_columns > 1)) {
 +                s->enable_parallel_tiles = 0; // TODO: you can enable tiles in parallel here
@@ -29344,13 +28102,6 @@ index 0000000000..00bd911a86
 +        return AVERROR_INVALIDDATA;
 +    }
 +
-+    sh->slice_ctb_addr_rs = sh->slice_segment_addr;
-+
-+    if (!s->sh.slice_ctb_addr_rs && s->sh.dependent_slice_segment_flag) {
-+        av_log(s->avctx, AV_LOG_ERROR, "Impossible slice segment.\n");
-+        return AVERROR_INVALIDDATA;
-+    }
-+
 +    if (get_bits_left(gb) < 0) {
 +        av_log(s->avctx, AV_LOG_ERROR,
 +               "Overread slice header by %d bits\n", -get_bits_left(gb));
@@ -29504,14 +28255,17 @@ index 0000000000..00bd911a86
 +                    lc->tu.cu_qp_delta = -lc->tu.cu_qp_delta;
 +            lc->tu.is_cu_qp_delta_coded = 1;
 +
-+            if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) ||
-+                lc->tu.cu_qp_delta >  (25 + s->ps.sps->qp_bd_offset / 2)) {
++// Was:
++//            if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) ||
++//                if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset / 2) ||
++// 2016 standard says:
++            if (lc->tu.cu_qp_delta < -(26 + s->ps.sps->qp_bd_offset) ||
++                lc->tu.cu_qp_delta > 25) {
 +                av_log(s->avctx, AV_LOG_ERROR,
 +                       "The cu_qp_delta %d is outside the valid range "
 +                       "[%d, %d].\n",
 +                       lc->tu.cu_qp_delta,
-+                       -(26 + s->ps.sps->qp_bd_offset / 2),
-+                        (25 + s->ps.sps->qp_bd_offset / 2));
++                       -(26 + s->ps.sps->qp_bd_offset), 25);
 +                return AVERROR_INVALIDDATA;
 +            }
 +
@@ -29953,11 +28707,25 @@ index 0000000000..00bd911a86
 +static HEVCRpiInterPredQ *
 +rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
 +{
-+    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
-+    HEVCRpiInterPredQ * ypt = yp + 1;
-+    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
-+        if (ypt->load < yp->load)
++    HEVCRpiInterPredQ * yp = NULL;
++    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
++    const unsigned int max_fill = ipe->max_fill;
++    unsigned int load = UINT_MAX;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
++        // We will always have enough room between the Qs but if we are
++        // running critically low due to poor scheduling then use fill size
++        // rather than load to determine QPU.  This has obvious dire
++        // performance implications but (a) it is better than crashing
++        // and (b) it should (almost) never happen
++        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
++        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
++
++        if (tload < load)
++        {
 +            yp = ypt;
++            load = tload;
++        }
 +    }
 +
 +    yp->load += load_val;
@@ -29980,7 +28748,9 @@ index 0000000000..00bd911a86
 +    }
 +}
 +
-+// Returns 0 on success, -1 if Q is dangerously full
++// Returns 0 on success
++// We no longer check for Q fullness as wew have emergncy code in ctu alloc
++// * However it might be an idea to have some means of spotting that we've used it
 +static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
 +{
 +    if (!ipe->used_grp)
@@ -29994,12 +28764,6 @@ index 0000000000..00bd911a86
 +    ipe->used = 1;
 +    ipe->used_grp = 0;
 +
-+    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
-+        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
-+        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
-+            return -1;
-+        }
-+    }
 +    return 0;
 +}
 +
@@ -31029,44 +29793,38 @@ index 0000000000..00bd911a86
 +static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
 +                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
 +{
-+    const int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
-+    const int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+    const int ctb_addr_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
-+    const int idxX = s->ps.pps->col_idxX[x_ctb >> s->ps.sps->log2_ctb_size];
++    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
++    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
++    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++    const unsigned int line_w = s->ps.sps->ctb_width;
 +
 +    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
 +
-+    lc->end_of_tiles_x = idxX + 1 >= s->ps.pps->num_tile_columns ? s->ps.sps->width :
-+        (s->ps.pps->col_bd[idxX + 1] << s->ps.sps->log2_ctb_size);
-+
-+    if (ctb_addr_ts == 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[ctb_addr_ts - 1] ||
-+        (s->ps.pps->entropy_coding_sync_enabled_flag && (x_ctb >> s->ps.sps->log2_ctb_size) == s->ps.pps->col_bd[idxX]))
-+    {
-+//        lc->first_qp_group = 1;
-+        lc->qPy_pred = s->sh.slice_qp;
-+    }
-+
-+    lc->end_of_tiles_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
++    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
 +
 +    lc->boundary_flags = 0;
 +
-+    if (x_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - 1]])
++    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
 +        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
 +    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
 +        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
-+    if (y_ctb <= 0 || s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]])
++    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
 +        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
-+    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - s->ps.sps->ctb_width])
++    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
 +        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
 +
 +    lc->ctb_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0;
 +    lc->ctb_up_flag   = (lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0;
-+    lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+        (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width);
 +
-+    lc->ctb_up_right_flag = ((y_ctb > 0) && (x_ctb + ctb_size) < lc->end_of_tiles_x &&
-+        (ctb_addr_in_slice+1 >= s->ps.sps->ctb_width) &&
-+        (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1 - s->ps.sps->ctb_width]]));
++    // Use line width rather than tile width for addr_in_slice test as
++    // addr_in_slice is in raster units
++    lc->ctb_up_left_flag = (lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++        (ctb_addr_rs_in_slice >= line_w + 1);
++
++    lc->ctb_up_right_flag = (ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
++        (ctb_addr_rs_in_slice + 1 >= line_w);
 +}
 +
 +
@@ -31091,11 +29849,10 @@ index 0000000000..00bd911a86
 +
 +    // Flush (SAO)
 +    if (y > y0) {
-+        const int tile_end = y_end ||
-+            s->ps.pps->tile_id[jb->ctu_ts_last] != s->ps.pps->tile_id[jb->ctu_ts_last + 1];
 +        const unsigned int xl = x0 > ctb_size ? x0 - ctb_size : 0;
 +        const unsigned int yt = y0 > ctb_size ? y0 - ctb_size : 0;
-+        const unsigned int yb = tile_end ? bound_b : y - ctb_size;
++        const unsigned int yb = (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0 ?
++            bound_b : y - ctb_size;
 +
 +        rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
 +        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
@@ -31169,7 +29926,7 @@ index 0000000000..00bd911a86
 +                break;
 +
 +            default:
-+                av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
 +                abort();
 +        }
 +    }
@@ -31531,16 +30288,21 @@ index 0000000000..00bd911a86
 +
 +    // * Sizeof the union structure might be overkill but at the moment it
 +    //   is correct (it certainly isn't going to be too small)
-+    // *** really should add per ctu sync words to be accurate
++    // Set max fill to slack/2 from the end of the Q
++    // If we exceed this in any Q then we will schedule by size (which should
++    // mean that we never use that Q again part from syncs)
++    // * Given how agressive the overflow resonse is we could maybe put the
++    //   threshold even nearer the end, but I don't expect us to ever hit
++    //   it on any real stream anyway.
 +
 +    rpi_inter_pred_alloc(&jb->chroma_ip,
 +                         QPU_N_MAX, QPU_N_GRP,
-+                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
-+                         QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
++                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
++                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2);
 +    rpi_inter_pred_alloc(&jb->luma_ip,
 +                         QPU_N_MAX,  QPU_N_GRP,
-+                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
-+                         QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
++                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
++                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2);
 +
 +    return jb;
 +}
@@ -31673,15 +30435,17 @@ index 0000000000..00bd911a86
 +
 +static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
 +{
-+    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
 +    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
 +
 +    // Check for obvious disasters
-+    if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
 +        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
 +        return AVERROR_INVALIDDATA;
 +    }
 +
++    // If dependant then ctb_addr_ts != 0 from previous check
 +    if (s->sh.dependent_slice_segment_flag) {
 +        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
 +        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
@@ -31691,7 +30455,7 @@ index 0000000000..00bd911a86
 +    }
 +
 +    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+        s->ps.pps->tile_id[ctb_addr_ts] + s->sh.num_entry_point_offsets >= tiles)
++        tile_id + s->sh.num_entry_point_offsets >= tiles)
 +    {
 +        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
 +        return AVERROR_INVALIDDATA;
@@ -31700,20 +30464,21 @@ index 0000000000..00bd911a86
 +    // Tiled stuff must start at start of tile if it has multiple entry points
 +    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
 +        s->sh.num_entry_point_offsets != 0 &&
-+        s->sh.slice_ctb_addr_rs != s->ps.pps->tile_pos_rs[s->ps.pps->tile_id[ctb_addr_ts]])
++        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
 +    {
 +        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
 +        return AVERROR_INVALIDDATA;
 +    }
 +
-+    // Setup any required decode vars
-+    if (!s->sh.dependent_slice_segment_flag)
-+        lc->qPy_pred = s->sh.slice_qp;
++    ff_hevc_rpi_cabac_init_decoder(lc);
 +
++    // Setup any required decode vars
++    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
++
++//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
 +    lc->qp_y = s->sh.slice_qp;
 +
 +    // General setup
-+    lc->wpp_init = 0;
 +    lc->bt_line_no = 0;
 +    lc->ts = ctb_addr_ts;
 +    return 0;
@@ -31722,6 +30487,7 @@ index 0000000000..00bd911a86
 +static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
 +{
 +    const GetBitContext * const gb = &s->HEVClc->gb;
++    RpiSliceHeader * const sh = &s->sh;
 +    int i, j;
 +
 +    const unsigned int length = nal->size;
@@ -31729,38 +30495,46 @@ index 0000000000..00bd911a86
 +    unsigned int cmpt;
 +    unsigned int startheader;
 +
-+    if (s->sh.num_entry_point_offsets == 0) {
++    if (sh->num_entry_point_offsets == 0) {
++        s->data = NULL;
 +        return 0;
 +    }
 +
-+    for (j = 0, cmpt = 0, startheader = offset + s->sh.entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++    // offset in slice header includes emulation prevention bytes.
++    // Unfortunately those have been removed by the time we get here so we
++    // have to compensate.  The nal layer keeps a track of where they were.
++    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
 +        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
 +            startheader--;
 +            cmpt++;
 +        }
 +    }
 +
-+    for (i = 1; i < s->sh.num_entry_point_offsets; i++) {
-+        offset += (s->sh.entry_point_offset[i - 1] - cmpt);
-+        for (j = 0, cmpt = 0, startheader = offset
-+             + s->sh.entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++    for (i = 1; i < sh->num_entry_point_offsets; i++) {
++        offset += (sh->entry_point_offset[i - 1] - cmpt);
++        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
 +            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
 +                startheader--;
 +                cmpt++;
 +            }
 +        }
-+        s->sh.size[i - 1] = s->sh.entry_point_offset[i] - cmpt;
-+        s->sh.offset[i - 1] = offset;
-+    }
-+    if (s->sh.num_entry_point_offsets != 0) {
-+        offset += s->sh.entry_point_offset[s->sh.num_entry_point_offsets - 1] - cmpt;
-+        if (length < offset) {
-+            av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++        if (sh->entry_point_offset[i] <= cmpt) {
++            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
 +            return AVERROR_INVALIDDATA;
 +        }
-+        s->sh.size[s->sh.num_entry_point_offsets - 1] = length - offset;
-+        s->sh.offset[s->sh.num_entry_point_offsets - 1] = offset;
++        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
++        sh->offset[i - 1] = offset;
 +    }
++
++    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
++    if (length < offset) {
++        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++        return AVERROR_INVALIDDATA;
++    }
++    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
++    sh->offset[sh->num_entry_point_offsets - 1] = offset;
++
++    // Remember data start pointer as we won't have nal later
 +    s->data = nal->data;
 +    return 0;
 +}
@@ -31787,10 +30561,11 @@ index 0000000000..00bd911a86
 +        const int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
 +        const int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size;
 +        int q_full;
++        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
 +
 +        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
 +
-+        ff_hevc_rpi_cabac_init(s, lc, ctb_addr_ts);
++        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
 +
 +        hls_sao_param(s, lc, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
 +
@@ -31800,28 +30575,40 @@ index 0000000000..00bd911a86
 +
 +        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
++        if (ff_hevc_rpi_cabac_overflow(lc))
++        {
++            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
++            more_data = AVERROR_INVALIDDATA;
++        }
++
 +        if (more_data < 0) {
-+            s->tab_slice_address[ctb_addr_rs] = -1;
++            s->tab_slice_address[ctb_addr_rs] = -1;  // Mark slice as broken
 +            return more_data;
 +        }
 +
-+        // Inc TS to next.
-+        // N.B. None of the other position vars have changed
-+        ctb_addr_ts++;
-+        ff_hevc_rpi_save_states(s, lc, ctb_addr_ts);
++        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
++             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
++        {
++            if (get_cabac_terminate(&lc->cc) < 0 ||
++                skip_bytes(&lc->cc, 0) == NULL)
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
++                return -1;
++            }
++        }
++
++        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
++            ff_hevc_rpi_save_states(s, lc);
 +
 +        // Report progress so we can use our MVs in other frames
-+        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
++        if (s->threads_type == FF_THREAD_FRAME && (ctb_flags & CTB_TS_FLAGS_EOL) != 0)
 +            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
-+        }
 +
 +        // End of line || End of tile line || End of tile
 +        // (EoL covers end of frame for our purposes here)
-+        q_full = x_ctb + ctb_size >= s->ps.sps->width ||
-+            s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts] != ctb_addr_rs + 1 ||
-+            s->ps.pps->tile_id[ctb_addr_ts - 1] != s->ps.pps->tile_id[ctb_addr_ts];
++        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
 +
-+        // Allocate QPU chuncks on fixed size 64 pel boundries rather than
++        // Allocate QPU chunks on fixed size 64 pel boundries rather than
 +        // whatever ctb_size is today.
 +        // * We might quite like to continue to 64 pel vertical too but that
 +        //   currently confuses WPP
@@ -31837,11 +30624,14 @@ index 0000000000..00bd911a86
 +                // * This is very annoying (and slow) to cope with in WPP so
 +                //   we treat it as an error there (no known stream triggers this
 +                //   with the current buffer sizes).  Non-wpp should cope fine.
-+                av_log(s, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
++                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
 +                q_full = 1;
 +            }
 +        }
 +
++        // Inc TS to next.
++        ctb_addr_ts++;
++
 +        if (q_full)
 +        {
 +            // Do job
@@ -31897,6 +30687,8 @@ index 0000000000..00bd911a86
 +    // Always need to store where we are in the bitstream
 +    dst_lc->ts = src_lc->ts;
 +    dst_lc->gb = src_lc->gb;
++    // Cabac init request will be built at start of next slice
++
 +    // Need to store context if we might have a dependent seg
 +    if (is_dep)
 +    {
@@ -31927,7 +30719,7 @@ index 0000000000..00bd911a86
 +        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
 +            INT_MAX :
 +        is_tile ?
-+            s->ps.pps->ctb_addr_rs_to_ts[s->ps.pps->tile_pos_rs[tile_id + line_inc]] :
++            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
 +            lc->ts + lc->bt_line_width * line_inc;
 +    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
 +    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
@@ -31951,14 +30743,11 @@ index 0000000000..00bd911a86
 +            return err;
 +
 +        ff_init_cabac_decoder(&lc->cc, data, len);
-+
-+        lc->wpp_init = 1;  // Stop ff_hevc_rpi_cabac_init trying to read non-existant termination bits
 +    }
 +
 +    // We should never be processing a dependent slice here so reset is good
 +    // ?? These probably shouldn't be needed (as they should be set by later
 +    //    logic) but do seem to be required
-+    lc->qPy_pred = s->sh.slice_qp;
 +    lc->qp_y = s->sh.slice_qp;
 +
 +    do
@@ -32007,7 +30796,7 @@ index 0000000000..00bd911a86
 +                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
 +            {
 +                if (err == 0) {
-+                    av_log(s, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
 +                    err = AVERROR_INVALIDDATA;
 +                }
 +                worker_free(s, lc);
@@ -32113,7 +30902,7 @@ index 0000000000..00bd911a86
 +        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
 +        const unsigned int tile = tile0 + line;
 +
-+        lc->ts = pps->ctb_addr_rs_to_ts[pps->tile_pos_rs[tile]];
++        lc->ts = pps->tile_pos_ts[tile];
 +        lc->bt_line_no = line;
 +        lc->bt_is_tile = 1;
 +        lc->bt_line_width = line_ts_width(s, lc->ts);
@@ -32134,10 +30923,10 @@ index 0000000000..00bd911a86
 +
 +        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
 +            if (lc->bt_terminate) {
-+                av_log(s, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
 +                break;
 +            }
-+            av_log(s, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
 +        }
 +    }
 +
@@ -32341,7 +31130,7 @@ index 0000000000..00bd911a86
 +
 +fail:
 +    // Cleanup
-+    av_log(s, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
 +    // Free our job & wait for temination
 +    worker_free(s, lc);
 +    worker_wait(s, lc);
@@ -32716,7 +31505,7 @@ index 0000000000..00bd911a86
 +    /* split the input packet into NAL units, so we know the upper bound on the
 +     * number of slices in the frame */
 +    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
-+                                s->nal_length_size, s->avctx->codec_id, 1);
++                                s->nal_length_size, s->avctx->codec_id, 0);
 +    if (ret < 0) {
 +        av_log(s->avctx, AV_LOG_ERROR,
 +               "Error splitting the input into NAL units.\n");
@@ -32965,7 +31754,7 @@ index 0000000000..00bd911a86
 +
 +    av_freep(&s->sei.picture_hash.md5_ctx);
 +
-+    av_freep(&s->cabac_state);
++    av_freep(&s->cabac_save);
 +
 +#if RPI_EXTRA_BIT_THREADS
 +    bit_threads_kill(s);
@@ -33059,12 +31848,10 @@ index 0000000000..00bd911a86
 +        ff_hevc_rpi_progress_init_state(s->progress_states + i);
 +    }
 +
-+    s->cabac_state = av_malloc(HEVC_CONTEXTS);
-+    if (!s->cabac_state)
++    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
 +        goto fail;
 +
-+    s->output_frame = av_frame_alloc();
-+    if (!s->output_frame)
++     if ((s->output_frame = av_frame_alloc()) == NULL)
 +        goto fail;
 +
 +    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
@@ -33091,7 +31878,7 @@ index 0000000000..00bd911a86
 +    return 0;
 +
 +fail:
-+    av_log(s, AV_LOG_ERROR, "%s: Failed\n", __func__);
++    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
 +    hevc_decode_free(avctx);
 +    return AVERROR(ENOMEM);
 +}
@@ -33197,13 +31984,13 @@ index 0000000000..00bd911a86
 +        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
 +        if (jbg == NULL)
 +        {
-+            av_log(s, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
 +            return -1;
 +        }
 +
 +        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
 +        {
-+            av_log(s, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
 +            return -1;
 +        }
 +    }
@@ -33322,10 +32109,10 @@ index 0000000000..00bd911a86
 +
 diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
 new file mode 100644
-index 0000000000..f61b29e669
+index 0000000000..2201017cb3
 --- /dev/null
 +++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,1054 @@
+@@ -0,0 +1,1061 @@
 +/*
 + * HEVC video decoder
 + *
@@ -33445,6 +32232,12 @@ index 0000000000..f61b29e669
 +#define HEVC_RPI_MAX_HEIGHT     1088
 +
 +
++// Min CTB size is 16
++#if ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) >= (1 << 16)
++#error Check CTB translation array el sizes (currently uint16_t)
++#endif
++
++
 +/**
 + * Value of the luma sample at position (x, y) in the 2D array tab.
 + */
@@ -33629,14 +32422,13 @@ index 0000000000..f61b29e669
 +    uint8_t cu_transquant_bypass_flag;
 +} CodingUnit;
 +
-+typedef struct NeighbourAvailable {
-+    int cand_bottom_left;
-+    int cand_left;
-+    int cand_up;
-+    int cand_up_left;
-+    int cand_up_right;
-+    int cand_up_right_sap;
-+} NeighbourAvailable;
++typedef struct RpiNeighbourAvailable {
++    char cand_bottom_left;
++    char cand_left;
++    char cand_up;
++    char cand_up_left;
++    char cand_up_right;
++} RpiNeighbourAvailable;
 +
 +typedef struct PredictionUnit {
 +    int mpm_idx;
@@ -33708,12 +32500,14 @@ index 0000000000..f61b29e669
 +
 +typedef struct HEVCRpiLocalContextIntra {
 +    TransformUnit tu;
-+    NeighbourAvailable na;
++    RpiNeighbourAvailable na;
 +} HEVCRpiLocalContextIntra;
 +
 +typedef struct HEVCRpiLocalContext {
 +    TransformUnit tu;  // Moved to start to match HEVCRpiLocalContextIntra (yuk!)
-+    NeighbourAvailable na;
++    RpiNeighbourAvailable na;
++
++    CABACContext cc;
 +
 +    // Vars that allow us to locate everything from just an lc
 +    struct HEVCRpiContext * context;  // ??? make const ???
@@ -33739,37 +32533,24 @@ index 0000000000..f61b29e669
 +
 +    struct HEVCRpiJob * jb0;
 +    char unit_done;  // Set once we have dealt with this slice
-+//    char max_done;
 +    char bt_is_tile;
 +    char last_progress_good;
-+
-+    char wpp_init;   // WPP/Tile bitstream init has happened
++    char cabac_init_req;
 +
 +    uint8_t cabac_state[HEVC_CONTEXTS];
-+
 +    uint8_t stat_coeff[4];
-+
-+//    uint8_t first_qp_group;
-+
 +    GetBitContext gb;
-+    CABACContext cc;
 +
 +    int8_t qp_y;
 +    int8_t curr_qp_y;
-+
-+    int qPy_pred;
++    int8_t qPy_pred;
 +
 +    uint8_t ctb_left_flag;
 +    uint8_t ctb_up_flag;
 +    uint8_t ctb_up_right_flag;
 +    uint8_t ctb_up_left_flag;
-+    int     end_of_tiles_x;
-+    int     end_of_tiles_y;
-+    /* +7 is for subpixel interpolation, *2 for high bit depths */
-+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+    /* The extended size between the new edge emu buffer is abused by SAO */
-+    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++    int     end_of_ctb_x;
++    int     end_of_ctb_y;
 +
 +    int ct_depth;
 +    CodingUnit cu;
@@ -33781,7 +32562,14 @@ index 0000000000..f61b29e669
 +#define BOUNDARY_UPPER_TILE     (1 << 3)
 +    /* properties of the boundary of the current CTB for the purposes
 +     * of the deblocking filter */
-+    int boundary_flags;
++    unsigned int boundary_flags;
++
++    /* +7 is for subpixel interpolation, *2 for high bit depths */
++    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++    /* The extended size between the new edge emu buffer is abused by SAO */
++    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
 +} HEVCRpiLocalContext;
 +
 +
@@ -33999,6 +32787,11 @@ index 0000000000..f61b29e669
 +} HEVCRpiStats;
 +#endif
 +
++typedef struct HEVCRpiCabacState
++{
++    uint8_t rice[4];
++    uint8_t state[HEVC_CONTEXTS];
++} HEVCRpiCabacState;
 +
 +typedef struct HEVCRpiContext {
 +    const AVClass *c;  // needed by private avoptions
@@ -34069,7 +32862,7 @@ index 0000000000..f61b29e669
 +    unsigned int dvq_n;
 +#endif
 +
-+    uint8_t *cabac_state;
++    HEVCRpiCabacState *cabac_save;
 +
 +    /** 1 if the independent slice segment header was successfully parsed */
 +    uint8_t slice_initialized;
@@ -34087,7 +32880,7 @@ index 0000000000..f61b29e669
 +    ///< candidate references for the current frame
 +    RefPicList rps[5];
 +
-+    SliceHeader sh;
++    RpiSliceHeader sh;
 +    SAOParams *sao;
 +    DBParams *deblock;
 +    enum HEVCNALUnitType nal_unit_type;
@@ -34195,8 +32988,9 @@ index 0000000000..f61b29e669
 + */
 +int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
 +
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc, int ctb_addr_ts);
-+int ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, int ctb_addr_ts);
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
 +int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc);
 +int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
 +int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
@@ -34269,7 +33063,7 @@ index 0000000000..f61b29e669
 +                                const int c_idx);
 +
 +void ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
-+
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
 +
 +extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
 +extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
@@ -40543,22 +39337,6 @@ index 0000000000..26fb3be999
 +
 +#endif
 +
-diff --git a/libavcodec/snowdec.c b/libavcodec/snowdec.c
-index 13668c2105..bebf9024ec 100644
---- a/libavcodec/snowdec.c
-+++ b/libavcodec/snowdec.c
-@@ -405,6 +405,11 @@ static int decode_header(SnowContext *s){
-         s->qbias = 0;
-         return AVERROR_INVALIDDATA;
-     }
-+    if (FFABS(s->qbias) > 127) {
-+        av_log(s->avctx, AV_LOG_ERROR, "qbias %d is too large\n", s->qbias);
-+        s->qbias = 0;
-+        return AVERROR_INVALIDDATA;
-+    }
- 
-     return 0;
- }
 diff --git a/libavcodec/utils.c b/libavcodec/utils.c
 index 9551f312e7..a1f68b8e30 100644
 --- a/libavcodec/utils.c
@@ -40604,18 +39382,169 @@ index 9551f312e7..a1f68b8e30 100644
  AVCodec *avcodec_find_decoder_by_name(const char *name)
  {
      AVCodec *p;
-diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
-index f0f849b326..cd97974748 100644
---- a/libavfilter/avfilter.c
-+++ b/libavfilter/avfilter.c
-@@ -995,6 +995,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
-                    "options, but options were provided: %s.\n", args);
-             return AVERROR(EINVAL);
-         }
-+        printf("=== args='%s'\n", args);
+diff --git a/libavfilter/Makefile b/libavfilter/Makefile
+index d2f0495f37..56bb87f851 100644
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -323,6 +323,7 @@ OBJS-$(CONFIG_TONEMAP_FILTER)                += vf_tonemap.o
+ OBJS-$(CONFIG_TRANSPOSE_FILTER)              += vf_transpose.o
+ OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
+ OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
++OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
+ OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
+ OBJS-$(CONFIG_USPP_FILTER)                   += vf_uspp.o
+ OBJS-$(CONFIG_VAGUEDENOISER_FILTER)          += vf_vaguedenoiser.o
+diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
+index 9b672a7a7e..d92e47e651 100644
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -334,6 +334,7 @@ static void register_all(void)
+     REGISTER_FILTER(TRANSPOSE,      transpose,      vf);
+     REGISTER_FILTER(TRIM,           trim,           vf);
+     REGISTER_FILTER(UNPREMULTIPLY,  unpremultiply,  vf);
++    REGISTER_FILTER(UNSAND,         unsand,         vf);
+     REGISTER_FILTER(UNSHARP,        unsharp,        vf);
+     REGISTER_FILTER(USPP,           uspp,           vf);
+     REGISTER_FILTER(VAGUEDENOISER,  vaguedenoiser,  vf);
+diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
+index 4304c06847..7bed282dff 100644
+--- a/libavfilter/avfiltergraph.c
++++ b/libavfilter/avfiltergraph.c
+@@ -31,6 +31,9 @@
+ #include "libavutil/internal.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/pixdesc.h"
++#if CONFIG_UNSAND_FILTER
++#include "libavutil/rpi_sand_fns.h"
++#endif
  
- #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
-             if (   !strcmp(filter->filter->name, "format")     ||
+ #define FF_INTERNAL_FIELDS 1
+ #include "framequeue.h"
+@@ -420,6 +423,19 @@ static int can_merge_formats(AVFilterFormats *a_arg,
+     }
+ }
+ 
++#if CONFIG_UNSAND_FILTER
++static int has_sand_format(const AVFilterFormats * const ff)
++{
++    int i;
++    for (i = 0; i != ff->nb_formats; ++i) {
++        if (av_rpi_is_sand_format(ff->formats[i])) {
++            return 1;
++        }
++    }
++    return 0;
++}
++#endif
++
+ /**
+  * Perform one round of query_formats() and merging formats lists on the
+  * filter graph.
+@@ -460,6 +476,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+         for (j = 0; j < filter->nb_inputs; j++) {
+             AVFilterLink *link = filter->inputs[j];
+             int convert_needed = 0;
++            unsigned int extra_convert_tried = 0;
+ 
+             if (!link)
+                 continue;
+@@ -507,12 +524,15 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+             )
+ #undef MERGE_DISPATCH
+ 
+-            if (convert_needed) {
++            while (convert_needed) {
+                 AVFilterContext *convert;
+                 AVFilter *filter;
+                 AVFilterLink *inlink, *outlink;
+                 char scale_args[256];
+                 char inst_name[30];
++                int can_retry = 0;
++
++                convert_needed = 0;
+ 
+                 if (graph->disable_auto_convert) {
+                     av_log(log_ctx, AV_LOG_ERROR,
+@@ -525,19 +545,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                 /* couldn't merge format lists. auto-insert conversion filter */
+                 switch (link->type) {
+                 case AVMEDIA_TYPE_VIDEO:
+-                    if (!(filter = avfilter_get_by_name("scale"))) {
+-                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
+-                               "not present, cannot convert pixel formats.\n");
+-                        return AVERROR(EINVAL);
+-                    }
+-
+-                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
+-                             scaler_count++);
++#if CONFIG_UNSAND_FILTER
++                    // Only try each extra conversion once
++                    // The unsand output pad should never trigger has_sand_format
++                    // but it is better to be safe
++                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
++                        if (!(filter = avfilter_get_by_name("unsand"))) {
++                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
++                                   "not present, cannot convert pixel formats.\n");
++                            return AVERROR(EINVAL);
++                        }
++
++                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
++                                 scaler_count++);
++
++                        if ((ret = avfilter_graph_create_filter(&convert, filter,
++                                                                inst_name, "", NULL,
++                                                                graph)) < 0)
++                            return ret;
+ 
+-                    if ((ret = avfilter_graph_create_filter(&convert, filter,
+-                                                            inst_name, graph->scale_sws_opts, NULL,
+-                                                            graph)) < 0)
+-                        return ret;
++                        extra_convert_tried |= 1;
++                        can_retry = 1;
++                    }
++                    else
++#endif
++                    {
++                        if (!(filter = avfilter_get_by_name("scale"))) {
++                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
++                                   "not present, cannot convert pixel formats.\n");
++                            return AVERROR(EINVAL);
++                        }
++
++                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
++                                 scaler_count++);
++
++                        if ((ret = avfilter_graph_create_filter(&convert, filter,
++                                                                inst_name, graph->scale_sws_opts, NULL,
++                                                                graph)) < 0)
++                            return ret;
++                    }
+                     break;
+                 case AVMEDIA_TYPE_AUDIO:
+                     if (!(filter = avfilter_get_by_name("aresample"))) {
+@@ -583,9 +629,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                     av_assert0(outlink-> in_channel_layouts->refcount > 0);
+                     av_assert0(outlink->out_channel_layouts->refcount > 0);
+                 }
+-                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type) ||
+-                    !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++                // If we have added an extra filter we must merge the input
++                // side but we can have another go at the output
++                if (!ff_merge_formats( inlink->in_formats,  inlink->out_formats,  inlink->type))
++                    ret = AVERROR(ENOSYS);
++                else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
++                {
++                    if (can_retry) {
++                        link = outlink;
++                        convert_needed = 1;
++                        continue;
++                    }
+                     ret = AVERROR(ENOSYS);
++                }
+                 if (inlink->type == AVMEDIA_TYPE_AUDIO &&
+                     (!ff_merge_samplerates(inlink->in_samplerates,
+                                            inlink->out_samplerates) ||
 diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
 index ad5aedd5f7..0d2df8b870 100644
 --- a/libavfilter/buffersrc.c
@@ -40629,19 +39558,248 @@ index ad5aedd5f7..0d2df8b870 100644
                                   frame->format);
          break;
      case AVMEDIA_TYPE_AUDIO:
+diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
+new file mode 100644
+index 0000000000..64578b7ac4
+--- /dev/null
++++ b/libavfilter/vf_unsand.c
+@@ -0,0 +1,232 @@
++/*
++ * Copyright (c) 2007 Bobby Bingham
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * format and noformat video filters
++ */
++
++#include <string.h>
++
++#include "libavutil/internal.h"
++#include "libavutil/mem.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/opt.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct UnsandContext {
++    const AVClass *class;
++} UnsandContext;
++
++static av_cold void uninit(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++
++    return 0;
++}
++
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterLink * const outlink = link->dst->outputs[0];
++    AVFrame *out = NULL;
++    int rv = 0;
++
++    if (outlink->format == in->format) {
++        // If nothing to do then do nothing
++        out = in;
++    }
++    else
++    {
++        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
++        {
++            rv = AVERROR(ENOMEM);
++            goto fail;
++        }
++        if (av_rpi_sand_to_planar_frame(out, in) != 0)
++        {
++            rv = -1;
++            goto fail;
++        }
++
++        av_frame_free(&in);
++    }
++
++    return ff_filter_frame(outlink, out);
++
++fail:
++    av_frame_free(&out);
++    av_frame_free(&in);
++    return rv;
++}
++
++#if 0
++static void dump_fmts(const AVFilterFormats * fmts)
++{
++    int i;
++    if (fmts== NULL) {
++        printf("NULL\n");
++        return;
++    }
++    for (i = 0; i < fmts->nb_formats; ++i) {
++        printf(" %d", fmts->formats[i]);
++    }
++    printf("\n");
++}
++#endif
++
++static int query_formats(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++    int ret;
++
++    // If we aren't connected at both ends then just do nothing
++    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
++        return 0;
++
++//    printf("Unsand: %s in: ", __func__);
++//    dump_fmts(ctx->inputs[0]->in_formats);
++//    printf("Unsand: %s out: ", __func__);
++//    dump_fmts(ctx->outputs[0]->out_formats);
++
++    // Our output formats depend on our input formats and we can't/don't
++    // want to convert between bit depths so we need to wait for the source
++    // to have an opinion before we do
++    if (ctx->inputs[0]->in_formats == NULL)
++        return AVERROR(EAGAIN);
++
++    // Accept anything
++    if (ctx->inputs[0]->out_formats == NULL &&
++        (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
++        return ret;
++
++    // Filter out sand formats
++
++    // Generate a container if we don't already have one
++    if (ctx->outputs[0]->in_formats == NULL)
++    {
++        // Somewhat rubbish way of ensuring we have a good structure
++        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
++        AVFilterFormats *formats = ff_make_format_list(out_fmts);
++
++        if (formats == NULL)
++            return AVERROR(ENOMEM);
++        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
++            return ret;
++    }
++
++    // Replace old format list with new filtered list derived from what our
++    // input says it can do
++    {
++        const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
++        AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
++        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
++        int i;
++        int n = 0;
++        int seen_420p = 0;
++        int seen_420p10 = 0;
++
++        for (i = 0; i < src_ff->nb_formats; ++i) {
++            const enum AVPixelFormat f = src_ff->formats[i];
++
++            switch (f){
++                case AV_PIX_FMT_YUV420P:
++                case AV_PIX_FMT_SAND128:
++                    if (!seen_420p) {
++                        seen_420p = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
++                    }
++                    break;
++                case AV_PIX_FMT_SAND64_10:
++                case AV_PIX_FMT_YUV420P10:
++                    if (!seen_420p10) {
++                        seen_420p10 = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
++                    }
++                    break;
++                default:
++                    dst_fmts[n++] = f;
++                    break;
++            }
++        }
++
++        av_freep(&dst_ff->formats);
++        dst_ff->formats = dst_fmts;
++        dst_ff->nb_formats = n;
++    }
++
++//    printf("Unsand: %s calc: ", __func__);
++//    dump_fmts(ctx->outputs[0]->in_formats);
++
++    return 0;
++}
++
++
++#define OFFSET(x) offsetof(UnsandContext, x)
++static const AVOption unsand_options[] = {
++    { NULL }
++};
++
++
++AVFILTER_DEFINE_CLASS(unsand);
++
++static const AVFilterPad avfilter_vf_unsand_inputs[] = {
++    {
++        .name             = "default",
++        .type             = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++    { NULL }
++};
++
++static const AVFilterPad avfilter_vf_unsand_outputs[] = {
++    {
++        .name = "default",
++        .type = AVMEDIA_TYPE_VIDEO
++    },
++    { NULL }
++};
++
++AVFilter ff_vf_unsand = {
++    .name          = "unsand",
++    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
++
++    .init          = init,
++    .uninit        = uninit,
++
++    .query_formats = query_formats,
++
++    .priv_size     = sizeof(UnsandContext),
++    .priv_class    = &unsand_class,
++
++    .inputs        = avfilter_vf_unsand_inputs,
++    .outputs       = avfilter_vf_unsand_outputs,
++};
++
 diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 1a7996c4fd..271e70ed84 100644
+index 1a7996c4fd..8119fc07f7 100644
 --- a/libavformat/utils.c
 +++ b/libavformat/utils.c
-@@ -750,7 +750,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
-         int default_stream_index = av_find_default_stream_index(s);
-         if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
-             for (i = 0; i < s->nb_streams; i++) {
--                if (av_find_program_from_stream(s, NULL, i))
-+                if (0 && av_find_program_from_stream(s, NULL, i))
-                     continue;
-                 s->streams[i]->pts_wrap_reference = pts_wrap_reference;
-                 s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
 @@ -2940,6 +2940,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
      return 1;
  }
@@ -40736,14 +39894,14 @@ index 1a7996c4fd..271e70ed84 100644
          if (!options)
              av_dict_free(&thread_opt);
 diff --git a/libavutil/Makefile b/libavutil/Makefile
-index 65e285a701..2ca778c59f 100644
+index 65e285a701..4909d2682e 100644
 --- a/libavutil/Makefile
 +++ b/libavutil/Makefile
 @@ -165,6 +165,7 @@ OBJS-$(CONFIG_QSV)                   += hwcontext_qsv.o
  OBJS-$(CONFIG_LIBDRM)                   += hwcontext_drm.o
  OBJS-$(CONFIG_LZO)                      += lzo.o
  OBJS-$(CONFIG_OPENCL)                   += opencl.o opencl_internal.o
-+OBJS-$(CONFIG_RPI)                      += rpi_sand_fns.o
++OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
  OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
  OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
  OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
@@ -40831,7 +39989,7 @@ index 73b6bd0b14..d907de3f1c 100644
   * @}
   */
 diff --git a/libavutil/frame.c b/libavutil/frame.c
-index d5fd2932e3..151a33a24d 100644
+index d5fd2932e3..b127cd833b 100644
 --- a/libavutil/frame.c
 +++ b/libavutil/frame.c
 @@ -16,6 +16,8 @@
@@ -40847,7 +40005,7 @@ index d5fd2932e3..151a33a24d 100644
  #include "imgutils.h"
  #include "mem.h"
  #include "samplefmt.h"
-+#if CONFIG_RPI
++#if CONFIG_SAND
 +#include "rpi_sand_fns.h"
 +#endif
  
@@ -40857,7 +40015,7 @@ index d5fd2932e3..151a33a24d 100644
          (frame->crop_top + frame->crop_bottom) >= frame->height)
          return AVERROR(ERANGE);
  
-+#if CONFIG_RPI
++#if CONFIG_SAND
 +    // Sand cannot be cropped - do not try
 +    if (av_rpi_is_sand_format(frame->format))
 +        return 0;
@@ -41128,15 +40286,16 @@ index 0000000000..52d52a2a83
 +
 diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
 new file mode 100644
-index 0000000000..b8bfad915e
+index 0000000000..3e31ef77ec
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,96 @@
+@@ -0,0 +1,151 @@
 +#include "config.h"
 +#include <stdint.h>
 +#include <string.h>
 +#include "rpi_sand_fns.h"
 +#include "avassert.h"
++#include "frame.h"
 +
 +#define PW 1
 +#include "rpi_sand_fn_pw.h"
@@ -41228,12 +40387,66 @@ index 0000000000..b8bfad915e
 +    }
 +}
 +
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
++{
++    const int w = av_frame_cropped_width(src);
++    const int h = av_frame_cropped_height(src);
++    const int x = src->crop_left;
++    const int y = src->crop_top;
++
++    // We will crop as part of the conversion
++    dst->crop_top = 0;
++    dst->crop_left = 0;
++    dst->crop_bottom = 0;
++    dst->crop_right = 0;
++
++    switch (src->format){
++        case AV_PIX_FMT_SAND128:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2,  w/2, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_SAND64_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x*2, y, w*2, h);
++                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y/2,  w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        default:
++            return -1;
++    }
++
++    return av_frame_copy_props(dst, src);
++}
 diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
 new file mode 100644
-index 0000000000..ebaa2b6d08
+index 0000000000..1f50b68ea8
 --- /dev/null
 +++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,131 @@
+@@ -0,0 +1,136 @@
 +#ifndef AVUTIL_RPI_SAND_FNS
 +#define AVUTIL_RPI_SAND_FNS
 +
@@ -41286,6 +40499,11 @@ index 0000000000..ebaa2b6d08
 +                         unsigned int w, unsigned int h, const unsigned int shr);
 +
 +
++// dst must contain required pixel format & allocated data buffers
++// Cropping on the src buffer will be honoured and dst crop will be set to zero
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
++
++
 +static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
 +{
 +#ifdef RPI_ZC_SAND128_ONLY
@@ -41365,50 +40583,6 @@ index 0000000000..ebaa2b6d08
 +
 +#endif
 +
-diff --git a/libswscale/input.c b/libswscale/input.c
-index bb2f4933ec..de5a17bc7f 100644
---- a/libswscale/input.c
-+++ b/libswscale/input.c
-@@ -741,6 +741,13 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
-     }
- }
- 
-+static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
-+                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
-+                       int width, uint32_t *unused)
-+{
-+    // NIF
-+}
-+
- #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
- 
- static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-@@ -1124,6 +1131,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
-     case AV_PIX_FMT_P016BE:
-         c->chrToYV12 = p016BEToUV_c;
-         break;
-+    case AV_PIX_FMT_SAND128:
-+    case AV_PIX_FMT_SAND64_10:
-+        c->chrToYV12 = sand128ToUV_c;  // NIF
-+        break;
-     }
-     if (c->chrSrcHSubSample) {
-         switch (srcFormat) {
-diff --git a/libswscale/utils.c b/libswscale/utils.c
-index dcab707de6..5b24de889a 100644
---- a/libswscale/utils.c
-+++ b/libswscale/utils.c
-@@ -256,6 +256,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
-     [AV_PIX_FMT_P010BE]      = { 1, 1 },
-     [AV_PIX_FMT_P016LE]      = { 1, 0 },
-     [AV_PIX_FMT_P016BE]      = { 1, 0 },
-+#if CONFIG_RPI
-+    [AV_PIX_FMT_SAND128]     = { 1, 0 },
-+    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
-+#endif
- };
- 
- int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
 index 0000000000..b1e99a6a89
@@ -41442,7 +40616,7 @@ index 0000000000..b1e99a6a89
 +
 diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
 new file mode 100644
-index 0000000000..5e7ed4da9d
+index 0000000000..e176c503f9
 --- /dev/null
 +++ b/pi-util/conf_h265.2016.csv
 @@ -0,0 +1,193 @@
@@ -41556,7 +40730,7 @@ index 0000000000..5e7ed4da9d
 +1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
 +1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
 +1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
 +2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
 +1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
 +1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
@@ -41622,7 +40796,7 @@ index 0000000000..5e7ed4da9d
 +1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
 +1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
 +1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
-+2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
 +1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
 +1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
 +1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
index 1d1fd1690e..5eac8a1bcd 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1004-added_upstream_mvc_patches.patch
@@ -1,4 +1,4 @@
-From 8f170986cda0695f28eb2cd4e863aaae0e14d19f Mon Sep 17 00:00:00 2001
+From e75d7807cc97b3ddd8d8f6fe2fcf3dc4de58863f Mon Sep 17 00:00:00 2001
 From: Hendrik Leppkes <h.leppkes@gmail.com>
 Date: Sat, 9 Jan 2016 16:34:09 +0100
 Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles
@@ -32,12 +32,12 @@ index 6c4b011b5c..8f1f5a3e53 100644
  #define FF_PROFILE_VC1_SIMPLE   0
  #define FF_PROFILE_VC1_MAIN     1
 diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 478b7c0ffc..ff10f3b2bc 100644
+index 6a13bbbf0e..03ae4838d2 100644
 --- a/libavcodec/codec_desc.c
 +++ b/libavcodec/codec_desc.c
-@@ -1700,6 +1700,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+@@ -1665,6 +1665,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
+         .props     = AV_CODEC_PROP_LOSSLESS,
+         .mime_types= MT("image/png"),
      },
 +    {
 +        .id        = AV_CODEC_ID_H264_MVC,
@@ -78,7 +78,7 @@ index 53cbcfb543..f93f06fcfb 100644
 2.14.1
 
 
-From 00de72f97e8f69f5d4c614bff956ec726f97fa2e Mon Sep 17 00:00:00 2001
+From 51f6cec0b87840c32482df5d2b09f50d503d2b2b Mon Sep 17 00:00:00 2001
 From: Hendrik Leppkes <h.leppkes@gmail.com>
 Date: Sat, 9 Jan 2016 16:34:40 +0100
 Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs
@@ -116,7 +116,7 @@ index 86df5eb9b3..22c4f1d82a 100644
  
  #endif /* AVCODEC_H264_H */
 diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index 053325c26b..855c74896e 100644
+index dd0a965af0..855c74896e 100644
 --- a/libavcodec/h264_parser.c
 +++ b/libavcodec/h264_parser.c
 @@ -62,6 +62,7 @@ typedef struct H264ParseContext {
@@ -139,7 +139,7 @@ index 053325c26b..855c74896e 100644
                      goto found;
                  }
              } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
--                       nalu_type == H264_NAL_IDR_SLICE)) {
+-                       nalu_type == H264_NAL_IDR_SLICE) {
 +                       nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) {
                  state += 8;
 +
@@ -195,7 +195,7 @@ index 053325c26b..855c74896e 100644
 2.14.1
 
 
-From bbf5daa149ccc2c462be1bd5f6f710eba0e82094 Mon Sep 17 00:00:00 2001
+From 6edab559331e83ad11e7940233dbbaae121e528c Mon Sep 17 00:00:00 2001
 From: Hendrik Leppkes <h.leppkes@gmail.com>
 Date: Tue, 28 Nov 2017 16:12:12 +0000
 Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame
@@ -223,10 +223,10 @@ index 855c74896e..90a99a19a8 100644
 2.14.1
 
 
-From 3a0ebb0f7473a9a5ab93e01f7261862a3d324e50 Mon Sep 17 00:00:00 2001
+From 2263d8d3a16ccf886c3692597331779a726373b5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
-Date: Tue, 28 Nov 2017 18:32:08 +0000
-Subject: [PATCH 4/4] extract_extradata_bsf: Support H264_MVC
+Date: Sun, 21 Jan 2018 20:31:31 +0000
+Subject: [PATCH 4/4] fixup
 
 ---
  libavcodec/extract_extradata_bsf.c | 8 +++++---