From 4bb4dcab1d5050ae6ef122afd76b57243f661a95 Mon Sep 17 00:00:00 2001
From: MilhouseVH <milhouseVH.github@nmacleod.com>
Date: Wed, 24 May 2017 22:30:03 +0100
Subject: [PATCH] ffmpeg: update hevc commits

---
 ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 11994 ++++++++++------
 ...e6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch |     1 -
 2 files changed, 7681 insertions(+), 4314 deletions(-)

diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index b4c15b782a..96cfa9ae30 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -11,7 +11,7 @@ index 524fb73..305632b 100644
  /ffplay
  /ffprobe
 diff --git a/ffmpeg.c b/ffmpeg.c
-index 9ffd833..7a86d7e 100644
+index 9ffd833..e2474e5 100644
 --- a/ffmpeg.c
 +++ b/ffmpeg.c
 @@ -23,6 +23,11 @@
@@ -52,7 +52,7 @@ index 9ffd833..7a86d7e 100644
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -158,6 +182,169 @@ static int restore_tty;
+@@ -158,6 +182,182 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -100,7 +100,7 @@ index 9ffd833..7a86d7e 100644
 +  mmal_buffer_header_release(buffer);
 +}
 +
-+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
 +{
 +    MMAL_COMPONENT_T* display;
 +    MMAL_DISPLAYREGION_T region =
@@ -111,7 +111,7 @@ index 9ffd833..7a86d7e 100644
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
 +
 +    bcm_host_init();  // TODO is this needed?
 +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
@@ -121,7 +121,7 @@ index 9ffd833..7a86d7e 100644
 +
 +    {
 +        MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+        format->encoding = MMAL_ENCODING_I420;
++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
 +        format->es->video.width = geo.stride_y;
 +        format->es->video.height = geo.height_y;
 +        format->es->video.crop.x = 0;
@@ -138,7 +138,7 @@ index 9ffd833..7a86d7e 100644
 +    mmal_port_enable(display->input[0],display_cb_input);
 +    mmal_port_enable(display->control,display_cb_control);
 +
-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
++    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
 +
 +    return display;
 +}
@@ -168,12 +168,24 @@ index 9ffd833..7a86d7e 100644
 +#ifdef RPI_ZERO_COPY
 +{
 +    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
++    if (fr_buf == NULL) {
++        mmal_buffer_header_release(buf);
++        return;
++    }
 +
 +    buf->user_data = fr_buf;
 +    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->alloc_size =
-+        buf->length = av_rpi_zc_numbytes(fr_buf);
-+
++    buf->offset = av_rpi_zc_offset(fr_buf);
++    buf->length = av_rpi_zc_length(fr_buf);
++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++#if 0
++    {
++        unsigned int n;
++        for (n = 0; n < fr->width; n += 128) {
++            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
++        }
++    }
++#endif
 +    ++rpi_display_count;
 +}
 +#else
@@ -208,6 +220,7 @@ index 9ffd833..7a86d7e 100644
 +
 +static void display_exit(MMAL_COMPONENT_T* display)
 +{
++//    sleep(120);
 +    if (display) {
 +        mmal_component_destroy(display);
 +    }
@@ -222,7 +235,7 @@ index 9ffd833..7a86d7e 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -540,6 +727,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
@@ -234,7 +247,7 @@ index 9ffd833..7a86d7e 100644
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -551,6 +743,9 @@ static void ffmpeg_cleanup(int ret)
+@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
          av_freep(&ist->filters);
          av_freep(&ist->hwaccel_device);
  
@@ -244,7 +257,7 @@ index 9ffd833..7a86d7e 100644
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -581,6 +776,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -252,7 +265,7 @@ index 9ffd833..7a86d7e 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -944,6 +1140,15 @@ static void do_video_out(AVFormatContext *s,
+@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
@@ -260,7 +273,7 @@ index 9ffd833..7a86d7e 100644
 +    if (next_picture && ist != NULL)
 +    {
 +        if (!rpi_display)
-+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
++            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
 +        display_frame(ist->dec_ctx, rpi_display, next_picture);
 +    }
 +#endif
@@ -268,7 +281,7 @@ index 9ffd833..7a86d7e 100644
      if (filter->inputs[0]->frame_rate.num > 0 &&
          filter->inputs[0]->frame_rate.den > 0)
          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
-@@ -2549,6 +2754,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
@@ -282,22 +295,23 @@ index 9ffd833..7a86d7e 100644
  
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index fd0d1f0..40d22d2 100644
+index fd0d1f0..1740768 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
-@@ -5,6 +5,11 @@ NAME = avcodec
+@@ -5,6 +5,12 @@ NAME = avcodec
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
 +          rpi_qpu.h                                                     \
 +          rpi_shader.h                                                  \
++	  rpi_shader_cmd.h                                              \
 +          rpi_mailbox.h                                                 \
 +          rpi_hevc_transform.h                                          \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -43,6 +48,10 @@ OBJS = allcodecs.o                                                      \
+@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
@@ -308,18 +322,22 @@ index fd0d1f0..40d22d2 100644
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -1078,3 +1087,11 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
++QASM := $(SUBDIR)../pi-util/qasm.py
++
++ifneq ("$(wildcard $(QASM))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	python $(SUBDIR)../pi-util/qasm.py -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	python $(SUBDIR)../pi-util/qasm.py -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++endif
 +
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
 diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
 index 54efaad..02a89c3 100644
 --- a/libavcodec/allcodecs.c
@@ -333,12 +351,14 @@ index 54efaad..02a89c3 100644
      REGISTER_PARSER(MJPEG,              mjpeg);
      REGISTER_PARSER(MLP,                mlp);
 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index a4ceca7..1354c14 100644
+index a4ceca7..cafd25d 100644
 --- a/libavcodec/arm/Makefile
 +++ b/libavcodec/arm/Makefile
-@@ -132,8 +132,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                          arm/hevc_misc_neon.o          \
                                            arm/hevcdsp_deblock_neon.o    \
 +                                          arm/hevcdsp_epel_neon.o       \
                                            arm/hevcdsp_idct_neon.o       \
@@ -1027,18 +1047,592 @@ index 0000000..31d3c59
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
+new file mode 100644
+index 0000000..373576b
+--- /dev/null
++++ b/libavcodec/arm/hevc_misc_neon.S
+@@ -0,0 +1,62 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@   uint16_t * buf,          [r0]
++@   unsigned int log_n_m2)   [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++        vmov.i64 q8, #0
++        adr     r12, zc_tab
++        vmov.i64 q9, #0
++        tst     r0, #63
++        vmov.i64 q10, #0
++        add     r0, #63
++        vmov.i64 q11, #0
++        and     r0, #~63
++        ldr     pc, [r12, r1, lsl #2]
++
++zc_tab:
++        .word   zc_lc2
++        .word   zc_lc3
++        .word   zc_lc4
++        .word   zc_lc5
++
++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
++zc_lc2:
++        it eq
++        vstmeq  r0, {q8-q11}
++        bx      lr
++
++@ 16*16*2 = 512 = 64 * 8
++zc_lc4:
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++@ 8*8*2 = 128
++zc_lc3:
++        vstm    r0!, {q8-q11}
++        vstm    r0,  {q8-q11}
++        bx      lr
++
++@ 32*32*2 = 2048 = 128 * 16
++zc_lc5:
++        vmov.i64 q12, #0
++        vmov.i64 q13, #0
++        vmov.i64 q14, #0
++        vmov.i64 q15, #0
++        mov     r2, #4
++1:
++        vstm    r0!, {q8-q15}
++        subs    r2, #1
++        vstm    r0!, {q8-q15}
++        vstm    r0!, {q8-q15}
++        vstm    r0!, {q8-q15}
++        bne     1b
++        bx      lr
++
++endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb..a088cc3 100644
+index 166bddb..9bd0a42 100644
 --- a/libavcodec/arm/hevcdsp_deblock_neon.S
 +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
-@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+@@ -15,7 +15,7 @@
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with FFmpeg; if not, write to the Free Software
+- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
+  */
+ 
+ 
+@@ -31,6 +31,9 @@
+         bxeq     lr
+ .endm
+ 
++@ Uses: d2, d4, d18, d19
++@ Returns: d2, d4
++@ Modifies: d0-d7, d22-d25
+ .macro hevc_loop_filter_chroma_body
+         vsubl.u8  q3, d4, d2
+         vsubl.u8  q11, d18, d19
+@@ -49,6 +52,33 @@
+         vqmovun.s16 d4, q2
+ .endm
+ 
++
++@ Uses r2[0:7], r2[8:15]
++@ Modifies: d0-d7, d22-d25
++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
++        vsubl.u8  q3, \Q0, \P0
++        vsubl.u8  q11, \P1, \Q1
++        vshl.i16  q3, #2
++        vadd.i16  q11, q3
++
++        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
++        vdup.16   d0, r2
++        vmovl.u8  q0, d0
++        vuzp.16   d0, d1
++
++        vrshr.s16 q11, q11, #3
++        vneg.s16  q12, q0
++        vmovl.u8  q2, \Q0
++        vmin.s16  q11, q11, q0
++        vmax.s16  q11, q11, q12
++        vaddw.u8  q1, q11, \P0
++        vsub.i16  q2, q11
++        vqmovun.s16 \P0, q1
++        vqmovun.s16 \Q0, q2
++.endm
++
++
++
+ .macro hevc_loop_filter_luma_start
+         ldr     r12, [r3]
+         ldr      r3, [r3, #4]
+@@ -60,15 +90,17 @@
+         lsr      r3, #16
+ .endm
+ 
+-.macro hevc_loop_filter_luma_body
++@ Uses: r2, r3, r12
++@ Modifies: r5, r6, r7, r8, r9
++function hevc_loop_filter_luma_body
++        vmovl.u8  q15, d23
++        vmovl.u8  q14, d22
++        vmovl.u8  q13, d21
++        vmovl.u8  q12, d20
++        vmovl.u8  q11, d19
++        vmovl.u8  q10, d18
++        vmovl.u8  q9, d17
+         vmovl.u8  q8, d16
+-        vmovl.u8  q9, d18
+-        vmovl.u8  q10, d20
+-        vmovl.u8  q11, d22
+-        vmovl.u8  q12, d24
+-        vmovl.u8  q13, d26
+-        vmovl.u8  q14, d28
+-        vmovl.u8  q15, d30
+ 
+         vadd.i16   q7, q9, q11
+         vadd.i16   q6, q14, q12
+@@ -77,7 +109,6 @@
+         vabd.s16   q7, q7, q10
+         vabd.s16   q6, q6, q13
+ 
+-
+         vdup.16    q0, r2
+         vmov       q4, q7
+         vmov       q5, q6
+@@ -152,7 +183,7 @@
+ 
+         and        r9, r8, r7
+         cmp        r9, #0
+-        beq        weakfilter_\@
++        beq        weakfilter_
+ 
+         vadd.i16  q2, q11, q12
+         vadd.i16  q4, q9, q8
+@@ -210,11 +241,11 @@
+         vbit      q13, q3, q5
+         vbit      q14, q2, q5
+ 
+-weakfilter_\@:
++weakfilter_:
+         mvn       r8, r8
+         and       r9, r8, r7
+         cmp       r9, #0
+-        beq       ready_\@
++        beq       ready_
+ 
+         vdup.16    q4, r2
+ 
+@@ -275,75 +306,345 @@ weakfilter_\@:
+         vbit      q11, q0, q5
+         vbit      q12, q4, q5
+ 
+-ready_\@:
++ready_:
+         vqmovun.s16 d16, q8
+-        vqmovun.s16 d18, q9
+-        vqmovun.s16 d20, q10
+-        vqmovun.s16 d22, q11
+-        vqmovun.s16 d24, q12
+-        vqmovun.s16 d26, q13
+-        vqmovun.s16 d28, q14
+-        vqmovun.s16 d30, q15
+-.endm
++        vqmovun.s16 d17, q9
++        vqmovun.s16 d18, q10
++        vqmovun.s16 d19, q11
++        vqmovun.s16 d20, q12
++        vqmovun.s16 d21, q13
++        vqmovun.s16 d22, q14
++        vqmovun.s16 d23, q15
++        mov       pc, lr
++endfunc
++
++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
++function ff_hevc_v_loop_filter_luma2_neon_8, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
++
++        ldr      r4, [sp, #40]
++        b        v_loop_luma_common
++endfunc
++
+ 
+ function ff_hevc_v_loop_filter_luma_neon, export=1
+         hevc_loop_filter_luma_start
+-        push     {r5-r11}
++        push     {r4-r10,lr}
++
++        sub      r4, r0, #4
++v_loop_luma_common:
++        @ Why this isn't a bitmask to start with I have no idea...
++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
++        ldr      r5, [sp, #32]
++        ldrh     r10, [r5]
++        ldr      r5, [sp, #36]
++        ldrh     r5, [r5]
++        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
++
+         vpush    {d8-d15}
+-        sub      r0, #4
+-        vld1.8   {d16}, [r0], r1
+-        vld1.8   {d18}, [r0], r1
+-        vld1.8   {d20}, [r0], r1
+-        vld1.8   {d22}, [r0], r1
+-        vld1.8   {d24}, [r0], r1
+-        vld1.8   {d26}, [r0], r1
+-        vld1.8   {d28}, [r0], r1
+-        vld1.8   {d30}, [r0], r1
+-        sub      r0, r0, r1, lsl #3
+-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+-        hevc_loop_filter_luma_body
+-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+-        vst1.8   {d16}, [r0], r1
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d22}, [r0], r1
+-        vst1.8   {d24}, [r0], r1
+-        vst1.8   {d26}, [r0], r1
+-        vst1.8   {d28}, [r0], r1
+-        vst1.8   {d30}, [r0]
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
++        vld4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
++
++        vld4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vld4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++
++        vld4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++        vld4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++
++        vld4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vld4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++
++        vld4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++        vld4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++
++        vld4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vld4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++
++        vld4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++        vld4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++
++        vld4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
++        vld4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
++
++        bl hevc_loop_filter_luma_body
++
++        neg     r1, r1
++
++        @ no_p[1]
++        tst     r10, #0xff00
++        itt ne
++        addne    r4, r4, r1, lsl #2
++        bne     1f
++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++
++1:
++        @ no_q[1]
++        tst     r10, #0xff000000
++        itt ne
++        addne    r0, r0, r1, lsl #2
++        bne     2f
++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++
++2:
++        @ no_p[0]
++        tst     r10, #0xff
++        bne     3f
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
++
++3:
++        @ no_q[0]
++        tst     r10, #0xff0000
++        bne     4f
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
++
++4:
++bypasswrite:
+         vpop     {d8-d15}
+-        pop      {r5-r11}
+-        bx lr
++        pop      {r4-r10,pc}
+ endfunc
+ 
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
++@                                 ptrdiff_t stride, [r1]
++@                                 int beta,         [r2]
++@                                 int32_t *tc,      [r3]
++@                                 uint8_t *no_p,    sp[0]
++@                                 uint8_t *no_q);   sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
+ function ff_hevc_h_loop_filter_luma_neon, export=1
+         hevc_loop_filter_luma_start
+-        push     {r5-r11}
++        push     {r4-r10,lr}
++
+         vpush    {d8-d15}
+         sub      r0, r0, r1, lsl #2
++
+         vld1.8  {d16}, [r0], r1
++        vld1.8  {d17}, [r0], r1
+         vld1.8  {d18}, [r0], r1
++        vld1.8  {d19}, [r0], r1
+         vld1.8  {d20}, [r0], r1
++        vld1.8  {d21}, [r0], r1
+         vld1.8  {d22}, [r0], r1
+-        vld1.8  {d24}, [r0], r1
+-        vld1.8  {d26}, [r0], r1
+-        vld1.8  {d28}, [r0], r1
+-        vld1.8  {d30}, [r0], r1
+-        sub        r0, r0, r1, lsl #3
+-        add        r0, r1
+-        hevc_loop_filter_luma_body
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d22}, [r0], r1
+-        vst1.8   {d24}, [r0], r1
+-        vst1.8   {d26}, [r0], r1
+-        vst1.8   {d28}, [r0]
+-bypasswrite:
++        vld1.8  {d23}, [r0]
++
++        bl hevc_loop_filter_luma_body
++
+         vpop     {d8-d15}
+-        pop      {r5-r11}
+-        bx lr
++
++        neg     r1, r1
++        add     r0, r0, r1
++
++        @ Why this isn't a bitmask to start with I have no idea...
++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
++        ldr      r5, [sp, #32]
++        ldrh     r10, [r5]
++        ldr      r5, [sp, #36]
++        ldrh     r5, [r5]
++        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
++        bne      1f
++
++        vst1.8  {d22}, [r0], r1
++        vst1.8  {d21}, [r0], r1
++        vst1.8  {d20}, [r0], r1
++        vst1.8  {d19}, [r0], r1
++        vst1.8  {d18}, [r0], r1
++        vst1.8  {d17}, [r0]
++
++        pop      {r4-r10,pc}
++
++@ Partial write
++1:
++        vmov     r2, r3, d22
++        vmov     r4, r5, d21
++        vmov     r6, r7, d20
++
++        tst      r10, #0xff0000
++        ittt eq
++        streq    r2, [r0]
++        streq    r4, [r0, r1]
++        streq    r6, [r0, r1, lsl # 1]
++
++        add      r0, r0, #4
++        tst      r10, #0xff000000
++        ittt eq
++        streq    r3, [r0]
++        streq    r5, [r0, r1]
++        streq    r7, [r0, r1, lsl # 1]
++
++        vmov     r2, r3, d19
++        vmov     r4, r5, d18
++        vmov     r6, r7, d17
++        add      r0, r0, r1
++        add      r0, r0, r1, lsl # 1
++
++        tst      r10, #0xff00
++        ittt eq
++        streq    r3, [r0]
++        streq    r5, [r0, r1]
++        streq    r7, [r0, r1, lsl # 1]
++
++        tst      r10, #0xff
++        ittt eq
++        streq    r2, [r0, #-4]!
++        streq    r4, [r0, r1]
++        streq    r6, [r0, r1, lsl # 1]
++
++        pop      {r4-r10,pc}
++
+ endfunc
+ 
++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++function ff_hevc_h_loop_filter_uv_neon_8, export=1
++        sub      r0, r0, r1, lsl #1
++        vld2.8   {d16,d17}, [r0], r1
++        vld2.8   {d18,d19}, [r0], r1
++        vld2.8   {d26,d27}, [r0], r1
++        vld2.8   {d28,d29}, [r0]
++        sub      r0, r0, r1, lsl #1
++        hevc_loop_filter_uv_body d16, d18, d26, d28
++        lsr      r2, r2, #16
++        hevc_loop_filter_uv_body d17, d19, d27, d29
++        cmp      r3, #0
++        bne      1f
++        vst2.8   {d18,d19}, [r0], r1
++        vst2.8   {d26,d27}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:      vzip.8   d18, d19
++        vzip.8   d26, d27
++        sub      r1, r1, #8
++
++        tst      r3, #1
++        bne      1f
++        vst1.8   {d18}, [r0]
++1:      add      r0, r0, #8
++        tst      r3, #2
++        bne      2f
++        vst1.8   {d19}, [r0]
++2:      add      r0, r0, r1
++
++        tst      r3, #4
++        bne      1f
++        vst1.8   {d26}, [r0]
++1:      add      r0, r0, #8
++        tst      r3, #8
++        it ne
++        bxne     lr
++        vst1.8   {d27}, [r0]
++        bx       lr
++
++endfunc
++
++
++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++function ff_hevc_v_loop_filter_uv2_neon_8, export=1
++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
++        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
++
++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
++        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++
++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
++        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++
++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
++        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++
++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
++        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++
++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
++        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++
++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
++        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++
++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
++        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
++
++        hevc_loop_filter_uv_body d16, d18, d26, d28
++        lsr      r2, r2, #16
++        hevc_loop_filter_uv_body d17, d19, d27, d29
++
++        neg      r1, r1
++
++        ldr      r2, [sp, #0]
++
++        @ p[1]
++        tst      r2, #2
++        itt ne
++        addne    r3, r3, r1, lsl #2
++        bne      1f
++        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
++        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
++        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
++        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
++
++1:
++        @ q[1]
++        tst      r2, #8
++        itt ne
++        addne    r0, r0, r1, lsl #2
++        bne 2f
++        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
++        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++
++2:
++        @ p[0]
++        tst      r2, #1
++        bne      3f
++        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
++        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
++        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
++        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
++
++3:
++        @ q[0]
++        tst      r2, #4
++        it ne
++        bxne     lr
++        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
++
++        bx       lr
++endfunc
++
++
+ function ff_hevc_v_loop_filter_chroma_neon, export=1
+         hevc_loop_filter_chroma_start
+         sub      r0, #4
+@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
          vst1.8   {d4}, [r0]
          bx       lr
  endfunc
 +
-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
++ *                                            int *curr_rpl0, int *curr_
++ *                                            MvField *curr, MvField *ne
 + */
 +function ff_hevc_deblocking_boundary_strengths_neon, export=1
 +        add         ip, sp, #4*4
@@ -1159,6 +1753,7 @@ index 166bddb..a088cc3 100644
 +90:     mov         a3, #1
 +        b           11b
 +endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
 new file mode 100644
 index 0000000..00eab9e
@@ -1503,10 +2098,10 @@ index 0000000..00eab9e
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
 diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 5591807..49c70dd 100644
+index 5591807..b6c48ee 100644
 --- a/libavcodec/arm/hevcdsp_init_neon.c
 +++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,6 +22,8 @@
+@@ -22,11 +22,26 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
  #include "hevcdsp_arm.h"
@@ -1515,7 +2110,25 @@ index 5591807..49c70dd 100644
  
  void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-@@ -43,6 +45,21 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++#ifdef RPI
++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++#endif
++
+ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
  void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
                                        ptrdiff_t stride);
  
@@ -1533,11 +2146,21 @@ index 5591807..49c70dd 100644
 +void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++
++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
++                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
++
++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
 +
  #define PUT_PIXELS(name) \
      void name(int16_t *dst, uint8_t *src, \
                                  ptrdiff_t srcstride, int height, \
-@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
  #undef PUT_PIXELS
@@ -1553,7 +2176,7 @@ index 5591807..49c70dd 100644
  
  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                     int height, int width);
-@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
  }
  
@@ -1599,6 +2222,50 @@ index 5591807..49c70dd 100644
 +    }
 +}
 +
++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    // Width 32 already dealt with
++    // width 16 code works in double lines
++    if (width == 16 && (height & 1) == 0) {
++        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
++                                          sao_offset_val_u, sao_left_class_u,
++                                          sao_offset_val_v, sao_left_class_v,
++                                          width, height);
++    }
++    else
++    {
++        const int shift  = 3; // BIT_DEPTH - 5
++        int k, y, x;
++        pixel *dst = (pixel *)_dst;
++        pixel *src = (pixel *)_src;
++        int8_t offset_table_u[32] = { 0 };
++        int8_t offset_table_v[32] = { 0 };
++
++        stride_src /= sizeof(pixel);
++        stride_dst /= sizeof(pixel);
++
++        for (k = 0; k < 4; k++)
++            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        for (k = 0; k < 4; k++)
++            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width * 2; x += 2)
++            {
++                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
++                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++            }
++            dst += stride_dst;
++            src += stride_src;
++
++        }
++    }
++}
++
 +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                                          int16_t *_sao_offset_val, int eo, int width, int height)
@@ -1677,6 +2344,54 @@ index 5591807..49c70dd 100644
 +        }
 +    }
 +}
++
++
++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++
++    if (width == 32 && (height & 7) == 0) {
++        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
++    }
++    else
++    {
++        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++        static const int8_t pos[4][2][2] = {
++            { { -1,  0 }, {  1, 0 } }, // horizontal
++            { {  0, -1 }, {  0, 1 } }, // vertical
++            { { -1, -1 }, {  1, 1 } }, // 45 degree
++            { {  1, -1 }, { -1, 1 } }, // 135 degree
++        };
++        int8_t sao_offset_val_u[8];  // padding of 3 for vld
++        int8_t sao_offset_val_v[8];  // padding of 3 for vld
++        pixel *dst = (pixel *)_dst;
++        pixel *src = (pixel *)_src;
++        int a_stride, b_stride;
++        int x, y;
++
++        for (x = 0; x < 5; x++) {
++            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
++            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
++        }
++
++        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width * 2; x += 2) {
++                int diff0u = CMP(src[x], src[x + a_stride]);
++                int diff1u = CMP(src[x], src[x + b_stride]);
++                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
++                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
++            }
++            src += stride_src;
++            dst += stride_dst;
++        }
++    }
++}
 +#undef CMP
 +
 +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
@@ -1686,18 +2401,36 @@ index 5591807..49c70dd 100644
  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
  {
      if (bit_depth == 8) {
-@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+         int x;
+         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
+         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
+         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
++#ifdef RPI
++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
++#endif
+         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
          c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
 +        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
 +          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
++          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
 +          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
++          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
 +        }
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
@@ -1719,7 +2452,7 @@ index 5591807..49c70dd 100644
          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
      }
@@ -1731,10 +2464,10 @@ index 5591807..49c70dd 100644
  }
 diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
 new file mode 100644
-index 0000000..9c7808d
+index 0000000..08a021d
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,510 @@
+@@ -0,0 +1,862 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -1860,24 +2593,186 @@ index 0000000..9c7808d
 +
 +function ff_hevc_sao_band_w64_neon_8, export=1
 +        init_sao_band
-+1:      subs      r12, #1
-+        pld       [r1, r3]
-+        vld1.8    {q8-q9}, [r1, :128]!
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vld1.8    {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        sub       r1, #32
-+        sao_band_64
-+        vst1.8    {q8-q9}, [r0, :128]!
-+        vst1.8    {q10-q11}, [r0, :128], r2
-+        sub       r0, #32
-+        bne       1b
 +
-+        bx lr
++        push      {r4, lr}
++        subs      r12, #1
++        mov       r4, r1
++        it ne
++        addne     r4, r3
++
++1:      subs      r12, #1
++        vldm      r1, {q8-q11}
++        pld       [r4]
++        vshr.u8   q12, q8, #3
++        vshr.u8   q13, q9, #3
++        add       r1, r3
++        vshr.u8   q14, q10, #3
++        vshr.u8   q15, q11, #3
++        sao_band_64
++        it ne
++        addne     r4, r3
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
 +endfunc
 +
++
++@ ff_hevc_sao_band_c_w64_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++@ As this is often done in-place on the frame buffer it is worth preloading
++@ the pixel values but we want to beware of loading ouside our buffer to avoid
++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
++
++function ff_hevc_sao_band_c_neon_8, export=1
++        mov     r12, sp
++        push   {r4-r8, lr}  // 24 bytes
++
++        ldm     r12, {r4-r7}
++
++        add     r4, #2
++        add     r6, #2
++        vld1.16 {d16}, [r4]    @ Unaligned
++        lsl     r5, r5, #3
++        vld1.16 {d18}, [r6]
++        pld     [r1]
++        vmov.i8  d17, #0
++        mov     r4, r1
++        vmov.i8  d19, #0
++        lsl     r7, r7, #3
++        vdup.8  q1, r5
++        ldr     r5, [r12, #16]  @ width
++        vdup.8  q2, r7
++        ldr     r12, [r12, #20]
++        vqmovn.s16 d0, q8
++        cmp     r5, #16         @ At some point we may want a table lookup
++        vqmovn.s16 d1, q9
++        vmov.i8 q3, #128
++        beq     16f
++
++        @ d0 U lookup
++        @ d1 V lookup
++        @ q1 U raw offset
++        @ q2 V raw offset
++        @ q3 #128
++
++        @ r4 = r1 = src - Inteded for preload pointer
++        @ r12 = height
++
++        @ Might (unlikely) be called with height == 1
++        subs      r12, #1
++        it ne
++        addne     r4, r3
++
++1:
++        subs      r12, #1
++        vld2.8    {q8-q9}, [r1, :128]!
++        vsub.u8   q12, q8, q1
++        vld2.8    {q10-q11}, [r1, :128], r3
++        vsub.u8   q14, q10, q1
++        vsub.u8   q13, q9, q2
++        sub       r1, #32
++        vsub.u8   q15, q11, q2
++        pld       [r4]
++        vshr.u8   q12, #3
++        vadd.s8   q8, q3
++        vshr.u8   q13, #3
++        vadd.s8   q9, q3
++
++        vtbl.8   d24, {d0}, d24
++        vshr.u8  q14, #3
++        vtbl.8   d25, {d0}, d25
++        vshr.u8  q15, #3
++        vtbl.8   d26, {d1}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d1}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d1}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d1}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++
++        it ne
++        addne     r4, r3        @ Do not inc on final pass
++        vst2.8    {q8-q9}, [r0, :128]!
++        vst2.8    {q10-q11}, [r0, :128], r2
++        sub       r0, #32
++        bpl       1b
++
++        pop    {r4-r8, pc}
++
++@ -- width 16 (UV pairs) --
++16:
++        subs    r12, #2
++        it ne
++        addne   r4, r4, r3, lsl #1
++
++1:
++        subs      r12, #2
++        vld2.8    {q8-q9}, [r1, :128], r3
++        vsub.u8   q12, q8, q1
++        vld2.8    {q10-q11}, [r1, :128], r3
++        vsub.u8   q14, q10, q1
++        vsub.u8   q13, q9, q2
++        pld       [r4]
++        vsub.u8   q15, q11, q2
++        pld       [r4, r3]
++        vshr.u8  q12, #3
++        vadd.s8  q8, q3
++        vshr.u8  q13, #3
++        vadd.s8  q9, q3
++
++        vtbl.8   d24, {d0}, d24
++        vshr.u8  q14, #3
++        vtbl.8   d25, {d0}, d25
++        vshr.u8  q15, #3
++        vtbl.8   d26, {d1}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d1}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d1}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d1}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++
++        it ne
++        addne   r4, r4, r3, lsl #1
++        vst2.8    {q8-q9}, [r0, :128], r2
++        vst2.8    {q10-q11}, [r0, :128], r2
++        bpl       1b
++
++        pop    {r4-r8, pc}
++
++endfunc
++
++
 +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
 +        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
 +        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
@@ -1887,71 +2782,120 @@ index 0000000..9c7808d
 +        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
 +.endm
 +
-+.macro table64
-+        vmov.s8 q13, #2 // 2 to all elements
-+        vmov.32  d24[0], r4  // load offset table from general registers
-+        vmov.32  d24[1], r5  // load rest of offset table
-+
-+        vadd.s8 q0, q13
-+        vadd.s8 q1, q13
-+        vadd.s8 q2, q13
-+        vadd.s8 q3, q13
-+
-+        vmov.u8  q15, #128 // s8 #-128
-+        vtbl.8   d0, {d24}, d0
-+        vadd.s8  q13,  q4, q15
-+        vtbl.8   d1, {d24}, d1
-+        vadd.s8  q14,  q5, q15
-+        vtbl.8   d2, {d24}, d2
-+        vqadd.s8 q0, q13
-+        vtbl.8   d3, {d24}, d3
-+        vqadd.s8 q1, q14
-+        vtbl.8   d4, {d24}, d4
-+        vadd.s8  q13,  q6, q15
-+        vtbl.8   d5, {d24}, d5
-+        vadd.s8  q14,  q7, q15
-+        vtbl.8   d6, {d24}, d6
-+        vqadd.s8 q2, q13
-+        vtbl.8   d7, {d24}, d7
-+        vqadd.s8 q3, q14
-+        vsub.s8   q0, q15
-+        vsub.s8   q1, q15
-+        vsub.s8   q2, q15
-+        vsub.s8   q3, q15
-+        vst1.8  {q0-q1}, [r0, :128]!
-+        vst1.8  {q2-q3}, [r0, :128], r2
-+        sub     r0, #32
-+.endm
 +
 +// input
 +// a in q0 - q3
 +// c in q4 - q7
 +// b in q8 - q11
-+// offset table in r7 and r5
++// offset table r4,r5 and r6,r7
++//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
 +// output in q0 - q3
 +// clobbers q12 - q15
-+.macro edge_w64_body
-+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
-+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
 +
-+        vadd.s8  q0, q12 //diff0 + diff1
-+        vadd.s8  q1, q13
++@ a <- c <- b
++@
++@ It appears that Neon can stall if you try and use results too soon so we try to
++@ spread our instruction out
 +
-+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
-+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
++.macro edgeidx64
++
++        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u8 q13, q5, q1
++        vcgt.u8 q14, q6, q2
++        vcgt.u8 q15, q7, q3
++
++        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u8 q1, q1, q5
++        vcgt.u8 q2, q2, q6
++        vcgt.u8 q3, q3, q7
++
++        vsub.s8 q0, q0, q12 // a = sign(c-a)
++        vsub.s8 q1, q1, q13
++        vsub.s8 q2, q2, q14
++        vsub.s8 q3, q3, q15
++
++        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u8 q13, q5, q9
++        vcgt.u8 q14, q6, q10
++        vcgt.u8 q15, q7, q11
++
++        vsub.s8 q0, q0, q12
++        vsub.s8 q1, q1, q13
++        vsub.s8 q2, q2, q14
++        vsub.s8 q3, q3, q15
++
++        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u8 q13, q9, q5
++        vcgt.u8 q14, q10, q6
++        vcgt.u8 q15, q11, q7
++
++        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s8 q1, q1, q13
++        vmov.u8 q12, #2
++        vadd.s8 q2, q2, q14
++        vadd.s8 q3, q3, q15
++
++        vadd.s8 q0, q0, q12
++        vadd.s8 q1, q1, q12
++        @ whilst vmov dn, rm, rn exists it is a vfp instruction
++        @ and causes a stall till neon pipe empty - so don't do that!
++        vmov    d26[0], r4
++        vmov    d26[1], r5
++        vmov    d27[0], r6
++        vmov    d27[1], r7
++        vadd.s8 q2, q2, q12
++        vuzp.8    q0, q1
++        vmov.u8 q15, #128
++        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
++
++        vtbl.8  d0, {d26}, d0
++        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d1, {d26}, d1
++        vadd.s8 q14, q5, q15
++
++        vtbl.8  d2, {d27}, d2
++        vuzp.8    q2, q3
++
++        vtbl.8  d3, {d27}, d3
++
++        vtbl.8  d4, {d26}, d4
++        vzip.8    q0, q1
++
++        vtbl.8  d5, {d26}, d5
++        vqadd.s8 q0, q0, q12
++        vqadd.s8 q1, q1, q14
++        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d6, {d27}, d6
++        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d7, {d27}, d7
++        vzip.8   q2, q3
++
++        vsub.s8 q0, q0, q15
++        vqadd.s8 q2, q2, q12
++        vqadd.s8 q3, q3, q14
++        vsub.s8 q1, q1, q15
++        vsub.s8 q2, q2, q15
++        vsub.s8 q3, q3, q15
 +
-+        vadd.s8  q2, q14
-+        vadd.s8  q3, q15
-+        table64
 +.endm
 +
++function edge_w64_body
++        edgeidx64
++        vstm    r0, {q0-q3}
++        add     r0, r0, r2
++        bx       lr
++endfunc
++
 +.macro init_edge_64
-+        push   {r4-r5}
-+        ldr    r12, [sp, #8] // height
-+        ldr    r5, [sp, #12] // sao_offset_val_table
-+        ldr    r4, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
++        push   {r4-r8,lr}
++        ldr    r12, [sp, #24] // height
++        ldr    r5,  [sp, #28] // sao_offset_val_table
++        ldrd   r4, r5, [r5]
++        mov    r6, r4
++        mov    r7, r5
 +.endm
 +
 +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
@@ -1974,11 +2918,10 @@ index 0000000..9c7808d
 +        vext.8 q9, q5, q6, #1
 +        vext.8 q10, q6, q7, #1
 +        vext.8 q11, q7, q12, #1
-+        edge_w64_body
++        bl    edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
@@ -1998,7 +2941,7 @@ index 0000000..9c7808d
 +        vld1.8  {q8-q9}, [r1, :128]!
 +        vld1.8  {q10-q11}, [r1, :128], r3
 +        sub     r1, #32
-+        edge_w64_body
++        bl      edge_w64_body
 +        // copy c to a
 +        vmov.64 q0, q4
 +        vmov.64 q1, q5
@@ -2011,8 +2954,7 @@ index 0000000..9c7808d
 +        vmov.64 q7, q11
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
@@ -2036,11 +2978,10 @@ index 0000000..9c7808d
 +        vld1.8  {q8-q9}, [r1]!
 +        vld1.8  {q10-q11}, [r1]
 +        sub     r1, #33
-+        edge_w64_body
++        bl      edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
@@ -2064,13 +3005,157 @@ index 0000000..9c7808d
 +        vld1.8  {q8-q9}, [r1]!
 +        vld1.8  {q10-q11}, [r1]
 +        sub     r1, #31
-+        edge_w64_body
++        bl      edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
++
++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
++@   uint8_t *_dst,               r0
++@   uint8_t *_src,               r1
++@   ptrdiff_t stride_dst,        r2
++@   ptrdiff_t stride_src,        r3
++@   int height,                  sp[0]
++@   int16_t *sao_offset_table_u,  sp[4]
++@   int16_t *sao_offset_table_v); sp[8]
++@   int eo                        sp[12]
++
++function ff_hevc_sao_edge_c_w64_neon_8, export=1
++        push   {r4-r8,lr}     // 6 reg = 24
++        ldr    r5,  [sp, #28] // sao_offset_val_table_u
++        ldr    r7,  [sp, #32] // sao_offset_val_table_v
++
++        @ Load and rearrange offsets
++        @ Also "convert" from 16bit to 8bit
++        ldrb    r4, [r5, #2]
++        ldrb    r8, [r5, #4]
++        ldrb    r6, [r7, #2]
++        ldrb    r12, [r7, #4]
++        orr     r4, r4, r8, lsl #8
++        orr     r6, r6, r12, lsl #8
++        ldrb    r8, [r5, #6]
++        ldrb    r12, [r7, #6]
++        orr     r4, r4, r8, lsl #24
++        orr     r6, r6, r12, lsl #24
++        ldrb    r5, [r5, #8]
++        ldrb    r7, [r7, #8]
++
++        ldr     r12, [sp, #36] // e0
++        adr     r8, edge_c_tbl_w64
++        ldr     r8, [r8, r12, lsl #2]
++
++        ldr     r12, [sp, #24] // height
++        vpush   {d8-d15}
++        mov     pc, r8
++
++edge_c_tbl_w64:
++        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
++
++ff_hevc_sao_edge_c_eo0_w64_neon_8:
++        sub    r1, #8
++1:      subs    r12, #1
++        vld1.64  {d7}, [r1, :64]!
++        vld1.64  {q4-q5}, [r1, :128]! // load c
++        vld1.64  {q6-q7}, [r1, :128]!
++        vld1.64  {d24}, [r1, :64], r3
++        sub      r1, #72
++        // load a
++        vext.8 q0, q3, q4, #14
++        vext.8 q1, q4, q5, #14
++        vext.8 q2, q5, q6, #14
++        vext.8 q3, q6, q7, #14
++        // load b
++        vext.8 q8, q4, q5, #2
++        vext.8 q9, q5, q6, #2
++        vext.8 q10, q6, q7, #2
++        vext.8 q11, q7, q12, #2
++        bl    edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo1_w64_neon_8:
++        sub     r1, r3
++        // load a
++        vldm    r1, {q0-q3}
++        add     r1, r3
++        // load c
++        vldm    r1, {q4-q7}
++        add     r1, r3
++1:      subs    r12, #1
++        // load b
++        vldm    r1, {q8-q11}
++        add     r1, r3
++        bl      edge_w64_body
++        // copy c to a
++        vmov.64 q0, q4
++        vmov.64 q1, q5
++        vmov.64 q2, q6
++        vmov.64 q3, q7
++        // copy b to c
++        vmov.64 q4, q8
++        vmov.64 q5, q9
++        vmov.64 q6, q10
++        vmov.64 q7, q11
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo2_w64_neon_8:
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        sub     r1, #2
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #30
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        add     r1, #2
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #34
++        bl      edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo3_w64_neon_8:
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        add     r1, #2
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #34
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        sub     r1, #2
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #30
++        bl      edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++endfunc
++
++
 +.macro init_edge_32
 +        ldr     r12, [sp, #4] // sao_offset_val_table
 +        vld1.32 {d31}, [r12]
@@ -2187,7 +3272,7 @@ index 0000000..9c7808d
 +        vext.8  q7, q11, q12, #8
 +        vext.8  q5, q10, q11, #7
 +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
 +        vadd.s8 q0, q12 //diff0 + diff1
 +        vadd.s8 q1, q13
 +        table32
@@ -2227,7 +3312,7 @@ index 0000000..9c7808d
 +        vext.8  q14, q12, q10, #7
 +
 +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
 +
 +        vadd.s8 q0, q12 //diff0 + diff1
 +        vadd.s8 q1, q13
@@ -2439,26 +3524,21 @@ index ce4bab2..b9b0c78 100644
 +    .split          = h264_split,
 +};
 diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index b478065..88dd40b 100644
+index b478065..955e426 100644
 --- a/libavcodec/hevc.c
 +++ b/libavcodec/hevc.c
-@@ -41,8 +41,186 @@
+@@ -41,8 +41,196 @@
  #include "hevc.h"
  #include "profiles.h"
  
 +#ifdef RPI
 +  #include "rpi_qpu.h"
-+  #include "rpi_user_vcsm.h"
-+  // Move Inter prediction into separate pass
-+  #define RPI_INTER
-+
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_MULTI_MAILBOX to use the updated mailbox that can launch both QPU and VPU
-+    #define RPI_MULTI_MAILBOX
-+  #endif
++  #include "rpi_shader.h"
++  #include "rpi_shader_cmd.h"
++  #include "rpi_zc.h"
 +
 +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
-+  // RPI_CACHE_UNIF_MVS doesn't seem to make much difference, so left undefined.
++  #define RPI_CACHE_UNIF_MVS  1
 +
 +  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
 +  //#define RPI_SIMULATE_QPUS
@@ -2466,19 +3546,24 @@ index b478065..88dd40b 100644
 +    #include "pthread.h"
 +  #endif
 +
-+  static void rpi_execute_dblk_cmds(HEVCContext *s);
-+  static void rpi_execute_transform(HEVCContext *s);
-+  static void rpi_launch_vpu_qpu(HEVCContext *s);
-+  static void rpi_execute_pred_cmds(HEVCContext *s);
-+  static void rpi_execute_inter_cmds(HEVCContext *s);
-+  static void rpi_begin(HEVCContext *s);
-+  static void flush_frame(HEVCContext *s,AVFrame *frame);
-+  static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job);
++  static void worker_core(HEVCContext * const s);
 +
++  // We can pred any block height but annoyingly if we we do then the TMU cache
++  // explodes and it goes even slower :-(
++  #if 0
++  #define Y_P_MAX_H     16
++  #define Y_B_MAX_H     16
++  #else
++  #define Y_P_MAX_H     64
++  #define Y_B_MAX_H     64
++  #endif
 +#endif
 +
 +// #define DISABLE_MC
 +
++#define DISABLE_CHROMA 0
++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
++
 +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
 +
 +#ifndef av_mod_uintp2
@@ -2488,46 +3573,66 @@ index b478065..88dd40b 100644
 +}
 +#   define av_mod_uintp2   av_mod_uintp2_c
 +#endif
++
++#define Y_B_ONLY 0
 +
  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
  
 +
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
 +
 +// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
 +// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
 +// For each block of 64*64 the smallest block size is 8x4
 +// We also need an extra command for the setup information
 +
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+#define UV_COMMANDS_PER_QPU ((1 + 3*RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
++#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
 +
-+#define RPI_LUMA_COMMAND_WORDS 10
-+#define Y_COMMANDS_PER_QPU ((1+2*RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
-+
 +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
 +
 +// TODO Chroma only needs 4 taps
 +
 +// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8][1] = {
-+        { ENCODE_COEFFS(   0,  64,   0,   0) },
-+        { ENCODE_COEFFS(  2,  58,  10,  2) },
-+        { ENCODE_COEFFS(  4,  54,  16,  2) },
-+        { ENCODE_COEFFS(  6,  46,  28,  4) },
-+        { ENCODE_COEFFS(  4,  36,  36,  4) },
-+        { ENCODE_COEFFS(  4,  28,  46,  6) },
-+        { ENCODE_COEFFS(  2,  16,  54,  4) },
-+        { ENCODE_COEFFS(  2,  10,  58,  2) }
++static const uint32_t rpi_filter_coefs[8] = {
++        ENCODE_COEFFS(  0,  64,   0,  0),
++        ENCODE_COEFFS(  2,  58,  10,  2),
++        ENCODE_COEFFS(  4,  54,  16,  2),
++        ENCODE_COEFFS(  6,  46,  28,  4),
++        ENCODE_COEFFS(  4,  36,  36,  4),
++        ENCODE_COEFFS(  4,  28,  46,  6),
++        ENCODE_COEFFS(  2,  16,  54,  4),
++        ENCODE_COEFFS(  2,  10,  58,  2)
 +};
 +
++#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
++
 +#endif
 +
 +
 +#ifdef RPI_WORKER
 +
++typedef struct worker_global_env_s
++{
++    volatile int arm_load;
++    pthread_mutex_t lock;
++
++    unsigned int arm_y;
++    unsigned int arm_c;
++    unsigned int gpu_y;
++    unsigned int gpu_c;
++} worker_global_env_t;
++
++static worker_global_env_t worker_global_env =
++{
++    .lock = PTHREAD_MUTEX_INITIALIZER
++};
++
++
 +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +
@@ -2606,17 +3711,7 @@ index b478065..88dd40b 100644
 +      break;
 +    }
 +    LOG_ENTER
-+    // printf("%d %d %d : %d %d %d %d\n",s->poc, x_ctb, y_ctb, s->num_pred_cmds,s->num_mv_cmds,s->num_coeffs[2] >> 8,s->num_coeffs[3] >> 10);
-+    rpi_launch_vpu_qpu(s);
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
++    worker_core(s);
 +
 +    worker_complete_job(s);
 +    LOG_EXIT
@@ -2629,7 +3724,7 @@ index b478065..88dd40b 100644
  /**
   * NOTE: Each function hls_foo correspond to the function foo in the
   * specification (HLS stands for High Level Syntax).
-@@ -55,6 +233,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
  /* free everything allocated  by pic_arrays_init() */
  static void pic_arrays_free(HEVCContext *s)
  {
@@ -2662,36 +3757,40 @@ index b478065..88dd40b 100644
      av_freep(&s->sao);
      av_freep(&s->deblock);
  
-@@ -91,6 +295,87 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
      int ctb_count        = sps->ctb_width * sps->ctb_height;
      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
  
 +#ifdef RPI
-+    int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
-+    int coefs_per_luma = 64*64*24*RPI_NUM_CHUNKS;
-+    int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
-+    int coefs_per_row = coefs_per_luma + coefs_per_chroma;
++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
 +    int job;
 +
 +    av_assert0(sps);
-+    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->ctu_per_y_chan = s->max_ctu_count / 12;
-+    s->ctu_per_uv_chan = s->max_ctu_count / 8;
++//    s->max_ctu_count = sps->ctb_width;
++//    printf("CTB with=%d\n", sps->ctb_width);
++//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
++    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
++    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
++
 +    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      printf("Allocated %d\n",coefs_per_row);
-+      for(job=0;job<RPI_MAX_JOBS;job++) {
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+        if (!s->coeffs_buf_arm[job][0])
-+            goto fail;
-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+        if (!s->coeffs_buf_arm[job][2])
-+            goto fail;
-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+      }
++        for(job=0;job<RPI_MAX_JOBS;job++) {
++            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
++            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
++            if (!s->coeffs_buf_arm[job][0])
++                goto fail;
++
++            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
++            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
++            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
++            if (!s->coeffs_buf_arm[job][2])
++                goto fail;
++            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
++            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
++        }
 +    }
 +#endif
 +#ifdef RPI_DEBLOCK_VPU
@@ -2738,8 +3837,6 @@ index b478065..88dd40b 100644
 +
 +            dvq->uv_setup_arm = (void*)p_arm;
 +            dvq->uv_setup_vc = (void*)p_vc;
-+
-+            dvq->cmd_id = -1;
 +        }
 +
 +        s->dvq_n = 0;
@@ -2750,7 +3847,7 @@ index b478065..88dd40b 100644
      s->bs_width  = (width  >> 2) + 1;
      s->bs_height = (height >> 2) + 1;
  
-@@ -137,6 +422,29 @@ fail:
+@@ -137,6 +434,29 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -2780,7 +3877,52 @@ index b478065..88dd40b 100644
  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
  {
      int i = 0;
-@@ -674,6 +982,11 @@ static int hls_slice_header(HEVCContext *s)
+@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
+ {
+     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
+-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+     int ret, i;
+ 
+     pic_arrays_free(s);
+@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     switch (sps->pix_fmt) {
+     case AV_PIX_FMT_YUV420P:
+     case AV_PIX_FMT_YUVJ420P:
++#if RPI_HEVC_SAND
++        // Currently geometry calc is stuffed for big sizes
++        if (sps->width < 2048 && sps->height <= 1088) {
++            *fmt++ = AV_PIX_FMT_SAND128;
++        }
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+         ret = ff_thread_get_format(s->avctx, pix_fmts);
+         if (ret < 0)
+             goto fail;
++
+         s->avctx->pix_fmt = ret;
+     }
+     else {
+@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+         for(c_idx = 0; c_idx < c_count; c_idx++) {
+             int w = sps->width >> sps->hshift[c_idx];
+             int h = sps->height >> sps->vshift[c_idx];
++            // ******** Very very nasty allocation kludge for plaited Chroma
+             s->sao_pixel_buffer_h[c_idx] =
+-                av_malloc((w * 2 * sps->ctb_height) <<
++                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
+                           sps->pixel_shift);
+             s->sao_pixel_buffer_v[c_idx] =
+-                av_malloc((h * 2 * sps->ctb_width) <<
++                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
+                           sps->pixel_shift);
+         }
+     }
+@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
                  pred_weight_table(s, gb);
              }
@@ -2792,33 +3934,42 @@ index b478065..88dd40b 100644
  
              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -931,6 +1244,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
      return 0;
  }
  
 +#ifdef RPI
 +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
 +{
++    // U & V done on U call in the case of sliced frames
++    if (rpi_sliced_frame(s->frame) && c_idx > 1)
++        return;
++
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
 +        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
 +        cmd->type = RPI_PRED_INTRA;
 +        cmd->size = log2_trafo_size;
-+        cmd->c_idx = c_idx;
-+        cmd->x = x0;
-+        cmd->y = y0;
 +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
-+        cmd->mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+    } else {
++        cmd->c_idx = c_idx;
++        cmd->i_pred.x = x0;
++        cmd->i_pred.y = y0;
++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++    }
++    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
++    }
++    else {
 +        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
 +    }
++
 +}
 +#endif
 +
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -943,8 +1275,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -2831,7 +3982,7 @@ index b478065..88dd40b 100644
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1030,7 +1365,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -2843,7 +3994,7 @@ index b478065..88dd40b 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1059,7 +1398,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -2855,7 +4006,7 @@ index b478065..88dd40b 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1088,7 +1431,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -2867,7 +4018,7 @@ index b478065..88dd40b 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1098,7 +1445,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -2879,7 +4030,7 @@ index b478065..88dd40b 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1110,26 +1461,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -2926,17 +4077,162 @@ index b478065..88dd40b 100644
              }
          }
      }
-@@ -1332,6 +1703,93 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+@@ -1275,47 +1675,120 @@ do {
+     return 0;
+ }
+ 
+-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
++
++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
+ {
+-    HEVCLocalContext *lc = s->HEVClc;
+     GetBitContext gb;
+-    int cb_size   = 1 << log2_cb_size;
+-    int stride0   = s->frame->linesize[0];
+-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+-    int   stride1 = s->frame->linesize[1];
+-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-    int   stride2 = s->frame->linesize[2];
+-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+-
+-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+-                          s->ps.sps->pcm.bit_depth_chroma;
+-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
+     int ret;
+ 
+-    if (!s->sh.disable_deblocking_filter_flag)
+-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+-
+     ret = init_get_bits(&gb, pcm, length);
+     if (ret < 0)
+         return ret;
+ 
+-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+-    if (s->ps.sps->chroma_format_idc) {
+-        s->hevcdsp.put_pcm(dst1, stride1,
++#ifdef RPI
++    if (rpi_sliced_frame(s->frame)) {
++        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
++                           s->frame->linesize[0],
++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
++                           s->frame->linesize[1],
+                            cb_size >> s->ps.sps->hshift[1],
+                            cb_size >> s->ps.sps->vshift[1],
+                            &gb, s->ps.sps->pcm.bit_depth_chroma);
+-        s->hevcdsp.put_pcm(dst2, stride2,
+-                           cb_size >> s->ps.sps->hshift[2],
+-                           cb_size >> s->ps.sps->vshift[2],
+-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+     }
++    else
++#endif
++    {
++        const int stride0   = s->frame->linesize[0];
++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++        const int   stride1 = s->frame->linesize[1];
++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++        const int   stride2 = s->frame->linesize[2];
++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
++
++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++        if (s->ps.sps->chroma_format_idc) {
++            s->hevcdsp.put_pcm(dst1, stride1,
++                               cb_size >> s->ps.sps->hshift[1],
++                               cb_size >> s->ps.sps->vshift[1],
++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
++            s->hevcdsp.put_pcm(dst2, stride2,
++                               cb_size >> s->ps.sps->hshift[2],
++                               cb_size >> s->ps.sps->vshift[2],
++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
++        }
+ 
++    }
+     return 0;
+ }
+ 
++#ifdef RPI
++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
++{
++    int16_t * const coeffs = (buf_no != 3) ?
++        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
++        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
++    s->num_coeffs[s->pass0_job][buf_no] += n;
++    return coeffs;
++}
++#endif
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++    return x << (y * 2);
++}
++
++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
++{
++    // Length in bits
++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
++
++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
++
++    if (!s->sh.disable_deblocking_filter_flag)
++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
++
++#ifdef RPI
++    if (s->enable_rpi) {
++        // Copy coeffs
++        const int blen = (length + 7) >> 3;
++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
++        // Allocation is in int16_t s
++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++        // sample this rounding doesn't affect the total size we need to allocate for
++        // the coeff buffer
++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
++        memcpy(coeffs, pcm, blen);
++
++        // Our coeff stash assumes that any partially allocated 64byte lump
++        // is zeroed so make that true.
++        {
++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++            if ((-(intptr_t)eopcm & 63) != 0)
++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
++        }
++
++        // Add command
++        {
++            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++            cmd->type = RPI_PRED_I_PCM;
++            cmd->size = log2_cb_size;
++            cmd->i_pcm.src = coeffs;
++            cmd->i_pcm.x = x0;
++            cmd->i_pcm.y = y0;
++            cmd->i_pcm.src_len = length;
++        }
++        return 0;
++    }
++#endif
++
++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
++}
++
+ /**
+  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
+  *
+@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
   * @param luma_offset additive offset applied to the luma prediction value
   */
  
-+#ifdef RPI_INTER
-+#define RPI_REDIRECT(fn) (s->enable_rpi ? rpi_ ## fn : fn)
++#if RPI_INTER
 +static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
 +                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
 +                        int block_w, int block_h, int luma_weight, int luma_offset)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_LUMA_UNI;
 +    cmd->dst = dst;
 +    cmd->dststride = dststride;
@@ -2953,9 +4249,10 @@ index b478065..88dd40b 100644
 +
 +static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
 +                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
++                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
++                       const struct MvField * const current_mv)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_LUMA_BI;
 +    cmd->dst = dst;
 +    cmd->dststride = dststride;
@@ -2973,17 +4270,17 @@ index b478065..88dd40b 100644
 +    cmd->ref_idx[1] = current_mv->ref_idx[1];
 +}
 +
-+static void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride, int reflist,
-+                          int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int chroma_weight, int chroma_offset)
++static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
++                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_CHROMA_UNI;
 +    cmd->dst = dst0;
 +    cmd->dststride = dststride;
 +    cmd->src = src0;
 +    cmd->srcstride = srcstride;
-+    cmd->mv = current_mv->mv[reflist];
++    cmd->mv = *mv;
 +    cmd->x_off = x_off;
 +    cmd->y_off = y_off;
 +    cmd->block_w = block_w;
@@ -2992,10 +4289,10 @@ index b478065..88dd40b 100644
 +    cmd->offset = chroma_offset;
 +}
 +
-+static void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                         int x_off, int y_off, int block_w, int block_h, struct MvField *current_mv, int cidx)
++static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
++                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
 +{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[s->pass0_job] + s->num_mv_cmds[s->pass0_job]++;
++    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
 +    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
 +    cmd->dst = dst0;
 +    cmd->dststride = dststride;
@@ -3013,14 +4310,12 @@ index b478065..88dd40b 100644
 +    cmd->ref_idx[1] = current_mv->ref_idx[1];
 +}
 +
-+#else
-+#define RPI_REDIRECT(fn) fn
 +#endif
 +
  static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                          AVFrame *ref, const Mv *mv, int x_off, int y_off,
                          int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1347,6 +1805,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                             (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
      int idx              = ff_hevc_pel_weight[block_w];
  
@@ -3031,7 +4326,7 @@ index b478065..88dd40b 100644
      x_off += mv->x >> 2;
      y_off += mv->y >> 2;
      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1393,7 +1855,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -3040,7 +4335,7 @@ index b478065..88dd40b 100644
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1417,6 +1879,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
  
@@ -3051,7 +4346,7 @@ index b478065..88dd40b 100644
      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1502,6 +1968,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
      intptr_t _mx         = mx << (1 - hshift);
      intptr_t _my         = my << (1 - vshift);
  
@@ -3062,7 +4357,7 @@ index b478065..88dd40b 100644
      x_off += mv->x >> (2 + hshift);
      y_off += mv->y >> (2 + vshift);
      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1566,6 +2036,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
      int hshift = s->ps.sps->hshift[1];
      int vshift = s->ps.sps->vshift[1];
  
@@ -3073,13 +4368,422 @@ index b478065..88dd40b 100644
      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1693,14 +2167,14 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
      }
  }
  
 -static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
 -                                int nPbW, int nPbH,
 -                                int log2_cb_size, int partIdx, int idx)
++
++#if RPI_INTER
++
++static HEVCRpiLumaPred *
++rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
++{
++    HEVCRpiLumaPred * yp = s->curr_pred_y;
++    HEVCRpiLumaPred * ypt = yp + 1;
++    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
++        if (ypt->load < yp->load)
++            yp = ypt;
++    }
++
++//        yp->load += load_val;
++    ++yp->load;
++    return yp;
++}
++
++static void
++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const Mv *const mv,
++           const int weight_mul,
++           const int weight_offset,
++           AVFrame *const src_frame)
++{
++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++
++//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
++//                    mv, x0, y0, nPbW, nPbH,
++//                    weight_mul, weight_offset);
++
++    {
++        const unsigned int mx          = mv->x & 3;
++        const unsigned int my          = mv->y & 3;
++        const unsigned int my_mx       = (my << 8) | mx;
++        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
++        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
++        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
++
++        // Potentially we could change the assembly code to support taller sizes in one go
++        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
++        {
++            const uint32_t src_yx_y = y1_m3 + start_y;
++            int start_x = 0;
++            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
++
++#if 1
++            // As Y-pred operates on two independant 8-wide src blocks we can merge
++            // this pred with the previous one if it the previous one is 8 pel wide,
++            // the same height as the current block, immediately to the left of our
++            // current dest block and mono-pred.
++
++            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
++            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
++            {
++                const int bw = FFMIN(nPbW, 8);
++                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
++
++                last_y8_lx->next_src2_x = x1_m3;
++                last_y8_lx->next_src2_y = src_yx_y;
++                last_y8_lx->next_src2_base = src_vc_address_y;
++                last_y8_p->p.w += bw;
++                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
++                last_y8_p->p.wo2 = wo;
++
++                s->last_y8_p = NULL;
++                s->last_y8_lx = NULL;
++                start_x = bw;
++#if RPI_TSTATS
++                ++s->tstats.y_pred1_y8_merge;
++#endif
++            }
++#endif
++
++            for (; start_x < nPbW; start_x += 16)
++            {
++                const int bw = FFMIN(nPbW - start_x, 16);
++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++#if RPI_TSTATS
++                {
++                    HEVCRpiStats *const ts = &s->tstats;
++                    if (mx == 0 && my == 0)
++                        ++ts->y_pred1_x0y0;
++                    else if (mx == 0)
++                        ++ts->y_pred1_x0;
++                    else if (my == 0)
++                        ++ts->y_pred1_y0;
++                    else
++                        ++ts->y_pred1_xy;
++
++                    if (nPbW > 8)
++                        ++ts->y_pred1_wgt8;
++                    else
++                        ++ts->y_pred1_wle8;
++
++                    if (nPbH > 16)
++                        ++ts->y_pred1_hgt16;
++                    else
++                        ++ts->y_pred1_hle16;
++                }
++#endif
++                cmd_y[-1].next_fn = s->qpu_filter;
++                cmd_lx->next_src1_x = x1_m3 + start_x;
++                cmd_lx->next_src1_y = src_yx_y;
++                cmd_lx->next_src1_base = src_vc_address_y;
++                if (bw <= 8)
++                {
++                    cmd_lx->next_src2_x = MC_DUMMY_X;
++                    cmd_lx->next_src2_y = MC_DUMMY_Y;
++                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
++                }
++                else
++                {
++                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
++                    cmd_lx->next_src2_y = src_yx_y;
++                    cmd_lx->next_src2_base = src_vc_address_y;
++                }
++                cmd_y->p.w = bw;
++                cmd_y->p.h = bh;
++                cmd_y->p.mymx21 = my2_mx2_my_mx;
++                cmd_y->p.wo1 = wo;
++                cmd_y->p.wo2 = wo;
++                cmd_y->p.dst_addr =  dst_addr + start_x;
++                yp->last_lx = cmd_y;
++                yp->qpu_mc_curr = cmd_y + 1;
++
++                if (bw == 8) {
++                    s->last_y8_lx = cmd_lx;
++                    s->last_y8_p = cmd_y;
++                }
++            }
++        }
++    }
++}
++
++static void
++rpi_pred_y_b(HEVCContext * const s,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const struct MvField *const mv_field,
++           AVFrame *const src_frame,
++           AVFrame *const src_frame2)
++{
++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const Mv * const mv  = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
++
++//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
++//           mv, x0, y0, nPbW, nPbH,
++//           src_frame2, mv2, mv_field);
++    {
++        const unsigned int mx          = mv->x & 3;
++        const unsigned int my          = mv->y & 3;
++        const unsigned int my_mx = (my<<8) | mx;
++        const unsigned int mx2          = mv2->x & 3;
++        const unsigned int my2          = mv2->y & 3;
++        const unsigned int my2_mx2 = (my2<<8) | mx2;
++        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++        const int x1 = x0 + (mv->x >> 2) - 3;
++        const int y1 = y0 + (mv->y >> 2) - 3;
++        const int x2 = x0 + (mv2->x >> 2) - 3;
++        const int y2 = y0 + (mv2->y >> 2) - 3;
++        const unsigned int ref_idx0 = mv_field->ref_idx[0];
++        const unsigned int ref_idx1 = mv_field->ref_idx[1];
++        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
++                     s->sh.luma_offset_l1[ref_idx1] + 1;
++        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++        uint32_t dst = get_vc_address_y(s->frame) + y_off;
++        const uint32_t src1_base = get_vc_address_y(src_frame);
++        const uint32_t src2_base = get_vc_address_y(src_frame2);
++
++        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
++        {
++            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
++
++            for (int start_x=0; start_x < nPbW; start_x += 8)
++            { // B blocks work 8 at a time
++                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
++                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
++                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++#if RPI_TSTATS
++              {
++                  HEVCRpiStats *const ts = &s->tstats;
++                  const unsigned int mmx = mx | mx2;
++                  const unsigned int mmy = my | my2;
++                  if (mmx == 0 && mmy == 0)
++                      ++ts->y_pred2_x0y0;
++                  else if (mmx == 0)
++                      ++ts->y_pred2_x0;
++                  else if (mmy == 0)
++                      ++ts->y_pred2_y0;
++                  else
++                      ++ts->y_pred2_xy;
++
++                  if (nPbH > 16)
++                      ++ts->y_pred2_hgt16;
++                  else
++                      ++ts->y_pred2_hle16;
++              }
++#endif
++              cmd_y[-1].next_fn = s->qpu_filter_b;
++              cmd_lx->next_src1_x = x1 + start_x;
++              cmd_lx->next_src1_y = y1 + start_y;
++              cmd_lx->next_src1_base = src1_base;
++              cmd_lx->next_src2_x = x2 + start_x;
++              cmd_lx->next_src2_y = y2 + start_y;
++              cmd_lx->next_src2_base = src2_base;
++              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
++              cmd_y->p.h = bh;
++              cmd_y->p.mymx21 = my2_mx2_my_mx;
++              cmd_y->p.wo1 = wo1;
++              cmd_y->p.wo2 = wo2;
++              cmd_y->p.dst_addr =  dst + start_x;
++              yp->last_lx = cmd_y;
++              yp->qpu_mc_curr = cmd_y + 1;
++          }
++          dst += s->frame->linesize[0] * 16;
++        }
++    }
++}
++
++
++static HEVCRpiChromaPred *
++rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
++{
++    HEVCRpiChromaPred * cp = s->curr_pred_c;
++    HEVCRpiChromaPred * cpt = cp + 1;
++    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
++        if (cpt->load < cp->load)
++            cp = cpt;
++    }
++    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
++    ++cp->load;
++//    cp->load += load_val;
++    return cp;
++}
++
++static void
++rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const Mv * const mv,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  AVFrame * const src_frame)
++{
++
++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
++#if 0
++    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++
++    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
++                c_weights[0], c_offsets[0]);
++
++    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
++                x0_c, y0_c, nPbW_c, nPbH_c, mv,
++                c_weights[1], c_offsets[1]);
++#endif
++    {
++        const int hshift           = s->ps.sps->hshift[1];
++        const int vshift           = s->ps.sps->vshift[1];
++
++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++        const uint32_t src_base_u = get_vc_address_u(src_frame);
++        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
++        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++
++        for(int start_y=0;start_y < nPbH_c;start_y+=16)
++        {
++            const int bh = FFMIN(nPbH_c-start_y, 16);
++
++            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++            {
++                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
++                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
++                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
++                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++                u[-1].next_fn  = s->qpu_filter_uv;
++                last_l0->next_src_x = x1_c + start_x;
++                last_l0->next_src_y = y1_c + start_y;
++                last_l0->next_src_base_c = src_base_u;
++                u[0].p.h = bh;
++                u[0].p.w = bw;
++                u[0].p.coeffs_x = x_coeffs;
++                u[0].p.coeffs_y = y_coeffs;
++                u[0].p.wo_u = wo_u;
++                u[0].p.wo_v = wo_v;
++                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
++                cp->last_l0 = u;
++                cp->qpu_mc_curr = u + 1;
++            }
++
++            dst_base_u += s->frame->linesize[1] * 16;
++        }
++    }
++  return;
++}
++
++static void
++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const struct MvField * const mv_field,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  const int16_t * const c_weights2,
++  const int16_t * const c_offsets2,
++  AVFrame * const src_frame,
++  AVFrame * const src_frame2)
++{
++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
++#if 0
++    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
++
++    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
++                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
++#endif
++    {
++        const int hshift = s->ps.sps->hshift[1];
++        const int vshift = s->ps.sps->vshift[1];
++        const Mv * const mv = mv_field->mv + 0;
++        const Mv * const mv2 = mv_field->mv + 1;
++
++        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++
++        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
++          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
++
++          // We are allowed 3/4 powers of two as well as powers of 2
++          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
++
++          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
++              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
++              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
++              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
++              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
++
++              u[-1].next_fn = s->qpu_filter_uv_b0;
++              last_l0->next_src_x = x1_c + start_x;
++              last_l0->next_src_y = y1_c + start_y;
++              last_l0->next_src_base_c = get_vc_address_u(src_frame);
++
++              u[0].next_fn = 0;  // Ignored - 2 block cmd
++              u[0].next_src_x = x2_c + start_x;
++              u[0].next_src_y = y2_c + start_y;
++              u[0].next_src_base_c = get_vc_address_u(src_frame2);
++
++              u[0].b0.h = (bh<16 ? bh : 16);
++              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
++              u[0].b0.coeffs_x = coefs0_x;
++              u[0].b0.coeffs_y = coefs0_y;
++              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
++              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
++              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
++
++              last_l1->next_src_x = x2_c + start_x;
++              last_l1->next_src_y = y2_c + start_y;
++              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
++
++              u[1].b1.dummy0 = 0;  // w,h inherited from b0
++              u[1].b1.coeffs_x = coefs1_x;
++              u[1].b1.coeffs_y = coefs1_y;
++              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
++              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
++              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
++
++              cp->last_l0 = u;
++              cp->last_l1 = u + 1;
++              cp->qpu_mc_curr = u + 2;
++          }
++
++          dst_base_u += s->frame->linesize[1] * 16;
++        }
++    }
++}
++#endif
++
++
++
 +static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
 +                                const int nPbW, const int nPbH,
 +                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
@@ -3092,7 +4796,7 @@ index b478065..88dd40b 100644
      int merge_idx = 0;
      struct MvField current_mv = {{{ 0 }}};
  
-@@ -1718,8 +2192,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
      int y_cb             = y0 >> log2_min_cb_size;
      int x_pu, y_pu;
      int i, j;
@@ -3102,315 +4806,112 @@ index b478065..88dd40b 100644
  
      if (!skip_flag)
          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1763,16 +2236,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
++#if RPI_INTER
 +        if (s->enable_rpi) {
-+            const Mv * const mv    = &current_mv.mv[0];
-+            const unsigned int mx          = mv->x & 3;
-+            const unsigned int my          = mv->y & 3;
-+            const unsigned int my_mx       = (my<<8) | mx;
-+            const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+            const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+            const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+            const uint32_t src_vc_address_y = get_vc_address_y(ref0->frame);
-+            uint32_t * y = s->curr_y_mvs;
-+
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              const uint32_t src_yx_hi = ((y1_m3 + start_y) << 16);
-+
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  const int bw = nPbW-start_x;
-+                  const int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_yx_hi | ((x1_m3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = s->sh.luma_weight_l0[current_mv.ref_idx[0]];
-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] * 2 + 1;
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->curr_y_mvs = y;
++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++              ref0->frame);
 +        } else
 +#endif
 +        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref0->frame,
++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
                      &current_mv.mv[0], x0, y0, nPbW, nPbH,
                      s->sh.luma_weight_l0[current_mv.ref_idx[0]],
                      s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
 +        }
  
          if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
-+#ifdef RPI_INTER_QPU
-+          if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                uint32_t *u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] * 2 + 1,
-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] * 2 + 1,
-+                                   s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
++#if RPI_INTER
++            if (s->enable_rpi) {
++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                  ref0->frame);
 +                return;
 +            }
 +#endif
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref0->frame->data[2], ref0->frame->linesize[2],
-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1]);
-         }
-@@ -1782,17 +2328,89 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
-+#ifdef RPI_LUMA_QPU
++#if RPI_INTER
 +        if (s->enable_rpi) {
-+            const int reflist = 1;
-+            const Mv *mv    = &current_mv.mv[reflist];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            int my2_mx2_my_mx = (my_mx << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            uint32_t *y = s->curr_y_mvs;
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=16) {
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + 8 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  *y++ = ( (bw<16 ? bw : 16) << 16 ) + (bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+                  *y++ = s->sh.luma_weight_l1[current_mv.ref_idx[reflist]];
-+                  *y++ = s->sh.luma_offset_l1[current_mv.ref_idx[reflist]] * 2 + 1;
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter;
-+                }
-+            }
-+            s->curr_y_mvs = y;
++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++              ref1->frame);
 +        } else
 +#endif
-+
 +        {
-+            RPI_REDIRECT(luma_mc_uni)(s, dst0, s->frame->linesize[0], ref1->frame,
++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
                      &current_mv.mv[1], x0, y0, nPbW, nPbH,
                      s->sh.luma_weight_l1[current_mv.ref_idx[1]],
                      s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
 +        }
  
          if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
 +            if (s->enable_rpi) {
-+                const int reflist = 1;
-+                const int hshift           = s->ps.sps->hshift[1];
-+                const int vshift           = s->ps.sps->vshift[1];
-+                const Mv * const mv        = &current_mv.mv[reflist];
-+                const intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                const intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                const intptr_t _mx         = mx << (1 - hshift);
-+                const intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+
-+                const int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                const int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                uint32_t * u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      const int bw = nPbW_c-start_x;
-+                      const int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][0] * 2 + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l1[current_mv.ref_idx[reflist]][1] * 2 + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[reflist]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
++                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                  ref1->frame);
 +                return;
 +            }
 +#endif
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
- 
--            chroma_mc_uni(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-+            RPI_REDIRECT(chroma_mc_uni)(s, dst2, s->frame->linesize[2], ref1->frame->data[2], ref1->frame->linesize[2],
-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1]);
-         }
-@@ -1802,15 +2420,118 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
 -        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
-+#ifdef RPI_LUMA_QPU
-+        if (s->enable_rpi && 0) {
-+            const Mv *mv    = &current_mv.mv[0];
-+            int mx          = mv->x & 3;
-+            int my          = mv->y & 3;
-+            int my_mx = (my<<8) + mx;
-+            const Mv *mv2    = &current_mv.mv[1];
-+            int mx2          = mv2->x & 3;
-+            int my2          = mv2->y & 3;
-+            int my2_mx2 = (my2<<8) + mx2;
-+            int my2_mx2_my_mx = (my2_mx2 << 16) + my_mx;
-+            int x1 = x0 + (mv->x >> 2);
-+            int y1 = y0 + (mv->y >> 2);
-+            int x2 = x0 + (mv2->x >> 2);
-+            int y2 = y0 + (mv2->y >> 2);
-+            uint32_t *y = s->curr_y_mvs;
-+            for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+              for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+                  int bw = nPbW-start_x;
-+                  int bh = nPbH-start_y;
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y1 - 3 + start_y) << 16) + ( (x1 - 3 + start_x) & 0xffff);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref0->frame);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = ((y2 - 3 + start_y) << 16) + ( (x2 - 3 + start_x) & 0xffff); // Second fetch is for ref1
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(ref1->frame);
-+                  *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
-+                  *y++ = my2_mx2_my_mx;
-+
-+                  *y++ = PACK2(s->sh.luma_weight_l1[current_mv.ref_idx[1]],
-+                               s->sh.luma_weight_l0[current_mv.ref_idx[0]]);
-+                  *y++ = s->sh.luma_offset_l0[current_mv.ref_idx[0]] +
-+                         s->sh.luma_offset_l1[current_mv.ref_idx[1]] + 1;
-+
-+                  *y++ = (get_vc_address_y(s->frame) + x0 + start_x + (start_y + y0) * s->frame->linesize[0]);
-+                  y++[-RPI_LUMA_COMMAND_WORDS] = s->mc_filter_b;
-+                }
-+            }
-+            s->curr_y_mvs = y;
++#if RPI_INTER
++        if (s->enable_rpi) {
++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
 +        } else
 +#endif
 +        {
-+            RPI_REDIRECT(luma_mc_bi)(s, dst0, s->frame->linesize[0], ref0->frame,
++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
                     ref1->frame, &current_mv.mv[1], &current_mv);
 +        }
  
          if (s->ps.sps->chroma_format_idc) {
--            chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
 +          if (s->enable_rpi) {
-+                int hshift           = s->ps.sps->hshift[1];
-+                int vshift           = s->ps.sps->vshift[1];
-+                const Mv *mv         = &current_mv.mv[0];
-+                intptr_t mx          = av_mod_uintp2(mv->x, 2 + hshift);
-+                intptr_t my          = av_mod_uintp2(mv->y, 2 + vshift);
-+                intptr_t _mx         = mx << (1 - hshift);
-+                intptr_t _my         = my << (1 - vshift); // Fractional part of motion vector
-+                int x1_c = x0_c + (mv->x >> (2 + hshift));
-+                int y1_c = y0_c + (mv->y >> (2 + hshift));
-+
-+                const Mv *mv2         = &current_mv.mv[1];
-+                intptr_t mx2          = av_mod_uintp2(mv2->x, 2 + hshift);
-+                intptr_t my2          = av_mod_uintp2(mv2->y, 2 + vshift);
-+                intptr_t _mx2         = mx2 << (1 - hshift);
-+                intptr_t _my2         = my2 << (1 - vshift); // Fractional part of motion vector
-+
-+                int x2_c = x0_c + (mv2->x >> (2 + hshift));
-+                int y2_c = y0_c + (mv2->y >> (2 + hshift));
-+
-+
-+                uint32_t *u = s->curr_u_mvs;
-+                for(int start_y=0;start_y < nPbH_c;start_y+=16) {
-+                  for(int start_x=0;start_x < nPbW_c;start_x+=RPI_CHROMA_BLOCK_WIDTH) {
-+                      int bw = nPbW_c-start_x;
-+                      int bh = nPbH_c-start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b0;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x1_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y1_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref0->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref0->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx][0];
-+                      *u++ = rpi_filter_coefs[_my][0];
-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0]; // Weight L0 U
-+                      *u++ = s->sh.chroma_weight_l0[current_mv.ref_idx[0]][1]; // Weight L0 V
-+                      *u++ = 0;  // Intermediate results are not written back in first pass of B filtering
-+                      *u++ = 0;
-+
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = s->mc_filter_uv_b;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = x2_c - 1 + start_x;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = y2_c - 1 + start_y;
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_u(ref1->frame);
-+                      u++[-RPI_CHROMA_COMMAND_WORDS] = get_vc_address_v(ref1->frame);
-+                      *u++ = ( (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH) << 16 ) + (bh<16 ? bh : 16);
-+                      *u++ = rpi_filter_coefs[_mx2][0];
-+                      *u++ = rpi_filter_coefs[_my2][0];
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0] +
-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0] + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0]);
-+                      *u++ = PACK2(s->sh.chroma_offset_l0[current_mv.ref_idx[0]][1] +
-+                                     s->sh.chroma_offset_l1[current_mv.ref_idx[1]][1] + 1,
-+                                   s->sh.chroma_weight_l1[current_mv.ref_idx[1]][1]);
-+                      *u++ = (get_vc_address_u(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[1]);
-+                      *u++ = (get_vc_address_v(s->frame) + x0_c + start_x + (start_y + y0_c) * s->frame->linesize[2]);
-+                    }
-+                }
-+                s->curr_u_mvs = u;
++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
++                           &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                           ref0->frame,
++                           ref1->frame);
 +                return;
 +            }
 +#endif
-+            RPI_REDIRECT(chroma_mc_bi)(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
--            chroma_mc_bi(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-+            RPI_REDIRECT(chroma_mc_bi)(s, dst2, s->frame->linesize[2], ref0->frame, ref1->frame,
-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 1);
-         }
-     }
-@@ -2304,6 +3025,734 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
+                 if (s->ps.sps->pcm.loop_filter_disable_flag)
++                {
+                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
++                }
+ 
+                 if (ret < 0)
+                     return ret;
+@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
  }
  
@@ -3427,6 +4928,7 @@ index b478065..88dd40b 100644
 +    s->num_dblk_cmds[job] = 0;
 +}
 +
++#if 0
 +static void rpi_execute_transform(HEVCContext *s)
 +{
 +    int i=2;
@@ -3442,7 +4944,7 @@ index b478065..88dd40b 100644
 +        s->hevcdsp.idct[5-2](coeffs, 32);
 +    }*/
 +
-+    gpu_cache_flush(&s->coeffs_buf_accelerated[job]);
++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
 +    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
 +                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
 +                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
@@ -3453,12 +4955,16 @@ index b478065..88dd40b 100644
 +    for(i=0;i<4;i++)
 +        s->num_coeffs[job][i] = 0;
 +}
++#endif
 +
-+static void rpi_execute_pred_cmds(HEVCContext *s)
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(HEVCContext * const s)
 +{
 +  int i;
 +  int job = s->pass1_job;
-+  HEVCPredCmd *cmd = s->univ_pred_cmds[job];
++  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
 +#ifdef RPI_WORKER
 +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
 +#else
@@ -3466,43 +4972,65 @@ index b478065..88dd40b 100644
 +#endif
 +
 +  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
-+      if (cmd->type == RPI_PRED_INTRA) {
-+          lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->mode;
-+          lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
-+          lc->na.cand_left         = (cmd->na >> 3) & 1;
-+          lc->na.cand_up_left      = (cmd->na >> 2) & 1;
-+          lc->na.cand_up           = (cmd->na >> 1) & 1;
-+          lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+          s->hpc.intra_pred[cmd->size - 2](s, cmd->x, cmd->y, cmd->c_idx);
-+      } else {
++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++
++      switch (cmd->type)
++      {
++          case RPI_PRED_INTRA:
++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
++              lc->na.cand_left         = (cmd->na >> 3) & 1;
++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
++              lc->na.cand_up           = (cmd->na >> 1) & 1;
++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
++              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              else
++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              break;
++
++          case RPI_PRED_ADD_RESIDUAL:
++              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
 +#ifdef RPI_PRECLEAR
-+          int trafo_size = 1 << cmd->size;
-+#endif
-+          s->hevcdsp.transform_add[cmd->size-2](cmd->dst, cmd->buf, cmd->stride);
-+#ifdef RPI_PRECLEAR
-+          memset(cmd->buf, 0, trafo_size * trafo_size * sizeof(int16_t)); // Clear coefficients here while they are in the cache
++              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
 +#endif
++              break;
++          case RPI_PRED_ADD_RESIDUAL_U:
++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++          case RPI_PRED_ADD_RESIDUAL_V:
++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++
++          case RPI_PRED_I_PCM:
++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++              break;
++
++          default:
++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++              abort();
 +      }
 +  }
 +  s->num_pred_cmds[job] = 0;
 +}
 +
-+static void rpi_execute_inter_cmds(HEVCContext *s)
++// Do any inter-pred that we want to do in software
++// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
++// All ARM
++static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
 +{
-+    int job = s->pass1_job;
-+    HEVCMvCmd *cmd = s->unif_mv_cmds[job];
-+    int n,cidx;
++    unsigned int cidx;
 +    AVFrame myref;
 +    AVFrame myref1;
 +    struct MvField mymv;
-+    if (s->num_mv_cmds[job] > RPI_MAX_MV_CMDS) {
-+        printf("Overflow inter_cmds\n");
-+        exit(-1);
-+    }
-+    for(n = s->num_mv_cmds[job]; n>0 ; n--, cmd++) {
++
++    for(; n>0 ; n--, cmd++) {
++        av_assert0(0);
++
 +        switch(cmd->cmd) {
 +        case RPI_CMD_LUMA_UNI:
++            if (b_only)
++                break;
 +            myref.data[0] = cmd->src;
 +            myref.linesize[0] = cmd->srcstride;
 +            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
@@ -3519,6 +5047,8 @@ index b478065..88dd40b 100644
 +                       &myref1, &cmd->mv1, &mymv);
 +            break;
 +        case RPI_CMD_CHROMA_UNI:
++            if (b_only)
++                break;
 +            mymv.mv[0] = cmd->mv;
 +            chroma_mc_uni(s, cmd->dst,
 +                          cmd->dststride, cmd->src, cmd->srcstride, 0,
@@ -3540,618 +5070,385 @@ index b478065..88dd40b 100644
 +            break;
 +        }
 +    }
-+    s->num_mv_cmds[job] = 0;
 +}
 +
-+static void rpi_do_all_passes(HEVCContext *s)
++static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
 +{
-+    // Kick off QPUs and VPUs
-+    rpi_launch_vpu_qpu(s);
-+    // Perform luma inter prediction
-+    rpi_execute_inter_cmds(s);
-+    // Wait for transform completion
-+    vpu_wait(s->vpu_id);
-+    // Perform intra prediction and residual reconstruction
-+    rpi_execute_pred_cmds(s);
-+    // Perform deblocking for CTBs in this row
-+    rpi_execute_dblk_cmds(s);
-+    // Prepare next batch
-+    rpi_begin(s);
++    const int job = s->pass1_job;
++
++    if (!qpu_luma || luma_b_only)
++        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
++    s->num_mv_cmds_y[job] = 0;
++    if (!qpu_chroma || chroma_b_only)
++        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
++    s->num_mv_cmds_c[job] = 0;
 +}
 +
 +#endif
 +
 +#ifdef RPI
++// Set initial uniform job values & zero ctu_count
 +static void rpi_begin(HEVCContext *s)
 +{
++#if RPI_INTER
 +    int job = s->pass0_job;
 +    int i;
-+#ifdef RPI_INTER_QPU
-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
 +
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[job][i] = s->mvs_base[job][i];
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = pic_width;
-+        *s->u_mvs[job][i]++ = pic_height;
-+        *s->u_mvs[job][i]++ = s->frame->linesize[1];
-+        *s->u_mvs[job][i]++ = s->frame->linesize[2];
-+        *s->u_mvs[job][i]++ = s->sh.chroma_log2_weight_denom + 6;
-+        *s->u_mvs[job][i]++ = 0;
-+        *s->u_mvs[job][i]++ = i;  // Select section of VPM (avoid collisions with 3d unit)
-+    }
-+    s->curr_u_mvs = s->u_mvs[job][0];
-+#endif
++    const uint16_t pic_width_y        = s->ps.sps->width;
++    const uint16_t pic_height_y       = s->ps.sps->height;
 +
-+#ifdef RPI_LUMA_QPU
-+    for(i=0;i<12;i++) {
-+        // This needs to have a generally similar structure to the
-+        // actual filter code as various pipelined bits need to land correctly
-+        // when inserted by the filter requests
-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-+        *s->y_mvs[job][i]++ = 0; // y_x
-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
-+        *s->y_mvs[job][i]++ = 0; // y2_x2
-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
-+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
-+        *s->y_mvs[job][i]++ = 0; // Next kernel
++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
++
++    for(i=0; i < QPU_N_UV;i++) {
++        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
++        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
++
++        // Chroma setup is a double block with L0 fetch
++        // and other stuff in the 1st block and L1 fetch
++        // in the 2nd along with a lot of dummy vars
++        // This could be packed a lot tighter but it would make
++        // L0, L1 management a lot harder
++
++        u->next_fn = 0;
++        u->next_src_x = 0;
++        u->next_src_y = 0;
++        u->next_src_base_c = 0;
++        u->s0.pic_cw = pic_width_c;
++        u->s0.pic_ch = pic_height_c;
++        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
++        u->s0.stride1 = s->frame->linesize[1];
++        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
++        u->s0.dummy0 = 0;
++        cp->last_l0 = u;
++        ++u;
++
++        u->next_fn = 0;
++        u->next_src_x = 0;
++        u->next_src_y = 0;
++        u->next_src_base_c = 0;
++        u->s1.dummy0 = 0;
++        u->s1.dummy1 = 0;
++        u->s1.dummy2 = 0;
++        u->s1.dummy3 = 0;
++        u->s1.dummy4 = 0;
++        u->s1.dummy5 = 0;
++        cp->last_l1 = u;
++        ++u;
++
++        cp->load = 0;
++        cp->qpu_mc_curr = u;
 +    }
-+    s->curr_y_mvs = s->y_mvs[job][0];
++    s->curr_pred_c = NULL;
++
++    for(i=0;i < QPU_N_Y;i++) {
++        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
++        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
++
++        y->next_src1_x = 0;
++        y->next_src1_y = 0;
++        y->next_src1_base = 0;
++        y->next_src2_x = 0;
++        y->next_src2_y = 0;
++        y->next_src2_base = 0;
++        y->s.pic_h = pic_height_y;
++        y->s.pic_w = pic_width_y;
++        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
++        y->s.stride1 = s->frame->linesize[0];
++        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
++        y->s.dummy0 = 0;
++        y->next_fn = 0;
++        yp->last_lx = y;
++        ++y;
++
++        yp->load = 0;
++        yp->qpu_mc_curr = y;
++    }
++    s->curr_pred_y = NULL;
++    s->last_y8_p = NULL;
++    s->last_y8_lx = NULL;
 +#endif
 +    s->ctu_count = 0;
 +}
 +#endif
 +
-+#ifdef RPI_SIMULATE_QPUS
 +
-+static int32_t clipx(int x,int FRAME_WIDTH)
++#if RPI_INTER
++static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
 +{
-+	if (x<=0) return 0;
-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
-+	return x;
-+}
++    unsigned int i;
++    const uint32_t exit_fn = qpu_fn(mc_exit);
++    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
++    unsigned int tc = 0;
++    HEVCRpiJob * const jb = s->jobs + job;
 +
-+static int32_t clipy(int y,int FRAME_HEIGHT)
-+{
-+	if (y<=0) return 0;
-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
-+	return y;
-+}
++    // Add final commands to Q
++    for(i = 0; i != QPU_N_Y; ++i) {
++        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
++        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
 +
-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
++        // We will always have had L0 if we have L1 so only test L0
++        if (px != yp->qpu_mc_base)
++            tc = 1;
 +
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
++        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
 +
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        px->next_src1_x = MC_DUMMY_X;
++        px->next_src1_y = MC_DUMMY_Y;
++        px->next_src1_base = s->qpu_dummy_frame;
++        px->next_src2_x = MC_DUMMY_X;
++        px->next_src2_y = MC_DUMMY_Y;
++        px->next_src2_base = s->qpu_dummy_frame;
 +
-+      vsum += lumaFilter[my][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+round)>>denom)+offset;
-+
-+   return av_clip_uint8( vsum );
-+}*/
-+
-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int chromaFilterH[4];
-+  int chromaFilterV[4];
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+  for(i=0;i<4;i++) {
-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
-+  }
-+
-+   for (y = 0; y < 4; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 4; x++)
-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-+
-+      vsum += chromaFilterV[y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-+
-+   return vsum;
-+}
-+
-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
-+
-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-+
-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-+
-+   return vsum;
-+}
-+
-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
-+{
-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-+  int pitch = frame->linesize[cIdx];
-+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
-+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
-+  if (p>=base && p<base+pitch*pic_height) {
-+    return frame->data[cIdx] + (p-base);
-+  }
-+  return NULL;
-+}
-+
-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
-+{
-+  SliceHeader *sh   = &s->sh;
-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
-+  int i;
-+  if (arm) return arm;
-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
-+  {
-+    for(i=0;i<sh->nb_refs[L0];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
-+      if (arm) return arm;
++        yp->last_lx = NULL;
 +    }
-+  }
-+  if (sh->slice_type == B_SLICE) {
-+    for(i=0;i<sh->nb_refs[L1];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
-+  exit(-1);
-+  return NULL;
++
++    return tc;
 +}
 +
-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
++#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
++#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
++
++static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
 +{
-+  uint32_t next_kernel;
-+  uint32_t x0;
-+  uint32_t y0;
-+  uint8_t *ref_u_base;
-+  uint8_t *ref_v_base;
-+  uint32_t frame_width = p[5];
-+  uint32_t frame_height = p[6];
-+  uint32_t pitch = p[7];
-+  uint32_t dst_pitch = p[8];
-+  int32_t offset_before = p[9];
-+  int32_t denom = p[10];
-+  uint32_t vpm_id = p[11];
-+  uint32_t tmp_u_dst[256];
-+  uint32_t tmp_v_dst[256];
-+  while(1) {
-+    p += 12;
-+    next_kernel = p[0-12];
-+    x0 = p[1-12];
-+    y0 = p[2-12];
-+    if (next_kernel==s->mc_filter_uv || next_kernel==s->mc_filter_uv_b0 || next_kernel==s->mc_filter_uv_b) {
-+      int x,y;
-+      uint32_t width_height = p[5];
-+      uint32_t hcoeffs = p[6];
-+      uint32_t vcoeffs = p[7];
-+      uint32_t offset_weight_u = p[8];
-+      uint32_t offset_weight_v = p[9];
-+      uint8_t *this_u_dst;
-+      uint8_t *this_v_dst;
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
-+      if (next_kernel!=s->mc_filter_uv_b0)
-+      {
-+        this_u_dst = compute_arm_addr(s,p[10],1);
-+        this_v_dst = compute_arm_addr(s,p[11],2);
-+      }
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->mc_filter_uv) {
-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          } else if (next_kernel==s->mc_filter_uv_b0) {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            tmp_u_dst[x+y*16] = refa;
-+            tmp_v_dst[x+y*16] = refb;
-+          } else {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
++    unsigned int i;
++    const uint32_t exit_fn = qpu_fn(mc_exit_c);
++    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
++    unsigned int tc = 0;
++    HEVCRpiJob * const jb = s->jobs + job;
++
++    // Add final commands to Q
++    for(i = 0; i != QPU_N_UV; ++i) {
++        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
++        qpu_mc_pred_c_t *const p0 = cp->last_l0;
++        qpu_mc_pred_c_t *const p1 = cp->last_l1;
++
++        // We will always have had L0 if we have L1 so only test L0
++        if (p0 != cp->qpu_mc_base)
++            tc = 1;
++
++        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->next_src_x = MC_DUMMY_X;
++        p0->next_src_y = MC_DUMMY_Y;
++        p0->next_src_base_c = s->qpu_dummy_frame;
++        p1->next_src_x = MC_DUMMY_X;
++        p1->next_src_y = MC_DUMMY_Y;
++        p1->next_src_base_c = s->qpu_dummy_frame;;
++
++        cp->last_l0 = NULL;
++        cp->last_l1 = NULL;
 +    }
-+  }
++
++    return tc;
 +}
-+
-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
-+{
-+  uint32_t next_kernel;
-+  int y_x,y2_x2;
-+  int x0;
-+  int y0;
-+  int x2;
-+  int y2;
-+  uint32_t *p0 = p;
-+  uint8_t *ref_y_base;
-+  uint8_t *ref_y2_base;
-+  uint32_t frame_width_height = p[4];
-+  uint32_t frame_width = frame_width_height>>16;
-+  uint32_t frame_height = (frame_width_height<<16)>>16;
-+  uint32_t pitch = p[5];
-+  uint32_t dst_pitch = p[6];
-+  int offset_shift = p[7];
-+  int32_t offset_before = offset_shift>>16;
-+  int32_t denom = (offset_shift<<16)>>16;
-+  while(1) {
-+    p += 9;
-+    next_kernel = p[8-9];
-+    y_x = p[0-9];
-+    x0 = (y_x<<16)>>16;
-+    y0 = y_x>>16;
-+    y2_x2 = p[2-9];
-+    x2 = (y2_x2<<16)>>16;
-+    y2 = y2_x2>>16;
-+
-+    if (next_kernel==s->mc_filter || next_kernel==s->mc_filter_b) {
-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+      int x,y;
-+      uint32_t width_height = p[4];
-+      uint32_t my2_mx2_my_mx = p[5];
-+      uint32_t offset_weight = p[6];
-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      uint8_t *dst_base = s->frame->data[0];
-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->mc_filter) {
-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+            refa = av_clip_uint8(refa);
-+            this_dst[x+y*dst_pitch] = refa;
-+          }
-+          else {
-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
-+    }
-+  }
-+}
-+
-+static void rpi_simulate_inter_qpu(HEVCContext *s)
-+{
-+  // First run the transform as normal
-+  int i;
-+  rpi_execute_transform(s);
-+  for(i=0;i<8;i++)
-+  {
-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
-+  }
-+  for(i=0;i<12;i++)
-+  {
-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-+  }
-+}
-+
 +#endif
 +
-+#ifdef RPI_INTER_QPU
++#ifdef RPI
 +
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
++
++static void flush_frame(HEVCContext *s,AVFrame *frame)
 +{
-+    int k;
-+    int job = s->pass1_job;
-+    int i;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+#ifdef RPI_LUMA_QPU
-+    uint32_t *y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
-+#endif
-+    if (s->sh.slice_type == I_SLICE) {
-+#ifdef RPI_MULTI_MAILBOX
-+      rpi_execute_transform(s);
-+      return;
-+#endif
-+    }
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[job][k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for V
-+        av_assert0(s->u_mvs[job][k] - s->mvs_base[job][k] < UV_COMMANDS_PER_QPU);
-+    }
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++  rpi_cache_flush_finish(rfe);
++}
 +
-+    s->u_mvs[job][8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
 +
-+#ifdef RPI_LUMA_QPU
-+    for(k=0;k<12;k++) {
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+1] = qpu_get_fn(QPU_MC_SETUP_UV); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->y_mvs[job][k][-RPI_LUMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP_UV); // Also need a dummy for second request
-+        s->y_mvs[job][k][-1] = qpu_get_fn(QPU_MC_EXIT); // Add exit command (Final uniform)
-+        av_assert0(s->y_mvs[job][k] - s->y_mvs_base[job][k] < Y_COMMANDS_PER_QPU);
-+    }
-+    s->y_mvs[job][12-1][-1] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT12); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+#endif
++// Core execution tasks
++static void worker_core(HEVCContext * const s)
++{
++    worker_global_env_t * const wg = &worker_global_env;
++    int arm_cost = 0;
++//    vpu_qpu_wait_h sync_c;
++    vpu_qpu_wait_h sync_y;
++    int qpu_luma = 0;
++    int qpu_chroma = 0;
++    int gpu_load;
++    int arm_load;
++    static const int arm_const_cost = 2;
 +
-+#ifdef RPI_SIMULATE_QPUS
-+    rpi_simulate_inter_qpu(s);
-+    return;
-+#endif
++//    static int z = 0;
 +
-+#ifdef RPI_MULTI_MAILBOX
-+#ifdef RPI_CACHE_UNIF_MVS
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],&s->y_unif_mvs_ptr[job], &s->unif_mvs_ptr[job], job);
-+#else
-+    flush_frame3(s, s->frame,&s->coeffs_buf_accelerated[job],NULL,NULL, job);
-+#endif
++    const int job = s->pass1_job;
++    unsigned int flush_start = 0;
++    unsigned int flush_count = 0;
 +
-+#if 1
-+    {
-+        unsigned int i;
-+        uint32_t * p;
-+        uint32_t code = qpu_get_fn(QPU_MC_SETUP_UV);
-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
 +
-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
-+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        code = qpu_get_fn(QPU_MC_SETUP);
-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
-+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        s->vpu_id = vpu_qpu_post_code2(vpu_get_fn(),
++    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
++        vpu_qpu_job_add_vpu(vqj,
++            vpu_get_fn(),
 +            vpu_get_constants(),
 +            s->coeffs_buf_vc[job][2],
 +            s->num_coeffs[job][2] >> 8,
 +            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
 +            s->num_coeffs[job][3] >> 10,
-+            0,
-+            // QPU job 1
-+            QPU_N_UV,
-+            mail_uv,
-+            // QPU job 2
-+            QPU_N_Y,
-+            mail_y
-+            );
++            0);
++
++        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
 +    }
 +
++
++#if RPI_INTER
++    pthread_mutex_lock(&wg->lock);
++
++//    ++z;
++    gpu_load = vpu_qpu_current_load();
++    arm_load = avpriv_atomic_int_get(&wg->arm_load);
++#if 0 // Y_B_ONLY
++    qpu_luma =  gpu_load + 2 < arm_load;
++    qpu_chroma = gpu_load < arm_load + 8;
++#elif 0
++    qpu_luma =  gpu_load < arm_load + 2;
++    qpu_chroma = gpu_load < arm_load + 8;
 +#else
-+    s->vpu_id = vpu_qpu_post_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2], s->num_coeffs[job][2] >> 8,
-+                                                                      s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3], s->num_coeffs[job][3] >> 10, 0,
-+                                   qpu_get_fn(QPU_MC_SETUP_UV),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+#ifdef RPI_LUMA_QPU
-+                                   qpu_get_fn(QPU_MC_SETUP),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][0 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][1 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][2 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][3 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][4 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][5 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][6 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][7 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][8 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][9 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][10 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm)),
-+                                   (uint32_t)(y_unif_vc+(s->y_mvs_base[job][11 ] - (uint32_t*)s->y_unif_mvs_ptr[job].arm))
-+#else
-+                                   0,
-+                                   0,0,0,0,
-+                                   0,0,0,0,
-+                                   0,0,0,0
-+#endif
-+                                 );
-+#endif
-+    for(i=0;i<4;i++)
-+        s->num_coeffs[job][i] = 0;
-+#else
-+#error Code rotted here
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][0 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][1 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][2 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][3 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][4 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][5 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][6 ] - (uint32_t*)s->unif_mvs_ptr[job].arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[job][7 ] - (uint32_t*)s->unif_mvs_ptr[job].arm))
-+      );
++    qpu_chroma = 1;
++    qpu_luma = 1;
 +#endif
 +
++    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
 +
++    wg->gpu_c += qpu_chroma;
++    wg->gpu_y += qpu_luma;
++    wg->arm_c += !qpu_chroma;
++    wg->arm_y += !qpu_luma;
++
++
++//    if ((z & 511) == 0) {
++//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
++//    }
++
++
++    {
++        int (*d)[2] = s->dblk_cmds[job];
++        unsigned int high=(*d)[1];
++        int n;
++
++        flush_start = high;
++        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
++            unsigned int y = (*d)[1];
++            flush_start = FFMIN(flush_start, y);
++            high=FFMAX(high,y);
++        }
++        // Avoid flushing past end of frame
++        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
++    }
++
++#if !DISABLE_CHROMA
++    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
++    {
++        HEVCRpiJob * const jb = s->jobs + job;
++        const uint32_t code = qpu_fn(mc_setup_c);
++        uint32_t * p;
++        unsigned int i;
++        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
++
++        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
++            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
++            *p++ = code;
++        }
++
++        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
++
++#if RPI_CACHE_UNIF_MVS
++        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#endif
++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
++    }
++#endif
++
++// We can take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
++
++    if (qpu_luma && mc_terminate_y(s, job) != 0)
++    {
++        HEVCRpiJob * const jb = s->jobs + job;
++        const uint32_t code = qpu_fn(mc_setup);
++        uint32_t * p;
++        unsigned int i;
++        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
++
++        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
++            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
++            *p++ = code;
++        }
++
++        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
++
++#if RPI_CACHE_UNIF_MVS
++        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#endif
++        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
++    }
++
++    pthread_mutex_unlock(&wg->lock);
++
++#endif
++
++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
++
++    // Having accumulated some commands - do them
++    rpi_cache_flush_finish(rfe);
++    vpu_qpu_job_finish(vqj);
++
++    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
++
++#if Y_B_ONLY
++    if (qpu_luma)
++        vpu_qpu_wait(&sync_y);
++#endif
++    // Perform inter prediction
++    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
++
++    // Wait for transform completion
++
++    // Perform intra prediction and residual reconstruction
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
++#if Y_B_ONLY
++    if (!qpu_luma)
++        vpu_qpu_wait(&sync_y);
++#else
++    vpu_qpu_wait(&sync_y);
++#endif
++    rpi_execute_pred_cmds(s);
++
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s);
++
++    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
 +}
-+#else
 +
-+#ifdef RPI
-+static void rpi_launch_vpu_qpu(HEVCContext *s)
++static void rpi_do_all_passes(HEVCContext *s)
 +{
-+  rpi_execute_transform(s);
-+}
-+#endif
-+
-+#endif
-+
-+#ifdef RPI
-+
-+#ifndef RPI_FAST_CACHEFLUSH
-+#error RPI_FAST_CACHEFLUSH is broken
-+static void flush_buffer(AVBufferRef *bref) {
-+    GPU_MEM_PTR_T *p = av_buffer_pool_opaque(bref);
-+    gpu_cache_flush(p);
-+}
-+#endif
-+
-+static void flush_frame(HEVCContext *s,AVFrame *frame)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-+    int n = s->ps.sps->height;
-+    int curr_y = 0;
-+    int curr_uv = 0;
-+    int n_uv = n >> s->ps.sps->vshift[1];
-+    int sz,base;
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    iocache.s[0].handle = p.vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int)(p.arm) + base;
-+    iocache.s[0].size  = sz;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int)(p.arm) + base;
-+    iocache.s[1].size  = sz;
-+    p = get_gpu_mem_ptr_y(s->frame);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p.vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int)(p.arm) + base;
-+    iocache.s[2].size  = sz;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
-+#endif
++    // Do the various passes - common with the worker code
++    worker_core(s);
++    // Prepare next batch
++    rpi_begin(s);
 +}
 +
-+static void flush_frame3(HEVCContext *s,AVFrame *frame,GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2, int job)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    int n;
-+    int curr_y;
-+    int curr_uv;
-+    int n_uv;
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(s->frame);
-+    int sz,base;
-+    int (*d)[2] = s->dblk_cmds[job];
-+    int low=(*d)[1];
-+    int high=(*d)[1];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+        int y = (*d)[1];
-+        low=FFMIN(low,y);
-+        high=FFMAX(high,y);
-+    }
-+    curr_y = low;
-+    n = high+(1 << s->ps.sps->log2_ctb_size);
-+    curr_uv = curr_y >> s->ps.sps->vshift[1];
-+    n_uv = n >> s->ps.sps->vshift[1];
 +
-+    sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+    base = s->frame->linesize[1] * curr_uv;
-+    iocache.s[0].handle = p.vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int)(p.arm) + base;
-+    iocache.s[0].size  = sz;
-+    p = get_gpu_mem_ptr_v(s->frame);
-+    iocache.s[1].handle = p.vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int)(p.arm) + base;
-+    iocache.s[1].size  = sz;
-+    p = get_gpu_mem_ptr_y(s->frame);
-+    sz = s->frame->linesize[0] * (n-curr_y);
-+    base = s->frame->linesize[0] * curr_y;
-+    iocache.s[2].handle = p.vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int)(p.arm) + base;
-+    iocache.s[2].size  = sz;
-+
-+    iocache.s[3].handle = p0->vcsm_handle;
-+    iocache.s[3].cmd = 3; // clean+invalidate
-+    iocache.s[3].addr = (int) p0->arm;
-+    iocache.s[3].size  = p0->numbytes;
-+    if (p1) {
-+      iocache.s[4].handle = p1->vcsm_handle;
-+      iocache.s[4].cmd = 3; // clean+invalidate
-+      iocache.s[4].addr = (int) p1->arm;
-+      iocache.s[4].size  = p1->numbytes;
-+    }
-+    if (p2) {
-+      iocache.s[5].handle = p2->vcsm_handle;
-+      iocache.s[5].cmd = 3; // clean+invalidate
-+      iocache.s[5].addr = (int) p2->arm;
-+      iocache.s[5].size  = p2->numbytes;
-+    }
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    flush_buffer(frame->buf[0]);
-+    flush_buffer(frame->buf[1]);
-+    flush_buffer(frame->buf[2]);
-+    gpu_cache_flush3(p0, p1, p2);
-+#endif
-+}
 +
 +#endif
 +
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  {
      HEVCContext *s  = avctxt->priv_data;
-@@ -2313,6 +3762,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
      int y_ctb       = 0;
      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
 +#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
++    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
++        s->frame->format == AV_PIX_FMT_SAND128 &&
++        !s->ps.pps->cross_component_prediction_enabled_flag;
 +
 +    if (!s->enable_rpi) {
 +      if (s->ps.pps->cross_component_prediction_enabled_flag)
@@ -4163,7 +5460,7 @@ index b478065..88dd40b 100644
      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
          return AVERROR_INVALIDDATA;
-@@ -2326,6 +3786,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          }
      }
  
@@ -4178,26 +5475,25 @@ index b478065..88dd40b 100644
      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
          int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
  
-@@ -2341,7 +3809,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+ 
++
+         ff_hevc_cabac_init(s, ctb_addr_ts);
+ 
+         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
-+#ifdef RPI_INTER_QPU
-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % 8];
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % 12];
++#if RPI_INTER
++        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
++        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
 +#endif
 +
          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
-+#ifdef RPI_INTER_QPU
-+        s->u_mvs[s->pass0_job][s->ctu_count % 8]= s->curr_u_mvs;
-+#endif
-+#ifdef RPI_LUMA_QPU
-+        s->y_mvs[s->pass0_job][s->ctu_count % 12] = s->curr_y_mvs;
-+#endif
-+
 +#ifdef RPI
 +        if (s->enable_rpi) {
 +          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
@@ -4207,14 +5503,18 @@ index b478065..88dd40b 100644
 +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
 +          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
 +          s->ctu_count++;
-+          //printf("%d %d/%d job=%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job);
 +
 +          if ( s->ctu_count >= s->max_ctu_count ) {
 +#ifdef RPI_WORKER
-+            if (s->used_for_ref) {
++            if (s->used_for_ref)
++            {
++//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
++
++//                worker_wait(s);
 +              // Split work load onto separate threads so we make as rapid progress as possible with this frame
 +              // Pass on this job to worker thread
 +              worker_submit_job(s);
++
 +              // Make sure we have space to prepare the next job
 +              worker_pass0_ready(s);
 +
@@ -4236,7 +5536,7 @@ index b478065..88dd40b 100644
          if (more_data < 0) {
              s->tab_slice_address[ctb_addr_rs] = -1;
              return more_data;
-@@ -2350,9 +3868,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  
          ctb_addr_ts++;
          ff_hevc_save_states(s, ctb_addr_ts);
@@ -4261,12 +5561,25 @@ index b478065..88dd40b 100644
 +        rpi_do_all_passes(s);
 +    }
 +
++#if RPI_TSTATS
++    {
++        HEVCRpiStats *const ts = &s->tstats;
++
++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
++        memset(ts, 0, sizeof(*ts));
++    }
++#endif
++
 +#endif
 +
      if (x_ctb + ctb_size >= s->ps.sps->width &&
          y_ctb + ctb_size >= s->ps.sps->height)
          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2387,6 +3925,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
      s = s1->sList[self_id];
      lc = s->HEVClc;
  
@@ -4278,16 +5591,32 @@ index b478065..88dd40b 100644
      if(ctb_row) {
          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-@@ -2767,6 +4310,16 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
          if (ret < 0)
              return ret;
  
-+        s->used_for_ref = !(s->nal_unit_type == NAL_TRAIL_N ||
++        // The definition of _N unit types is "non-reference for other frames
++        // with the same temporal_id" so they may/will be ref frames for pics
++        // with a higher temporal_id.
++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++            !(s->nal_unit_type == NAL_TRAIL_N ||
 +                        s->nal_unit_type == NAL_TSA_N   ||
 +                        s->nal_unit_type == NAL_STSA_N  ||
 +                        s->nal_unit_type == NAL_RADL_N  ||
 +                        s->nal_unit_type == NAL_RASL_N);
 +
++#if DEBUG_DECODE_N
++        {
++            static int z = 0;
++            if (IS_IDR(s)) {
++                z = 1;
++            }
++            if (z != 0 && z++ > DEBUG_DECODE_N) {
++                s->is_decoded = 0;
++                break;
++            }
++        }
++#endif
 +        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
 +            s->is_decoded = 0;
 +            break;
@@ -4295,27 +5624,30 @@ index b478065..88dd40b 100644
          if (s->max_ra == INT_MAX) {
              if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
                  s->max_ra = s->poc;
-@@ -2891,9 +4444,17 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+         }
      }
  
- fail:
+-fail:
 -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
++fail:  // Also success path
 +    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#ifdef RPI_INTER_QPU
-+        ff_hevc_flush_buffer(s, &s->ref->tf, s->ps.sps->height);
++#if RPI_INTER
++        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
 +#endif
          ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 -
-+    } else if (s->ref) {
-+#ifdef RPI_INTER_QPU
++    }
++#if RPI_INTER
++    else if (s->ref && s->enable_rpi) {
 +      // When running single threaded we need to flush the whole frame
 +      flush_frame(s,s->frame);
-+#endif
 +    }
++#endif
      return ret;
  }
  
-@@ -3064,6 +4625,41 @@ fail:
+@@ -3064,6 +4764,41 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -4357,7 +5689,7 @@ index b478065..88dd40b 100644
  static av_cold int hevc_decode_free(AVCodecContext *avctx)
  {
      HEVCContext       *s = avctx->priv_data;
-@@ -3075,6 +4671,32 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
  
      av_freep(&s->cabac_state);
  
@@ -4368,29 +5700,26 @@ index b478065..88dd40b 100644
 +#endif
 +
 +    for(i=0;i<RPI_MAX_JOBS;i++) {
-+      av_freep(&s->unif_mv_cmds[i]);
-+      av_freep(&s->univ_pred_cmds[i]);
 +
-+#ifdef RPI_INTER_QPU
-+      if (s->unif_mvs[i]) {
-+        gpu_free( &s->unif_mvs_ptr[i] );
-+        s->unif_mvs[i] = 0;
-+      }
-+#endif
-+#ifdef RPI_LUMA_QPU
-+      if (s->y_unif_mvs[i]) {
-+        gpu_free( &s->y_unif_mvs_ptr[i] );
-+        s->y_unif_mvs[i] = 0;
-+      }
++        av_freep(&s->unif_mv_cmds_y[i]);
++        av_freep(&s->unif_mv_cmds_c[i]);
++        av_freep(&s->univ_pred_cmds[i]);
++
++#if RPI_INTER
++        gpu_free(&s->jobs[i].chroma_mvs_gptr);
++        gpu_free(&s->jobs[i].luma_mvs_gptr);
 +#endif
 +    }
 +
++    vpu_qpu_term();
++
++    av_rpi_zc_uninit(avctx);
 +#endif
 +
      for (i = 0; i < 3; i++) {
          av_freep(&s->sao_pixel_buffer_h[i]);
          av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3116,10 +4738,23 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
      return 0;
  }
  
@@ -4410,75 +5739,76 @@ index b478065..88dd40b 100644
  {
      HEVCContext *s = avctx->priv_data;
      int i;
-+    int job;
++#ifdef RPI
++    unsigned int job;
++#endif
  
      s->avctx = avctx;
  
-@@ -3129,6 +4764,78 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
      s->HEVClcList[0] = s->HEVClc;
      s->sList[0] = s;
  
 +#ifdef RPI
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        s->unif_mv_cmds[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS);
-+        if (!s->unif_mv_cmds[job])
++    // Whilst FFmpegs init fn is only called once the close fn is called as
++    // many times as we have threads (init_thread_copy is called for the
++    // threads).  So to match init & term put the init here where it will be
++    // called by both init & copy
++    av_rpi_zc_init(avctx);
++
++    if (vpu_qpu_init() != 0)
++        goto fail;
++
++    for(job = 0; job < RPI_MAX_JOBS; job++) {
++        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
++        if (!s->unif_mv_cmds_y[job])
++            goto fail;
++        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
++        if (!s->unif_mv_cmds_c[job])
 +            goto fail;
 +        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
 +        if (!s->univ_pred_cmds[job])
 +            goto fail;
 +    }
 +
-+#ifdef RPI_INTER_QPU
++#if RPI_INTER
 +    // We divide the image into blocks 256 wide and 64 high
 +    // We support up to 2048 widths
 +    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
 +    // Also add space for the startup command for each stream.
 +
-+    {
-+        int uv_commands_per_qpu = UV_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+		for(job=0;job<RPI_MAX_JOBS;job++) {
-+#ifdef RPI_CACHE_UNIF_MVS
-+          gpu_malloc_cached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++    for (job = 0; job < RPI_MAX_JOBS; job++) {
++        HEVCRpiJob * const jb = s->jobs + job;
++#if RPI_CACHE_UNIF_MVS
++        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
++        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
 +#else
-+          gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
++        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
++        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
 +#endif
-+          s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
 +
-+          // Set up initial locations for uniform streams
-+          p = s->unif_mvs[job];
-+          for(i = 0; i < 8; i++) {
-+            s->mvs_base[job][i] = p;
-+            p += uv_commands_per_qpu;
-+          }
++        {
++            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
++            for(i = 0; i < QPU_N_UV; i++) {
++                jb->chroma_mvs[i].qpu_mc_base = p;
++                jb->chroma_mvs[i].qpu_mc_curr = p;
++                p += UV_COMMANDS_PER_QPU;
++            }
 +        }
-+        s->mc_filter_uv = qpu_get_fn(QPU_MC_FILTER_UV);
-+        s->mc_filter_uv_b0 = qpu_get_fn(QPU_MC_FILTER_UV_B0);
-+        s->mc_filter_uv_b = qpu_get_fn(QPU_MC_FILTER_UV_B);
-+    }
-+
-+#endif
-+#ifdef RPI_LUMA_QPU
-+    for(job=0;job<RPI_MAX_JOBS;job++)
-+    {
-+        int y_commands_per_qpu = Y_COMMANDS_PER_QPU;
-+        uint32_t *p;
-+#ifdef RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#else
-+        gpu_malloc_uncached( 12 * y_commands_per_qpu * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#endif
-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
-+
-+        // Set up initial locations for uniform streams
-+        p = s->y_unif_mvs[job];
-+        for(i = 0; i < 12; i++) {
-+            s->y_mvs_base[job][i] = p;
-+            p += y_commands_per_qpu;
++        {
++            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
++            for(i = 0; i < QPU_N_Y; i++) {
++                jb->luma_mvs[i].qpu_mc_base = p;
++                jb->luma_mvs[i].qpu_mc_curr = p;
++                p += Y_COMMANDS_PER_QPU;
++            }
 +        }
 +    }
-+    s->mc_filter = qpu_get_fn(QPU_MC_FILTER);
-+    s->mc_filter_b = qpu_get_fn(QPU_MC_FILTER_B);
++    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
++    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
++    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
++    s->qpu_filter = qpu_fn(mc_filter);
++    s->qpu_filter_b = qpu_fn(mc_filter_b);
 +#endif
 +    //gpu_malloc_uncached(2048*64,&s->dummy);
 +
@@ -4493,8 +5823,30 @@ index b478065..88dd40b 100644
      s->cabac_state = av_malloc(HEVC_CONTEXTS);
      if (!s->cabac_state)
          goto fail;
+@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
+     }
+ 
+     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+-            s->threads_type = FF_THREAD_FRAME;
+-        else
+-            s->threads_type = FF_THREAD_SLICE;
++        s->threads_type = FF_THREAD_FRAME;
++    else
++        s->threads_type = FF_THREAD_SLICE;
+ 
+     return 0;
+ }
+@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
+     .update_thread_context = hevc_update_thread_context,
+     .init_thread_copy      = hevc_init_thread_copy,
+     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++//                             0,
++//                             AV_CODEC_CAP_FRAME_THREADS,
+                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
+ };
 diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index be91010..6b03ea8 100644
+index be91010..dd7d152 100644
 --- a/libavcodec/hevc.h
 +++ b/libavcodec/hevc.h
 @@ -23,6 +23,9 @@
@@ -4507,37 +5859,53 @@ index be91010..6b03ea8 100644
  #include "libavutil/buffer.h"
  #include "libavutil/md5.h"
  
-@@ -37,6 +40,29 @@
+@@ -37,6 +40,45 @@
  #include "thread.h"
  #include "videodsp.h"
  
 +// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifdef RPI
++#ifndef RPI
++
++  #define RPI_INTER          0
++  #define RPI_TSTATS         0
++  #define RPI_HEVC_SAND      0
++
++#else
 +
 +  #include "rpi_qpu.h"
-+  // Define RPI_INTER_QPU to use QPU for chroma inter prediction
-+  #define RPI_INTER_QPU
++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
 +
-+  #ifdef RPI_INTER_QPU
-+    // Define RPI_LUMA_QPU to also use QPU for luma inter prediction
-+    #define RPI_LUMA_QPU
-+  #endif
-+
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  #define RPI_MAX_JOBS 2
 +  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
 +  #define RPI_WORKER
++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
++  // This has no effect unless RPI_WORKER is defined
++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
++  // free for the foreground to fill in.
++  #define RPI_MAX_JOBS 2
++
 +  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
 +//  #define RPI_DEBLOCK_VPU
 +
-+#endif
++  #define RPI_VPU_DEBLOCK_CACHED 1
 +
-+#define RPI_VPU_DEBLOCK_CACHED 1
++  #if HAVE_NEON
++  #define RPI_HEVC_SAND      1
++  #else
++  // Sand bust on Pi1 currently - reasons unknown
++  #define RPI_HEVC_SAND      0
++  #endif
++
++  #define RPI_TSTATS 0
++#endif
 +
  #define MAX_DPB_SIZE 16 // A.4.1
  #define MAX_REFS 16
  
-@@ -660,17 +686,6 @@ typedef struct CodingUnit {
+@@ -660,17 +702,6 @@ typedef struct CodingUnit {
      uint8_t cu_transquant_bypass_flag;
  } CodingUnit;
  
@@ -4555,7 +5923,7 @@ index be91010..6b03ea8 100644
  typedef struct NeighbourAvailable {
      int cand_bottom_left;
      int cand_left;
-@@ -747,7 +762,17 @@ typedef struct HEVCFrame {
+@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
      uint8_t flags;
  } HEVCFrame;
  
@@ -4573,7 +5941,7 @@ index be91010..6b03ea8 100644
      uint8_t cabac_state[HEVC_CONTEXTS];
  
      uint8_t stat_coeff[4];
-@@ -762,7 +787,6 @@ typedef struct HEVCLocalContext {
+@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
  
      int qPy_pred;
  
@@ -4581,7 +5949,7 @@ index be91010..6b03ea8 100644
  
      uint8_t ctb_left_flag;
      uint8_t ctb_up_flag;
-@@ -779,7 +803,6 @@ typedef struct HEVCLocalContext {
+@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
      int ct_depth;
      CodingUnit cu;
      PredictionUnit pu;
@@ -4589,7 +5957,7 @@ index be91010..6b03ea8 100644
  
  #define BOUNDARY_LEFT_SLICE     (1 << 0)
  #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -790,6 +813,80 @@ typedef struct HEVCLocalContext {
+@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
      int boundary_flags;
  } HEVCLocalContext;
  
@@ -4601,13 +5969,15 @@ index be91010..6b03ea8 100644
 +// This is a distance of 1536 pixels across the screen
 +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
 +// but allocate more memory and increase the latency before data in the next frame can be processed
-+#define RPI_NUM_CHUNKS 1
++#define RPI_NUM_CHUNKS 4
++#define RPI_CHUNK_SIZE 12
 +
 +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*24)
++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
 +
 +// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
-+#define RPI_MAX_MV_CMDS   (2*16*3*(RPI_MAX_WIDTH/4))
++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
 +// Each block can have an intra prediction and a transform_add command
 +#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
 +// Worst case is 16x16 CTUs
@@ -4624,53 +5994,118 @@ index be91010..6b03ea8 100644
 +
 +// Command for inter prediction
 +typedef struct HEVCMvCmd {
-+    int cmd;
-+    uint8_t *dst;
-+    ptrdiff_t dststride;
-+    uint8_t *src;
-+    ptrdiff_t srcstride;
-+    Mv mv;
-+    int x_off;
-+    int y_off;
-+    int block_w;
-+    int block_h;
-+    int weight;
-+    int offset;
-+    uint8_t *src1;
-+    ptrdiff_t srcstride1;
-+    Mv mv1;
++    uint8_t cmd;
++    uint8_t block_w;
++    uint8_t block_h;
 +    int8_t ref_idx[2];
++    uint16_t dststride;
++    uint16_t srcstride;
++    uint16_t srcstride1;
++    int16_t weight;
++    int16_t offset;
++    int16_t x_off;
++    int16_t y_off;
++    uint8_t *src;
++    uint8_t *src1;
++    uint8_t *dst;
++    Mv mv;
++    Mv mv1;
 +} HEVCMvCmd;
 +
 +
 +// Command for intra prediction and transform_add of predictions to coefficients
-+#define RPI_PRED_TRANSFORM_ADD 0
-+#define RPI_PRED_INTRA 1
++enum rpi_pred_cmd_e
++{
++    RPI_PRED_ADD_RESIDUAL,
++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_INTRA,
++    RPI_PRED_I_PCM,
++    RPI_PRED_CMD_MAX
++};
++
 +typedef struct HEVCPredCmd {
-+    uint8_t size;
 +    uint8_t type;
-+    uint8_t na;
-+    uint8_t c_idx;
++    uint8_t size;  // log2 "size" used by all variants
++    uint8_t na;    // i_pred - but left here as they pack well
++    uint8_t c_idx; // i_pred
 +    union {
-+        uint8_t *dst; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t x;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        int16_t *buf; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t y;   // RPI_PRED_INTRA
-+    };
-+    union {
-+        enum IntraPredMode mode; // RPI_PRED_TRANSFORM_ADD
-+        uint32_t stride;         // RPI_PRED_INTRA
++        struct {  // TRANSFORM_ADD
++            uint8_t * dst;
++            const int16_t * buf;
++            uint32_t stride;
++        } ta;
++        struct {  // INTRA
++            uint16_t x;
++            uint16_t y;
++            enum IntraPredMode mode;
++        } i_pred;
++        struct {  // I_PCM
++            uint16_t x;
++            uint16_t y;
++            const void * src;
++            uint32_t src_len;
++        } i_pcm;
 +    };
 +} HEVCPredCmd;
 +
 +#endif
++
++#ifdef RPI
++
++struct qpu_mc_pred_c_s;
++struct qpu_mc_pred_y_s;
++
++typedef struct HEVCRpiLumaPred
++{
++    struct qpu_mc_pred_y_s *qpu_mc_base;
++    struct qpu_mc_pred_y_s *qpu_mc_curr;
++    struct qpu_mc_pred_y_s *last_lx;
++    unsigned int load;
++} HEVCRpiLumaPred;
++
++typedef struct HEVCRpiChromaPred
++{
++    struct qpu_mc_pred_c_s *qpu_mc_base;
++    struct qpu_mc_pred_c_s *qpu_mc_curr;
++    struct qpu_mc_pred_c_s *last_l0;
++    struct qpu_mc_pred_c_s *last_l1;
++    unsigned int load;
++} HEVCRpiChromaPred;
++
++typedef struct HEVCRpiJob {
++    GPU_MEM_PTR_T chroma_mvs_gptr;
++    GPU_MEM_PTR_T luma_mvs_gptr;
++    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
++    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
++} HEVCRpiJob;
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++    int y_pred1_y8_merge;
++    int y_pred1_xy;
++    int y_pred1_x0;
++    int y_pred1_y0;
++    int y_pred1_x0y0;
++    int y_pred1_wle8;
++    int y_pred1_wgt8;
++    int y_pred1_hle16;
++    int y_pred1_hgt16;
++    int y_pred2_xy;
++    int y_pred2_x0;
++    int y_pred2_y0;
++    int y_pred2_x0y0;
++    int y_pred2_hle16;
++    int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++#endif
 +
  typedef struct HEVCContext {
      const AVClass *c;  // needed by private avoptions
      AVCodecContext *avctx;
-@@ -798,13 +895,107 @@ typedef struct HEVCContext {
+@@ -798,13 +978,103 @@ typedef struct HEVCContext {
  
      HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
      HEVCLocalContext    *HEVClc;
@@ -4688,7 +6123,8 @@ index be91010..6b03ea8 100644
 +
 +#ifdef RPI
 +    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds[RPI_MAX_JOBS];
++    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
++    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
 +    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
 +    int buf_width;
 +    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
@@ -4697,7 +6133,8 @@ index be91010..6b03ea8 100644
 +    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
 +    int num_coeffs[RPI_MAX_JOBS][4];
 +    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds[RPI_MAX_JOBS];
++    int num_mv_cmds_y[RPI_MAX_JOBS];
++    int num_mv_cmds_c[RPI_MAX_JOBS];
 +    int num_pred_cmds[RPI_MAX_JOBS];
 +    int num_dblk_cmds[RPI_MAX_JOBS];
 +    int vpu_id;
@@ -4707,29 +6144,23 @@ index be91010..6b03ea8 100644
 +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
 +    int ctu_per_y_chan; // Number of CTUs per luma QPU
 +    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
-+#ifdef RPI_INTER_QPU
-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
 +
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[RPI_MAX_JOBS][8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[RPI_MAX_JOBS][8];
-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
-+    // Function pointers
-+    uint32_t mc_filter_uv;
-+    uint32_t mc_filter_uv_b0;
-+    uint32_t mc_filter_uv_b;
++    HEVCRpiJob jobs[RPI_MAX_JOBS];
++#if RPI_TSTATS
++    HEVCRpiStats tstats;
 +#endif
-+#ifdef RPI_LUMA_QPU
-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][12];
-+    uint32_t *y_mvs[RPI_MAX_JOBS][12];
-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
++#if RPI_INTER
++    HEVCRpiChromaPred * curr_pred_c;
++    HEVCRpiLumaPred * curr_pred_y;
++    struct qpu_mc_pred_y_s * last_y8_p;
++    struct qpu_mc_pred_y_s * last_y8_lx;
++
 +    // Function pointers
-+    uint32_t mc_filter;
-+    uint32_t mc_filter_b;
++    uint32_t qpu_filter_uv;
++    uint32_t qpu_filter_uv_b0;
++    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
++    uint32_t qpu_filter;
++    uint32_t qpu_filter_b;
 +#endif
 +
 +#ifdef RPI_WORKER
@@ -4766,7 +6197,7 @@ index be91010..6b03ea8 100644
 +        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
 +        int vpu_cmds_vc;
 +
-+        int cmd_id;
++        vpu_qpu_wait_h cmd_id;
 +    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
 +
 +    struct dblk_vpu_q_s * dvq;
@@ -4779,7 +6210,7 @@ index be91010..6b03ea8 100644
      uint8_t *cabac_state;
  
      /** 1 if the independent slice segment header was successfully parsed */
-@@ -922,6 +1113,9 @@ typedef struct HEVCContext {
+@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
      uint32_t max_mastering_luminance;
      uint32_t min_mastering_luminance;
  
@@ -4789,22 +6220,38 @@ index be91010..6b03ea8 100644
  } HEVCContext;
  
  int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-@@ -1048,6 +1242,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                   int log2_trafo_size, enum ScanType scan_idx,
                                   int c_idx);
  
-+#ifdef RPI_INTER_QPU
-+extern void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n);
++#if RPI_INTER
++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
 +#endif
 +
  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
  
+@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
+ extern const uint8_t ff_hevc_diag_scan8x8_x[64];
+ extern const uint8_t ff_hevc_diag_scan8x8_y[64];
+ 
++#ifdef RPI
++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++#endif
++
+ #endif /* AVCODEC_HEVC_H */
 diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 05b2821..e2f1f4e 100644
+index 05b2821..733efde 100644
 --- a/libavcodec/hevc_cabac.c
 +++ b/libavcodec/hevc_cabac.c
-@@ -21,14 +21,72 @@
+@@ -21,14 +21,76 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -4817,6 +6264,10 @@ index 05b2821..e2f1f4e 100644
  #include "hevc.h"
 +#include "cabac_functions.h"
 +
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
 +// BY22 is probably faster than simple bypass if the processor has
 +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
 +// x86 has fast int divide
@@ -4878,7 +6329,7 @@ index 05b2821..e2f1f4e 100644
  /**
   * number of bin by SyntaxElement.
   */
-@@ -445,6 +503,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
      { 28, 36, 43, 49, 54, 58, 61, 63, },
  };
  
@@ -5090,7 +6541,7 @@ index 05b2821..e2f1f4e 100644
  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
  {
      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-@@ -863,19 +1126,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
  }
  
@@ -5116,7 +6567,7 @@ index 05b2821..e2f1f4e 100644
  }
  
  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-@@ -891,14 +1154,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
  }
  
@@ -5133,7 +6584,7 @@ index 05b2821..e2f1f4e 100644
          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
          ctx_shift = (log2_size + 1) >> 2;
      } else {
-@@ -929,22 +1192,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
      return value;
  }
  
@@ -5159,7 +6610,7 @@ index 05b2821..e2f1f4e 100644
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -966,90 +1223,366 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -5172,7 +6623,7 @@ index 05b2821..e2f1f4e 100644
 +
 +#ifndef coeff_abs_level_remaining_decode_bypass
 +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
- {
++{
 +    CABACContext * const c = &s->HEVClc->cc;
 +    uint32_t y;
 +    unsigned int prefix;
@@ -5213,7 +6664,7 @@ index 05b2821..e2f1f4e 100644
 +#endif
 +
 +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
-+{
+ {
 +    CABACContext * const c = &s->HEVClc->cc;
      int prefix = 0;
      int suffix = 0;
@@ -5359,7 +6810,7 @@ index 05b2821..e2f1f4e 100644
 +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
 +{
 +    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
- }
++}
 +#endif
 +
 +
@@ -5454,6 +6905,45 @@ index 05b2821..e2f1f4e 100644
 +    return i;
 +}
 +
++#ifdef RPI
++static void rpi_add_residual(HEVCContext * const s,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    unsigned int stride = frame->linesize[c_idx];
++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
++    const int is_sliced = rpi_sliced_frame(frame);
++    uint8_t * dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            rpi_sliced_frame_pos_y(frame, x, y) :
++            rpi_sliced_frame_pos_c(frame, x, y);
++
++//    if (c_idx != 0) {
++//        return;
++//    }
++    if (s->enable_rpi) {
++        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++        cmd->size = log2_trafo_size;
++        cmd->c_idx = c_idx;
++        cmd->ta.buf = coeffs;
++        cmd->ta.dst = dst;
++        cmd->ta.stride = stride;
++    }
++    else if (!is_sliced || c_idx == 0) {
++        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++    else if (c_idx == 1) {
++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++    else {
++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
+ }
++#endif
  
  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                  int log2_trafo_size, enum ScanType scan_idx,
@@ -5483,17 +6973,20 @@ index 05b2821..e2f1f4e 100644
 +    const uint8_t *scan_x_cg, *scan_y_cg;
 +    const xy_off_t * scan_xy_off;
  
++#ifndef RPI
      ptrdiff_t stride = s->frame->linesize[c_idx];
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                            ((x0 >> hshift) << s->ps.sps->pixel_shift)];
-+#ifdef RPI
-+    //***** transform_skip_flag decoded later!
-+    int use_vpu = s->enable_rpi && !lc->cu.cu_transquant_bypass_flag /* && !transform_skip_flag*/ && !lc->tu.cross_pf && log2_trafo_size>=4;
-+#endif
-     int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
 -    uint8_t significant_coeff_group_flag[8][8] = {{0}};
++#endif
++#ifdef RPI
++    int use_vpu;
++#endif
++    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
      int explicit_rdpcm_flag = 0;
      int explicit_rdpcm_dir_flag;
@@ -5508,38 +7001,11 @@ index 05b2821..e2f1f4e 100644
      int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
                                           lc->tu.intra_pred_mode_c;
  
+-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
 +    int prev_sig = 0;
 +    const int c_idx_nz = (c_idx != 0);
 +
 +    int may_hide_sign;
-+
-+#ifdef RPI
-+    if (s->enable_rpi) {
-+        int n = trafo_size * trafo_size;
-+        if (use_vpu) {
-+            // We support size 4 and size 5.
-+            // Size 4 grows from the front  (Coeffs_buf_arm[2] points to start of buf)
-+            // Size 5 grows from the back   (Coeffs_buf_arm[3] points to end of buf)
-+            // num_coeffs is indexed by log2_trafo_size-2
-+            if (log2_trafo_size == 4)
-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] + s->num_coeffs[s->pass0_job][log2_trafo_size - 2];
-+            else
-+                coeffs = s->coeffs_buf_arm[s->pass0_job][log2_trafo_size - 2] - s->num_coeffs[s->pass0_job][log2_trafo_size - 2] - n;
-+            s->num_coeffs[s->pass0_job][log2_trafo_size - 2] += n;
-+        } else {
-+            coeffs = s->coeffs_buf_arm[s->pass0_job][0] + s->num_coeffs[s->pass0_job][0];
-+            s->num_coeffs[s->pass0_job][0] += n;
-+        }
-+    }
-+    // We now do the memset after transform_add while we know the data is cached.
-+    #ifdef RPI_PRECLEAR
-+    #else
-+    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+    #endif
-+#else
-     memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
-+#endif
-+
 +
  
      // Derive QP for dequant
@@ -5549,7 +7015,7 @@ index 05b2821..e2f1f4e 100644
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1065,9 +1598,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          };
          int qp_y = lc->qp_y;
  
@@ -5570,7 +7036,7 @@ index 05b2821..e2f1f4e 100644
          }
  
          if (c_idx == 0) {
-@@ -1100,39 +1643,73 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -5641,6 +7107,9 @@ index 05b2821..e2f1f4e 100644
 +        may_hide_sign = 0;
      }
  
++
++
++
      if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
 -        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
 -        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
@@ -5658,7 +7127,7 @@ index 05b2821..e2f1f4e 100644
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1160,119 +1737,113 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -5715,14 +7184,41 @@ index 05b2821..e2f1f4e 100644
 -    for (i = num_last_subset; i >= 0; i--) {
 -        int n, m;
 -        int x_cg, y_cg, x_c, y_c, pos;
--        int implicit_non_zero_coeff = 0;
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++
++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++#ifdef RPI
++        use_vpu = 0;
++        if (s->enable_rpi) {
++            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
++            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if HAVE_NEON
++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++        }
++        else
++#endif
++        {
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++        }
++    }
++
++    i = num_last_subset;
++    do {
+         int implicit_non_zero_coeff = 0;
 -        int64_t trans_coeff_level;
 -        int prev_sig = 0;
 -        int offset = i << 4;
 -        int rice_init = 0;
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++        int n_end;
  
--        uint8_t significant_coeff_flag_idx[16];
+         uint8_t significant_coeff_flag_idx[16];
 -        uint8_t nb_significant_coeff_flag = 0;
 -
 -        x_cg = scan_x_cg[i];
@@ -5734,8 +7230,7 @@ index 05b2821..e2f1f4e 100644
 -                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
 -            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
- 
+-
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
 -            implicit_non_zero_coeff = 1;
@@ -5744,13 +7239,8 @@ index 05b2821..e2f1f4e 100644
 -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 -             (x_cg == 0 && y_cg == 0));
 -        }
-+    i = num_last_subset;
-+    do {
-+        int implicit_non_zero_coeff = 0;
-+        int n_end;
- 
+-
 -        last_scan_pos = num_coeff - offset - 1;
-+        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
  
          if (i == num_last_subset) {
@@ -5836,7 +7326,7 @@ index 05b2821..e2f1f4e 100644
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1286,34 +1857,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      }
                  }
              }
@@ -5885,12 +7375,11 @@ index 05b2821..e2f1f4e 100644
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1323,141 +1890,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          }
  
 -        n_end = nb_significant_coeff_flag;
--
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
 +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
@@ -5938,6 +7427,9 @@ index 05b2821..e2f1f4e 100644
 +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
 +                }
  
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ 
 -        if (n_end) {
 -            int first_nz_pos_in_cg;
 -            int last_nz_pos_in_cg;
@@ -5948,9 +7440,6 @@ index 05b2821..e2f1f4e 100644
 -            int sum_abs = 0;
 -            int sign_hidden;
 -            int sb_type;
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
- 
 +                if (coded_val)
 +                {
 +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
@@ -5961,13 +7450,18 @@ index 05b2821..e2f1f4e 100644
 +                        const unsigned int c_rice_param = *stat_coeff >> 2;
 +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
  
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
 +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
 +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                    }
 +                }
  
+-            // initialize first elem of coeff_bas_level_greater1_flag
+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+ 
 -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
 -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
@@ -5975,11 +7469,7 @@ index 05b2821..e2f1f4e 100644
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
 -                c_rice_param = lc->stat_coeff[sb_type] / 4;
 -            }
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
+-
 -            if (!(i == num_last_subset) && greater1_ctx == 0)
 -                ctx_set++;
 -            greater1_ctx = 1;
@@ -6064,10 +7554,6 @@ index 05b2821..e2f1f4e 100644
 +
 +                            sum_abs += last_coeff_abs_level_remaining + 1;
 +                            *level = trans_coeff_level;
-+
-+                            if (stat_coeff != NULL)
-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                            stat_coeff = NULL;
  
 -            for (m = 0; m < n_end; m++) {
 -                n = significant_coeff_flag_idx[m];
@@ -6088,6 +7574,10 @@ index 05b2821..e2f1f4e 100644
 -                                if (lc->stat_coeff[sb_type] > 0)
 -                                    lc->stat_coeff[sb_type]--;
 -                            rice_init = 1;
++                            if (stat_coeff != NULL)
++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                            stat_coeff = NULL;
++
 +                            if (trans_coeff_level > (3 << c_rice_param) &&
 +                                (c_rice_param < 4 || rice_adaptation_enabled))
 +                                ++c_rice_param;
@@ -6188,7 +7678,7 @@ index 05b2821..e2f1f4e 100644
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1467,7 +2078,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -6197,7 +7687,7 @@ index 05b2821..e2f1f4e 100644
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1475,7 +2086,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                  for (i = 0; i < 8; i++)
                      FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
              }
@@ -6205,7 +7695,7 @@ index 05b2821..e2f1f4e 100644
              s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
  
              if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1486,8 +2096,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                  s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
              }
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
@@ -6233,7 +7723,7 @@ index 05b2821..e2f1f4e 100644
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-@@ -1501,6 +2129,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      col_limit = FFMIN(24, col_limit);
                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
              }
@@ -6241,26 +7731,20 @@ index 05b2821..e2f1f4e 100644
          }
      }
      if (lc->tu.cross_pf) {
-@@ -1510,6 +2139,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
 +#ifdef RPI
-+    if (s->enable_rpi) {
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
-+        cmd->size = log2_trafo_size;
-+        cmd->buf = coeffs;
-+        cmd->dst = dst;
-+        cmd->stride = stride;
-+        return;
-+    }
-+#endif
++    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++#else
      s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
++#endif
  }
  
+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
 diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 1f33b0c..55a0315 100644
+index 1f33b0c..3143b4f 100644
 --- a/libavcodec/hevc_filter.c
 +++ b/libavcodec/hevc_filter.c
 @@ -22,6 +22,12 @@
@@ -6281,14 +7765,78 @@ index 1f33b0c..55a0315 100644
  #include "bit_depth_template.c"
  
 +#ifdef RPI
-+#include "rpi_user_vcsm.h"
 +#include "rpi_qpu.h"
++#include "rpi_zc.h"
 +#endif
 +
  #define LUMA 0
  #define CB 1
  #define CR 2
-@@ -273,6 +284,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
+ }
+ 
++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
++{
++#ifdef RPI
++    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
++#else
++    return s->ps.sps->pixel_shift;
++#endif
++}
++
+ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+                      intptr_t stride_dst, intptr_t stride_src)
+ {
+@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                            int stride_src, int x, int y, int width, int height,
+                            int c_idx, int x_ctb, int y_ctb)
+ {
+-    int sh = s->ps.sps->pixel_shift;
++    const unsigned int sh = pixel_shift(s, c_idx);
+     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+ 
+@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
++        const unsigned int sh = pixel_shift(s, c_idx);
++        int len          = (min_pu_size >> hshift) << sh;
+         for (y = y_min; y < y_max; y++) {
+             for (x = x_min; x < x_max; x++) {
+                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+                     int n;
+-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+                     for (n = 0; n < (min_pu_size >> vshift); n++) {
+                         memcpy(src, dst, len);
+                         src += stride_src;
+@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+ 
+ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+ {
+-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+     HEVCLocalContext *lc = s->HEVClc;
+     int c_idx;
+     int edges[4];  // 0 left 1 top 2 right 3 bottom
+@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+     uint8_t right_tile_edge  = 0;
+     uint8_t up_tile_edge     = 0;
+     uint8_t bottom_tile_edge = 0;
++#ifdef RPI
++    const int sliced = rpi_sliced_frame(s->frame);
++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
++#else
++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
++#endif
+ 
+     edges[0]   = x_ctb == 0;
+     edges[1]   = y_ctb == 0;
      edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
      edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
  
@@ -6299,7 +7847,301 @@ index 1f33b0c..55a0315 100644
      if (restore) {
          if (!edges[0]) {
              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -496,6 +511,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         }
+     }
+ 
+-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
+         int x0       = x >> s->ps.sps->hshift[c_idx];
+         int y0       = y >> s->ps.sps->vshift[c_idx];
+         int stride_src = s->frame->linesize[c_idx];
+@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+-        int stride_dst;
++        ptrdiff_t stride_dst;
+         uint8_t *dst;
+ 
++#ifdef RPI
++        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = !sliced ?
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0, y0);
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++            !sliced ? src - (1 << sh) :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++            !sliced ? src + (width << sh) :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
++
++
++        if (sliced && c_idx > 1) {
++            break;
++        }
++#else
++        const unsigned int sh = s->ps.sps->pixel_shift;
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
++#endif
++
+         switch (sao->type_idx[c_idx]) {
+         case SAO_BAND:
+             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                            x_ctb, y_ctb);
+             if (s->ps.pps->transquant_bypass_enable_flag ||
+                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+-            dst = lc->edge_emu_buffer;
+-            stride_dst = 2*MAX_PB_SIZE;
+-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+-                                            width, height);
+-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-                               x, y, width, height, c_idx);
++                dst = lc->edge_emu_buffer;
++                stride_dst = 2*MAX_PB_SIZE;
++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
++#ifdef RPI
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++#endif
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                   x, y, width, height, c_idx);
+             } else {
+-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+-                                            width, height);
++#ifdef RPI
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++#endif
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
+             }
+             sao->type_idx[c_idx] = SAO_APPLIED;
+             break;
+@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         {
+             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+-            int left_edge = edges[0];
+             int top_edge = edges[1];
+-            int right_edge = edges[2];
+             int bottom_edge = edges[3];
+-            int sh = s->ps.sps->pixel_shift;
+-            int left_pixels, right_pixels;
+ 
+             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+ 
+             if (!top_edge) {
+-                int left = 1 - left_edge;
+-                int right = 1 - right_edge;
+-                const uint8_t *src1[2];
+                 uint8_t *dst1;
+-                int src_idx, pos;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+ 
+-                dst1 = dst - stride_dst - (left << sh);
+-                src1[0] = src - stride_src - (left << sh);
+-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+-                pos = 0;
+-                if (left) {
++                dst1 = dst - stride_dst;
++
++                if (src_l != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1, src1[src_idx], sh);
+-                    pos += (1 << sh);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
+                 }
++
+                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                            SAO_APPLIED);
+-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+-                if (right) {
+-                    pos += width << sh;
++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++                if (src_r != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
+                 }
+             }
+             if (!bottom_edge) {
+-                int left = 1 - left_edge;
+-                int right = 1 - right_edge;
+-                const uint8_t *src1[2];
+-                uint8_t *dst1;
+-                int src_idx, pos;
++                uint8_t * const dst1 = dst + height * stride_dst;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++                const unsigned int hoff = height * stride_src;
+ 
+-                dst1 = dst + height * stride_dst - (left << sh);
+-                src1[0] = src + height * stride_src - (left << sh);
+-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+-                pos = 0;
+-                if (left) {
++                if (src_l != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1, src1[src_idx], sh);
+-                    pos += (1 << sh);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
+                 }
++
+                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                            SAO_APPLIED);
+-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+-                if (right) {
+-                    pos += width << sh;
++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++                if (src_r != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
+                 }
+             }
+-            left_pixels = 0;
+-            if (!left_edge) {
++            if (src_l != NULL) {
+                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                     copy_vert(dst - (1 << sh),
+                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                               sh, height, stride_dst, 1 << sh);
+                 } else {
+-                    left_pixels = 1;
++                    copy_vert(dst - (1 << sh),
++                              src_l,
++                              sh, height, stride_dst, stride_src);
+                 }
+             }
+-            right_pixels = 0;
+-            if (!right_edge) {
++            if (src_r != NULL) {
+                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                     copy_vert(dst + (width << sh),
+                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                               sh, height, stride_dst, 1 << sh);
+                 } else {
+-                    right_pixels = 1;
++                    copy_vert(dst + (width << sh),
++                              src_r,
++                              sh, height, stride_dst, stride_src);
+                 }
+             }
+ 
+-            copy_CTB(dst - (left_pixels << sh),
+-                     src - (left_pixels << sh),
+-                     (width + left_pixels + right_pixels) << sh,
++            copy_CTB(dst,
++                     src,
++                     width << sh,
+                      height, stride_dst, stride_src);
+ 
+             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                            x_ctb, y_ctb);
+-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+-                                            sao->eo_class[c_idx], width, height);
+-            s->hevcdsp.sao_edge_restore[restore](src, dst,
+-                                                stride_src, stride_dst,
+-                                                sao,
+-                                                edges, width,
+-                                                height, c_idx,
+-                                                vert_edge,
+-                                                horiz_edge,
+-                                                diag_edge);
++#ifdef RPI
++            if (sliced && c_idx != 0)
++            {
++                // Class always the same for both U & V (which is just as well :-))
++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++                                                width, height);
++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            else
++#endif
++            {
++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++                                                sao->eo_class[c_idx], width, height);
++                s->hevcdsp.sao_edge_restore[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
+             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                                x, y, width, height, c_idx);
+             sao->type_idx[c_idx] = SAO_APPLIED;
+@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+     }
+ }
+ 
++// Returns 2 or 0.
+ static int get_pcm(HEVCContext *s, int x, int y)
+ {
+     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
+@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+     uint8_t *src;
+     int x, y;
+     int chroma, beta;
+-    int32_t c_tc[2], tc[2];
++    int32_t c_tc[4], tc[2];
+     uint8_t no_p[2] = { 0 };
+     uint8_t no_q[2] = { 0 };
+ 
+@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -6315,27 +8157,81 @@ index 1f33b0c..55a0315 100644
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -539,6 +563,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                          s->frame->linesize[LUMA],
-                                                          beta, tc, no_p, no_q);
-                 } else
-+#ifdef RPI_DEBLOCK_VPU
-+                if (s->enable_rpi_deblock) {
-+                    uint8_t (*setup)[2][2][4];
-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
-+                    int a = ((y>>3) & 1) << 1;
-+                    int b = (x>>3) & 1;
-+                    setup = s->dvq->y_setup_arm[num16];
-+                    setup[0][b][0][a] = beta;
-+                    setup[0][b][0][a + 1] = beta;
-+                    setup[0][b][1][a] = tc[0];
-+                    setup[0][b][1][a + 1] = tc[1];
-+                } else
+@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+ 
+                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                 if (pcmf) {
+                     no_p[0] = get_pcm(s, x - 1, y);
+                     no_p[1] = get_pcm(s, x - 1, y + 4);
+                     no_q[0] = get_pcm(s, x, y);
+                     no_q[1] = get_pcm(s, x, y + 4);
+-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+-                                                         s->frame->linesize[LUMA],
+-                                                         beta, tc, no_p, no_q);
+-                } else
+-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                       s->frame->linesize[LUMA],
+-                                                       beta, tc, no_p, no_q);
++                }
++#ifdef RPI
++                if (rpi_sliced_frame(s->frame)) {
++
++                    // This copes properly with no_p/no_q
++                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
++                                                     s->frame->linesize[LUMA],
++                                                     beta, tc, no_p, no_q,
++                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
++                }
++                else
 +#endif
-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
-                                                        s->frame->linesize[LUMA],
-                                                        beta, tc, no_p, no_q);
-@@ -571,6 +608,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                {
++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                    if (pcmf) {
++                        // Standard DSP code is broken if no_p / no_q is set
++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++                                                           s->frame->linesize[LUMA],
++                                                           beta, tc, no_p, no_q);
++                    }
++                    else
++#ifdef RPI_DEBLOCK_VPU
++                    if (s->enable_rpi_deblock) {
++                        uint8_t (*setup)[2][2][4];
++                        int num16 = (y>>4)*s->setup_width + (x>>4);
++                        int a = ((y>>3) & 1) << 1;
++                        int b = (x>>3) & 1;
++                        setup = s->dvq->y_setup_arm[num16];
++                        setup[0][b][0][a] = beta;
++                        setup[0][b][0][a + 1] = beta;
++                        setup[0][b][1][a] = tc[0];
++                        setup[0][b][1][a + 1] = tc[1];
++                    } else
++#endif
++                    {
++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
++                                                           s->frame->linesize[LUMA],
++                                                           beta, tc, no_p, no_q);
++                    }
++                }
+             }
+         }
+ 
+@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                src =
++#ifdef RPI
++                    rpi_sliced_frame(s->frame) ?
++                        rpi_sliced_frame_pos_y(s->frame, x, y) :
++#endif
++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                 if (pcmf) {
+                     no_p[0] = get_pcm(s, x, y - 1);
+                     no_p[1] = get_pcm(s, x + 4, y - 1);
+@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -6355,7 +8251,113 @@ index 1f33b0c..55a0315 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -605,9 +655,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+     }
+ 
+     if (s->ps.sps->chroma_format_idc) {
++#ifdef RPI
++        if (rpi_sliced_frame(s->frame)) {
++            const int v = 2;
++            const int h = 2;
++
++            // vertical filtering chroma
++            for (y = y0; y < y_end; y += 8 * v) {
++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
++
++                    if ((bs0 == 2) || (bs1 == 2)) {
++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
++                        unsigned int no_f = 0;
++
++                        // tc_offset here should be set to cur_tc_offset I think
++                        const uint32_t tc4 =
++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++
++                        if (tc4 == 0)
++                            continue;
++
++                        if (pcmf) {
++                            no_f =
++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
++                                (get_pcm(s, x, y) ? 4 : 0) |
++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
++                            if (no_f == 0xf)
++                                continue;
++                        }
++
++                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                       s->frame->linesize[1],
++                                                       tc4,
++                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       no_f);
++                    }
++                }
++
++                if (y == 0)
++                    continue;
++
++                // horizontal filtering chroma
++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
++                x_end2 = x_end;
++                if (x_end != s->ps.sps->width)
++                    x_end2 = x_end - 8 * h;
++
++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
++                    if ((bs0 == 2) || (bs1 == 2)) {
++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
++                        const uint32_t tc4 =
++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++                        unsigned int no_f = 0;
++
++                        if (tc4 == 0)
++                            continue;
++
++                        if (pcmf) {
++                            no_f =
++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
++                                (get_pcm(s, x,         y)     ? 4 : 0) |
++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
++
++                            if (no_f == 0xf)
++                                continue;
++                        }
++
++                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                             s->frame->linesize[1],
++                                                             tc4, no_f);
++                    }
++                }
++            }
++        }
++        else
++#endif
+         for (chroma = 1; chroma <= 2; chroma++) {
+             int h = 1 << s->ps.sps->hshift[chroma];
+             int v = 1 << s->ps.sps->vshift[chroma];
+@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+ 
+                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                        src =
++#ifdef RPI
++                            rpi_sliced_frame(s->frame) ?
++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#endif
++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                         if (pcmf) {
+                             no_p[0] = get_pcm(s, x - 1, y);
+                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6379,7 +8381,21 @@ index 1f33b0c..55a0315 100644
                      }
                  }
  
-@@ -638,6 +702,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+ 
+                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                        src =
++#ifdef RPI
++                            rpi_sliced_frame(s->frame) ?
++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#endif
++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                         if (pcmf) {
+                             no_p[0] = get_pcm(s, x,           y - 1);
+                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -6399,7 +8415,7 @@ index 1f33b0c..55a0315 100644
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -648,69 +725,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  }
  
@@ -6469,7 +8485,7 @@ index 1f33b0c..55a0315 100644
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -721,10 +735,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -6479,8 +8495,9 @@ index 1f33b0c..55a0315 100644
 -    int i, j, bs;
 +    int i, j;
 +    RefPicList *rpl      = s->ref->refPicList;
-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
 +    int y_pu             = y0 >> log2_min_pu_size;
 +    int x_pu             = x0 >> log2_min_pu_size;
 +    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
@@ -6494,7 +8511,7 @@ index 1f33b0c..55a0315 100644
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -736,34 +761,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -6571,7 +8588,7 @@ index 1f33b0c..55a0315 100644
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -774,64 +821,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -6586,9 +8603,7 @@ index 1f33b0c..55a0315 100644
 -        int xq_pu =  x0      >> log2_min_pu_size;
 -        int xp_tu = (x0 - 1) >> log2_min_tu_size;
 -        int xq_tu =  x0      >> log2_min_tu_size;
-+                               rpl;
-+        MvField *left = curr - 1;
- 
+-
 -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
 -                int y_pu      = (y0 + i) >> log2_min_pu_size;
 -                int y_tu      = (y0 + i) >> log2_min_tu_size;
@@ -6606,18 +8621,20 @@ index 1f33b0c..55a0315 100644
 -                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
 -            }
 -    }
-+        if (is_intra) {
-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
-+                bs[j * s->bs_width >> 2] = 2;
- 
+-
 -    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
 -        RefPicList *rpl = s->ref->refPicList;
--
++                               rpl;
++        MvField *left = curr - 1;
+ 
 -        // bs for TU internal horizontal PU boundaries
 -        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
 -            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
 -            int yq_pu = (y0 + j)     >> log2_min_pu_size;
--
++        if (is_intra) {
++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
++                bs[j * s->bs_width >> 2] = 2;
+ 
 -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
 -                int x_pu = (x0 + i) >> log2_min_pu_size;
 -                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
@@ -6674,137 +8691,42 @@ index 1f33b0c..55a0315 100644
          }
      }
  }
-@@ -840,11 +877,196 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
  #undef CB
  #undef CR
  
-+#if !defined(RPI_FAST_CACHEFLUSH)
-+#if defined(RPI_LUMA_QPU) || defined(RPI_DEBLOCK_VPU)
-+static void flush_buffer_y(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_y(frame);
-+    gpu_cache_flush(&p);
-+}
-+
-+static void flush_buffer_u(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_u(frame);
-+    gpu_cache_flush(&p);
-+}
-+
-+static void flush_buffer_v(const AVFrame * const frame) {
-+    GPU_MEM_PTR_T p = get_gpu_mem_ptr_v(frame);
-+    gpu_cache_flush(&p);
-+}
-+#endif
-+#endif
-+
-+
 +#ifdef RPI_DEBLOCK_VPU
-+#error Not fixed yet
-+
 +// ff_hevc_flush_buffer_lines
 +// flushes and invalidates all pixel rows in [start,end-1]
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = start;
-+        int n = end;
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-+        int sz,base;
-+        GPU_MEM_PTR_T p;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { return; }
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        if (flush_chroma) {
-+          p = get_gpu_mem_ptr_u(s->frame);
-+          iocache.s[0].handle = p.vcsm_handle;
-+          iocache.s[0].cmd = 3; // clean+invalidate
-+          iocache.s[0].addr = (int)p.arm + base;
-+          iocache.s[0].size  = sz;
-+          p = get_gpu_mem_ptr_v(s->frame);
-+          iocache.s[1].handle = p.vcsm_handle;
-+          iocache.s[1].cmd = 3; // clean+invalidate
-+          iocache.s[1].addr = (int)p.arm + base;
-+          iocache.s[1].size  = sz;
-+        }
-+        if (flush_luma) {
-+          p = get_gpu_mem_ptr_y(s->frame);
-+          sz = s->frame->linesize[0] * (n-curr_y);
-+          base = s->frame->linesize[0] * curr_y;
-+          iocache.s[2].handle = p.vcsm_handle;
-+          iocache.s[2].cmd = 3; // clean+invalidate
-+          iocache.s[2].addr = (int)p.arm + base;
-+          iocache.s[2].size  = sz;
-+        }
-+        vcsm_clean_invalid( &iocache );
-+#else
-+        if (flush_chroma) {
-+          flush_buffer_u(s->frame);
-+          flush_buffer_v(s->frame);
-+        }
-+        if (flush_luma) {
-+          flush_buffer_y(s->frame);
-+        }
-+#endif
++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++    rpi_cache_flush_finish(rfe);
 +}
 +#endif
 +
-+#ifdef RPI_INTER_QPU
-+void ff_hevc_flush_buffer(HEVCContext *s, ThreadFrame *f, int n)
++#if RPI_INTER
++
++// Flush some lines of a reference frames
++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
 +{
 +    if (s->enable_rpi && s->used_for_ref) {
-+      // TODO make this use ff_hevc_flush_buffer_lines
-+#ifdef RPI_FAST_CACHEFLUSH
-+        struct vcsm_user_clean_invalid_s iocache = {};
-+        int curr_y = ((int *)f->progress->data)[0];
-+        int curr_uv = curr_y >> s->ps.sps->vshift[1];
-+        int n_uv = n >> s->ps.sps->vshift[1];
-+        int sz,base;
-+        GPU_MEM_PTR_T p;
-+        if (curr_uv < 0) curr_uv = 0;
-+        if (n_uv<=curr_uv) { return; }
-+        sz = s->frame->linesize[1] * (n_uv-curr_uv);
-+        base = s->frame->linesize[1] * curr_uv;
-+        p = get_gpu_mem_ptr_u(s->frame);
-+        iocache.s[0].handle = p.vcsm_handle;
-+        iocache.s[0].cmd = 3; // clean+invalidate
-+        iocache.s[0].addr = (int)p.arm + base;
-+        iocache.s[0].size  = sz;
-+        p = get_gpu_mem_ptr_v(s->frame);
-+        iocache.s[1].handle = p.vcsm_handle;
-+        iocache.s[1].cmd = 3; // clean+invalidate
-+        iocache.s[1].addr = (int)p.arm + base;
-+        iocache.s[1].size  = sz;
++        const int d0 = ((int *)f->progress->data)[0];
++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
 +
-+#ifdef RPI_LUMA_QPU
-+        p = get_gpu_mem_ptr_y(s->frame);
-+        sz = s->frame->linesize[0] * (n-curr_y);
-+        base = s->frame->linesize[0] * curr_y;
-+        iocache.s[2].handle = p.vcsm_handle;
-+        iocache.s[2].cmd = 3; // clean+invalidate
-+        iocache.s[2].addr = (int)p.arm + base;
-+        iocache.s[2].size  = sz;
-+#endif
-+        vcsm_clean_invalid( &iocache );
-+#else
-+        flush_buffer_u(s->frame);
-+        flush_buffer_v(s->frame);
-+#ifdef RPI_LUMA_QPU
-+        flush_buffer_y(s->frame);
-+#endif
-+
-+#endif
-+        //memcpy(s->dummy.arm,s->frame->data[0],2048*64);
-+        //memcpy(s->dummy.arm,s->frame->data[1],1024*32);
-+        //memcpy(s->dummy.arm,s->frame->data[2],1024*32);
++        if (curr_y < (unsigned int)f->f->height) {
++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
++            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
++            rpi_cache_flush_finish(rfe);
++        }
 +    }
 +}
 +#endif
 +
 +#ifdef RPI_DEBLOCK_VPU
-+#error XXX
 +/* rpi_deblock deblocks an entire row of ctbs using the VPU */
 +static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
 +{
@@ -6833,16 +8755,19 @@ index 1f33b0c..55a0315 100644
 +  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
 +  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
 +  s->dvq->vpu_cmds_arm[2][5] = 4;
++
 +  // Call VPU
-+  s->dvq->cmd_id = vpu_post_code2( vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5, 0); // 5 means to do all the commands
++  {
++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
++      vpu_qpu_job_finish(vqj);
++  }
 +
 +  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
 +  s->dvq = s->dvq_ents + s->dvq_n;
 +
-+  if (s->dvq->cmd_id != -1) {
-+      vpu_wait(s->dvq->cmd_id);
-+      s->dvq->cmd_id = -1;
-+  }
++  vpu_qpu_wait(&s->dvq->cmd_id);
 +}
 +
 +#endif
@@ -6871,14 +8796,14 @@ index 1f33b0c..55a0315 100644
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -853,16 +1075,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
 -            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_buffer(s,&s->ref->tf, y);
++            if (s->threads_type == FF_THREAD_FRAME ) {
++#if RPI_INTER
++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y, 0);
 +            }
@@ -6886,14 +8811,14 @@ index 1f33b0c..55a0315 100644
          if (x_end && y_end) {
              sao_filter_CTB(s, x , y);
 -            if (s->threads_type & FF_THREAD_FRAME )
-+            if (s->threads_type & FF_THREAD_FRAME ) {
-+#ifdef RPI_INTER_QPU
-+                ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size);
++            if (s->threads_type == FF_THREAD_FRAME ) {
++#if RPI_INTER
++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            }
 +        }
-+    } else if (s->threads_type & FF_THREAD_FRAME && x_end) {
++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
 +        //if (((y + ctb_size)&63)==0)
@@ -6904,15 +8829,15 @@ index 1f33b0c..55a0315 100644
 +            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +          }
 +        } else {
-+#ifdef RPI_INTER_QPU
-+          ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#if RPI_INTER
++          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
 +          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
          }
 -    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
 +#else
-+#ifdef RPI_INTER_QPU
-+        ff_hevc_flush_buffer(s, &s->ref->tf, y + ctb_size - 4);
++#if RPI_INTER
++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
 +#endif
          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
@@ -6922,10 +8847,23 @@ index 1f33b0c..55a0315 100644
  
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
 diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
-index 83f2ec2..6882a8d 100644
+index 83f2ec2..bcf53dc 100644
 --- a/libavcodec/hevc_ps.c
 +++ b/libavcodec/hevc_ps.c
-@@ -989,6 +989,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+     switch (sps->bit_depth) {
+     case 8:
+         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
++#if RPI_HEVC_SAND
++        // *** Horrid kludge s.t. we start out with sand format
++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
++#else
+         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
++#endif
+         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
      sps->amp_enabled_flag = get_bits1(gb);
      sps->sao_enabled      = get_bits1(gb);
  
@@ -6935,7 +8873,7 @@ index 83f2ec2..6882a8d 100644
      if (sps->pcm_enabled_flag) {
          sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
 diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 9d773d9..a6534a9 100644
+index 9d773d9..c4d7250 100644
 --- a/libavcodec/hevcdsp.c
 +++ b/libavcodec/hevcdsp.c
 @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
@@ -7059,7 +8997,68 @@ index 9d773d9..a6534a9 100644
  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
  {
  #undef FUNC
-@@ -257,6 +371,8 @@ int i = 0;
+@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+ 
++#ifndef RPI
++#define SLICED_LOOP_FILTERS(depth)
++#else
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#endif
++
++
+ #define HEVC_DSP(depth)                                                     \
+     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
+     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+                                                                                \
++    hevcdsp->sao_band_filter_c[0] =                                            \
++    hevcdsp->sao_band_filter_c[1] =                                            \
++    hevcdsp->sao_band_filter_c[2] =                                            \
++    hevcdsp->sao_band_filter_c[3] =                                            \
++    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
++    hevcdsp->sao_edge_filter_c[0] =                                            \
++    hevcdsp->sao_edge_filter_c[1] =                                            \
++    hevcdsp->sao_edge_filter_c[2] =                                            \
++    hevcdsp->sao_edge_filter_c[3] =                                            \
++    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
++                                                                               \
+     QPEL_FUNCS(depth);                                                         \
+     QPEL_UNI_FUNCS(depth);                                                     \
+     QPEL_BI_FUNCS(depth);                                                      \
+@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     EPEL_UNI_FUNCS(depth);                                                     \
+     EPEL_BI_FUNCS(depth);                                                      \
+                                                                                \
++    SLICED_LOOP_FILTERS(depth);                                                \
+     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
+     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
+     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
+@@ -257,6 +404,8 @@ int i = 0;
          break;
      }
  
@@ -7069,10 +9068,10 @@ index 9d773d9..a6534a9 100644
          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
      if (ARCH_ARM)
 diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index 9f1f6dd..e221e54 100644
+index 9f1f6dd..639ecf1 100644
 --- a/libavcodec/hevcdsp.h
 +++ b/libavcodec/hevcdsp.h
-@@ -42,6 +42,17 @@ typedef struct SAOParams {
+@@ -42,11 +42,26 @@ typedef struct SAOParams {
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
  
@@ -7090,21 +9089,742 @@ index 9f1f6dd..e221e54 100644
  typedef struct HEVCDSPContext {
      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                      struct GetBitContext *gb, int pcm_bit_depth);
-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
+ 
+-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
++    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+ 
+     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+ 
+@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
+ 
+     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
++                               int width, int height);
+ 
+     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+ 
+     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+ 
+     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width);
+@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                          int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
++#ifdef RPI
++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                                 const uint8_t no_p[2], const uint8_t no_q[2],
++                                 uint8_t * _pix_l);
++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f);
++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f);
++
++#endif
++
 +    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
 +                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
 +                                               MvField *curr, MvField *neigh, uint8_t *bs);
  } HEVCDSPContext;
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
+index b840d17..32b9e47 100644
+--- a/libavcodec/hevcdsp_template.c
++++ b/libavcodec/hevcdsp_template.c
+@@ -26,6 +26,9 @@
+ #include "bit_depth_template.c"
+ #include "hevcdsp.h"
+ 
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
+ 
+ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+                           GetBitContext *gb, int pcm_bit_depth)
+@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+     }
+ }
+ 
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++
++    dst = (pixel *)_dst + 1;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++
+ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+                                                      ptrdiff_t stride, int size)
+ {
+@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
+     }
+ }
+ 
++static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
++                                                ptrdiff_t stride, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
+ static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
+                                        ptrdiff_t stride)
+ {
+@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+ }
+ 
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst, res, stride, 32);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
++}
++
+ 
+ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
+ {
+@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+     pixel *src = (pixel *)_src;
+-    int16_t *sao_offset_val = sao->offset_val[c_idx];
+     int sao_eo_class    = sao->eo_class[c_idx];
+     int init_x = 0, width = _width, height = _height;
+ 
+@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+ 
+     if (sao_eo_class != SAO_EO_VERT) {
+         if (borders[0]) {
+-            int offset_val = sao_offset_val[0];
+             for (y = 0; y < height; y++) {
+-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
++                dst[y * stride_dst] = src[y * stride_src];
+             }
+             init_x = 1;
+         }
+         if (borders[2]) {
+-            int offset_val = sao_offset_val[0];
+             int offset     = width - 1;
+             for (x = 0; x < height; x++) {
+-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+             }
+             width--;
+         }
+     }
+     if (sao_eo_class != SAO_EO_HORIZ) {
+         if (borders[1]) {
+-            int offset_val = sao_offset_val[0];
+             for (x = init_x; x < width; x++)
+-                dst[x] = av_clip_pixel(src[x] + offset_val);
++                dst[x] = src[x];
+         }
+         if (borders[3]) {
+-            int offset_val   = sao_offset_val[0];
+-            int y_stride_dst = stride_dst * (height - 1);
+-            int y_stride_src = stride_src * (height - 1);
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
+             for (x = init_x; x < width; x++)
+-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
++                dst[x + y_stride_dst] = src[x + y_stride_src];
+             height--;
+         }
+     }
+@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+     pixel *src = (pixel *)_src;
+-    int16_t *sao_offset_val = sao->offset_val[c_idx];
+     int sao_eo_class    = sao->eo_class[c_idx];
+     int init_x = 0, init_y = 0, width = _width, height = _height;
+ 
+@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+ 
+     if (sao_eo_class != SAO_EO_VERT) {
+         if (borders[0]) {
+-            int offset_val = sao_offset_val[0];
+             for (y = 0; y < height; y++) {
+-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
++                dst[y * stride_dst] = src[y * stride_src];
+             }
+             init_x = 1;
+         }
+         if (borders[2]) {
+-            int offset_val = sao_offset_val[0];
+             int offset     = width - 1;
+             for (x = 0; x < height; x++) {
+-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+             }
+             width--;
+         }
+     }
+     if (sao_eo_class != SAO_EO_HORIZ) {
+         if (borders[1]) {
+-            int offset_val = sao_offset_val[0];
+             for (x = init_x; x < width; x++)
+-                dst[x] = av_clip_pixel(src[x] + offset_val);
++                dst[x] = src[x];
+             init_y = 1;
+         }
+         if (borders[3]) {
+-            int offset_val   = sao_offset_val[0];
+-            int y_stride_dst = stride_dst * (height - 1);
+-            int y_stride_src = stride_src * (height - 1);
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
+             for (x = init_x; x < width; x++)
+-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
++                dst[x + y_stride_dst] = src[x + y_stride_src];
+             height--;
+         }
+     }
+@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+     }
+ }
+ 
++
++// --- Plaited chroma versions
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table_u[32] = { 0 };
++    int offset_table_v[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++    width *= 2;
++
++    for (k = 0; k < 4; k++)
++    {
++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++    }
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2)
++        {
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++        }
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++#endif
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++    width *= 2;
++
++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2) {
++            int diff0u = CMP(src[x], src[x + a_stride]);
++            int diff1u = CMP(src[x], src[x + b_stride]);
++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++#endif
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
++#endif
++
++
+ #undef CMP
+ 
+ ////////////////////////////////////////////////////////////////////////////////
+@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+ #undef TQ1
+ #undef TQ2
+ #undef TQ3
++
++#ifdef RPI
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
++                                 uint8_t * _pix_l)
++{
++    int d, j;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    const ptrdiff_t xstride = 1;
++    const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
++
++        if (d0 + d3 >= beta) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            }
++        }
++    }
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, const int32_t *_tc,
++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++    int d, j, no_p, no_q;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix_l += ystride;
++            pix_r += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
++
++#endif
++
+diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
+index 02c1766..cea16ea 100644
+--- a/libavcodec/hevcpred.c
++++ b/libavcodec/hevcpred.c
+@@ -24,6 +24,7 @@
+ 
+ #include "hevcpred.h"
+ 
++#define PRED_C 0
+ #define BIT_DEPTH 8
+ #include "hevcpred_template.c"
+ #undef BIT_DEPTH
+@@ -39,13 +40,37 @@
+ #define BIT_DEPTH 12
+ #include "hevcpred_template.c"
+ #undef BIT_DEPTH
++#undef PRED_C
++
++#ifdef RPI
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++#endif
+ 
+ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+ {
+ #undef FUNC
+ #define FUNC(a, depth) a ## _ ## depth
+ 
+-#define HEVC_PRED(depth)                                \
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth)                                \
+     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
+     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
+     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
+@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+ 
++#define HEVC_PRED_C(depth)                                \
++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
++
++#ifdef RPI
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth); \
++    HEVC_PRED_C(depth);
++#else
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth);
++#endif
++
+     switch (bit_depth) {
+     case 9:
+         HEVC_PRED(9);
+diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
+index eb17663..00ba3f9 100644
+--- a/libavcodec/hevcpred.h
++++ b/libavcodec/hevcpred.h
+@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
+     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                             const uint8_t *left, ptrdiff_t stride,
+                             int c_idx, int mode);
++#ifdef RPI
++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
++
++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride, int log2_size, int c_idx);
++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int c_idx, int mode);
++#endif
+ } HEVCPredContext;
+ 
+ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 6ae87cc..28d2653 100644
+index 6ae87cc..c14dddd 100644
 --- a/libavcodec/hevcpred_template.c
 +++ b/libavcodec/hevcpred_template.c
-@@ -20,6 +20,8 @@
+@@ -20,13 +20,55 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -7113,7 +9833,54 @@ index 6ae87cc..28d2653 100644
  #include "libavutil/pixdesc.h"
  
  #include "bit_depth_template.c"
-@@ -69,8 +71,11 @@ do {                                  \
+ #include "hevcpred.h"
+ 
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
++#define DUMP_PRED 0
++
+ #define POS(x, y) src[(x) + stride * (y)]
+ 
++#if PRED_C
++
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++
++#if BIT_DEPTH == 8
++#undef BIT_DEPTH
++#define BIT_DEPTH 16
++#include "bit_depth_template.c"
++#undef FUNC
++#define FUNC(a) FUNC3(a, 8, _c)
++#else
++#undef FUNC
++#define FUNC FUNCC
++#endif
++
++#endif
++
++#if DUMP_PRED
++#ifndef DEBUG_ONCE
++#define DEBUG_ONCE
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++        for (unsigned int x = 0; x != size; x++) {
++            printf("%4d", data[x * 2]);
++        }
++        printf("\n");
++    }
++    printf("\n");
++}
++#endif
++#endif
++
+ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
+                                               int log2_size, int c_idx)
+ {
+@@ -69,8 +111,11 @@ do {                                  \
                  AV_WN4P(&ptr[i], a);                                           \
              else                                                               \
                  a = PIXEL_SPLAT_X4(ptr[i + 3])
@@ -7126,17 +9893,399 @@ index 6ae87cc..28d2653 100644
      int i;
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-@@ -114,6 +119,10 @@ do {                                  \
+@@ -79,15 +124,23 @@ do {                                  \
+     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+     int size_in_luma_v = size << vshift;
+     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+-    int x = x0 >> hshift;
+-    int y = y0 >> vshift;
++    const int x = x0 >> hshift;
++    const int y = y0 >> vshift;
+     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+ 
+     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+ 
+-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
++#if defined(RPI)
++    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
++            (pixel*)s->frame->data[c_idx] + x + y * stride :
++        c_idx == 0 ?
++            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
++            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
++#else
+     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
++#endif
+ 
+     int min_pu_width = s->ps.sps->min_pu_width;
+ 
+@@ -95,14 +148,20 @@ do {                                  \
+                               lc->tu.intra_pred_mode;
+     pixel4 a;
+     pixel  left_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
+     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
++#endif
+     pixel  top_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
+     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
++#endif
+ 
+     pixel  *left          = left_array + 1;
+     pixel  *top           = top_array  + 1;
++#if !PRED_C
+     pixel  *filtered_left = filtered_left_array + 1;
+     pixel  *filtered_top  = filtered_top_array  + 1;
++#endif
+     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
+     int cand_left        = lc->na.cand_left;
+     int cand_up_left     = lc->na.cand_up_left;
+@@ -114,6 +173,26 @@ do {                                  \
      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                             (x0 + size_in_luma_h)) >> hshift;
  
++    pixel * src_l = src - 1;
++    pixel * src_u = src - stride;
++    pixel * src_ur = src_u + size;
++
 +#ifdef DISABLE_INTRA
 +    return;
 +#endif
++
++#if defined(RPI)
++    if (s->frame->format == AV_PIX_FMT_SAND128) {
++        const AVFrame * const frame = s->frame;
++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
++        if ((x & mask) == 0)
++            src_l -= stripe_adj;
++        if (((x + size) & mask) == 0)
++            src_ur += stripe_adj;
++    }
++#endif
 +
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
+@@ -163,23 +242,24 @@ do {                                  \
+         top[-1] = 128;
+     }
+     if (cand_up_left) {
+-        left[-1] = POS(-1, -1);
++        left[-1] = src_l[-stride];
+         top[-1]  = left[-1];
+     }
+     if (cand_up)
+-        memcpy(top, src - stride, size * sizeof(pixel));
++        // Always good - even with sand
++        memcpy(top, src_u, size * sizeof(pixel));
+     if (cand_up_right) {
+-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
+-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
+                size - top_right_size);
+     }
+     if (cand_left)
+         for (i = 0; i < size; i++)
+-            left[i] = POS(-1, i);
++            left[i] = src_l[stride * i];
+     if (cand_bottom_left) {
+         for (i = size; i < size + bottom_left_size; i++)
+-            left[i] = POS(-1, i);
+-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
++            left[i] = src_l[stride * i];
++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
+                size - bottom_left_size);
+     }
+ 
+@@ -268,7 +348,11 @@ do {                                  \
+             cand_up_left = 1;
+             cand_left    = 1;
+         } else { // No samples available
++#if PRED_C && BIT_DEPTH == 16
++            left[-1] = 0x8080;
++#else
+             left[-1] = (1 << (BIT_DEPTH - 1));
++#endif
+             EXTEND(top,  left[-1], 2 * size);
+             EXTEND(left, left[-1], 2 * size);
+         }
+@@ -287,6 +371,9 @@ do {                                  \
+     top[-1] = left[-1];
+ 
+     // Filtering process
++    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
++    // worry about chroma smoothing for that case
++#if !PRED_C
+     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+         if (mode != INTRA_DC && size != 4){
+             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+@@ -342,13 +429,46 @@ do {                                  \
+                                            mode);
+         break;
+     }
++#else
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
++                       (uint8_t *)left, stride, log2_size, c_idx);
++        break;
++    default:
++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                           (uint8_t *)left, stride, c_idx,
++                                           mode);
++        break;
++    }
++
++#if DUMP_PRED
++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
+ }
+ 
++#if !PRED_C || BIT_DEPTH == 16
+ #define INTRA_PRED(size)                                                            \
+ static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+ {                                                                                   \
+     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
+ }
++#else
++#define INTRA_PRED(size)                                                            \
++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
++{                                                                                   \
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#endif
+ 
+ INTRA_PRED(2)
+ INTRA_PRED(3)
+@@ -357,6 +477,7 @@ INTRA_PRED(5)
+ 
+ #undef INTRA_PRED
+ 
++#if !PRED_C
+ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+                                   const uint8_t *_left, ptrdiff_t stride,
+                                   int trafo_size)
+@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
+                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
+ }
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++                                  const uint8_t * _left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    int size = 1 << trafo_size;
++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++
++    for (y = 0; y < size; y++, src += stride)
++    {
++        for (x = 0; x < size; x++)
++        {
++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++        }
++    }
++}
++#endif
+ 
++#if !PRED_C || BIT_DEPTH == 16
+ #define PRED_PLANAR(size)\
+ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+                                        const uint8_t *left, ptrdiff_t stride)   \
+ {                                                                               \
+     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
+ }
++#else
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
++    abort();                                                                    \
++}
++#endif
+ 
+ PRED_PLANAR(0)
+ PRED_PLANAR(1)
+@@ -386,6 +540,7 @@ PRED_PLANAR(3)
+ 
+ #undef PRED_PLANAR
+ 
++#if !PRED_C
+ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                           const uint8_t *_left,
+                           ptrdiff_t stride, int log2_size, int c_idx)
+@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+     }
+ }
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size, int c_idx)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    unsigned int dc0 = size;
++    unsigned int dc1 = size;
++
++    for (i = 0; i < size; i++)
++    {
++        dc0 += left[i][0] + top[i][0];
++        dc1 += left[i][1] + top[i][1];
++    }
++
++    dc0 >>= log2_size + 1;
++    dc1 >>= log2_size + 1;
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = dc0;
++            src[j][1] = dc1;
+ 
++        }
++    }
++}
++#endif
++
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++};
++static const int inv_angle[] = {
++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++    -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
+ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                 const uint8_t *_top,
+                                                 const uint8_t *_left,
+@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+     const pixel *top  = (const pixel *)_top;
+     const pixel *left = (const pixel *)_left;
+ 
+-    static const int intra_pred_angle[] = {
+-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+-    };
+-    static const int inv_angle[] = {
+-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+-        -630, -910, -1638, -4096
+-    };
+-
+     int angle = intra_pred_angle[mode - 2];
+     pixel ref_array[3 * MAX_TB_SIZE + 4];
+     pixel *ref_tmp = ref_array + size;
+@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+         }
+     }
+ }
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride, int c_idx,
++                                                int mode, int size)
++{
++    int x, y;
++    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
++    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
++    c8_src_ptr_t left = (c8_src_ptr_t)_left;
++
++    const int angle = intra_pred_angle[mode - 2];
++    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
++    c8_dst_ptr_t ref_tmp = ref_array + size;
++    c8_src_ptr_t ref;
++    const int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, top - 1, (size + 1) * 2);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c8_src_ptr_t)ref_tmp;
++        }
++
++        for (y = 0; y < size; y++, src += stride) {
++            const int idx  = ((y + 1) * angle) >> 5;
++            const int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; ++x) {
++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                memcpy(src, ref + idx + 1, size * 2);
++            }
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, left - 1, (size + 1) * 2);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c8_src_ptr_t)ref_tmp;
++        }
++
++        for (x = 0; x < size; x++, src++) {
++            const int idx  = ((x + 1) * angle) >> 5;
++            const int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                {
++                    src[y * stride][0] = ref[y + idx + 1][0];
++                    src[y * stride][1] = ref[y + idx + 1][1];
++                }
++            }
++        }
++    }
++}
++#endif
+ 
+ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+                                  const uint8_t *left,
 diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
 index 099a8c5..bdff2d2 100644
 --- a/libavcodec/mmaldec.c
@@ -7181,6 +10330,87 @@ index 3adf28d..2f9195f 100644
      if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
          s->codec_id == AV_CODEC_ID_MPEG4 &&
          avctx->idct_algo == FF_IDCT_AUTO) {
+diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+index bfa2537..1bca89e 100644
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
+ 
++    /* RPI */
++#ifdef RPI
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++#endif
++
+     /* special */
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+index d837056..81256b5 100644
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
+     return 0;
+ }
+ 
++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
++{
++    for (int y = 0; y != frame->height / 2; ++y) {
++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
++            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
++            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
++            for (int i = 0; i < w; ++i)
++                *dst++ = p[i * 2];
++        }
++    }
++    return dst;
++}
++
++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    int size = frame->width * frame->height * 3 / 2;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    // Luma is "easy"
++    for (int y = 0; y != frame->height; ++y) {
++        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
++            const int w = FFMIN(frame->linesize[0], frame->width - x);
++            memcpy(dst,
++                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
++            dst += w;
++        }
++    }
++    // Chroma is dull
++    dst = cpy_sand_c(dst, frame, 0);
++    dst = cpy_sand_c(dst, frame, 1);
++
++    return 0;
++}
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *frame, int *got_packet)
+ {
+@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+     if (ret < 0)
+         return ret;
+ 
++    if (frame->format == AV_PIX_FMT_SAND128) {
++        ret = raw_sand_as_yuv420(avctx, pkt, frame);
++        *got_packet = (ret == 0);
++        return ret;
++    }
++
+     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+         return ret;
+     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
 diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
 new file mode 100644
 index 0000000..4309f1c
@@ -11182,10 +14412,10 @@ index 0000000..5543093
 +  pop r6-r7, pc
 diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
 new file mode 100644
-index 0000000..3904efc
+index 0000000..0255f5d
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,340 @@
+@@ -0,0 +1,149 @@
 +/*
 +Copyright (c) 2012, Broadcom Europe Ltd.
 +All rights reserved.
@@ -11213,6 +14443,8 @@ index 0000000..3904efc
 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +*/
 +
++#ifdef RPI
++
 +#include <stdio.h>
 +#include <string.h>
 +#include <stdlib.h>
@@ -11220,7 +14452,6 @@ index 0000000..3904efc
 +#include <unistd.h>
 +#include <assert.h>
 +#include <stdint.h>
-+#include <sys/mman.h>
 +#include <sys/ioctl.h>
 +
 +#include <linux/ioctl.h>
@@ -11230,75 +14461,7 @@ index 0000000..3904efc
 +#define DEVICE_FILE_NAME "/dev/vcio"
 +
 +#include "rpi_mailbox.h"
-+
-+#define PAGE_SIZE (4*1024)
-+
-+// Shared memory will not be cached in ARM cache
-+void *mapmem_shared(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_SHARED/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+// Unshared memory will be faster as lives in ARM cache, but requires cache flushing
-+void *mapmem_private(unsigned base, unsigned size)
-+{
-+   int mem_fd;
-+   unsigned offset = base % PAGE_SIZE;
-+   base = base - offset;
-+   /* open /dev/mem */
-+   if ((mem_fd = open("/dev/mem", O_RDWR|O_SYNC) ) < 0) {
-+      printf("can't open /dev/mem\nThis program should be run as root. Try prefixing command with: sudo\n");
-+      return NULL;
-+   }
-+   void *mem = mmap(
-+      0,
-+      size,
-+      PROT_READ|PROT_WRITE,
-+      MAP_PRIVATE/*|MAP_FIXED*/,
-+      mem_fd,
-+      base);
-+#ifdef DEBUG
-+   printf("base=0x%x, mem=%p\n", base, mem);
-+#endif
-+   if (mem == MAP_FAILED) {
-+      printf("mmap error %d\n", (int)mem);
-+      return NULL;
-+   }
-+   close(mem_fd);
-+   return (char *)mem + offset;
-+}
-+
-+void unmapmem(void *addr, unsigned size)
-+{
-+   int s = munmap(addr, size);
-+   if (s != 0) {
-+      printf("munmap error %d\n", s);
-+      exit (-1);
-+   }
-+}
++//#include <interface/vctypes/vc_image_structs.h>
 +
 +/*
 + * use ioctl to send mbox property message
@@ -11320,47 +14483,7 @@ index 0000000..3904efc
 +   return ret_val;
 +}
 +
-+unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000c; // (the tag id)
-+   p[i++] = 12; // (size of the buffer)
-+   p[i++] = 12; // (size of the data)
-+   p[i++] = size; // (num bytes? or pages?)
-+   p[i++] = align; // (alignment)
-+   p[i++] = flags; // (MEM_FLAG_L1_NONALLOCATING)
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_free(int file_desc, unsigned handle)
-+{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x3000f; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = handle;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned mem_lock(int file_desc, unsigned handle)
++unsigned mbox_mem_lock(int file_desc, unsigned handle)
 +{
 +   int i=0;
 +   unsigned p[32];
@@ -11379,7 +14502,7 @@ index 0000000..3904efc
 +   return p[5];
 +}
 +
-+unsigned mem_unlock(int file_desc, unsigned handle)
++unsigned mbox_mem_unlock(int file_desc, unsigned handle)
 +{
 +   int i=0;
 +   unsigned p[32];
@@ -11398,117 +14521,30 @@ index 0000000..3904efc
 +   return p[5];
 +}
 +
-+unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5)
++#define GET_VCIMAGE_PARAMS 0x30044
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
 +{
-+   int i=0;
-+   unsigned p[32];
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
 +
-+   p[i++] = 0x30010; // (the tag id)
-+   p[i++] = 28; // (size of the buffer)
-+   p[i++] = 28; // (size of the data)
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
 +
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
++    rv = mbox_property(fd, buf);
++    memcpy(img, rimg, sizeof(*img));
 +
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned qpu_enable(int file_desc, unsigned enable)
-+{
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+
-+   p[i++] = 0x30012; // (the tag id)
-+   p[i++] = 4; // (size of the buffer)
-+   p[i++] = 4; // (size of the data)
-+   p[i++] = enable;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30011; // (the tag id)
-+   p[i++] = 16; // (size of the buffer)
-+   p[i++] = 16; // (size of the data)
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return p[5];
-+}
-+
-+void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2) {
-+   int i=0;
-+   unsigned p[32];
-+
-+   p[i++] = 0; // size
-+   p[i++] = 0x00000000; // process request
-+   p[i++] = 0x30018; // (the tag id)
-+   p[i++] = 88; // (size of the buffer)
-+   p[i++] = 88; // (size of the data)
-+
-+   p[i++] = num_qpus;
-+   p[i++] = control;
-+   p[i++] = noflush;
-+   p[i++] = timeout; // ms
-+
-+   p[i++] = num_qpus_2;
-+   p[i++] = control_2;
-+   p[i++] = noflush_2;
-+   p[i++] = timeout_2; // ms
-+
-+   p[i++] = code;
-+   p[i++] = r0;
-+   p[i++] = r1;
-+   p[i++] = r2;
-+   p[i++] = r3;
-+   p[i++] = r4;
-+   p[i++] = r5;
-+
-+   p[i++] = code_2;
-+   p[i++] = r0_2;
-+   p[i++] = r1_2;
-+   p[i++] = r2_2;
-+   p[i++] = r3_2;
-+   p[i++] = r4_2;
-+   p[i++] = r5_2;
-+
-+   p[i++] = 0x00000000; // end tag
-+   p[0] = i*sizeof *p; // actual size
-+
-+   mbox_property(file_desc, p);
-+   return;
++    return rv;
 +}
 +
 +int mbox_open() {
@@ -11526,55 +14562,80 @@ index 0000000..3904efc
 +void mbox_close(int file_desc) {
 +  close(file_desc);
 +}
++
++#endif
++
 diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
 new file mode 100644
-index 0000000..5898102
+index 0000000..b316878
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,25 @@
+@@ -0,0 +1,58 @@
 +#ifndef RPI_MAILBOX_H
 +#define RPI_MAILBOX_H
 +
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++  void *u, *v;
++  int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++    VC_IMAGE_EXTRA_UV_T uv;
++//  VC_IMAGE_EXTRA_RGBA_T rgba;
++//  VC_IMAGE_EXTRA_PAL_T pal;
++//  VC_IMAGE_EXTRA_TF_T tf;
++//  VC_IMAGE_EXTRA_BAYER_T bayer;
++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++//  VC_IMAGE_EXTRA_CODEC_T codec;
++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++  unsigned short                  type;           /* should restrict to 16 bits */
++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++  unsigned short                  width;          /* width in pixels */
++  unsigned short                  height;         /* height in pixels */
++  int                             pitch;          /* pitch of image_data array in bytes */
++  int                             size;           /* number of bytes available in image_data array */
++  void                           *image_data;     /* pixel data */
++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++  void                           *metadata;       /* metadata header for the image */
++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
++  int                             metadata_size;  /* size of metadata of each channel in bytes */
++  int                             channel_offset; /* offset of consecutive channels in bytes */
++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                            into a linked-mulitchannel image */
++  uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                            it is being linked. */
++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
 +extern int mbox_open(void);
 +extern void mbox_close(int file_desc);
 +
-+extern unsigned get_version(int file_desc);
-+extern unsigned mem_alloc(int file_desc, unsigned size, unsigned align, unsigned flags);
-+extern unsigned mem_free(int file_desc, unsigned handle);
-+extern unsigned mem_lock(int file_desc, unsigned handle);
-+extern unsigned mem_unlock(int file_desc, unsigned handle);
-+extern void *mapmem_shared(unsigned base, unsigned size);
-+extern void *mapmem_private(unsigned base, unsigned size);
-+extern void unmapmem(void *addr, unsigned size);
++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
 +
-+extern unsigned execute_code(int file_desc, unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern unsigned execute_qpu(int file_desc, unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout);
-+extern void execute_multi(int file_desc,
-+   unsigned num_qpus, unsigned control, unsigned noflush, unsigned timeout,
-+   unsigned num_qpus_2, unsigned control_2, unsigned noflush_2, unsigned timeout_2,
-+   unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+   unsigned code_2, unsigned r0_2, unsigned r1_2, unsigned r2_2, unsigned r3_2, unsigned r4_2, unsigned r5_2);
-+extern unsigned qpu_enable(int file_desc, unsigned enable);
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
 +
 +#endif
 diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
 new file mode 100644
-index 0000000..365f4a6
+index 0000000..7c0eedd
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,993 @@
+@@ -0,0 +1,902 @@
 +#ifdef RPI
-+// Use vchiq service for submitting jobs
-+#define GPUSERVICE
-+
-+// This works better than the mmap in that the memory can be cached, but requires a kernel modification to enable the device.
-+// define RPI_TIME_TOTAL_QPU to print out how much time is spent in the QPU code
-+//#define RPI_TIME_TOTAL_QPU
-+// define RPI_TIME_TOTAL_VPU to print out how much time is spent in the VPI code
-+//#define RPI_TIME_TOTAL_VPU
-+// define RPI_TIME_TOTAL_POSTED to print out how much time is spent in the multi execute QPU/VPU combined
-+#define RPI_TIME_TOTAL_POSTED
-+
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
@@ -11587,27 +14648,35 @@ index 0000000..365f4a6
 +#include <pthread.h>
 +#include <time.h>
 +
++#include <interface/vcsm/user-vcsm.h>
++
 +#include "rpi_mailbox.h"
 +#include "rpi_qpu.h"
 +#include "rpi_shader.h"
 +#include "rpi_hevc_transform.h"
++#include "rpi_zc.h"
 +
-+#include "rpi_user_vcsm.h"
-+#ifdef GPUSERVICE
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
 +#pragma GCC diagnostic ignored "-Wredundant-decls"
 +#include "interface/vmcs_host/vc_vchi_gpuserv.h"
 +#pragma GCC diagnostic pop
-+#endif
 +
-+// QPU profile flags
-+#define NO_FLUSH 1
-+#define CLEAR_PROFILE 2
-+#define OUTPUT_COUNTS 4
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
 +
-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL       0
 +
++// QPU "noflush" flags
++// a mixture of flushing & profiling
++
++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
 +// On Pi2 there is no way to access the VPU L2 cache
 +// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
@@ -11664,65 +14733,223 @@ index 0000000..365f4a6
 +{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
 +};
 +
++// Code/constants on GPU
 +struct GPU
 +{
 +  unsigned int qpu_code[QPU_CODE_SIZE];
 +  unsigned int vpu_code[VPU_CODE_SIZE];
 +  short transMatrix2even[16*16*2];
-+  int open_count; // Number of allocated video buffers
-+  int      mb; // Mailbox handle
-+  int      vc; // Address in GPU memory
-+  int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-+  int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
 +};
 +
++#define CFE_ENTS_PER_A 8
++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
++// allow 128
++#define CFE_ENT_COUNT  128
++#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
++
++struct rpi_cache_flush_env_s {
++    unsigned int n;
++    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++};
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++  int count;
++  int64_t start[WAIT_COUNT_MAX];
++  int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++  unsigned int jcount;
++  int64_t start0;
++  int64_t last_update;
++  trace_time_one_t active;
++  trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++  sem_t sem;
++  unsigned int cost;
++  struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++  vq_wait_t * head;
++  vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++  int open_count;
++  int init_count;
++  int mb;
++  unsigned int current_load;
++  GPU_MEM_PTR_T code_gm_ptr;
++  vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
 +// Stop more than one thread trying to allocate memory or use the processing resources at once
 +static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static volatile struct GPU* gpu = NULL;
-+static GPU_MEM_PTR_T gpu_mem_ptr;
++static gpu_env_t * gpu = NULL;
 +
-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
-+static unsigned int Microseconds(void) {
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
 +    struct timespec ts;
-+    unsigned int x;
-+    static unsigned int base = 0;
-+    clock_gettime(CLOCK_REALTIME, &ts);
-+    x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-+    if (base==0) base=x;
-+    return x-base;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
 +}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++  // Update totals for levels that are still pending
++  for (int i = 0; i < tto->count; ++i) {
++    tto->total[i] += now - tto->start[i];
++    tto->start[i] = now;
++  }
++
++  printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++         prefix,
++         T_ARG(now - start0 - tto->total[0]),
++         T_ARG(tto->total[0]),
++         T_ARG(tto->total[1]),
++         T_ARG(tto->total[2]),
++         T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++  av_assert0(tto->count < WAIT_COUNT_MAX);
++  tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++  const int n = --tto->count;
++  av_assert0(n >= 0);
++  tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++  printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++  tto_print(&ttw->active, now, ttw->start0, "Active");
++  tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
++}
++
 +#endif
 +
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
++// GPU memory alloc fns (internal)
++
++// GPU_MEM_PTR_T alloc fns
++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++  p->numbytes = numbytes;
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  av_assert0(p->vcsm_handle);
++  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++  av_assert0(p->vc_handle);
++  p->arm = vcsm_lock(p->vcsm_handle);
++  av_assert0(p->arm);
++  p->vc = mbox_mem_lock(mb, p->vc_handle);
++  av_assert0(p->vc);
++  return 0;
++}
++
++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
++  mbox_mem_unlock(mb, p->vc_handle);
++  vcsm_unlock_ptr(p->arm);
++  vcsm_free(p->vcsm_handle);
++  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++  gpu_env_t * const ge = gpu;
++
++  // We have to hope that eveything has terminated...
++  gpu = NULL;
++
++  vc_gpuserv_deinit();
++
++  gpu_free_internal(ge->mb, &ge->code_gm_ptr);
++
++  vcsm_exit();
++
++  mbox_close(ge->mb);
++
++  vq_wait_pool_deinit(&ge->wait_pool);
++
++  free(ge);
++}
++
 +
 +// Connect to QPU, returns 0 on success.
-+static int gpu_init(volatile struct GPU **gpu) {
-+  int mb = mbox_open();
-+  int vc;
++static int gpu_init(gpu_env_t ** const gpu) {
 +  volatile struct GPU* ptr;
-+	if (mb < 0)
-+		return -1;
-+#ifndef RPI_ASYNC
-+	if (qpu_enable(mb, 1)) return -2;
-+#endif
++  gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++  *gpu = NULL;
++
++  if (ge == NULL)
++    return -1;
++
++  if ((ge->mb = mbox_open()) < 0)
++    return -1;
++
++  vq_wait_pool_init(&ge->wait_pool);
++
 +  vcsm_init();
-+  vc_gpuserv_init();
-+  gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+  ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+  memset((void*)ptr, 0, sizeof *ptr);
-+  vc = gpu_mem_ptr.vc;
 +
-+  ptr->mb = mb;
-+  ptr->vc = vc;
++  gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
++  ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
 +
-+  printf("GPU allocated at 0x%x\n",vc);
-+
-+  *gpu = ptr;
++  // Zero everything so we have zeros between the code bits
++  memset((void *)ptr, 0, sizeof(*ptr));
 +
 +  // Now copy over the QPU code into GPU memory
 +  {
-+    int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
++    int num_bytes = (char *)mc_end - (char *)rpi_shader;
 +    av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
 +    memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
 +  }
@@ -11735,106 +14962,56 @@ index 0000000..365f4a6
 +  // And the transform coefficients
 +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
 +
-+#ifdef RPI_ASYNC
-+  {
-+    int err;
-+    vpu_async_tail = 0;
-+    vpu_async_head = 0;
-+    err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-+    //printf("Created thread\n");
-+    if (err) {
-+        av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
-+        return -4;
-+    }
-+
-+    {
-+      struct sched_param param = {0};
-+      int policy = 0;
-+
-+      if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+      {
-+        av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+      }
-+      else
-+      {
-+        av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
-+
-+        policy = SCHED_FIFO;
-+        param.sched_priority = sched_get_priority_max(SCHED_FIFO);
-+
-+        av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
-+            policy,
-+            policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+            param.sched_priority);
-+
-+        if (pthread_setschedparam(vpu_thread, policy, &param) != 0)
-+        {
-+          av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
-+        }
-+        else
-+        {
-+          if (pthread_getschedparam(vpu_thread, &policy, &param) != 0)
-+          {
-+            av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+          }
-+          else
-+          {
-+            av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
-+                policy,
-+                policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+                param.sched_priority);
-+          }
-+        }
-+      }
-+
-+    }
-+
-+  }
-+#endif
-+
++  *gpu = ge;
 +  return 0;
 +}
 +
-+// Returns 1 if the gpu is currently idle
-+static int gpu_idle(void)
-+{
-+  int ret = pthread_mutex_trylock(&gpu_mutex);
-+  if (ret==0) {
-+    pthread_mutex_unlock(&gpu_mutex);
-+    return 1;
-+  }
-+  return 0;
-+}
 +
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static void gpu_lock(void) {
-+  pthread_mutex_lock(&gpu_mutex);
-+
-+  if (gpu==NULL) {
-+    gpu_init(&gpu);
-+  }
-+}
 +
 +static void gpu_unlock(void) {
 +  pthread_mutex_unlock(&gpu_mutex);
 +}
 +
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mem_lock(mb, p->vc_handle);
-+  av_assert0(p->vc);
-+  return 0;
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++  pthread_mutex_lock(&gpu_mutex);
++
++  av_assert0(gpu != NULL);
++  return gpu;
 +}
 +
++static gpu_env_t * gpu_lock_ref(void)
++{
++  pthread_mutex_lock(&gpu_mutex);
++
++  if (gpu == NULL) {
++    int rv = gpu_init(&gpu);
++    if (rv != 0) {
++      gpu_unlock();
++      return NULL;
++    }
++  }
++
++  ++gpu->open_count;
++  return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++  if (--ge->open_count == 0)
++    gpu_term();
++
++  gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++  av_assert0(gpu != NULL);
++  return gpu;
++}
++
++// Public gpu fns
++
 +// Allocate memory on GPU
 +// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
 +// Returns 0 on success.
@@ -11843,732 +15020,532 @@ index 0000000..365f4a6
 +int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
 +{
 +  int r;
-+  gpu_lock();
-+  r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-+  gpu->open_count++;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
 +  gpu_unlock();
 +  return r;
 +}
 +
-+int gpu_get_mailbox(void)
-+{
-+  av_assert0(gpu);
-+  return gpu->mb;
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p->arm;
-+    iocache.s[0].size  = p->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp = vcsm_lock(p->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+    struct vcsm_user_clean_invalid_s iocache = {};
-+    iocache.s[0].handle = p0->vcsm_handle;
-+    iocache.s[0].cmd = 3; // clean+invalidate
-+    iocache.s[0].addr = (int) p0->arm;
-+    iocache.s[0].size  = p0->numbytes;
-+    iocache.s[1].handle = p1->vcsm_handle;
-+    iocache.s[1].cmd = 3; // clean+invalidate
-+    iocache.s[1].addr = (int) p1->arm;
-+    iocache.s[1].size  = p1->numbytes;
-+    iocache.s[2].handle = p2->vcsm_handle;
-+    iocache.s[2].cmd = 3; // clean+invalidate
-+    iocache.s[2].addr = (int) p2->arm;
-+    iocache.s[2].size  = p2->numbytes;
-+    vcsm_clean_invalid( &iocache );
-+#else
-+    void *tmp;
-+    tmp = vcsm_lock(p0->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p1->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+    tmp = vcsm_lock(p2->vcsm_handle);
-+    vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+  av_assert0(p->vcsm_handle);
-+  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+  av_assert0(p->vc_handle);
-+  p->arm = vcsm_lock(p->vcsm_handle);
-+  av_assert0(p->arm);
-+  p->vc = mem_lock(gpu->mb, p->vc_handle);
-+  av_assert0(p->vc);
-+  return 0;
-+}
-+
 +// This allocates data that will be
 +//    Cached in ARM L2
 +//    Uncached in VPU L2
 +int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
 +{
 +  int r;
-+  gpu_lock();
-+  r = gpu_malloc_cached_internal(numbytes, p);
-+  gpu->open_count++;
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++  r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
 +  gpu_unlock();
 +  return r;
 +}
 +
-+static void gpu_term(void)
-+{
-+  int mb;
-+
-+  if (gpu==NULL)
-+    return;
-+  mb = gpu->mb;
-+
-+  // ??? Tear down anything needed for gpuexecute
-+
-+  qpu_enable(mb, 0);
-+  gpu_free_internal(&gpu_mem_ptr);
-+
-+  vc_gpuserv_deinit();
-+  vcsm_exit();
-+
-+  mbox_close(mb);
-+  gpu = NULL;
-+}
-+
-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
-+  int mb = gpu->mb;
-+  mem_unlock(mb,p->vc_handle);
-+  vcsm_unlock_ptr(p->arm);
-+  vcsm_free(p->vcsm_handle);
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T *p) {
-+  gpu_lock();
-+
-+  gpu_free_internal(p);
-+
-+  gpu->open_count--;
-+  if (gpu->open_count==0) {
-+      printf("Closing GPU\n");
-+      gpu_term();
-+      gpu = NULL;
-+  }
-+  gpu_unlock();
++void gpu_free(GPU_MEM_PTR_T * const p) {
++  gpu_env_t * const ge = gpu_lock();
++  gpu_free_internal(ge->mb, p);
++  gpu_unlock_unref(ge);
 +}
 +
 +unsigned int vpu_get_fn(void) {
 +  // Make sure that the gpu is initialized
-+  if (gpu==NULL) {
-+    printf("Preparing gpu\n");
-+    gpu_lock();
-+    gpu_unlock();
-+  }
-+  return gpu->vc + offsetof(struct GPU,vpu_code);
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
 +}
 +
 +unsigned int vpu_get_constants(void) {
-+  if (gpu==NULL) {
-+    gpu_lock();
++  av_assert0(gpu != NULL);
++  return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
++}
++
++int gpu_get_mailbox(void)
++{
++  av_assert0(gpu);
++  return gpu->mb;
++}
++
++void gpu_ref(void)
++{
++  gpu_lock_ref();
++  gpu_unlock();
++}
++
++void gpu_unref(void)
++{
++  gpu_env_t * const ge = gpu_lock();
++  gpu_unlock_unref(ge);
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++
++rpi_cache_flush_env_t * rpi_cache_flush_init()
++{
++    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
++    if (rfe == NULL)
++        return NULL;
++
++    rfe->n = 0;
++    return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++    if (rfe != NULL)
++        free(rfe);
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++    int rc = 0;
++    unsigned int na;
++    unsigned int nr;
++
++    // Clear any reamaining ents in the final block
++    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
++        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
++
++    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
++    {
++        if (vcsm_clean_invalid(rfe->a + na) != 0)
++            rc = -1;
++    }
++
++    free(rfe);
++
++    if (rc == 0)
++        return 0;
++
++    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++    return rc;
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++    // Deal with empty pointer trivially
++    if (gm == NULL || gm->numbytes == 0)
++        return;
++
++    {
++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++
++        av_assert0(rfe->n < CFE_ENT_COUNT);
++
++        a->s[n].cmd = mode;
++        a->s[n].handle = gm->vcsm_handle;
++        a->s[n].addr = (unsigned int)gm->arm;
++        a->s[n].size = gm->numbytes;
++        ++rfe->n;
++    }
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset, const unsigned int size)
++{
++    // Deal with empty pointer trivially
++    if (gm == NULL || size == 0)
++        return;
++
++//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
++
++    av_assert0(offset <= gm->numbytes);
++    av_assert0(size <= gm->numbytes);
++    av_assert0(offset + size <= gm->numbytes);
++
++    {
++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++
++        av_assert0(rfe->n < CFE_ENT_COUNT);
++
++        a->s[n].cmd = mode;
++        a->s[n].handle = gm->vcsm_handle;
++        a->s[n].addr = (unsigned int)gm->arm + offset;
++        a->s[n].size = size;
++        ++rfe->n;
++    }
++}
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++  if (gpu_is_buf1(frame)) {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++  }
++  else
++  {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++  }
++}
++
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++  const unsigned int y_offset = frame->linesize[0] * start_line;
++  const unsigned int y_size = frame->linesize[0] * n;
++  // Round UV up/down to get everything
++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++
++  // As all unsigned they will also reject -ve
++  // Test individually as well as added to reject overflow
++  av_assert0(start_line <= (unsigned int)frame->height);
++  av_assert0(n <= (unsigned int)frame->height);
++  av_assert0(start_line + n <= (unsigned int)frame->height);
++
++  if (!gpu_is_buf1(frame))
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++  else if (!rpi_sliced_frame(frame))
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++    }
++  }
++  else
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
++    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
++      if (do_luma) {
++        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
++      }
++      if (do_chroma) {
++        rpi_cache_flush_add_gm_range(rfe, gm, mode,
++                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
++      }
++    }
++  }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++  rpi_cache_flush_finish(rfe);
++}
++
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_init(&wp->pool[i].sem, 0, 0);
++    wp->pool[i].next = wp->pool + i + 1;
++  }
++  wp->head = wp->pool + 0;
++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  wp->head = NULL;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_destroy(&wp->pool[i].sem);
++    wp->pool[i].next = NULL;
++  }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(const unsigned int cost)
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  vq_wait_t * const wait = ge->wait_pool.head;
++  ge->wait_pool.head = wait->next;
++  ge->current_load += cost;
++  wait->cost = cost;
++  wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  tto_start(&ge->ttw.active, ns_time());
++#endif
++
++  gpu_unlock();
++  return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++  gpu_env_t * const ge = gpu_lock();
++  wait->next = ge->wait_pool.head;
++  ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    trace_time_wait_t * const ttw = &ge->ttw;
++    const int64_t now = ns_time();
++    ++ttw->jcount;
++    tto_end(&ttw->wait, now);
++
++    if (ttw->start0 == 0)
++    {
++      ttw->start0 = ttw->active.start[0];
++      ttw->last_update = ttw->start0;
++    }
++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++    {
++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++      ttw_print(ttw, now);
++    }
++  }
++#endif
++  gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++      const int64_t now = ns_time();
++      gpu_env_t * const ge = gpu_lock();
++      tto_start(&ge->ttw.wait, now);
++      gpu_unlock();
++  }
++#endif
++
++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++    /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if !RPI_TRACE_TIME_VPU_QPU_WAIT
++  if (wait->cost != 0)
++#endif
++  {
++    gpu_env_t *const ge = gpu_lock();
++    ge->current_load -= wait->cost;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    tto_end(&ge->ttw.active, ns_time());
++#endif
 +    gpu_unlock();
 +  }
-+  return gpu->vc + offsetof(struct GPU,transMatrix2even);
++
++  sem_post(&wait->sem);
 +}
 +
-+#ifdef GPUSERVICE
-+static void callback(void *cookie)
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU  1
++#define VPU_QPU_MASK_VPU  2
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
 +{
-+  sem_post((sem_t *)cookie);
++  unsigned int n;
++  unsigned int mask;
++  unsigned int cost;
++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
++{
++  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++  return vqj;
 +}
-+#endif
 +
-+
-+static volatile uint32_t post_done = 0;
-+static volatile uint32_t post_qed = 0;
-+
-+static void post_code2_cb(void * v)
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
 +{
-+  uint32_t n = (uint32_t)v;
-+  if ((int32_t)(n - post_done) > 0) {
-+    post_done = n;
++  memset(vqj, 0, sizeof(*vqj));
++  free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++  struct gpu_job_s * const j = vqj->j + vqj->n++;
++  av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
++  return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++  if (vpu_code != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_VPU;
++
++    j->command = EXECUTE_VPU;
++    j->u.v.q[0] = vpu_code;
++    j->u.v.q[1] = r0;
++    j->u.v.q[2] = r1;
++    j->u.v.q[3] = r2;
++    j->u.v.q[4] = r3;
++    j->u.v.q[5] = r4;
++    j->u.v.q[6] = r5;
 +  }
 +}
 +
-+
-+// Post a command to the queue
-+// Returns an id which we can use to wait for completion
-+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
 +{
-+  struct gpu_job_s j[1] = {
-+    {
-+      .command = EXECUTE_VPU,
-+      .u.v.q = {code, r0, r1, r2, r3, r4, r5},
-+      .callback.func = post_code2_cb
-+    }
-+  };
-+  uint32_t id;
++  if (n != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_QPU;
++    vqj->cost += cost;
 +
-+  j[0].callback.cookie = (void *)(id = ++post_qed);
-+
-+  av_assert0(vc_gpuserv_execute_code(1, j) == 0);
-+
-+  return id;
++    j->command = EXECUTE_QPU;
++    j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
++    j->u.q.timeout = 5000;
++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  }
 +}
 +
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+    int qpu0_n, const uint32_t * qpu0_mail,
-+    int qpu1_n, const uint32_t * qpu1_mail)
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
 +{
-+#if 1
-+  sem_t sync0;
-+  struct gpu_job_s j[4];
++  vq_wait_post(v);
++}
 +
-+  sem_init(&sync0, 0, 0);
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++  vq_wait_t * wait;
 +
-+  j[0].command = EXECUTE_VPU;
-+  j[0].u.v.q[0] = vpu_code;
-+  j[0].u.v.q[1] = r0;
-+  j[0].u.v.q[2] = r1;
-+  j[0].u.v.q[3] = r2;
-+  j[0].u.v.q[4] = r3;
-+  j[0].u.v.q[5] = r4;
-+  j[0].u.v.q[6] = r5;
-+  j[0].callback.func = 0;
-+  j[0].callback.cookie = NULL;
++  if (vqj->mask == 0) {
++    *wait_h = NULL;
++    return;
++  }
 +
-+  j[1].command = EXECUTE_QPU;
-+  j[1].u.q.jobs = qpu1_n;
-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+  j[1].u.q.timeout = 5000;
-+  j[1].callback.func = 0;
-+  j[1].callback.cookie = NULL;
++  // We are going to want a sync object
++  wait = vq_wait_new(vqj->cost);
 +
-+  j[2].command = EXECUTE_QPU;
-+  j[2].u.q.jobs = qpu0_n;
-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[2].u.q.noflush = 1;
-+  j[2].u.q.timeout = 5000;
-+  j[2].callback.func = 0;
-+  j[2].callback.cookie = NULL;
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert0(j->callback.func == 0);
 +
-+  j[3].command = EXECUTE_SYNC;
-+  j[3].u.s.mask = 3;
-+  j[3].callback.func = callback;
-+  j[3].callback.cookie = (void *)&sync0;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
 +
-+  av_assert0(vc_gpuserv_execute_code(4, j) == 0);
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
 +
-+  sem_wait(&sync0);
-+#else
++  vqj->cost = 0;
++  vqj->mask = 0;
++  *wait_h = wait;
++}
 +
-+  sem_t sync0, sync2;
-+  struct gpu_job_s j[3];
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++  return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
 +
-+  sem_init(&sync0, 0, 0);
-+  sem_init(&sync2, 0, 0);
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++  int rv;
++  rv = vpu_qpu_job_start(vqj);
++  vpu_qpu_job_delete(vqj);
++  return rv;
++}
 +
-+  j[0].command = EXECUTE_VPU;
-+  j[0].u.v.q[0] = vpu_code;
-+  j[0].u.v.q[1] = r0;
-+  j[0].u.v.q[2] = r1;
-+  j[0].u.v.q[3] = r2;
-+  j[0].u.v.q[4] = r3;
-+  j[0].u.v.q[5] = r4;
-+  j[0].u.v.q[6] = r5;
-+  j[0].callback.func = callback;
-+  j[0].callback.cookie = (void *)&sync0;
++unsigned int vpu_qpu_current_load(void)
++{
++  return gpu_ptr()->current_load;
++}
 +
-+  j[1].command = EXECUTE_QPU;
-+  j[1].u.q.jobs = qpu1_n;
-+  memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+  j[1].u.q.timeout = 5000;
-+  j[1].callback.func = 0;
-+  j[1].callback.cookie = NULL;
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++  if (wait_h != NULL)
++  {
++    vq_wait_t * const wait = *wait_h;
++    if (wait != NULL) {
++      *wait_h = NULL;
++      vq_wait_wait(wait);
++      vq_wait_delete(wait);
++    }
++  }
++}
 +
-+  j[2].command = EXECUTE_QPU;
-+  j[2].u.q.jobs = qpu0_n;
-+  memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+  j[2].u.q.noflush = 1;
-+  j[2].u.q.timeout = 5000;
-+  j[2].callback.func = callback;
-+  j[2].callback.cookie = (void *)&sync2;
++int vpu_qpu_init()
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
 +
-+  av_assert0(vc_gpuserv_execute_code(3, j) == 0);
-+
-+  sem_wait(&sync0);
-+  sem_wait(&sync2);
-+#endif
++  if (ge->init_count++ == 0)
++  {
++    vc_gpuserv_init();
++  }
 +
++  gpu_unlock();
 +  return 0;
 +}
 +
-+
-+// Wait for completion of the given command
-+void vpu_wait(int id)
++void vpu_qpu_term()
 +{
-+  if (id == 0) {
-+#if 0
-+    sem_t sync0;
-+    struct gpu_job_s j[1] =
-+    {
-+      {
-+        .command = EXECUTE_SYNC,
-+        .u.s.mask = 3,
-+        .callback.func = callback,
-+        .callback.cookie = (void *)&sync0
-+      }
-+    };
++  gpu_env_t * const ge = gpu_lock();
 +
-+    sem_init(&sync0, 0, 0);
++  if (--ge->init_count == 0) {
++    vc_gpuserv_deinit();
 +
-+    av_assert0(vc_gpuserv_execute_code(1, j) == 0);
-+
-+    sem_wait(&sync0);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    ttw_print(&ge->ttw, ns_time());
 +#endif
 +  }
-+  else {
-+    while ((int32_t)(post_done - (uint32_t)id) < 0) {
-+      usleep(1000);
-+    }
-+  }
++
++  gpu_unlock_unref(ge);
 +}
 +
-+
-+unsigned int qpu_get_fn(int num) {
-+    // Make sure that the gpu is initialized
-+    unsigned int *fn;
-+    if (gpu==NULL) {
-+      printf("Preparing gpu\n");
-+      gpu_lock();
-+      gpu_unlock();
-+    }
-+    switch(num) {
-+    case QPU_MC_SETUP:
-+      fn = mc_setup;
-+      break;
-+    case QPU_MC_FILTER:
-+      fn = mc_filter;
-+      break;
-+    case QPU_MC_EXIT:
-+      fn = mc_exit;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT12:
-+      fn = mc_interrupt_exit12;
-+      break;
-+    case QPU_MC_FILTER_B:
-+      fn = mc_filter_b;
-+      break;
-+    //case QPU_MC_FILTER_HONLY:
-+    //  fn = mc_filter_honly;
-+    //  break;
-+    case QPU_MC_SETUP_UV:
-+      fn = mc_setup_uv;
-+      break;
-+    case QPU_MC_FILTER_UV:
-+      fn = mc_filter_uv;
-+      break;
-+    case QPU_MC_FILTER_UV_B0:
-+      fn = mc_filter_uv_b0;
-+      break;
-+    case QPU_MC_FILTER_UV_B:
-+      fn = mc_filter_uv_b;
-+      break;
-+    case QPU_MC_INTERRUPT_EXIT8:
-+      fn = mc_interrupt_exit8;
-+      break;
-+    case QPU_MC_END:
-+      fn = mc_end;
-+      break;
-+    default:
-+      printf("Unknown function\n");
-+      exit(-1);
-+    }
-+    return gpu->vc + 4*(int)(fn-rpi_shader);
-+    //return code[num] + gpu->vc;
-+}
-+
-+#if 0
-+typedef unsigned int uint32_t;
-+
-+typedef struct mvs_s {
-+    GPU_MEM_PTR_T unif_mvs_ptr;
-+    uint32_t *unif_mvs; // Base of memory for motion vector commands
-+
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[8];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[8];
-+
-+} HEVCContext;
-+
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+
-+static void rpi_inter_clear(HEVCContext *s)
++uint32_t qpu_fn(const int * const mc_fn)
 +{
-+    int i;
-+    for(i=0;i<8;i++) {
-+        s->u_mvs[i] = s->mvs_base[i];
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 0;
-+        *s->u_mvs[i]++ = 128;  // w
-+        *s->u_mvs[i]++ = 128;  // h
-+        *s->u_mvs[i]++ = 128;  // stride u
-+        *s->u_mvs[i]++ = 128;  // stride v
-+        s->u_mvs[i] += 3;  // Padding words
-+    }
++  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
 +}
 +
-+static void rpi_execute_inter_qpu(HEVCContext *s)
-+{
-+    int k;
-+    uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+
-+    for(k=0;k<8;k++) {
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+        s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); //  dummy location for V
-+    }
-+
-+    s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+    qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+      (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+      (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+      );
-+}
-+
-+void rpi_test_qpu(void)
-+{
-+    HEVCContext mvs;
-+    HEVCContext *s = &mvs;
-+    int i;
-+    int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+    uint32_t *p;
-+    printf("Allocate memory\n");
-+    gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+    s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-+
-+    // Set up initial locations for uniform streams
-+    p = s->unif_mvs;
-+    for(i = 0; i < 8; i++) {
-+        s->mvs_base[i] = p;
-+        p += uv_commands_per_qpu;
-+    }
-+    // Now run a simple program that should just quit immediately after a single texture fetch
-+    rpi_inter_clear(s);
-+    for(i=0;i<4;i++) {
-+      printf("Launch QPUs\n");
-+      rpi_execute_inter_qpu(s);
-+      printf("Done\n");
-+    }
-+    printf("Free memory\n");
-+    gpu_free(&s->unif_mvs_ptr);
-+    return;
-+}
-+#endif
-+
-+#if 0
-+
-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-+
-+static uint8_t av_clip_uint8(int32_t a)
-+{
-+    if (a&(~255)) return (-a)>>31;
-+    else          return a;
-+}
-+
-+static int32_t filter8(const uint8_t *data, int pitch)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += hcoeffs[x]*data[x + y * pitch];
-+
-+      vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-+   }
-+
-+   return av_clip_uint8( (vsum + 64) >> 7);
-+}
-+
-+// Note regression changes coefficients so is not thread safe
-+//#define REGRESSION
-+#ifdef REGRESSION
-+#define CMAX 100
-+#else
-+#define CMAX 2
-+#endif
-+#define YMAX 16
-+
-+int rpi_test_shader(void)
-+{
-+   int i, c;
-+
-+   uint32_t *unifs;
-+
-+   uint8_t *in_buffer;
-+   uint8_t *out_buffer[2];
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   GPU_MEM_PTR_T in_buffer_ptr;
-+   GPU_MEM_PTR_T out_buffer_ptr[2];
-+
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+
-+   int pitch = 0x500;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
-+
-+   printf("This needs to change to reflect new assembler\n");
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return -2;
-+   }
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-+      return -3;
-+   }
-+   in_buffer = (uint8_t*)in_buffer_ptr.arm;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-+      return -4;
-+   }
-+   out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-+   out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-+
-+   for (c = 0; c < CMAX; c++) {
-+      int xo[] = {rand()&31, rand()&31};
-+
-+#ifdef REGRESSION
-+      for (i = 0; i < 8; i++) {
-+         hcoeffs[i] = (int8_t)rand();
-+         vcoeffs[i] = (int8_t)rand();
-+         if (hcoeffs[i]==-128)
-+           hcoeffs[i]++;
-+         if (vcoeffs[i]==-128)
-+           vcoeffs[i]++;
-+      }
-+#endif
-+
-+      for (i = 0; i < 64*23; i++) {
-+         //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-+         in_buffer[i] = rand();
-+      }
-+
-+      // Clear output array
-+      {
-+        int b;
-+        for(b=0;b<2;b++) {
-+          for(i=0;i<16*16;i++) {
-+            out_buffer[b][i] = 3;
-+          }
-+        }
-+      }
-+
-+      unifs[0] = mc_filter;
-+      unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-+      unifs[2] = 64; // src pitch
-+      unifs[3] = pitch; // dst pitch
-+      unifs[4] = 0; // Padding
-+      unifs[5] = 0;
-+      unifs[6] = 0;
-+      unifs[7 ] = mc_filter;
-+      unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-+      unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[13] = out_buffer_ptr[0].vc;
-+      unifs[14] = mc_exit;
-+      unifs[15] = in_buffer_ptr.vc+xo[1]+16;        // dummy
-+      unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+      unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+      unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+      unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+      unifs[20] = out_buffer_ptr[1].vc;
-+
-+      printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+      // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-+
-+      //qpu_run_shader(mc_setup, unifs_ptr.vc);
-+      //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-+      rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-+      rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-+
-+      if (1)
-+      {
-+         int x, y, b;
-+         int bad = 0;
-+
-+         for (b=0; b<2; ++b)
-+            for (y=0; y<YMAX; ++y)
-+               for (x=0; x<16; ++x) {
-+                  int32_t ref = filter8(in_buffer+x+y*64+xo[b], 64);
-+
-+                  if (out_buffer[b][x+y*pitch] != ref) {
-+                      bad = 1;
-+//                     printf("%d, %d, %d, %d\n", c, b, x, y);
-+                  }
-+#ifndef REGRESSION
-+                  //printf("%08x %08x\n", out_buffer[b][x+y*pitch], ref);
-+#endif
-+               }
-+          if (bad)
-+            printf("Failed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+          else
-+            printf("Passed dst=%x test=%d\n",out_buffer_ptr[1].vc,c);
-+      }
-+      //printf("%d\n", simpenrose_get_qpu_tick_count());
-+   }
-+
-+   gpu_free(&out_buffer_ptr[0]);
-+   gpu_free(&out_buffer_ptr[1]);
-+   gpu_free(&in_buffer_ptr);
-+   gpu_free(&unifs_ptr);
-+
-+   return 0;
-+}
-+
-+void rpi_do_block_arm(const uint8_t *in_buffer, int src_pitch, uint8_t *dst, int dst_pitch)
-+{
-+  int x,y;
-+  for (y=0; y<16; ++y) {
-+    for (x=0; x<16; ++x) {
-+       dst[x+y*dst_pitch] = filter8(in_buffer+x+y*src_pitch, src_pitch);
-+    }
-+  }
-+}
-+
-+void rpi_do_block(const uint8_t *in_buffer_vc, int src_pitch, uint8_t *dst_vc, int dst_pitch, uint8_t *dst)
-+{
-+   uint32_t *unifs;
-+
-+   GPU_MEM_PTR_T unifs_ptr;
-+   //uint8_t *out_buffer;
-+   //GPU_MEM_PTR_T out_buffer_ptr;
-+
-+   // Addresses in GPU memory of filter programs
-+   uint32_t mc_setup = 0;
-+   uint32_t mc_filter = 0;
-+   uint32_t mc_exit = 0;
-+   //int x,y;
-+
-+   if (gpu==NULL) {
-+      gpu_lock();
-+      gpu_unlock();
-+   }
-+
-+   // Use table to compute locations of program start points
-+   mc_setup = code[0] + gpu->vc;
-+   mc_filter = code[1] + gpu->vc;
-+   mc_exit = code[2] + gpu->vc;
-+
-+   if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+      return;
-+   }
-+   //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-+   //out_buffer = (uint8_t*)out_buffer_ptr.arm;
-+
-+   /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         out_buffer[x+y*dst_pitch] = 7;
-+      }
-+    }*/
-+
-+   unifs = (uint32_t*)unifs_ptr.arm;
-+
-+    unifs[0] = mc_filter;
-+    unifs[1] = (int)in_buffer_vc;
-+    unifs[2] = src_pitch; // src pitch
-+    unifs[3] = dst_pitch; // dst pitch
-+    unifs[4] = 0; // Padding
-+    unifs[5] = 0;
-+    unifs[6] = 0;
-+    unifs[7 ] = mc_exit;
-+    unifs[8 ] = (int)in_buffer_vc;
-+    unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+    unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+    unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+    unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+    unifs[13] = (int)dst_vc;
-+    //unifs[13] = (int)out_buffer_ptr.vc;
-+
-+    //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+    qpu_run_shader(mc_setup, unifs_ptr.vc);
-+
-+    /*for (y=0; y<16; ++y) {
-+      for (x=0; x<16; ++x) {
-+         dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-+      }
-+    }*/
-+
-+    gpu_free(&unifs_ptr);
-+    //gpu_free(&out_buffer_ptr);
-+}
-+
-+
-+
-+#endif
-+
 +#endif // RPI
 diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
 new file mode 100644
-index 0000000..c6cdb2b
+index 0000000..a95f7d9
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,176 @@
+@@ -0,0 +1,200 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
-+#define RPI_FAST_CACHEFLUSH
-+
 +#define RPI_ONE_BUF 1
 +
 +typedef struct gpu_mem_ptr_s {
@@ -12582,9 +15559,7 @@ index 0000000..c6cdb2b
 +// General GPU functions
 +extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
 +extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T *p);
-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
 +
 +#include "libavutil/frame.h"
 +#if !RPI_ONE_BUF
@@ -12627,29 +15602,31 @@ index 0000000..c6cdb2b
 +    return av_buffer_get_opaque(frame->buf[0]);
 +}
 +
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
 +{
 +    return av_buffer_pool_opaque(frame->buf[n]);
 +}
 +
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++    return gm->vc + (frame->data[n] - gm->arm);
++}
++
 +
 +static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
++    return get_vc_address3(frame, 0);
 +}
 +
 +static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ?
-+        gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
-+        gpu_buf3_gmem(frame, 1)->vc;
++    return get_vc_address3(frame, 1);
 +}
 +
 +static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+    return gpu_is_buf1(frame) ?
-+        gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
-+        gpu_buf3_gmem(frame, 2)->vc;
++    return get_vc_address3(frame, 2);
 +}
 +
-+
++#if 0
 +static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
 +    if (gpu_is_buf1(frame))
 +    {
@@ -12686,48 +15663,74 @@ index 0000000..c6cdb2b
 +    else
 +        return *gpu_buf3_gmem(frame, 2);
 +}
-+
 +#endif
++#endif
++
++// Cache flush stuff
++
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
++} rpi_cache_flush_mode_t;
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
 +
 +
 +// QPU specific functions
-+extern void rpi_test_qpu(void);
++uint32_t qpu_fn(const int * const mc_fn);
 +
-+enum {
-+  QPU_MC_SETUP,
-+  QPU_MC_FILTER,
-+  QPU_MC_EXIT,
-+  QPU_MC_INTERRUPT_EXIT12,
-+  QPU_MC_FILTER_B,
-+  QPU_MC_FILTER_HONLY,
-+  QPU_MC_SETUP_UV,
-+  QPU_MC_FILTER_UV,
-+  QPU_MC_FILTER_UV_B0,
-+  QPU_MC_FILTER_UV_B,
-+  QPU_MC_INTERRUPT_EXIT8,
-+  QPU_MC_END
-+  };
-+extern unsigned int qpu_get_fn(int num);
-+
-+#define QPU_N_UV   8
-+#define QPU_N_Y    12
-+#define QPU_N_MAX  16
++#define QPU_N_GRP_UV 4
++#define QPU_N_UV     8
++#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
++#define QPU_N_Y      12
 +
 +#define QPU_MAIL_EL_VALS  2
-+#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
-+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
-+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
++
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
 +
 +// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++vpu_qpu_job_h vpu_qpu_job_new(void);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++
 +extern unsigned int vpu_get_fn(void);
 +extern unsigned int vpu_get_constants(void);
-+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+    int qpu0_n, const uint32_t * qpu0_mail,
-+    int qpu1_n, const uint32_t * qpu1_mail);
 +
-+extern void vpu_wait( int id);
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++unsigned int vpu_qpu_current_load(void);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
 +
 +// Simple test of shader code
 +extern int rpi_test_shader(void);
@@ -12736,14 +15739,16 @@ index 0000000..c6cdb2b
 +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
 +
 +extern int gpu_get_mailbox(void);
++void gpu_ref(void);
++void gpu_unref(void);
 +
 +#endif
 diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
 new file mode 100644
-index 0000000..06fb166
+index 0000000..0898ecd
 --- /dev/null
 +++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,629 @@
+@@ -0,0 +1,670 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -12767,607 +15772,648 @@ index 0000000..06fb166
 +__attribute__((aligned(8)))
 +#endif
 +unsigned int rpi_shader[] = {
-+// ::mc_setup_uv
-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
-+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
-+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
-+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
++// ::mc_setup_c
++/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
++/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
++/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
++/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
++/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
++/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
++/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
++/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
++/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
++/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
++/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
++/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
++/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
++/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
++/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
++/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
++/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
++/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
++/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
++/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
++/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
++/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
++/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
++/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
++/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
++/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
++/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
++/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
++/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
++/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
++/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
++/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
++/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
++/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
 +// ::mc_filter_uv
-+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0         ; mov r1, unif
-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
-+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1     ; mov vw_setup, rb28
-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
-+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
-+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
-+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
-+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
++/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
++/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
++/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
++/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
++/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
++/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
++/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
++/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
++/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
++/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
++/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
++/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
++/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
++/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
++/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
++/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
++/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
++/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
 +// :uvloop
-+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
++/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
++/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
++/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
++/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
++/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
++/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
++/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
++/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
++/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
++/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
++/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
++/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
++/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
++/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
 +// ::mc_filter_uv_b0
-+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0                ; mov r1, unif
-+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next
-+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3  	     ; mov ra1, unif
-+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3        ; mov ra0, unif
-+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1            ; mov vw_setup, rb21
-+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0,   r0, i_shift16      ; mov ra3, unif
-+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
-+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
-+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
-+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
-+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov      rb14, unif
-+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
++/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
++/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
++/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
++/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
++/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
++/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
++/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
++/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
++/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
++/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
++/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
++/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
++/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
++/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
++/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
++/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
++/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
++/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
 +// :uvloop_b0
-+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20
-+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_filter_uv_b
-+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
-+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0                      ; mov ra_y_next, unif
-+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
-+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3                     ; mov ra1, unif
-+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
-+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21     ; mov ra3, unif
-+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
-+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
-+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
-+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop                 ; mov rb10, ra3.8c
-+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
-+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
++/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
++/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
++/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
++/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
++/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
++/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
++/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
++/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
++/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
++/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
++/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
++/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
++/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
++/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
++/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
++/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
++/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
++/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
++/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
++/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
++/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
++/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
++/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
++/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
++/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
++/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
++/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
++/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
++// :uv_b0_post12
++/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
++/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
++/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
++/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
++/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
++/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
++// :uv_b0_post_fin
++/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
++/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
++/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
++/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
++/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
++/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
++/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
++/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
++/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
++/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
++/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
++/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
++/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
++/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
 +// :uvloop_b
-+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20
-+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8subs r1, r1, rb20
-+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0          ; mul24 r0, vpm, ra4
-+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
-+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop                     ; mul24 r0, r0, rb14
-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_exit
-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit8
-+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
++/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
++/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
++/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
++/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
++/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
++/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
++/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
++/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
++/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
++/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
++/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
++/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
++/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
++/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
++/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
++/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
++/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
++/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
++/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
++/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
++/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
++/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
++/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
++/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
++/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_interrupt_exit8c
++/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
++/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
 +/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_setup
-+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
-+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
-+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
-+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
-+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop             ; mul24 r1, r1, rb_pitch
-+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
-+// :per_block_setup
-+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
-+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8                          ; mov ra_y_next, ra1.16b
-+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3                     ; mov ra_y2_next, ra1.16b
-+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
-+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
-+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
-+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
-+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
-+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
-+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d    ; mov r0, unif
-+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c    ; mov r1, rb13
-+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1          ; mov rb4, ra3.8a
-+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3          ; mov rb5, ra3.8b
-+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3            ; mov rb6, ra3.8c
-+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                 ; mov rb7, ra3.8d
-+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
-+// ::mc_filter
-+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
-+// :yloop
-+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
-+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_b
-+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
-+// :yloopb
-+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20
-+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
-+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
-+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
-+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
-+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_exit
++// ::mc_exit_c
++/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
++/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
++/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
 +// ::mc_interrupt_exit12
-+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
++/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
 +// ::mc_exit1
-+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop        ; nop
++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
++/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_setup
++/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
++/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
++/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
++/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
++/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
++/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
++/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
++/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
++/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
++/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
++/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
++/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
++/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
++/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
++/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
++/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
++/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
++/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
++/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
++/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
++/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
++/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
++/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
++/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
++// :per_block_setup
++/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
++/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
++/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
++/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
++/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
++/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
++/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
++/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
++/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
++/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
++/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
++/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
++/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
++/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
++/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
++/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
++/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
++/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
++/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
++/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
++/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
++/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
++/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
++/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
++/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
++/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
++/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
++/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
++/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
++/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
++/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
++/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
++/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
++/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
++/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
++/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
++/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
++/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
++// ::mc_filter
++/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
++// :yloop
++/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
++/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
++/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
++/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
++/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
++/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
++/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
++/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
++/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
++/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
++/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
++/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
++/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
++/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
++/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
++/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
++/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
++/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_filter_b
++// :yloopb
++/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
++/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
++/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
++/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
++/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
++/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
++/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
++/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
++/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
++/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
++/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
++/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
++/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
++/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
++/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
++/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
++/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
++/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
++/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
++/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
++/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
++/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
++/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
++/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
++/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
++/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
++/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
++/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
++/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
++/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
++/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
++/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
++/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
++/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
++/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
++/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
++/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
++/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
++/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
@@ -13375,7 +16421,7 @@ index 0000000..06fb166
 +#endif
 diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
 new file mode 100644
-index 0000000..9772796
+index 0000000..d17b9fd
 --- /dev/null
 +++ b/libavcodec/rpi_shader.h
 @@ -0,0 +1,19 @@
@@ -13384,26 +16430,33 @@ index 0000000..9772796
 +
 +extern unsigned int rpi_shader[];
 +
-+#define mc_setup_uv (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 132)
-+#define mc_filter_uv_b0 (rpi_shader + 274)
-+#define mc_filter_uv_b (rpi_shader + 392)
-+#define mc_exit (rpi_shader + 540)
-+#define mc_interrupt_exit8 (rpi_shader + 558)
-+#define mc_setup (rpi_shader + 588)
-+#define mc_filter (rpi_shader + 872)
-+#define mc_filter_b (rpi_shader + 992)
-+#define mc_interrupt_exit12 (rpi_shader + 1114)
-+#define mc_exit1 (rpi_shader + 1152)
-+#define mc_end (rpi_shader + 1168)
++#define mc_setup_c (rpi_shader + 0)
++#define mc_filter_uv (rpi_shader + 152)
++#define mc_filter_uv_b0 (rpi_shader + 280)
++#define mc_interrupt_exit8c (rpi_shader + 554)
++#define mc_exit (rpi_shader + 582)
++#define mc_exit_c (rpi_shader + 582)
++#define mc_interrupt_exit12 (rpi_shader + 598)
++#define mc_exit1 (rpi_shader + 634)
++#define mc_setup (rpi_shader + 650)
++#define mc_filter (rpi_shader + 942)
++#define mc_filter_b (rpi_shader + 1094)
++#define mc_end (rpi_shader + 1246)
 +
 +#endif
 diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
 new file mode 100644
-index 0000000..aa9e1e7
+index 0000000..aa3fe47
 --- /dev/null
 +++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1098 @@
+@@ -0,0 +1,1259 @@
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4.  As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
 +# register allocation
 +#
 +# ra0...ra7                                     eight horizontal filter coefficients
@@ -13420,32 +16473,32 @@ index 0000000..aa9e1e7
 +#
 +# rb8...rb11                                    eight vertical filter coefficients
 +
-+# ra4                                           y: Fiter, UV: 0x10000
++# ra4                                           y: Fiter, UV: part -of b0 -> b stash
 +
 +# rb12                                          offset to add before shift (round + weighting offsets)
 +# rb13                                          shift: denom + 6 + 9
 +# rb14                                          L0 weight (U on left, V on right)
 +# rb15                                          -- free --
 +#
-+# ra16                                          clipped(row start address+elem_num)&~3
-+# ra17                                          per-channel shifts
++# ra16                                          width:height
++# ra17                                          ra_y:ra_xshift
 +# ra18                                          L1 weight (Y)
-+# ra19                                          next ra17
++# ra19                                          ra_y_next:ra_xshift_next
 +#
 +# rb16                                          pitch
 +# rb17                                          height + 1
-+# rb18                                          height + 3
-+# rb19                                          next ra16
++# rb18                                          max(height,16) + 3
++# rb19                                          frame_base2_next
 +#
 +# ra20                                          1
-+# ra21                                          ra_21
++# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
 +# ra22 ra_k256                                  256
-+# ra23 ra_y2_next                               ra_y2_next
++# ra23                                          0
 +#
-+# rb20                                          0xffffff00
-+# rb21                                          vpm_setup for reading/writing 16bit results into VPM
++# rb20                                          -- free --
++# rb21                                          -- free --
 +# rb22 rb_k255                                  255
-+# rb23                                          24
++# rb23                                          dest (Y)
 +#
 +# rb24                                          vdw_setup_1(dst_pitch)
 +# rb25                                          frame width-1
@@ -13456,146 +16509,233 @@ index 0000000..aa9e1e7
 +# rb30                                          frame height-1
 +# rb31                                          used as temp to count loop iterations
 +#
-+# ra24                                          clipped(row start address+8+elem_num)&~3
-+# ra25                                          per-channel shifts 2
++# ra24                                          src frame base
++# ra25                                          src frame base 2
 +# ra26                                          next ra24
 +# ra27                                          next ra25
-+# ra28                                          next y
-+# ra29                                          y for next texture access
-+# ra30                                          64
++# ra28                                          -- free --
++# ra29                                          -- free --
 +#
-+# ra31                                          next kernel address
++# Use an even numbered register as a link register to avoid corrupting flags
++# ra30                                          next kernel address
++# ra31                                          chroma-B height+3; free otherwise
 +
-+.set rb_frame_width_minus_1,       rb25
-+.set rb_frame_height_minus_1,      rb30
++.set rb_max_x,                     rb25
++.set rb_max_y,                     rb30
 +.set rb_pitch,                     rb16
-+.set ra_x,                         ra16
++.set ra_width_height,              ra16
++.set ra_width,                     ra16.16b
++.set ra_height,                    ra16.16a
 +.set ra_y2,                        ra21.16a
 +.set ra_y2_next,                   ra21.16b
 +
-+.set rb_x_next,                    rb19
-+.set rx_frame_base2_next,          rb19
++.set rb_base2_next,                rb19
 +
-+.set ra_frame_base,                ra24
-+.set ra_frame_base_next,           ra26
-+.set ra_xshift,                    ra17
++.set rb_dest,                      rb23
++.set ra_base,                      ra24
++.set ra_base_next,                 ra26
++.set ra_xshift,                    ra17.16a
 +
-+.set ra_u2v_ref_offset,            ra25
-+.set ra_frame_base2,               ra25
++.set ra_base2,                     ra25
 +
-+.set ra_xshift_next,               ra19
-+.set rx_xshift2,                   rb0
-+.set rx_xshift2_next,              rb1
++# Note ra_xy & ra_xy_next should have same structure!
++.set ra_xshift_next,               ra19.16a
++.set rb_xshift2,                   rb0
++.set rb_xshift2_next,              rb1
 +
-+.set ra_u2v_dst_offset,            ra27
-+
-+.set ra_y_next,                    ra28
-+.set ra_y,                         ra29
++.set ra_y_next,                    ra19.16b
++.set ra_y,                         ra17.16b
 +
 +.set ra_k1,                        ra20
++.set rb_xpitch,                    rb20
 +.set rb_k255,                      rb22
 +.set ra_k256,                      ra22
++.set ra_k0,                        ra23
++
++.set ra_link,                      ra30
 +
 +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
 +.set i_shift16,                    -16
 +.set i_shift21,                    -11
++.set i_shift23,                     -9
++.set i_shift30,                     -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, r_vpm, r_dma
++  mov r2, qpu_num
++  asr r1, r2, 2
++  shl r1, r1, 6
++  and r0, r2, 3
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++  add r_dma, r0, r1  # DMA out
++.endm
++
++# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
++.macro m_calc_dma_regs_c, r_vpm, r_dma
++  mov r2, qpu_num
++  asr r1, r2, 1
++  shl r1, r1, 5
++  and r0, r2, 1
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++  shl r0, r0, 6
++  add r_dma, r0, r1  # DMA out
++.endm
++
 +
 +################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+::mc_setup_uv
-+
-+# Read starting kernel
-+mov ra31, unif
++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++::mc_setup_c
++  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
 +
 +# Load first request location
-+add ra_x, unif, elem_num # Store x
-+mov ra_y, unif # Store y
-+mov ra_frame_base, unif # Store frame u base
-+nop
-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
++  mov ra0, unif         # next_x_y
++
++  mov ra_base, unif                             # Store frame c base
 +
 +# Read image dimensions
-+sub rb25,unif,1
-+sub rb30,unif,1
-+
-+# get source pitch
-+mov rb16, unif
-+
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
++  sub rb_max_x, unif, 1     # pic c width
++  sub rb_max_y, unif, 1     # pic c height
 +
 +# load constants
++  mov ra_k1, 1
++  mov ra_k256, 256
++  mov rb_k255, 255
++  mov ra_k0, 0
 +
-+mov ra4, 0x10000
-+mov ra_k1, 1
-+mov ra_k256, 256
-+mov ra30, 64
++# touch registers to keep simulator happy
 +
-+mov rb20, 0xffffff00
-+mov rb_k255, 255
-+mov rb23, 24
++  # ra/b4..7: B0 -> B stash registers
++  mov ra4, 0 ; mov rb4, 0
++  mov ra5, 0 ; mov rb5, 0
++  mov ra6, 0 ; mov rb6, 0
++  mov ra7, 0 ; mov rb7, 0
 +
-+# touch vertical context to keep simulator happy
++  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
 +
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
++# ; ra12..15: vertical scroll registers
++# get source pitch
++  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
++  mov rb_pitch, unif    ; mov ra13, 0           # stride1
++  mov r0, elem_num      ; mov ra14, 0
++# get destination vdw setup
++  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
 +
 +# Compute base address for first and second access
-+mov r0, ra_x           # Load x
-+max r0, r0, 0; mov r1, ra_y # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+add ra_y, r1, 1
-+add r0, r0, r3
-+and r0, r0, ~3
-+max r1, r1, 0 ; mov ra_x, r0 # y
-+min r1, r1, rb_frame_height_minus_1
-+# submit texture requests for first line
-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+add t0s, r0, r1 ; mov ra_frame_base, r2
-+add t1s, r2, r1
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
 +
-+mov r2, 9
-+add rb13, r2, unif  # denominator
-+mov -, unif         # Unused
++  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
++  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
++  min r0, r0, rb_max_x
++
++# Get shift
++  and r1, r0, 1
++  shl ra_xshift_next, r1, 4
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++  and r0, r0, -2
++  add r0, r0, r0        ; v8subs r1, r1, r1
++  sub r1, r1, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra_y
++  add ra_base, ra_base, r0
++
++  max r0, r1, 0
++  min r0, r0, rb_max_y
++
++# submit texture requests for first line
++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++  add t0s, ra_base, r0
++
++# submit texture requests for 2nd line
++
++  max r0, r1, 0
++  min r0, r0, rb_max_y
++
++  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++  add t0s, ra_base, r0
++
++  add rb13, 9, unif     # denominator
++  mov -, unif           # Unused
 +
 +# Compute part of VPM to use for DMA output
-+mov r2, unif
-+shl r2, r2, 1   # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
++  m_calc_dma_regs_c rb28, rb27
 +
-+mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+add rb28, r0, r1  # VPM 8bit storage
-+asr r2, r0, 1     # r0 = bc0000d
-+mov r1, vpm_setup(0, 2, h16p(0, 0))  # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+add rb21, r2, r1  # VPM for 16bit intermediates
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1  # DMA out
++# -----------------
++# And again for L1, but only worrying about frame2 stuff
 +
-+# submit texture requests for second line
-+max r1, ra_y, 0
-+min r1, r1, rb_frame_height_minus_1
-+add ra_y, ra_y, 1
-+bra -, ra31
-+nop ; mul24 r1, r1, rb_pitch
-+add t0s, r1, ra_x
-+add t1s, r1, ra_frame_base
++  mov ra_link, unif        # Next fn
 +
++# Load first request location
++  mov ra0, unif            # next_x_y
++
++  mov ra_base2, unif # Store frame c base
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  mov ra_y2, ra0.16a       # Store y
++  mov r0, ra0.16b          # Load x
++  add r0, r0, elem_num     # Add QPU slice
++  max r0, r0, 0         ; mov -, unif           # Unused 0
++  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
++
++# Get shift
++  and r1, r0, 1         ; mov -, unif           # Unused 2
++  shl rb_xshift2_next, r1, 4
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++  and r0, r0, -2
++  add r0, r0, r0        ; v8subs r1, r1, r1
++  sub r1, r1, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra_y2
++  add ra_base2, ra_base2, r0
++
++  max r0, r1, 0
++  min r0, r0, rb_max_y
++
++# submit texture requests for first line
++  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
++  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
++
++# submit texture requests for 2nd line
++
++  max r0, r1, 0         ; mov -, unif           # Unused 4
++
++  bra -, ra_link
++
++  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
++  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
++  add t1s, ra_base2, r0
++
++# >>> ra_link
++
++
++.macro setf_nz_if_v
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++.endm
 +
 +
 +################################################################################
@@ -13605,51 +16745,51 @@ index 0000000..aa9e1e7
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv
-+mov ra31, unif
++  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0         ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+shl ra_xshift_next, r0, 3
-+add r0, r0, r3        ; mov ra1, unif  # ; width_height
-+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
-+mov ra_y_next, r1     ; mov vw_setup, rb28
-+add ra_frame_base_next, rb_x_next, r2
++  mov ra2, unif         ; mov r0, elem_num
++
++  setf_nz_if_v                                  # Also acts as delay slot for ra2
++
++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++
++  shl ra_xshift_next, r0, 4
++
++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
 +
 +# set up VPM write
-+# get width,height of block
 +
-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
-+add r0,   r0, ra1.16b    # Combine width and height of destination area
-+shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
++  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
++  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++# ; unpack filter coefficients
 +
-+# unpack filter coefficients
++  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
++  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
 +
-+mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
-+mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
-+nop                   ; mov rb10, ra3.8c
-+mov r3, 0             ; mov rb11, ra3.8d   # Loop count
++  shl r1, r1, rb13      ; mov rb10, ra3.8c
++  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
 +
-+shl r1, ra1.16b, rb13
-+asr rb12, r1, 1
-+shl rb14, ra1.16a, 1  # b14 = weight*2
++  asr rb12, r1, 1
++  shl rb14, ra1.16a, 1  # b14 = weight*2
 +
 +# rb14 - weight L0 * 2
 +# rb13 = weight denom + 6 + 9
 +# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
 +
-+# r2 is elem_num
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
@@ -13658,123 +16798,114 @@ index 0000000..aa9e1e7
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++  min r2, r2, rb_max_y
++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  setf_nz_if_v
 +
 +# apply horizontal filter
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop
-+mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  sub r0, r2, r3        ; mov r3, rb31
++  sub.setf -, r3, 4     ; mov ra12, ra13
++  brr.anyn -, r:uvloop
++  mov ra13, ra14        ; mul24 r1, ra14, rb9
++  mov ra14, ra15
++  mov ra15, r0          ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop
 +
 +# apply vertical filter and write to VPM
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+sub r1, r1, r0          ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14
-+nop                     ; mul24 r1, r1, rb14
-+shl r1, r1, 8
++  sub r1, r1, r0        ; mul24 r0, ra14, rb10
++  add r1, r1, r0        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0
++  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14
++  nop                   ; mul24 r1, r1, rb14
++  shl r1, r1, 8
 +
-+add r1, r1, rb12
-+brr.anyn -, r:uvloop
-+asr r1, r1, rb13
-+min r1, r1, rb_k255       # Delay 2
-+max vpm, r1, 0         # Delay 3
++  add r1, r1, rb12
++  asr ra1.8as, r1, rb13
++  nop                   ; mov r1, r1 << 8
++  brr.anyn -, r:uvloop
++  asr ra1.8bs, r1, rb13
++  mov -, vw_wait
++  mov vpm, ra1
 +
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
++# >>>
 +
++# DMA out for U & stash for V
++  bra -, ra_link
++  mov vw_setup, rb26
++  mov vw_setup, rb29
++  mov vw_addr, unif     # u_dst_addr
++# >>>
 +
 +################################################################################
 +
-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv_b0
-+mov ra31, unif
++  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num       # x
-+max r0, r0, 0                ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+sub r2, unif, r3             ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
-+shl ra_xshift_next, r0, 3
-+add r0, r0, r3  	     ; mov ra1, unif   # ; width_height
-+and rb_x_next, r0, ~3        ; mov ra0, unif   # ; H filter coeffs
-+mov ra_y_next, r1            ; mov vw_setup, rb21
++  mov ra2, unif         ; mov r0, elem_num
 +
-+add ra_frame_base_next, rb_x_next, r2
++  setf_nz_if_v                                  # Also acts as delay slot for ra2
 +
-+# Need to have unsigned coeffs to so we can just unpack in the filter
-+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
-+# filter code. Unpack into b regs for V
++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
 +
-+# set up VPM write, we need to save 16bit precision
++  shl ra_xshift_next, r0, 4
 +
-+sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
-+add r0,   r0, ra1.16b           # Combine width and height of destination area
-+shl r0,   r0, i_shift16      ; mov ra3, unif  # ; V filter coeffs
-+add rb26, r0, rb27
++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
++  add r0, r0, r0        ; mov ra_y_next, ra2.16a
++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
 +
-+mov rb8, ra3.8a
-+mov rb9, ra3.8b
-+mov rb10, ra3.8c
-+mov rb11, ra3.8d
++# set up VPM write
 +
-+# r2 is elem_num
-+# r3 is loop counter
++  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
++  add rb17, r1, 1
++  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++# ; unpack filter coefficients
++
++  add r0,   r0, r2      ; mov rb9,  ra3.8b
++  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb26, r0, rb27
++
++  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
++
++  mov rb14, unif                                # U weight
++  mov.ifnz rb14, unif                           # V weight
 +
-+mov      rb14, unif                 # U weight L0
-+mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
 +# rb14 unused in b0 but will hang around till the second pass
 +
 +# retrieve texture results and pick out bytes
@@ -13785,108 +16916,143 @@ index 0000000..aa9e1e7
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++  shr r1, r0, 8         ; mov.ifnz r3, ra_y
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
++  min r2, r2, rb_max_y
++  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b0
-+mov ra13, ra14          ; mul24 r1, ra14, rb9  # ra14 is about to be ra13
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
++  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  sub r0, r2, r3        ; mov r3, rb31
++  sub.setf -, r3, 4     ; mov ra12, ra13
++  brr.anyn -, r:uvloop_b0
++  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
++  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
++  mov ra15, r0          ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop_b0
 +
-+# apply vertical filter and write to VPM
++# apply vertical filter and write to B-FIFO
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+sub.setf -, r3, rb18
-+brr.anyn -, r:uvloop_b0
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+sub r1, r1, r0          ; mov -, vw_wait
-+asr vpm, r1, 6
-+# >>> .anyn uvloop_b0
++  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
++  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
++  sub r1, r1, r0        ; mov ra7, rb6
 +
-+# in pass0 we don't really need to save any results, but need to discard the uniforms
-+# DMA out for U
++# FIFO goes:
++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
++# This arrangement optimizes the inner loop FIFOs at the expense of making the
++# bulk shift between loops quite a bit nastier
++# a8 used as temp
 +
-+bra -, ra31
-+mov -, unif           # Delay 1
-+mov -, unif           # Delay 2
-+nop                   # Delay 3
++  sub.setf -, r3, ra31
++  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
++  brr.anyn -, r:uvloop_b0
++  mov ra5, rb4          ; mov rb4, ra4
++  mov ra4, rb5          ; mov rb5, ra6
++  mov ra6, rb7          ; mov rb7, ra8
++# >>>
 +
++# 1st half done all results now in the a/b4..7 fifo
 +
-+################################################################################
++# Need to bulk rotate FIFO for heights other than 16
++# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
++# we are allowed 3/4 cb_size w/h :-(
 +
-+::mc_filter_uv_b
-+mov ra31, unif
++# Destination uniforms discarded
++# At the end drop through to _b - we will always do b after b0
++
++  sub.setf -, 15, r3    # 12 + 3 of preroll
++  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
++  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
++  mov r0, i_shift16     ; mov ra_link, unif
++  mov r1, 0x10000
++# >>>
++  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
++# If h != 16 && h != 12 then h <= 8 so
++# shift 8 with discard (.16b = .16a on all regs)
++  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++# >>>
++  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
++
++  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
++# Shift 4
++  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
++  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
++  # If we shifted by 4 here then the max length remaining is 4
++  # so that is it
++
++  brr -, r:uv_b0_post_fin
++# Shift 2
++  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
++  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
++  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
++  # 6 / 2 so need 6 outputs
++# >>>
++
++:uv_b0_post12
++# this one is annoying as we need to swap halves of things that don't
++# really want to be swapped
++
++# b7a, a6a, b5a, a4a
++# b4a, a5a, b6a, a7a
++# b7b, a6b, b5b, a4b
++# b4b, a5b, b6b, a7b
++
++  mov r2, ra6           ; mov r3, rb7
++  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
++  mov ra5, r2           ; mov rb4, r3
++
++  mov r2,  ra4          ; mov r3,  rb5
++  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
++  mov ra7, r2           ; mov rb6, r3
++
++:uv_b0_post_fin
++
++##### L1 B processing
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
-+# set up VPM write
-+mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
-+
 +# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num    # x
-+max r0, r0, 0                      ; mov ra_y_next, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
-+add r0, r0, r3                     ; mov ra1, unif       # width_height
-+and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
++  mov ra2, unif         ; mov r0, elem_num
 +
-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
++  setf_nz_if_v                                  # Also acts as delay slot for ra2
 +
-+add ra_frame_base_next, rb_x_next, r2
++  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
 +
-+# r0 is currently height<<7
-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+shl r3, r0, i_shift21     ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
-+shr r3, r3, 8
-+add vr_setup, r3, rb21
++  shl rb_xshift2_next, r0, 4
 +
-+add r0, r0, ra1.16b    # Combine width and height of destination area
-+shl r0, r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
++  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
++  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
++  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
++  add rb_base2_next, r3, r0
 +
-+# get filter coefficients
++  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
++  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+# Get offset & weight stuff
-+
-+# The unif read occurs unconditionally, only the write is conditional
-+mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
-+mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
-+nop                 ; mov rb10, ra3.8c
-+mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
-+
-+shl r1, ra1.16b, rb13
-+asr rb12, r1, 1
++  nop                   ; mov rb11, ra3.8d
++  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
++  asr rb12, r1, 1
 +
 +# ra1.16a used directly in the loop
 +
@@ -13894,125 +17060,147 @@ index 0000000..aa9e1e7
 +# then submit two more texture requests
 +
 +# r3 = 0
++
 +:uvloop_b
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift     ; v8subs r0, r0, rb20  # v8subs masks out all but bottom byte
++  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
++  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
++  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2         ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
++  min r2, r2, rb_max_y
++  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
 +
 +mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b
-+mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
++  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
++  sub r0, r2, r3       ; mov r3, rb31
++  sub.setf -, r3, 4    ; mov ra12, ra13
++  brr.anyn -, r:uvloop_b
++  mov ra13, ra14          ; mul24 r1, ra14, rb9
++  mov ra14, ra15          ; mul24 r2, ra15, rb10
++  mov ra15, r0            ; mul24 r0, ra12, rb8
 +# >>> .anyn uvloop_b
 +
 +# apply vertical filter and write to VPM
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
-+sub r1, r1, r0          ; mul24 r0, vpm, ra4  # ra4 = 0x10000
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14          # shift2=6
++  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
++  add r1, r1, r2        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
++  mov ra7, rb6          ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
 +
-+asr r0, r0, i_shift16   ; mul24 r1, r1, ra1.16a
-+nop                     ; mul24 r0, r0, rb14
++  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
++  add r1, r1, r0        ; mov rb4, ra4
 +
-+add r1, r1, r0          ; mov -, vw_wait
-+shl r1, r1, 8           # Lose bad top 8 bits & sign extend
++  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
++  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
 +
-+add r1, r1, rb12        # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++  sub.setf -, r3, ra31  ; mov ra6, rb7
++  asr ra3.8as, r1, rb13
++  nop                   ; mov r1, r1 << 8
++  brr.anyn -, r:uvloop_b
++  asr ra3.8bs, r1, rb13
++  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
++  mov vpm, ra3
++# >>>
 +
-+brr.anyn -, r:uvloop_b
-+asr r1, r1, rb13         # Delay 1
-+min r1, r1, rb_k255       # Delay 2
-+max vpm, r1, 0         # Delay 3
++# DMA out
 +
++  bra -, ra_link
++  mov vw_setup, rb26
++  mov vw_setup, rb29
++  mov vw_addr, unif     # c_dst_addr
 +
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
 +
 +################################################################################
 +
 +# mc_exit()
 +
++::mc_interrupt_exit8c
++  ldtmu0
++  ldtmu1
++  ldtmu1
++  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
++
++  mov -,sacq(0) # 1
++  mov -,sacq(0) # 2
++  mov -,sacq(0) # 3
++  mov -,sacq(0) # 4
++  mov -,sacq(0) # 5
++  mov -,sacq(0) # 6
++  mov -,sacq(0) # 7
++#  mov -,sacq(0) # 8
++#  mov -,sacq(0) # 9
++#  mov -,sacq(0) # 10
++#  mov -,sacq(0) # 11
++
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
++
++# Chroma & Luma the same now
++::mc_exit_c
 +::mc_exit
-+mov  -, vw_wait # wait on the VDW
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
 +
-+mov -,srel(0)
++  mov -,srel(0)
 +
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
-+
-+nop        ; nop ; thrend
-+nop        ; nop # delay slot 1
-+nop        ; nop # delay slot 2
-+
-+# mc_interrupt_exit8()
-+::mc_interrupt_exit8
-+mov  -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
-+
-+nop        ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop        ; nop # delay slot 2
++  nop        ; nop ; thrend
++  nop        ; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
 +
++# mc_interrupt_exit12()
++::mc_interrupt_exit12
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
++
++  mov -,sacq(0) # 1
++  mov -,sacq(0) # 2
++  mov -,sacq(0) # 3
++  mov -,sacq(0) # 4
++  mov -,sacq(0) # 5
++  mov -,sacq(0) # 6
++  mov -,sacq(0) # 7
++  mov -,sacq(0) # 8
++  mov -,sacq(0) # 9
++  mov -,sacq(0) # 10
++  mov -,sacq(0) # 11
++
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
 +
++::mc_exit1
++  mov  -, vw_wait # wait on the VDW
++
++  ldtmu0
++  ldtmu1
++  ldtmu0
++  ldtmu1
++  nop        ; nop ; thrend
++  mov interrupt, 1; nop # delay slot 1
++  nop        ; nop # delay slot 2
 +
 +# LUMA CODE
 +
@@ -14022,116 +17210,104 @@ index 0000000..aa9e1e7
 +################################################################################
 +# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
 +::mc_setup
-+  mov r3, 16
-+
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov ra8, unif  # y_x
-+  mov ra9, unif  # ref_y_base
-+  mov ra10, unif # y2_x2
-+  mov ra11, unif # ref_y2_base
++  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
++  mov ra9, unif         # ref_y_base
++  mov ra10, unif        # y2_x2
++  mov ra11, unif        # ref_y2_base
 +
 +# Read image dimensions
-+  mov r1, unif # width_height
-+  shl r0,r1,r3
-+  asr r1,r1,r3 # width
-+  asr r0,r0,r3 # height
-+  sub rb_frame_width_minus_1,r1,1
-+  sub rb_frame_height_minus_1,r0,1
-+
-+# get source pitch
-+  mov rb_pitch, unif # src_pitch
++  mov ra3, unif         # width_height
++  mov rb_xpitch, unif   # stride2
++  sub rb_max_x, ra3.16b, 1
++  sub rb_max_y, ra3.16a, 1
++  mov rb_pitch, unif    # stride1
 +
 +# get destination pitch
-+  mov r0, unif       # dst_pitch
 +  mov r1, vdw_setup_1(0)
-+  add rb24, r1, r0
++  or  rb24, r1, rb_pitch
 +
 +# Compute base address for first and second access
-+  mov r1, ra8 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
++  mov r3, elem_num
++  add r0, ra8.16a, r3   # Load x + elem_num
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9  # Load the frame base
++  min r0, r0, rb_max_x
 +  shl ra_xshift_next, r0, 3 # Compute shifts
-+  add ra_y, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame0 (not including y offset)
-+  max r1, r1, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t0s, r2, r1 ; mov ra_frame_base, r2
 +
-+  mov r1, ra10 # y_x
-+  shl r0,r1,r3 # r0 is x<<16
-+  asr r1,r1,r3 # r1 is y
-+  asr r0,r0,r3 # r0 is x
-+  add r0, r0, elem_num # Load x
++
++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
++
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base, ra9, r0
++
++  mov r1, ra8.16b       # Load y
++  add ra_y, r1, 1       # Set for next
++  max r1, r1, 0
++  min r1, r1, rb_max_y
++
++# submit texture requests for first line
++  nop                   ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1
++
++
++  # r3 still contains elem_num
++  add r0, ra10.16a, r3  # Load x
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11  # Load the frame base
-+  shl rx_xshift2_next, r0, 3 # Compute shifts
-+  add ra_y2, r1, 1
-+  and r0, r0, ~3  # r0 gives the clipped and aligned x coordinate
-+  add r2, r2, r0  # r2 is address for frame1 (not including y offset)
-+  max r1, r1, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop             ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t1s, r2, r1 ; mov ra_frame_base2, r2
++  min r0, r0, rb_max_x
++  shl rb_xshift2_next, r0, 3 # Compute shifts
 +
++  # r2 still contains mask
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base2, ra11, r0
++
++  mov r1, ra10.16b       # Load y
++  add ra_y2, r1, 1       # Set for next
++  max r1, r1, 0
++  min r1, r1, rb_max_y
++
++# submit texture requests for first line
++  nop                   ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1
 +
 +# load constants
 +
 +  mov ra_k1, 1
 +  mov ra_k256, 256
-+  mov ra30, 64
-+
-+  mov rb20, 0xffffff00
 +  mov rb_k255, 255
-+  mov rb23, 24
++  mov ra_k0, 0
 +
 +# touch vertical context to keep simulator happy
 +
-+  mov ra8, 0
-+  mov ra9, 0
-+  mov ra10, 0
-+  mov ra11, 0
-+  mov ra12, 0
-+  mov ra13, 0
-+  mov ra14, 0
-+  mov ra15, 0
++  mov ra8,  0           ; mov rb8,  0
++  mov ra9,  0           ; mov rb9,  0
++  mov ra10, 0           ; mov rb10, 0
++  mov ra11, 0           ; mov rb11, 0
 +
 +# Compute part of VPM to use
-+  mov r2, qpu_num
-+  mov r1, r2
-+  asr r1, r1, 2
-+  shl r1, r1, 6
-+  mov r0, r2
-+  and r0, r0, 3
-+  add r0, r0, r1
-+  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add rb28, r0, r1  # VPM for saving data
-+  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+  shl r0, r0, 5
-+  add rb27, r0, r1  # Command for dma output
++  m_calc_dma_regs rb28, rb27
 +
 +# Weighted prediction denom
-+  add rb13, unif, 9  # unif = weight denom + 6
-+
-+  mov -, unif # Unused
++  add rb13, unif, 9     # unif = weight denom + 6
 +
 +# submit texture requests for second line
 +  max r1, ra_y, 0
-+  min r1, r1, rb_frame_height_minus_1
++  min r1, r1, rb_max_y
 +  add ra_y, ra_y, 1
-+  nop ; mul24 r1, r1, rb_pitch
-+  add t0s, r1, ra_frame_base
++  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
++  add t0s, r1, ra_base
 +
 +  max r1, ra_y2, 0
-+  min r1, r1, rb_frame_height_minus_1
++  min r1, r1, rb_max_y
 +  add ra_y2, ra_y2, 1
-+  nop ; mul24 r1, r1, rb_pitch
-+  add t1s, r1, ra_frame_base2
++  nop                   ; mul24 r1, r1, rb_pitch
++  add t1s, r1, ra_base2
 +
 +# FALL THROUGHT TO PER-BLOCK SETUP
 +
@@ -14139,47 +17315,63 @@ index 0000000..aa9e1e7
 +# P and B blocks share the same setup code to save on Icache space
 +:per_block_setup
 +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra31, unif
++  mov ra_link, unif
++#### We do all the setup even if we are about to exit - reading junk from unif....
 +
-+  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
++  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +  mov ra_xshift, ra_xshift_next
-+  mov rx_xshift2, rx_xshift2_next
++  mov rb_xshift2, rb_xshift2_next
 +
 +# get base addresses and per-channel shifts for *next* invocation
 +
-+  add r0, ra1.16a, r1 # Load x
++  add r0, ra1.16a, r3   # Load x
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl ra_xshift_next, r0, 3 # Compute shifts
-+  mov r3, 8                          ; mov ra_y_next, ra1.16b
-+  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
-+  add ra_frame_base_next, r2, r0
++  min r0, r0, rb_max_x
 +
-+  add r0, ra1.16a, r1 # Load x
++  shl ra_xshift_next, r0, 3         # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base_next, unif, r0              # Base1
++  mov ra_y_next, ra1.16b                      # Load y
++  mov ra1, unif         # x2_y2
++  nop                   # ra1 delay
++
++  add r0, ra1.16a, r3   # Load x2
 +  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl rx_xshift2_next, r0, 3         # Compute shifts
-+  add r3, r3, r3                     ; mov ra_y2_next, ra1.16b  # r3 = 16 ;
-+  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
-+  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
++  min r0, r0, rb_max_x
++
++  shl rb_xshift2_next, r0, 3         # Compute shifts
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add rb_base2_next, unif, r0              # Base1
++  mov ra_y2_next, ra1.16b                      # Load y
++  mov ra_width_height, unif         # width_height
 +
 +# set up VPM write
-+  mov vw_setup, rb28
++  mov vw_setup, rb28    # [ra1 delay]
 +
 +# get width,height of block (unif load above)
-+  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
-+  add rb17, ra1.16a, 5
-+  add rb18, ra1.16a, 7
-+  shl r0,   ra1.16a, 7
-+  add r0,   r0, ra1.16b # Combine width and height of destination area
-+  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
++  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
++  add rb17, ra_height, 5  ; mov r0, ra_height
++  mov r1, 16
++  min r0, r0, r1
++  add rb18, r0, 7
++  shl r0,   r0, 7
++  add r0,   r0, ra_width                        # Combine width and height of destination area
++  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
 +  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16      # Pick half to use
-+  shl ra8, r0, 3
++  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
++  mov r2, 0x01040400                 # [ra5 delay]
++  shl ra8, r0, 3                     ; mov rb14, ra5.16a
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
 +
@@ -14187,9 +17379,8 @@ index 0000000..aa9e1e7
 +  ror ra2.8a, r1, ra8.8d
 +  ror ra0.8a, r1, ra8.8c
 +
-+  mov r1,0x01040400
-+  ror ra2.8b, r1, ra8.8d
-+  ror ra0.8b, r1, ra8.8c
++  ror ra2.8b, r2, ra8.8d
++  ror ra0.8b, r2, ra8.8c
 +
 +  mov r1,0x050b0a00  # -ve
 +  ror ra2.8c, r1, ra8.8d
@@ -14215,37 +17406,44 @@ index 0000000..aa9e1e7
 +  ror ra3.8c, r1, ra8.8d
 +  ror ra1.8c, r1, ra8.8c
 +
-+# Extract weighted prediction information in parallel
-+
 +  mov r1,0x01010000  # -ve
-+  ror ra3.8d, r1, ra8.8d    ; mov r0, unif      # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
-+  ror ra1.8d, r1, ra8.8c    ; mov r1, rb13      # ; rb13 = weight denom + 6 + 9
++  ror ra3.8d, r1, ra8.8d
++  ror ra1.8d, r1, ra8.8c
 +
-+# r3 = 16 from (long way) above
-+  shl r1, unif, r1          ; mov rb4, ra3.8a   # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
-+  asr ra18, r0, r3          ; mov rb5, ra3.8b
-+  bra -, ra31
-+  shl r0, r0, r3            ; mov rb6, ra3.8c
-+  mov r3, 0                 ; mov rb7, ra3.8d   # loop count ;
-+  asr rb12, r1, 9
++# Extract weighted prediction information in parallel
++# We are annoyingly A src limited here
 +
-+# >>> branch ra31
++  mov rb4, ra3.8a            ; mov ra18, unif
++  mov rb5, ra3.8b
++  mov rb6, ra3.8c
++  mov.ifnz ra5, ra18
++
++  mov rb_dest, unif     # Destination address
++
++  bra -, ra_link
++
++  shl r0, ra5.16b, rb13      # Offset calc
++  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  mov r3, 0                  ; mov rb7, ra3.8d
++# >>> branch ra_link
 +#
 +# r3 = 0
-+# ra18 = weight L1
-+# r0   = weight L0 << 16 (will be put into rb14 in filter preamble)
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# ra18.16a = weight L1
++# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# rb13     = weight denom + 6 + 9
++# rb14     = weight L0
 +
 +
 +################################################################################
-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, y2_x2 should be y_x+8
 +# At this point we have already issued two pairs of texture requests for the current block
 +
 +::mc_filter
-+# r0 = weight << 16; We want weight * 2 in rb14
-+  asr rb14, r0, 15
++# ra5.16a = weight << 16; We want weight * 2 in rb14
++
++  shl rb14, ra5.16a, 1
 +
 +# r3 = 0
 +
@@ -14261,20 +17459,20 @@ index 0000000..aa9e1e7
 +# might be B where y != y2 so we must do full processing on both y and y2
 +
 +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
 +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14283,21 +17481,21 @@ index 0000000..aa9e1e7
 +
 +# apply horizontal filter
 +  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +  sub r0, r2, r3       ; mov r3, rb31
 +
 +  sub.setf -, r3, 8       ; mov r1,   ra8
@@ -14336,18 +17534,48 @@ index 0000000..aa9e1e7
 +  max vpm, r1, 0         # Delay 3
 +# >>> branch.anyn yloop
 +
-+# DMA out
++# If looping again the we consumed 16 height last loop
++  # rb29 (stride) remains constant
++  # rb17 remains const (based on total height)
++  # recalc rb26, rb18 based on new segment height
++  # N.B. r3 is loop counter still
 +
-+  brr -, r:per_block_setup
++  mov r1, 16
++  sub r0, ra_height, r1
++  mov ra_height, r0
++  max.setf r0, r0, 0    # Done if Z now
++
++# DMA out
++  brr.anyz -, r:per_block_setup
 +  mov vw_setup, rb26 # VDW setup 0    Delay 1
 +  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
++  mov vw_addr, rb_dest # start the VDW   Delay 3
++# >>> .anyz per_block_setup
++
++  min r0, r0, r1
++  add rb18, rb18, r0
++  sub r0, r0, r1
++  shl r0, r0, i_shift23
++  add rb26, rb26, r0
++
++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++
++  mov vw_setup, rb28    # Reset our VDM write pointer
++
++  brr -, r:yloop
++  nop
++  nop
++  nop
++# >>>
++
++
 +
 +
 +
 +################################################################################
 +
-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, only the first half of coefficients contain used information.
 +# At this point we have already issued two pairs of texture requests for the current block
 +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
@@ -14359,7 +17587,7 @@ index 0000000..aa9e1e7
 +
 +::mc_filter_b
 +  # r0 = weightL0 << 16, we want it in rb14
-+  asr rb14, r0, i_shift16
++#  asr rb14, r0, i_shift16
 +
 +:yloopb
 +# retrieve texture results and pick out bytes
@@ -14369,20 +17597,20 @@ index 0000000..aa9e1e7
 +# Perhaps we could add on the pitch and clip using larger values?
 +
 +  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
++  mov.ifz ra_base, ra_base_next ; mov rb31, r3
 +  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
++  min r2, r2, rb_max_y
 +  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8subs r1, r1, rb20
++  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14391,21 +17619,21 @@ index 0000000..aa9e1e7
 +
 +# apply horizontal filter
 +  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +  sub r0, r2, r3       ; mov r3, rb31
 +
 +  sub.setf -, r3, 8       ; mov r1,   ra8
@@ -14417,7 +17645,6 @@ index 0000000..aa9e1e7
 +  # >>> .anyn yloopb
 +
 +  # apply vertical filter and write to VPM
-+
 +  nop                     ; mul24 r0, rb8,  ra2.8a
 +  nop                     ; mul24 r1, rb9,  ra2.8b
 +  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
@@ -14433,7 +17660,7 @@ index 0000000..aa9e1e7
 +
 +  asr r1, r1, 14
 +  nop                     ; mul24 r0, r1, rb14
-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18 << 8
++  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
 +
 +  add r1, r1, r0          ; mov -, vw_wait
 +  shl r1, r1, 8
@@ -14443,548 +17670,164 @@ index 0000000..aa9e1e7
 +  min r1, r1, rb_k255       # Delay 2
 +  max vpm, r1, 0         # Delay 3
 +
++
++# If looping again the we consumed 16 height last loop
++  # rb29 (stride) remains constant
++  # rb17 remains const (based on total height)
++  # recalc rb26, rb18 based on new segment height
++  # N.B. r3 is loop counter still
++
++  mov r1, 16
++  sub r0, ra_height, r1
++  mov ra_height, r0
++  max.setf r0, r0, 0    # Done if Z now
++
 +# DMA out
-+  brr -, r:per_block_setup
++  brr.anyz -, r:per_block_setup
 +  mov vw_setup, rb26 # VDW setup 0    Delay 1
 +  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
++  mov vw_addr, rb_dest # start the VDW   Delay 3
++# >>> .anyz per_block_setup
++
++  min r0, r0, r1
++  add rb18, rb18, r0
++  sub r0, r0, r1
++  shl r0, r0, i_shift23
++  add rb26, rb26, r0
++
++  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++
++  mov vw_setup, rb28    # Reset our VDM write pointer
++
++  brr -, r:yloopb
++  nop
++  nop
++  nop
 +
 +################################################################################
 +
-+# mc_interrupt_exit12()
-+::mc_interrupt_exit12
-+  mov  -, vw_wait # wait on the VDW
-+
-+  # Dummy wait to test instructions
-+#  mov r3,1000000
-+#:dummy_loop
-+#  sub.setf r3, r3, 1
-+#  nop
-+#  nop
-+#  brr.anynn -, r:dummy_loop
-+#  nop
-+#  nop
-+#  nop
-+
-+  ldtmu0
-+  ldtmu0
-+  ldtmu1
-+  ldtmu1
-+
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+  mov -,sacq(0) # 8
-+  mov -,sacq(0) # 9
-+  mov -,sacq(0) # 10
-+  mov -,sacq(0) # 11
-+
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+
-+::mc_exit1
-+  mov  -, vw_wait # wait on the VDW
-+
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  ldtmu1
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+
 +::mc_end
 +# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
 new file mode 100644
-index 0000000..db41a4d
+index 0000000..27cbb59
 --- /dev/null
-+++ b/libavcodec/rpi_user_vcsm.h
-@@ -0,0 +1,459 @@
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation.  All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein.  IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+*    constitutes the valuable trade secrets of Broadcom, and you shall use
-+*    all reasonable efforts to protect the confidentiality thereof, and to
-+*    use this information only in connection with your use of Broadcom
-+*    integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+*    AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+*    WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+*    RESPECT TO THE SOFTWARE.  BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+*    IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+*    FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+*    QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+*    ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+*    LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+*    OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+*    YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+*    ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+*    OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+*    IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+*    ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
++++ b/libavcodec/rpi_shader_cmd.h
+@@ -0,0 +1,88 @@
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
 +
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
++#pragma pack(push, 4)
 +
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+**       1) Initialize the service.
-+**       2) Allocate shared memory blocks.
-+**       3) Start using the allocated blocks.
-+**          - In order to gain ownership on a block, lock the allocated block,
-+**            locking a block returns a valid address that the user application
-+**            can access.
-+**          - When finished with using the block for the current execution cycle
-+**            or function, and so when giving up the ownership, unlock the block.
-+**       4) A block can be locked/unlocked as many times required - within or outside
-+**          of - a specific execution context.
-+**       5) To completely release an allocated block, free it.
-+**       6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
++typedef struct qpu_mc_pred_c_s {
++    uint32_t next_fn;
++    int16_t next_src_y;
++    int16_t next_src_x;
++    uint32_t next_src_base_c;
++    union {
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t wo_u;
++            uint32_t wo_v;
++            uint32_t dst_addr_c;
++        } p;
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t weight_u;
++            uint32_t weight_v;
++            uint32_t dummy0;
++        } b0;
++        struct {
++            uint32_t dummy0;
++            uint32_t coeffs_x;
++            uint32_t coeffs_y;
++            uint32_t wo_u;
++            uint32_t wo_v;
++            uint32_t dst_addr_c;
++        } b1;
++        struct {
++            uint32_t pic_cw;            // C Width (== Y width / 2)
++            uint32_t pic_ch;            // C Height (== Y Height / 2)
++            uint32_t stride2;
++            uint32_t stride1;
++            uint32_t wdenom;
++            uint32_t dummy0;
++        } s0;
++        struct {
++            uint32_t dummy0;
++            uint32_t dummy1;
++            uint32_t dummy2;
++            uint32_t dummy3;
++            uint32_t dummy4;
++            uint32_t dummy5;
++        } s1;
++    };
++} qpu_mc_pred_c_t;
 +
-+** Allocating memory blocks.
-+**
-+**   Memory blocks can be allocated in different manners depending on the cache
-+**   behavior desired.  A given block can either be:
++typedef struct qpu_mc_pred_y_s {
++    int16_t next_src1_x;
++    int16_t next_src1_y;
++    uint32_t next_src1_base;
++    int16_t next_src2_x;
++    int16_t next_src2_y;
++    uint32_t next_src2_base;
++    union {
++        struct {
++            uint16_t h;
++            uint16_t w;
++            uint32_t mymx21;
++            uint32_t wo1;
++            uint32_t wo2;
++            uint32_t dst_addr;
++        } p;
++        struct {
++            uint16_t pic_h;
++            uint16_t pic_w;
++            uint32_t stride2;
++            uint32_t stride1;
++            uint32_t wdenom;
++            uint32_t dummy0;
++        } s;
++    };
++    uint32_t next_fn;
++} qpu_mc_pred_y_t;
 +
-+**       - Allocated in a non cached fashion all the way through host and videocore.
-+**       - Allocated in a cached fashion on host OR videocore.
-+**       - Allocated in a cached fashion on host AND videocore.
-+**
-+**   It is an application decision to determine how to allocate a block.  Evidently
-+**   if the application will be doing substantial read/write accesses to a given block,
-+**   it is recommended to allocate the block at least in a 'host cached' fashion for
-+**   better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, locking the
-+**   memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**   It is possible to dynamically change the host cache behavior (ie cached or non
-+**   cached) of a given allocation without needing to free and re-allocate the block.
-+**   This feature can be useful for such application which requires access to the block
-+**   only at certain times and not otherwise.  By changing the cache behavior dynamically
-+**   the application can optimize performances for a given duration of use.
-+**   Such dynamic cache behavior remapping only applies to host cache and not videocore
-+**   cache.  If one requires to change the videocore cache behavior, then a new block
-+**   must be created to replace the old one.
-+**
-+**   On successful locking, a valid pointer is returned that the application can use
-+**   to access to data inside the block.  There is no guarantee that the pointer will
-+**   stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+**   When the memory block has been allocated in a host cached fashion, unlocking the
-+**   memory block (and so forgiving its ownership) will trigger a cache flush unless
-+**   explicitely asked not to flush the cache for performances reasons.
-+**
-+**   For the above reason and when using host cached allocation, it is important that
-+**   an application properly implements the lock/unlock mechanism to ensure cache will
-+**   stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
++#pragma pack(pop)
 +
-+#ifdef __cplusplus
-+extern "C"
-+{
 +#endif
 +
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+   VCSM_STATUS_VC_WALK_ALLOC = 0,   // Walks *all* the allocation on videocore.
-+                                    // Result of the walk is seen in the videocore
-+                                    // log.
-+   VCSM_STATUS_HOST_WALK_MAP,       // Walks the *full* mapping allocation on host
-+                                    // driver (ie for all processes).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_MAP,   // Walks the per process mapping allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+                                    // driver (for current process).  Result of
-+                                    // the walk is seen in the kernel log.
-+   VCSM_STATUS_VC_MAP_ALL,          // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+                                    // VCSM_STATUS_HOST_WALK_MAP.
-+                                    //
-+   VCSM_STATUS_NONE,                // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+   VCSM_CACHE_TYPE_NONE = 0,        // No caching applies.
-+   VCSM_CACHE_TYPE_HOST,            // Allocation is cached on host (user space).
-+   VCSM_CACHE_TYPE_VC,              // Allocation is cached on videocore.
-+   VCSM_CACHE_TYPE_HOST_AND_VC,     // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
-+
-+
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
-+
-+
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed.  If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process.  In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer.  The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+**       opaque handle (allocated via vcsm_malloc) and it is only
-+**       significant for such application which knows what to do
-+**       with it, for the others it is just a number with little
-+**       use since nothing can be done with it (in particular
-+**       for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-+
-+
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns:        0 on error
-+**                 a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns:        0 on error
-+**                 a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.  The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired.  The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use.  Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL.  The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns:        NULL on error
-+**                 a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+                       VCSM_CACHE_TYPE_T cache_update,
-+                       VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+**    Do not flush cache as the result of the unlock (if cache
-+**    flush was otherwise applicable in this case).
-+**
-+** Returns:        0 on success
-+**                 -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns:        non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate       given virtual range in L1/L2
-+** 2: clean            given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
-+*/
-+struct vcsm_user_clean_invalid_s {
-+   struct {
-+      unsigned int cmd;
-+      unsigned int handle;
-+      unsigned int addr;
-+      unsigned int size;
-+   } s[8];
-+};
-+
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* __USER_VCSM__H__INCLUDED__ */
 diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
 new file mode 100644
-index 0000000..9580165
+index 0000000..b061fe0
 --- /dev/null
 +++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,406 @@
+@@ -0,0 +1,581 @@
 +#include "config.h"
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#include "rpi_mailbox.h"
 +#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
 +
 +#include "libavutil/buffer_internal.h"
++#include <interface/vctypes/vc_image_types.h>
++
++#define TRACE_ALLOC 0
 +
 +struct ZcPoolEnt;
 +
 +typedef struct ZcPool
 +{
 +    int numbytes;
++    unsigned int n;
 +    struct ZcPoolEnt * head;
 +    pthread_mutex_t lock;
 +} ZcPool;
@@ -14993,27 +17836,56 @@ index 0000000..9580165
 +{
 +    // It is important that we start with gmem as other bits of code will expect to see that
 +    GPU_MEM_PTR_T gmem;
++    unsigned int n;
 +    struct ZcPoolEnt * next;
 +    struct ZcPool * pool;
 +} ZcPoolEnt;
 +
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
++#if 1
++//#define ALLOC_PAD       0x1000
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++//#define ALLOC_N_OFFSET  0x100
++#define ALLOC_N_OFFSET  0
++#define STRIDE_ROUND    0x80
++#define STRIDE_OR       0x80
++#else
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++#define ALLOC_N_OFFSET  0
++#define STRIDE_ROUND    32
++#define STRIDE_OR       0
++#endif
++
++#define DEBUG_ZAP0_BUFFERS 0
++
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
 +{
 +    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
 +
++    // Round up to 4k & add 4k
++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
 +    if (zp == NULL) {
 +        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
 +        goto fail0;
 +    }
 +
-+    if (gpu_malloc_cached(size, &zp->gmem) != 0)
++    if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
 +    {
-+        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
 +        goto fail1;
 +    }
 +
++#if TRACE_ALLOC
++    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
++#endif
++
++    pool->numbytes = zp->gmem.numbytes;
 +    zp->next = NULL;
 +    zp->pool = pool;
++    zp->n = pool->n++;
 +    return zp;
 +
 +fail1:
@@ -15024,6 +17896,10 @@ index 0000000..9580165
 +
 +static void zc_pool_ent_free(ZcPoolEnt * const zp)
 +{
++#if TRACE_ALLOC
++    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
++#endif
++
 +    gpu_free(&zp->gmem);
 +    av_free(zp);
 +}
@@ -15032,6 +17908,8 @@ index 0000000..9580165
 +{
 +    ZcPoolEnt * p = pool->head;
 +    pool->head = NULL;
++    pool->numbytes = -1;
++
 +    while (p != NULL)
 +    {
 +        ZcPoolEnt * const zp = p;
@@ -15040,15 +17918,21 @@ index 0000000..9580165
 +    }
 +}
 +
-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
 +{
 +    ZcPoolEnt * zp;
++    int numbytes;
++
 +    pthread_mutex_lock(&pool->lock);
 +
-+    if (numbytes != pool->numbytes)
++    numbytes = pool->numbytes;
++
++    // If size isn't close then dump the pool
++    // Close in this context means within 128k
++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
 +    {
 +        zc_pool_flush(pool);
-+        pool->numbytes = numbytes;
++        numbytes = req_bytes;
 +    }
 +
 +    if (pool->head != NULL)
@@ -15062,6 +17946,10 @@ index 0000000..9580165
 +    }
 +
 +    pthread_mutex_unlock(&pool->lock);
++
++    // Start with our buffer empty of preconceptions
++//    rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++
 +    return zp;
 +}
 +
@@ -15071,6 +17959,10 @@ index 0000000..9580165
 +    if (zp != NULL)
 +    {
 +        pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
++#endif
++
 +        if (pool->numbytes == zp->gmem.numbytes)
 +        {
 +            zp->next = pool->head;
@@ -15101,10 +17993,18 @@ index 0000000..9580165
 +    pthread_mutex_destroy(&pool->lock);
 +}
 +
++typedef struct ZcOldCtxVals
++{
++    int thread_safe_callbacks;
++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++    void * get_buffer_context;
++} ZcOldCtxVals;
 +
 +typedef struct AVZcEnv
 +{
++    unsigned int refcount;
 +    ZcPool pool;
++    ZcOldCtxVals old;
 +} ZcEnv;
 +
 +// Callback when buffer unrefed to zero
@@ -15124,28 +18024,94 @@ index 0000000..9580165
 +}
 +
 +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const unsigned int video_width, const unsigned int video_height)
++    const int format, const unsigned int video_width, const unsigned int video_height)
 +{
 +    AVRpiZcFrameGeometry geo;
-+    geo.stride_y = (video_width + 32 + 31) & ~31;
-+    geo.stride_c = geo.stride_y / 2;
-+//    geo.height_y = (video_height + 15) & ~15;
-+    geo.height_y = (video_height + 32 + 31) & ~31;
-+    geo.height_c = geo.height_y / 2;
++
++    switch (format)
++    {
++        case AV_PIX_FMT_YUV420P:
++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
++            geo.stride_c = geo.stride_y / 2;
++        //    geo.height_y = (video_height + 15) & ~15;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            break;
++
++        case AV_PIX_FMT_SAND128:
++        {
++            const unsigned int stripe_w = 128;
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                gpu_ref();
++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
++                gpu_unref();
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++
++            pthread_mutex_unlock(&sand_lock);
++
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        default:
++            memset(&geo, 0, sizeof(geo));
++            break;
++    }
 +    return geo;
 +}
 +
++
 +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
 +{
 +    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
 +    AVBufferRef * buf;
++    intptr_t idata = (intptr_t)zp->gmem.arm;
++#if ALLOC_N_OFFSET != 0
++    intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
++#endif
 +
 +    if (zp == NULL) {
 +        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
 +        goto fail0;
 +    }
 +
-+    if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
++#if ALLOC_N_OFFSET != 0
++    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
++#endif
++
++#if DEBUG_ZAP0_BUFFERS
++    memset((void*)idata, 0, size);
++#endif
++
++    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
 +    {
 +        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
 +        goto fail2;
@@ -15159,13 +18125,12 @@ index 0000000..9580165
 +    return NULL;
 +}
 +
-+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
 +{
-+    ZcEnv *const zc = s->get_buffer_context;
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
 +    const unsigned int size_y = geo.stride_y * geo.height_y;
 +    const unsigned int size_c = geo.stride_c * geo.height_c;
-+    const unsigned int size_pic = size_y + size_c * 2;
++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
 +    AVBufferRef * buf;
 +    unsigned int i;
 +
@@ -15173,7 +18138,7 @@ index 0000000..9580165
 +
 +    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
 +    {
-+        av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
 +        return AVERROR(ENOMEM);
 +    }
 +
@@ -15184,19 +18149,24 @@ index 0000000..9580165
 +    }
 +
 +    frame->buf[0] = buf;
++
 +    frame->linesize[0] = geo.stride_y;
 +    frame->linesize[1] = geo.stride_c;
 +    frame->linesize[2] = geo.stride_c;
++    if (geo.stripes > 1)
++        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
++
 +    frame->data[0] = buf->data;
 +    frame->data[1] = frame->data[0] + size_y;
-+    frame->data[2] = frame->data[1] + size_c;
++    if (geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
 +    frame->extended_data = frame->data;
 +    // Leave extended buf alone
 +
 +    return 0;
 +}
 +
-+
 +#define RPI_GET_BUFFER2 1
 +
 +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
@@ -15206,21 +18176,25 @@ index 0000000..9580165
 +#else
 +    int rv;
 +
-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
-+        frame->format != AV_PIX_FMT_YUV420P)
++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
 +    {
 +//        printf("Do default alloc: format=%#x\n", frame->format);
 +        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
++    else if (frame->format == AV_PIX_FMT_YUV420P ||
++             frame->format == AV_PIX_FMT_SAND128)
++    {
++        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
++    }
 +    else
 +    {
-+        rv = rpi_get_display_buffer(s, frame);
++        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
 +
 +#if 0
-+    printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+        frame->width, frame->height,
-+        frame->linesize[0], frame->linesize[1], frame->linesize[2],
++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++        frame->format, frame->width, frame->height,
++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
 +        frame->data[0], frame->data[1], frame->data[2],
 +        frame->buf[0], frame->buf[1], frame->buf[2],
 +        av_buffer_get_opaque(frame->buf[0]));
@@ -15241,7 +18215,7 @@ index 0000000..9580165
 +    dest->width = src->width;
 +    dest->height = src->height;
 +
-+    if (rpi_get_display_buffer(s, dest) != 0)
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
 +    {
 +        return NULL;
 +    }
@@ -15274,14 +18248,16 @@ index 0000000..9580165
 +{
 +    assert(s != NULL);
 +
-+    if (frame->format != AV_PIX_FMT_YUV420P)
++    if (frame->format != AV_PIX_FMT_YUV420P &&
++        frame->format != AV_PIX_FMT_SAND128)
 +    {
-+        av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
++        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
 +        return NULL;
 +    }
 +
 +    if (frame->buf[1] != NULL)
 +    {
++        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
 +        if (maycopy)
 +        {
 +            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
@@ -15317,6 +18293,18 @@ index 0000000..9580165
 +    return p == NULL ? -1 : p->vc_handle;
 +}
 +
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : fr_ref->data - p->arm;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++    return fr_ref == NULL ? 0 : fr_ref->size;
++}
++
++
 +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
 +{
 +    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
@@ -15353,27 +18341,50 @@ index 0000000..9580165
 +    }
 +}
 +
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++    return s->get_buffer2 == av_rpi_zc_get_buffer2;
++}
++
 +int av_rpi_zc_init(struct AVCodecContext * const s)
 +{
-+    ZcEnv * const zc = av_rpi_zc_env_alloc();
-+    if (zc == NULL)
++    if (av_rpi_zc_in_use(s))
 +    {
-+        return AVERROR(ENOMEM);
++        ZcEnv * const zc = s->get_buffer_context;
++        ++zc->refcount;
 +    }
++    else
++    {
++        ZcEnv *const zc = av_rpi_zc_env_alloc();
++        if (zc == NULL)
++        {
++            return AVERROR(ENOMEM);
++        }
 +
-+    s->get_buffer_context = zc;
-+    s->get_buffer2 = av_rpi_zc_get_buffer2;
++        zc->refcount = 1;
++        zc->old.get_buffer_context = s->get_buffer_context;
++        zc->old.get_buffer2 = s->get_buffer2;
++        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
++
++        s->get_buffer_context = zc;
++        s->get_buffer2 = av_rpi_zc_get_buffer2;
++        s->thread_safe_callbacks = 1;
++    }
 +    return 0;
 +}
 +
 +void av_rpi_zc_uninit(struct AVCodecContext * const s)
 +{
-+    if (s->get_buffer2 == av_rpi_zc_get_buffer2)
++    if (av_rpi_zc_in_use(s))
 +    {
 +        ZcEnv * const zc = s->get_buffer_context;
-+        s->get_buffer2 = avcodec_default_get_buffer2;
-+        s->get_buffer_context = NULL;
-+        av_rpi_zc_env_free(zc);
++        if (--zc->refcount == 0)
++        {
++            s->get_buffer2 = zc->old.get_buffer2;
++            s->get_buffer_context = zc->old.get_buffer_context;
++            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++            av_rpi_zc_env_free(zc);
++        }
 +    }
 +}
 +
@@ -15381,19 +18392,19 @@ index 0000000..9580165
 +
 diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
 new file mode 100644
-index 0000000..f0109f4
+index 0000000..f4aeb78
 --- /dev/null
 +++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,83 @@
+@@ -0,0 +1,137 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
 +// Zero-Copy frame code for RPi
 +// RPi needs Y/U/V planes to be contiguous for display.  By default
 +// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display.  This code prodes a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be refrence counted until
-+// display ahs finsihed with it.
++// display.  This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
 +
 +#include "libavutil/frame.h"
 +#include "libavcodec/avcodec.h"
@@ -15410,10 +18421,13 @@ index 0000000..f0109f4
 +    unsigned int height_y;
 +    unsigned int stride_c;
 +    unsigned int height_c;
++    unsigned int planes_c;
++    unsigned int stripes;
 +} AVRpiZcFrameGeometry;
 +
 +
 +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format,
 +    const unsigned int video_width, const unsigned int video_height);
 +
 +// Replacement fn for avctx->get_buffer2
@@ -15422,7 +18436,7 @@ index 0000000..f0109f4
 +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
 +// must be set to 1 as otherwise the buffer info is killed before being returned
 +// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
-+// return must be manually derefed with av_frame_unref.  This should be done
++// returned must be manually derefed with av_frame_unref.  This should be done
 +// after av_rpi_zc_ref has been called.
 +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
 +
@@ -15439,6 +18453,11 @@ index 0000000..f0109f4
 +// Get the vc_handle from the frame ref
 +// Returns -1 if ref doesn't look valid
 +int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
 +// Get the number of bytes allocated from the frame ref
 +// Returns 0 if ref doesn't look valid
 +int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
@@ -15455,6 +18474,8 @@ index 0000000..f0109f4
 +// Allocate the environment used by the ZC code
 +void av_rpi_zc_env_free(AVZcEnvPtr);
 +
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
 +
 +// Init ZC into a context
 +// There is nothing magic in this fn - it just packages setting
@@ -15466,10 +18487,54 @@ index 0000000..f0109f4
 +// get_buffer2 & get_buffer_context
 +void av_rpi_zc_uninit(struct AVCodecContext * const s);
 +
++
++
++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    const unsigned int stride1 = frame->linesize[0];
++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = frame->linesize[0];
++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
++    const unsigned int x = x_c * 2;
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
++}
++
++static inline int rpi_sliced_frame(const AVFrame * const frame)
++{
++    return frame->format == AV_PIX_FMT_SAND128;
++}
++
++
 +#endif
 +
 diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index f7adb52..708526e 100644
+index f7adb52..3b398a3 100644
 --- a/libavcodec/utils.c
 +++ b/libavcodec/utils.c
 @@ -26,6 +26,12 @@
@@ -15559,6 +18624,30 @@ index f7adb52..708526e 100644
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
+@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+ {
+     int ret;
+ 
++#ifdef RPI
++    // This is going to end badly if we let it continue
++    av_assert0(frame->format != AV_PIX_FMT_SAND128);
++#endif
++
+     if ((ret = update_frame_pool(avctx, frame)) < 0)
+         return ret;
+ 
+diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
+index 21f8d9e..71ce7b9 100644
+--- a/libavfilter/avfilter.c
++++ b/libavfilter/avfilter.c
+@@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
+                    "options, but options were provided: %s.\n", args);
+             return AVERROR(EINVAL);
+         }
++        printf("=== args='%s'\n", args);
+ 
+ #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
+             if (   !strcmp(filter->filter->name, "format")     ||
 diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
 index b31d233..2767306 100644
 --- a/libavformat/mpegts.c
@@ -15613,6 +18702,88 @@ index 0c0ce12..82e0bc3 100644
  /**
   * @}
   */
+diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+index 0dffa4d..5644176 100644
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+         .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
+                  AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
+     },
++    [AV_PIX_FMT_SAND128] = {
++        .name = "sand128",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
++        },
++        .flags = 0,
++    }
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+index 0ed01c4..4705e80 100644
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -303,7 +303,10 @@ enum AVPixelFormat {
+     AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
+     AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
+ 
+-    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
++// RPI - not on ifdef so can be got at by calling progs
++    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++
++    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ };
+ 
+ #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
+diff --git a/libswscale/input.c b/libswscale/input.c
+index 14ab5ab..e61b67a 100644
+--- a/libswscale/input.c
++++ b/libswscale/input.c
+@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
+     }
+ }
+ 
++
++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
++                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
++                       int width, uint32_t *unused)
++{
++    // NIF
++}
++
+ #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+ 
+ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+     case AV_PIX_FMT_P010BE:
+         c->chrToYV12 = p010BEToUV_c;
+         break;
++    case AV_PIX_FMT_SAND128:
++        c->chrToYV12 = sand128ToUV_c;
++        break;
+     }
+     if (c->chrSrcHSubSample) {
+         switch (srcFormat) {
+diff --git a/libswscale/utils.c b/libswscale/utils.c
+index 576d8f0..d7206cc 100644
+--- a/libswscale/utils.c
++++ b/libswscale/utils.c
+@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+     [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
+     [AV_PIX_FMT_P010LE]      = { 1, 0 },
+     [AV_PIX_FMT_P010BE]      = { 1, 0 },
++#ifdef RPI
++    [AV_PIX_FMT_SAND128]     = { 1, 0 },
++#endif
+ };
+ 
+ int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 diff --git a/pi-util/conf.sh b/pi-util/conf.sh
 new file mode 100755
 index 0000000..8b596a2
@@ -15652,21 +18823,61 @@ index 0000000..8b596a2
 +
 +# gcc option for getting asm listing
 +# -Wa,-ahls
+diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
+new file mode 100644
+index 0000000..160e149
+--- /dev/null
++++ b/pi-util/conf1.sh
+@@ -0,0 +1,34 @@
++echo "Configure for Pi1"
++
++RPI_BUILDROOT=`pwd`/build
++RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
++RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=$RPI_ROOTFS/opt/vc
++#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++#RPI_DEFS="-D__VCCOREVER__=0x04000000"
++RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --cpu=arm1176jzf-s\
++ --arch=armv\
++ --disable-neon\
++ --target-os=linux\
++ --disable-stripping\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
 new file mode 100644
-index 0000000..61d1399
+index 0000000..fc14f2a
 --- /dev/null
 +++ b/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
 +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
 +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
@@ -15688,7 +18899,7 @@ index 0000000..61d1399
 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
@@ -15728,7 +18939,7 @@ index 0000000..61d1399
 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
@@ -15742,10 +18953,10 @@ index 0000000..61d1399
 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
@@ -15774,7 +18985,7 @@ index 0000000..61d1399
 +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
 +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
 +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
 +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
@@ -15783,9 +18994,9 @@ index 0000000..61d1399
 +1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
 +1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
 +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
@@ -15804,10 +19015,10 @@ index 0000000..61d1399
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
 new file mode 100644
-index 0000000..38f942f
+index 0000000..c896bc6
 --- /dev/null
 +++ b/pi-util/ffconf.py
-@@ -0,0 +1,146 @@
+@@ -0,0 +1,154 @@
 +#!/usr/bin/env python
 +
 +import os
@@ -15851,16 +19062,18 @@ index 0000000..38f942f
 +    except:
 +        pass
 +
-+    rv = False
 +    if  m1 and m2 and m1.group() == m2.group():
 +        print >> flog, "Match: " + m1.group()
-+        rv = True
++        rv = 0
 +    elif not m1:
 +        print >> flog, "****** Cannot find m1"
++        rv = 3
 +    elif not m2:
 +        print >> flog, "****** Cannot find m2"
++        rv = 2
 +    else:
 +        print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++        rv = 1
 +    flog.close()
 +    return rv
 +
@@ -15906,19 +19119,25 @@ index 0000000..38f942f
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
-+                if exp_test == 1:
-+                    failures.append(name)
-+                    print ": * FAIL *"
-+                else:
-+                    print ": fail"
-+            else:
++            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++            if (rv == 0):
 +                if exp_test == 2:
 +                    print ": * OK *"
 +                    unx_success.append(name)
 +                else:
 +                    print ": ok"
-+
++            elif exp_test > 1 and rv == 1:
++                print ": fail"
++            else:
++                failures.append(name)
++                if rv == 1:
++                    print ": * FAIL *"
++                elif (rv == 2) :
++                    print ": * CRASH *"
++                elif (rv == 3) :
++                    print ": * MD5 MISSING *"
++                else :
++                    print ": * BANG *"
 +
 +    if failures or unx_success:
 +        print "Unexpected Failures:", failures
@@ -18462,6 +21681,21 @@ index 0000000..1eacc04
 +
 +if __name__ == '__main__':
 +   main()
+diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+new file mode 100644
+index 0000000..47dd071
+--- /dev/null
++++ b/pi-util/qem.sh
+@@ -0,0 +1,9 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ pi-util/qasm.py
++SRC_FILE=libavcodec/rpi_shader.qasm
++DST_BASE=shader
++
++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
 diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
 new file mode 100755
 index 0000000..6a9a33f
@@ -18554,3 +21788,137 @@ index 0000000..d8bdd91
 +pi-util/rebase_liblinks.py $DST
 +
 +
+diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
+new file mode 100644
+index 0000000..5935a11
+--- /dev/null
++++ b/pi-util/v3dusage.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def do_logparse(logname):
++
++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
++
++    ttotal = {'idle':0.0}
++    tstart = {}
++    qctotal = {}
++    qtstotal = {}
++    l2hits = {}
++    l2total = {}
++    time0 = None
++    idle_start = None
++    qpu_op_no = 0
++    op_count = 0
++
++    with open(logname, "rt") as infile:
++        for line in infile:
++            match = rmatch.match(line)
++            if match:
++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++                time = float(match.group(1))
++                unit = match.group(3)
++                opstart = not match.group(2)
++                optype = match.group(7)
++                hascb = match.group(8) != "0"
++
++                if unit == 'qpu1':
++                    unit = unit + "." + str(qpu_op_no)
++                    if not opstart:
++                        if hascb or optype == 'EXECUTE_SYNC':
++                            qpu_op_no = 0
++                        else:
++                            qpu_op_no += 1
++
++                # Ignore sync type
++                if optype == 'EXECUTE_SYNC':
++                    continue
++
++                if not time0:
++                    time0 = time
++
++                if opstart:
++                    tstart[unit] = time;
++                elif unit in tstart:
++                    op_count += 1
++                    if not unit in ttotal:
++                        ttotal[unit] = 0.0
++                    ttotal[unit] += time - tstart[unit]
++                    del tstart[unit]
++
++                if not idle_start and not tstart:
++                    idle_start = time
++                elif idle_start and tstart:
++                    ttotal['idle'] += time - idle_start
++                    idle_start = None
++
++            match = rqcycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qctotal:
++                    qctotal[unit] = 0
++                qctotal[unit] += int(match.group(2))
++
++            match = rqtscycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qtstotal:
++                    qtstotal[unit] = 0
++                qtstotal[unit] += int(match.group(2))
++
++            match = rl2hits.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in l2total:
++                    l2total[unit] = 0
++                    l2hits[unit] = 0
++                l2total[unit] += int(match.group(3))
++                if match.group(2) == "hits":
++                    l2hits[unit] += int(match.group(3))
++
++
++    if not time0:
++        print "No v3d profile records found"
++    else:
++        tlogged = time - time0
++
++        print "Logged time:", tlogged, "  Op count:", op_count
++        for unit in sorted(ttotal):
++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++        print
++        for unit in sorted(qctotal):
++            if not unit in qtstotal:
++                qtstotal[unit] = 0;
++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++            if unit in l2total:
++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
++
++
++if __name__ == '__main__':
++    argp = argparse.ArgumentParser(
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        description="QPU/VPU perf summary from VC logging",
++        epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
++
++Example use:
++  vcgencmd set_logging level=0xc0
++  <command to profile>
++  sudo vcdbg log msg >& t.log
++  v3dusage.py t.log
++""")
++
++    argp.add_argument("logfile")
++    args = argp.parse_args()
++
++    do_logparse(args.logfile)
++
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
index 721a065449..5240cf58ce 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1008-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
@@ -22,4 +22,3 @@ index 2fd3f2b..7165652 100644
          if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
              *poutbuf      = NULL;
              *poutbuf_size = 0;
-