diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
index f9b7f1bd34..2786d22397 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1001-pfcd_hevc_optimisations.patch
@@ -1,7 +1,7 @@
-diff --git b/.gitignore a/.gitignore
+diff --git a/.gitignore b/.gitignore
 index 524fb73..305632b 100644
---- b/.gitignore
-+++ a/.gitignore
+--- a/.gitignore
++++ b/.gitignore
 @@ -23,6 +23,7 @@
  .\#*
  /.config
@@ -10,81 +10,10 @@ index 524fb73..305632b 100644
  /ffmpeg
  /ffplay
  /ffprobe
-diff --git b/Changelog a/Changelog
-index 6f023a9..ad53c9d 100644
---- b/Changelog
-+++ a/Changelog
-@@ -1,7 +1,7 @@
- Entries are sorted chronologically from oldest to youngest within each release,
- releases are sorted from youngest to oldest.
- 
--version 3.3:
-+version <next>:
- - CrystalHD decoder moved to new decode API
- - add internal ebur128 library, remove external libebur128 dependency
- - Pro-MPEG CoP #3-R2 FEC protocol
-@@ -22,7 +22,6 @@ version 3.3:
- - threshold filter
- - midequalizer filter
- - Optimal Huffman tables for (M)JPEG encoding
--- VAAPI-accelerated MPEG-2 and VP8 encoding
- - FM Screen Capture Codec decoder
- - native Opus encoder
- - ScreenPressor decoder
-@@ -33,7 +32,6 @@ version 3.3:
- - Removed the legacy X11 screen grabber, use XCB instead
- - MPEG-7 Video Signature filter
- - Removed asyncts filter (use af_aresample instead)
--- Intel QSV-accelerated VP8 video decoding
- 
- 
- version 3.2:
-@@ -121,6 +119,7 @@ version 3.1:
- - libutvideo wrapper removed
- - YUY2 Lossless Codec decoder
- - VideoToolbox H.264 encoder
-+- VAAPI-accelerated MPEG-2 and VP8 encoding
- 
- 
- version 3.0:
-diff --git b/RELEASE_NOTES a/RELEASE_NOTES
-new file mode 100644
-index 0000000..c3ec010
---- /dev/null
-+++ a/RELEASE_NOTES
-@@ -0,0 +1,15 @@
-+
-+              ┌────────────────────────────────────────┐
-+              │ RELEASE NOTES for FFmpeg 3.2 "Hypatia" │
-+              └────────────────────────────────────────┘
-+
-+   The FFmpeg Project proudly presents FFmpeg 3.2 "Hypatia", about 4
-+   months after the release of FFmpeg 3.1.
-+
-+   A complete Changelog is available at the root of the project, and the
-+   complete Git history on http://source.ffmpeg.org.
-+
-+   We hope you will like this release as much as we enjoyed working on it, and
-+   as usual, if you have any questions about it, or any FFmpeg related topic,
-+   feel free to join us on the #ffmpeg IRC channel (on irc.freenode.net) or ask
-+   on the mailing-lists.
-diff --git b/doc/Doxyfile a/doc/Doxyfile
-index 0891899..8f855f8 100644
---- b/doc/Doxyfile
-+++ a/doc/Doxyfile
-@@ -38,7 +38,7 @@ PROJECT_NAME           = FFmpeg
- # could be handy for archiving the generated documentation or if some version
- # control system is used.
- 
--PROJECT_NUMBER         =
-+PROJECT_NUMBER         = 3.2
- 
- # Using the PROJECT_BRIEF tag one can provide an optional one line description
- # for a project that appears at the top of each page and should give viewer a
-diff --git b/ffmpeg.c a/ffmpeg.c
-index 11faf0d..494c23d 100644
---- b/ffmpeg.c
-+++ a/ffmpeg.c
+diff --git a/ffmpeg.c b/ffmpeg.c
+index 4b4dae4..9a7c29c 100644
+--- a/ffmpeg.c
++++ b/ffmpeg.c
 @@ -23,6 +23,11 @@
   * multimedia converter based on the FFmpeg libraries
   */
@@ -97,7 +26,7 @@ index 11faf0d..494c23d 100644
  #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -68,6 +73,25 @@
+@@ -69,6 +74,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
@@ -123,7 +52,7 @@ index 11faf0d..494c23d 100644
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -164,6 +188,174 @@ static int restore_tty;
+@@ -165,6 +189,182 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -171,7 +100,7 @@ index 11faf0d..494c23d 100644
 +  mmal_buffer_header_release(buffer);
 +}
 +
-+static MMAL_COMPONENT_T* display_init(size_t x, size_t y, size_t w, size_t h)
++static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
 +{
 +    MMAL_COMPONENT_T* display;
 +    MMAL_DISPLAYREGION_T region =
@@ -182,7 +111,7 @@ index 11faf0d..494c23d 100644
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(w, h);
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
 +
 +    bcm_host_init();  // TODO is this needed?
 +    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
@@ -192,7 +121,7 @@ index 11faf0d..494c23d 100644
 +
 +    {
 +        MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+        format->encoding = MMAL_ENCODING_I420;
++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
 +        format->es->video.width = geo.stride_y;
 +        format->es->video.height = geo.height_y;
 +        format->es->video.crop.x = 0;
@@ -209,7 +138,7 @@ index 11faf0d..494c23d 100644
 +    mmal_port_enable(display->input[0],display_cb_input);
 +    mmal_port_enable(display->control,display_cb_control);
 +
-+    printf("Allocated display %dx%d in %dx%d\n", w, h, geo.stride_y, geo.height_y);
++    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
 +
 +    return display;
 +}
@@ -249,7 +178,14 @@ index 11faf0d..494c23d 100644
 +    buf->offset = av_rpi_zc_offset(fr_buf);
 +    buf->length = av_rpi_zc_length(fr_buf);
 +    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+
++#if 0
++    {
++        unsigned int n;
++        for (n = 0; n < fr->width; n += 128) {
++            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
++        }
++    }
++#endif
 +    ++rpi_display_count;
 +}
 +#else
@@ -284,6 +220,7 @@ index 11faf0d..494c23d 100644
 +
 +static void display_exit(MMAL_COMPONENT_T* display)
 +{
++//    sleep(120);
 +    if (display) {
 +        mmal_component_destroy(display);
 +    }
@@ -298,7 +235,7 @@ index 11faf0d..494c23d 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -575,6 +767,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -576,6 +776,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
@@ -310,7 +247,7 @@ index 11faf0d..494c23d 100644
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -587,6 +784,9 @@ static void ffmpeg_cleanup(int ret)
+@@ -588,6 +793,9 @@ static void ffmpeg_cleanup(int ret)
          av_freep(&ist->hwaccel_device);
          av_freep(&ist->dts_buffer);
  
@@ -320,7 +257,7 @@ index 11faf0d..494c23d 100644
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -617,6 +817,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -618,6 +826,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -328,7 +265,7 @@ index 11faf0d..494c23d 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1050,6 +1251,15 @@ static void do_video_out(OutputFile *of,
+@@ -1053,6 +1262,15 @@ static void do_video_out(OutputFile *of,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
@@ -336,7 +273,7 @@ index 11faf0d..494c23d 100644
 +    if (next_picture && ist != NULL)
 +    {
 +        if (!rpi_display)
-+           rpi_display = display_init(0,0,next_picture->width,next_picture->height);
++            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
 +        display_frame(ist->dec_ctx, rpi_display, next_picture);
 +    }
 +#endif
@@ -344,7 +281,7 @@ index 11faf0d..494c23d 100644
      frame_rate = av_buffersink_get_frame_rate(filter);
      if (frame_rate.num > 0 && frame_rate.den > 0)
          duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base));
-@@ -2873,6 +3083,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2884,6 +3102,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
@@ -357,23 +294,24 @@ index 11faf0d..494c23d 100644
          ist->dec_ctx->thread_safe_callbacks = 1;
  
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
-diff --git b/libavcodec/Makefile a/libavcodec/Makefile
-index 0dd0c7b..d2eb014 100644
---- b/libavcodec/Makefile
-+++ a/libavcodec/Makefile
-@@ -5,6 +5,11 @@ NAME = avcodec
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 0dd0c7b..b9732c5 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -5,6 +5,12 @@ NAME = avcodec
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
 +          rpi_qpu.h                                                     \
 +          rpi_shader.h                                                  \
++	  rpi_shader_cmd.h                                              \
 +          rpi_mailbox.h                                                 \
 +          rpi_hevc_transform.h                                          \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -47,6 +52,10 @@ OBJS = allcodecs.o                                                      \
+@@ -47,6 +53,10 @@ OBJS = allcodecs.o                                                      \
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
@@ -384,36 +322,26 @@ index 0dd0c7b..d2eb014 100644
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -973,8 +982,7 @@ OBJS-$(CONFIG_AAC_ADTSTOASC_BSF)          += aac_adtstoasc_bsf.o aacadtsdec.o \
- OBJS-$(CONFIG_CHOMP_BSF)                  += chomp_bsf.o
- OBJS-$(CONFIG_DUMP_EXTRADATA_BSF)         += dump_extradata_bsf.o
- OBJS-$(CONFIG_DCA_CORE_BSF)               += dca_core_bsf.o
--OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF)      += extract_extradata_bsf.o    \
--                                             h2645_parse.o
-+OBJS-$(CONFIG_EXTRACT_EXTRADATA_BSF)      += extract_extradata_bsf.o
- OBJS-$(CONFIG_H264_MP4TOANNEXB_BSF)       += h264_mp4toannexb_bsf.o
- OBJS-$(CONFIG_HEVC_MP4TOANNEXB_BSF)       += hevc_mp4toannexb_bsf.o
- OBJS-$(CONFIG_IMX_DUMP_HEADER_BSF)        += imx_dump_header_bsf.o
-@@ -1103,3 +1111,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -1103,3 +1113,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
-+QASM := $(SUBDIR)../pi-util/qasm.py
++QASM_PY := ../local/bin/qasm.py
 +
-+ifneq ("$(wildcard $(QASM))","")
++ifneq ("$(wildcard $(QASM_PY))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
 +endif
 +
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
-diff --git b/libavcodec/allcodecs.c a/libavcodec/allcodecs.c
++$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
+diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
 index 4df4772..ca05158 100644
---- b/libavcodec/allcodecs.c
-+++ a/libavcodec/allcodecs.c
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
 @@ -696,6 +696,7 @@ static void register_all(void)
      REGISTER_PARSER(H261,               h261);
      REGISTER_PARSER(H263,               h263);
@@ -422,26 +350,29 @@ index 4df4772..ca05158 100644
      REGISTER_PARSER(HEVC,               hevc);
      REGISTER_PARSER(MJPEG,              mjpeg);
      REGISTER_PARSER(MLP,                mlp);
-diff --git b/libavcodec/arm/Makefile a/libavcodec/arm/Makefile
-index 1eeac54..f96f93b 100644
---- b/libavcodec/arm/Makefile
-+++ a/libavcodec/arm/Makefile
-@@ -135,8 +135,10 @@ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index 1eeac54..a94a240 100644
+--- a/libavcodec/arm/Makefile
++++ b/libavcodec/arm/Makefile
+@@ -134,9 +134,13 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                          arm/hevc_misc_neon.o          \
                                            arm/hevcdsp_deblock_neon.o    \
 +                                          arm/hevcdsp_epel_neon.o       \
                                            arm/hevcdsp_idct_neon.o       \
 -                                          arm/hevcdsp_qpel_neon.o
++                                          arm/hevcdsp_cres_neon.o       \
 +                                          arm/hevcdsp_qpel_neon.o       \
 +                                          arm/hevcdsp_sao_neon.o
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
-diff --git b/libavcodec/arm/cabac.h a/libavcodec/arm/cabac.h
+diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
 index fdbf86b..0a3980a 100644
---- b/libavcodec/arm/cabac.h
-+++ a/libavcodec/arm/cabac.h
+--- a/libavcodec/arm/cabac.h
++++ b/libavcodec/arm/cabac.h
 @@ -26,13 +26,34 @@
  #include "libavutil/internal.h"
  #include "libavcodec/cabac.h"
@@ -620,11 +551,11 @@ index fdbf86b..0a3980a 100644
  #endif /* HAVE_ARMV6T2_INLINE */
  
  #endif /* AVCODEC_ARM_CABAC_H */
-diff --git b/libavcodec/arm/hevc_cabac.h a/libavcodec/arm/hevc_cabac.h
+diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
 new file mode 100644
 index 0000000..31d3c59
 --- /dev/null
-+++ a/libavcodec/arm/hevc_cabac.h
++++ b/libavcodec/arm/hevc_cabac.h
 @@ -0,0 +1,491 @@
 +/*
 + * This file is part of FFmpeg.
@@ -1117,18 +1048,873 @@ index 0000000..31d3c59
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git b/libavcodec/arm/hevcdsp_deblock_neon.S a/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb..a088cc3 100644
---- b/libavcodec/arm/hevcdsp_deblock_neon.S
-+++ a/libavcodec/arm/hevcdsp_deblock_neon.S
-@@ -383,3 +383,127 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
+new file mode 100644
+index 0000000..373576b
+--- /dev/null
++++ b/libavcodec/arm/hevc_misc_neon.S
+@@ -0,0 +1,62 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@   uint16_t * buf,          [r0]
++@   unsigned int log_n_m2)   [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++        vmov.i64 q8, #0
++        adr     r12, zc_tab
++        vmov.i64 q9, #0
++        tst     r0, #63
++        vmov.i64 q10, #0
++        add     r0, #63
++        vmov.i64 q11, #0
++        and     r0, #~63
++        ldr     pc, [r12, r1, lsl #2]
++
++zc_tab:
++        .word   zc_lc2
++        .word   zc_lc3
++        .word   zc_lc4
++        .word   zc_lc5
++
++@ 4*4*2: "32 bytes" 64 or 0 depending on dst address
++zc_lc2:
++        it eq
++        vstmeq  r0, {q8-q11}
++        bx      lr
++
++@ 16*16*2 = 512 = 64 * 8
++zc_lc4:
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++        vstm    r0!, {q8-q11}
++@ 8*8*2 = 128
++zc_lc3:
++        vstm    r0!, {q8-q11}
++        vstm    r0,  {q8-q11}
++        bx      lr
++
++@ 32*32*2 = 2048 = 128 * 16
++zc_lc5:
++        vmov.i64 q12, #0
++        vmov.i64 q13, #0
++        vmov.i64 q14, #0
++        vmov.i64 q15, #0
++        mov     r2, #4
++1:
++        vstm    r0!, {q8-q15}
++        subs    r2, #1
++        vstm    r0!, {q8-q15}
++        vstm    r0!, {q8-q15}
++        vstm    r0!, {q8-q15}
++        bne     1b
++        bx      lr
++
++endfunc
++
+diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
+new file mode 100644
+index 0000000..880b26e
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_cres_neon.S
+@@ -0,0 +1,275 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ General notes:
++@
++@ Residual is only guaranteed to be cliped to 16 bits
++@ This means that we do need to do movul, qadd, qmovun
++@ rather than addw, qmovun (if we were clipped to 15 then we could get away
++@ with this)
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_u_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vld1.16     {q0, q1}, [r1]
++        vmov.i64    q2, #0
++        vmov.i64    q3, #0
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_u_neon_8, export=1
++        mov         r12,    #4
++1:
++        vld2.8      {d16, d17}, [r0, :128], r2
++        vld2.8      {d18, d19}, [r0, :128]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12, #1
++        vmovl.u8    q10, d16
++        sub         r0, r2
++        vmovl.u8    q11, d18
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d16,  q0
++        vqmovun.s16 d18,  q1
++        vst2.8      {d16, d17}, [r0, :128], r2
++        vst2.8      {d18, d19}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_u_neon_8, export=1
++        mov         r12,    #16
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d16, q0
++        vqmovun.s16 d17, q1
++        vst2.8      {q8, q9}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_v_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vld1.16     {q2, q3}, [r1]
++        vmov.i64    q0, #0
++        vmov.i64    q1, #0
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_v_neon_8, export=1
++        mov         r12,    #4
++1:
++        vld2.8      {d16, d17}, [r0, :128], r2
++        vld2.8      {d18, d19}, [r0, :128]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12, #1
++        vmovl.u8    q10, d17
++        sub         r0, r2
++        vmovl.u8    q11, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d17,  q0
++        vqmovun.s16 d19,  q1
++        vst2.8      {d16, d17}, [r0, :128], r2
++        vst2.8      {d18, d19}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_v_neon_8, export=1
++        mov         r12,    #16
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d18
++        vmovl.u8    q11, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d18, q0
++        vqmovun.s16 d19, q1
++        vst2.8      {q8, q9}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_c_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vldm        r1, {q0-q3}           @ Q0/1 gets all of U, Q2/3 gets all of V
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_c_neon_8, export=1
++        mov         r12,    #8
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.8      {d16, d17}, [r0, :128]
++        vld1.16     {q0}, [r1, :128]!
++        vld1.16     {q1}, [r3, :128]!
++        subs        r12, #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst2.8      {d0, d1}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_c_neon_8, export=1
++        mov         r12,    #16
++        add         r3, r1, #(16*16*2)  @ Offset to V
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        vld1.16     {q2, q3}, [r3, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst2.8      {q0, q1}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
++
++
+diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+index 166bddb..9bd0a42 100644
+--- a/libavcodec/arm/hevcdsp_deblock_neon.S
++++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+@@ -15,7 +15,7 @@
+  *
+  * You should have received a copy of the GNU Lesser General Public
+  * License along with FFmpeg; if not, write to the Free Software
+- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
+  */
+ 
+ 
+@@ -31,6 +31,9 @@
+         bxeq     lr
+ .endm
+ 
++@ Uses: d2, d4, d18, d19
++@ Returns: d2, d4
++@ Modifies: d0-d7, d22-d25
+ .macro hevc_loop_filter_chroma_body
+         vsubl.u8  q3, d4, d2
+         vsubl.u8  q11, d18, d19
+@@ -49,6 +52,33 @@
+         vqmovun.s16 d4, q2
+ .endm
+ 
++
++@ Uses r2[0:7], r2[8:15]
++@ Modifies: d0-d7, d22-d25
++.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
++        vsubl.u8  q3, \Q0, \P0
++        vsubl.u8  q11, \P1, \Q1
++        vshl.i16  q3, #2
++        vadd.i16  q11, q3
++
++        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
++        vdup.16   d0, r2
++        vmovl.u8  q0, d0
++        vuzp.16   d0, d1
++
++        vrshr.s16 q11, q11, #3
++        vneg.s16  q12, q0
++        vmovl.u8  q2, \Q0
++        vmin.s16  q11, q11, q0
++        vmax.s16  q11, q11, q12
++        vaddw.u8  q1, q11, \P0
++        vsub.i16  q2, q11
++        vqmovun.s16 \P0, q1
++        vqmovun.s16 \Q0, q2
++.endm
++
++
++
+ .macro hevc_loop_filter_luma_start
+         ldr     r12, [r3]
+         ldr      r3, [r3, #4]
+@@ -60,15 +90,17 @@
+         lsr      r3, #16
+ .endm
+ 
+-.macro hevc_loop_filter_luma_body
++@ Uses: r2, r3, r12
++@ Modifies: r5, r6, r7, r8, r9
++function hevc_loop_filter_luma_body
++        vmovl.u8  q15, d23
++        vmovl.u8  q14, d22
++        vmovl.u8  q13, d21
++        vmovl.u8  q12, d20
++        vmovl.u8  q11, d19
++        vmovl.u8  q10, d18
++        vmovl.u8  q9, d17
+         vmovl.u8  q8, d16
+-        vmovl.u8  q9, d18
+-        vmovl.u8  q10, d20
+-        vmovl.u8  q11, d22
+-        vmovl.u8  q12, d24
+-        vmovl.u8  q13, d26
+-        vmovl.u8  q14, d28
+-        vmovl.u8  q15, d30
+ 
+         vadd.i16   q7, q9, q11
+         vadd.i16   q6, q14, q12
+@@ -77,7 +109,6 @@
+         vabd.s16   q7, q7, q10
+         vabd.s16   q6, q6, q13
+ 
+-
+         vdup.16    q0, r2
+         vmov       q4, q7
+         vmov       q5, q6
+@@ -152,7 +183,7 @@
+ 
+         and        r9, r8, r7
+         cmp        r9, #0
+-        beq        weakfilter_\@
++        beq        weakfilter_
+ 
+         vadd.i16  q2, q11, q12
+         vadd.i16  q4, q9, q8
+@@ -210,11 +241,11 @@
+         vbit      q13, q3, q5
+         vbit      q14, q2, q5
+ 
+-weakfilter_\@:
++weakfilter_:
+         mvn       r8, r8
+         and       r9, r8, r7
+         cmp       r9, #0
+-        beq       ready_\@
++        beq       ready_
+ 
+         vdup.16    q4, r2
+ 
+@@ -275,75 +306,345 @@ weakfilter_\@:
+         vbit      q11, q0, q5
+         vbit      q12, q4, q5
+ 
+-ready_\@:
++ready_:
+         vqmovun.s16 d16, q8
+-        vqmovun.s16 d18, q9
+-        vqmovun.s16 d20, q10
+-        vqmovun.s16 d22, q11
+-        vqmovun.s16 d24, q12
+-        vqmovun.s16 d26, q13
+-        vqmovun.s16 d28, q14
+-        vqmovun.s16 d30, q15
+-.endm
++        vqmovun.s16 d17, q9
++        vqmovun.s16 d18, q10
++        vqmovun.s16 d19, q11
++        vqmovun.s16 d20, q12
++        vqmovun.s16 d21, q13
++        vqmovun.s16 d22, q14
++        vqmovun.s16 d23, q15
++        mov       pc, lr
++endfunc
++
++@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
++function ff_hevc_v_loop_filter_luma2_neon_8, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
++
++        ldr      r4, [sp, #40]
++        b        v_loop_luma_common
++endfunc
++
+ 
+ function ff_hevc_v_loop_filter_luma_neon, export=1
+         hevc_loop_filter_luma_start
+-        push     {r5-r11}
++        push     {r4-r10,lr}
++
++        sub      r4, r0, #4
++v_loop_luma_common:
++        @ Why this isn't a bitmask to start with I have no idea...
++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
++        ldr      r5, [sp, #32]
++        ldrh     r10, [r5]
++        ldr      r5, [sp, #36]
++        ldrh     r5, [r5]
++        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
++
+         vpush    {d8-d15}
+-        sub      r0, #4
+-        vld1.8   {d16}, [r0], r1
+-        vld1.8   {d18}, [r0], r1
+-        vld1.8   {d20}, [r0], r1
+-        vld1.8   {d22}, [r0], r1
+-        vld1.8   {d24}, [r0], r1
+-        vld1.8   {d26}, [r0], r1
+-        vld1.8   {d28}, [r0], r1
+-        vld1.8   {d30}, [r0], r1
+-        sub      r0, r0, r1, lsl #3
+-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+-        hevc_loop_filter_luma_body
+-        transpose_8x8 d16, d18, d20, d22, d24, d26, d28, d30
+-        vst1.8   {d16}, [r0], r1
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d22}, [r0], r1
+-        vst1.8   {d24}, [r0], r1
+-        vst1.8   {d26}, [r0], r1
+-        vst1.8   {d28}, [r0], r1
+-        vst1.8   {d30}, [r0]
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32], r1
++        vld4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32], r1
++
++        vld4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vld4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++
++        vld4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++        vld4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++
++        vld4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vld4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++
++        vld4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++        vld4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++
++        vld4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vld4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++
++        vld4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++        vld4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++
++        vld4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32]
++        vld4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32]
++
++        bl hevc_loop_filter_luma_body
++
++        neg     r1, r1
++
++        @ no_p[1]
++        tst     r10, #0xff00
++        itt ne
++        addne    r4, r4, r1, lsl #2
++        bne     1f
++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
++
++1:
++        @ no_q[1]
++        tst     r10, #0xff000000
++        itt ne
++        addne    r0, r0, r1, lsl #2
++        bne     2f
++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
++
++2:
++        @ no_p[0]
++        tst     r10, #0xff
++        bne     3f
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
++
++3:
++        @ no_q[0]
++        tst     r10, #0xff0000
++        bne     4f
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
++
++4:
++bypasswrite:
+         vpop     {d8-d15}
+-        pop      {r5-r11}
+-        bx lr
++        pop      {r4-r10,pc}
+ endfunc
+ 
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
++@                                 ptrdiff_t stride, [r1]
++@                                 int beta,         [r2]
++@                                 int32_t *tc,      [r3]
++@                                 uint8_t *no_p,    sp[0]
++@                                 uint8_t *no_q);   sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
+ function ff_hevc_h_loop_filter_luma_neon, export=1
+         hevc_loop_filter_luma_start
+-        push     {r5-r11}
++        push     {r4-r10,lr}
++
+         vpush    {d8-d15}
+         sub      r0, r0, r1, lsl #2
++
+         vld1.8  {d16}, [r0], r1
++        vld1.8  {d17}, [r0], r1
+         vld1.8  {d18}, [r0], r1
++        vld1.8  {d19}, [r0], r1
+         vld1.8  {d20}, [r0], r1
++        vld1.8  {d21}, [r0], r1
+         vld1.8  {d22}, [r0], r1
+-        vld1.8  {d24}, [r0], r1
+-        vld1.8  {d26}, [r0], r1
+-        vld1.8  {d28}, [r0], r1
+-        vld1.8  {d30}, [r0], r1
+-        sub        r0, r0, r1, lsl #3
+-        add        r0, r1
+-        hevc_loop_filter_luma_body
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d22}, [r0], r1
+-        vst1.8   {d24}, [r0], r1
+-        vst1.8   {d26}, [r0], r1
+-        vst1.8   {d28}, [r0]
+-bypasswrite:
++        vld1.8  {d23}, [r0]
++
++        bl hevc_loop_filter_luma_body
++
+         vpop     {d8-d15}
+-        pop      {r5-r11}
+-        bx lr
++
++        neg     r1, r1
++        add     r0, r0, r1
++
++        @ Why this isn't a bitmask to start with I have no idea...
++        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
++        ldr      r5, [sp, #32]
++        ldrh     r10, [r5]
++        ldr      r5, [sp, #36]
++        ldrh     r5, [r5]
++        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
++        bne      1f
++
++        vst1.8  {d22}, [r0], r1
++        vst1.8  {d21}, [r0], r1
++        vst1.8  {d20}, [r0], r1
++        vst1.8  {d19}, [r0], r1
++        vst1.8  {d18}, [r0], r1
++        vst1.8  {d17}, [r0]
++
++        pop      {r4-r10,pc}
++
++@ Partial write
++1:
++        vmov     r2, r3, d22
++        vmov     r4, r5, d21
++        vmov     r6, r7, d20
++
++        tst      r10, #0xff0000
++        ittt eq
++        streq    r2, [r0]
++        streq    r4, [r0, r1]
++        streq    r6, [r0, r1, lsl # 1]
++
++        add      r0, r0, #4
++        tst      r10, #0xff000000
++        ittt eq
++        streq    r3, [r0]
++        streq    r5, [r0, r1]
++        streq    r7, [r0, r1, lsl # 1]
++
++        vmov     r2, r3, d19
++        vmov     r4, r5, d18
++        vmov     r6, r7, d17
++        add      r0, r0, r1
++        add      r0, r0, r1, lsl # 1
++
++        tst      r10, #0xff00
++        ittt eq
++        streq    r3, [r0]
++        streq    r5, [r0, r1]
++        streq    r7, [r0, r1, lsl # 1]
++
++        tst      r10, #0xff
++        ittt eq
++        streq    r2, [r0, #-4]!
++        streq    r4, [r0, r1]
++        streq    r6, [r0, r1, lsl # 1]
++
++        pop      {r4-r10,pc}
++
+ endfunc
+ 
++@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++function ff_hevc_h_loop_filter_uv_neon_8, export=1
++        sub      r0, r0, r1, lsl #1
++        vld2.8   {d16,d17}, [r0], r1
++        vld2.8   {d18,d19}, [r0], r1
++        vld2.8   {d26,d27}, [r0], r1
++        vld2.8   {d28,d29}, [r0]
++        sub      r0, r0, r1, lsl #1
++        hevc_loop_filter_uv_body d16, d18, d26, d28
++        lsr      r2, r2, #16
++        hevc_loop_filter_uv_body d17, d19, d27, d29
++        cmp      r3, #0
++        bne      1f
++        vst2.8   {d18,d19}, [r0], r1
++        vst2.8   {d26,d27}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:      vzip.8   d18, d19
++        vzip.8   d26, d27
++        sub      r1, r1, #8
++
++        tst      r3, #1
++        bne      1f
++        vst1.8   {d18}, [r0]
++1:      add      r0, r0, #8
++        tst      r3, #2
++        bne      2f
++        vst1.8   {d19}, [r0]
++2:      add      r0, r0, r1
++
++        tst      r3, #4
++        bne      1f
++        vst1.8   {d26}, [r0]
++1:      add      r0, r0, #8
++        tst      r3, #8
++        it ne
++        bxne     lr
++        vst1.8   {d27}, [r0]
++        bx       lr
++
++endfunc
++
++
++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++function ff_hevc_v_loop_filter_uv2_neon_8, export=1
++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
++        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
++
++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
++        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++
++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
++        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++
++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
++        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++
++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
++        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++
++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
++        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++
++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
++        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++
++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
++        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
++
++        hevc_loop_filter_uv_body d16, d18, d26, d28
++        lsr      r2, r2, #16
++        hevc_loop_filter_uv_body d17, d19, d27, d29
++
++        neg      r1, r1
++
++        ldr      r2, [sp, #0]
++
++        @ p[1]
++        tst      r2, #2
++        itt ne
++        addne    r3, r3, r1, lsl #2
++        bne      1f
++        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
++        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
++        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
++        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
++
++1:
++        @ q[1]
++        tst      r2, #8
++        itt ne
++        addne    r0, r0, r1, lsl #2
++        bne 2f
++        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
++        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++
++2:
++        @ p[0]
++        tst      r2, #1
++        bne      3f
++        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
++        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
++        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
++        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
++
++3:
++        @ q[0]
++        tst      r2, #4
++        it ne
++        bxne     lr
++        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
++
++        bx       lr
++endfunc
++
++
+ function ff_hevc_v_loop_filter_chroma_neon, export=1
+         hevc_loop_filter_chroma_start
+         sub      r0, #4
+@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
          vst1.8   {d4}, [r0]
          bx       lr
  endfunc
 +
-+/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+ *                                            int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+ *                                            MvField *curr, MvField *neigh, uint8_t *bs)
++/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
++ *                                            int *curr_rpl0, int *curr_
++ *                                            MvField *curr, MvField *ne
 + */
 +function ff_hevc_deblocking_boundary_strengths_neon, export=1
 +        add         ip, sp, #4*4
@@ -1249,11 +2035,12 @@ index 166bddb..a088cc3 100644
 +90:     mov         a3, #1
 +        b           11b
 +endfunc
-diff --git b/libavcodec/arm/hevcdsp_epel_neon.S a/libavcodec/arm/hevcdsp_epel_neon.S
++
+diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
 new file mode 100644
 index 0000000..00eab9e
 --- /dev/null
-+++ a/libavcodec/arm/hevcdsp_epel_neon.S
++++ b/libavcodec/arm/hevcdsp_epel_neon.S
 @@ -0,0 +1,337 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
@@ -1592,11 +2379,11 @@ index 0000000..00eab9e
 +       .byte 4, 28, 46, 6
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
-diff --git b/libavcodec/arm/hevcdsp_init_neon.c a/libavcodec/arm/hevcdsp_init_neon.c
-index 1a3912c..5c72e1d 100644
---- b/libavcodec/arm/hevcdsp_init_neon.c
-+++ a/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,6 +22,8 @@
+diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+index 1a3912c..c87e9d3 100644
+--- a/libavcodec/arm/hevcdsp_init_neon.c
++++ b/libavcodec/arm/hevcdsp_init_neon.c
+@@ -22,11 +22,26 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
  #include "hevcdsp_arm.h"
@@ -1605,10 +2392,49 @@ index 1a3912c..5c72e1d 100644
  
  void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-@@ -43,6 +45,21 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++#ifdef RPI
++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++#endif
++
+ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+@@ -43,6 +58,52 @@ void ff_hevc_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
  void ff_hevc_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
                                         ptrdiff_t stride);
  
++#if RPI_HEVC_SAND
++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++#endif
++
 +void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
 +void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
 +void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
@@ -1623,11 +2449,21 @@ index 1a3912c..5c72e1d 100644
 +void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++
++void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
++                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
++
++void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
 +
  #define PUT_PIXELS(name) \
      void name(int16_t *dst, uint8_t *src, \
                                  ptrdiff_t srcstride, int height, \
-@@ -58,6 +75,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+@@ -58,6 +119,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
  #undef PUT_PIXELS
@@ -1643,7 +2479,7 @@ index 1a3912c..5c72e1d 100644
  
  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                     int height, int width);
-@@ -142,6 +168,132 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+@@ -142,14 +212,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
  }
  
@@ -1689,6 +2525,50 @@ index 1a3912c..5c72e1d 100644
 +    }
 +}
 +
++static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    // Width 32 already dealt with
++    // width 16 code works in double lines
++    if (width == 16 && (height & 1) == 0) {
++        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
++                                          sao_offset_val_u, sao_left_class_u,
++                                          sao_offset_val_v, sao_left_class_v,
++                                          width, height);
++    }
++    else
++    {
++        const int shift  = 3; // BIT_DEPTH - 5
++        int k, y, x;
++        pixel *dst = (pixel *)_dst;
++        pixel *src = (pixel *)_src;
++        int8_t offset_table_u[32] = { 0 };
++        int8_t offset_table_v[32] = { 0 };
++
++        stride_src /= sizeof(pixel);
++        stride_dst /= sizeof(pixel);
++
++        for (k = 0; k < 4; k++)
++            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        for (k = 0; k < 4; k++)
++            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width * 2; x += 2)
++            {
++                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
++                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++            }
++            dst += stride_dst;
++            src += stride_src;
++
++        }
++    }
++}
++
 +#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
 +static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                                          int16_t *_sao_offset_val, int eo, int width, int height)
@@ -1767,6 +2647,54 @@ index 1a3912c..5c72e1d 100644
 +        }
 +    }
 +}
++
++
++static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++
++    if (width == 32 && (height & 7) == 0) {
++        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
++    }
++    else
++    {
++        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++        static const int8_t pos[4][2][2] = {
++            { { -1,  0 }, {  1, 0 } }, // horizontal
++            { {  0, -1 }, {  0, 1 } }, // vertical
++            { { -1, -1 }, {  1, 1 } }, // 45 degree
++            { {  1, -1 }, { -1, 1 } }, // 135 degree
++        };
++        int8_t sao_offset_val_u[8];  // padding of 3 for vld
++        int8_t sao_offset_val_v[8];  // padding of 3 for vld
++        pixel *dst = (pixel *)_dst;
++        pixel *src = (pixel *)_src;
++        int a_stride, b_stride;
++        int x, y;
++
++        for (x = 0; x < 5; x++) {
++            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
++            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
++        }
++
++        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++        for (y = 0; y < height; y++) {
++            for (x = 0; x < width * 2; x += 2) {
++                int diff0u = CMP(src[x], src[x + a_stride]);
++                int diff1u = CMP(src[x], src[x + b_stride]);
++                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
++                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
++            }
++            src += stride_src;
++            dst += stride_dst;
++        }
++    }
++}
 +#undef CMP
 +
 +void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
@@ -1776,18 +2704,48 @@ index 1a3912c..5c72e1d 100644
  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
  {
      if (bit_depth == 8) {
-@@ -161,6 +313,10 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+         int x;
+         c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon;
+         c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
+         c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+         c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
++#ifdef RPI
++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_8;
++#endif
+         c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+         c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+         c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+@@ -160,7 +455,25 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+         c->add_residual[1]             = ff_hevc_add_residual_8x8_neon_8;
          c->add_residual[2]             = ff_hevc_add_residual_16x16_neon_8;
          c->add_residual[3]             = ff_hevc_add_residual_32x32_neon_8;
++#if RPI_HEVC_SAND
++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
++#endif
          c->transform_4x4_luma          = ff_hevc_transform_luma_4x4_neon_8;
 +        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
 +          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
++          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
 +          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
++          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
 +        }
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +357,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -201,7 +514,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
@@ -1809,7 +2767,7 @@ index 1a3912c..5c72e1d 100644
          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +391,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -221,4 +548,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
      }
@@ -1819,12 +2777,12 @@ index 1a3912c..5c72e1d 100644
 +    assert(offsetof(MvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
  }
-diff --git b/libavcodec/arm/hevcdsp_sao_neon.S a/libavcodec/arm/hevcdsp_sao_neon.S
+diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
 new file mode 100644
-index 0000000..9c7808d
+index 0000000..08a021d
 --- /dev/null
-+++ a/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,510 @@
++++ b/libavcodec/arm/hevcdsp_sao_neon.S
+@@ -0,0 +1,862 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -1950,24 +2908,186 @@ index 0000000..9c7808d
 +
 +function ff_hevc_sao_band_w64_neon_8, export=1
 +        init_sao_band
-+1:      subs      r12, #1
-+        pld       [r1, r3]
-+        vld1.8    {q8-q9}, [r1, :128]!
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vld1.8    {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        sub       r1, #32
-+        sao_band_64
-+        vst1.8    {q8-q9}, [r0, :128]!
-+        vst1.8    {q10-q11}, [r0, :128], r2
-+        sub       r0, #32
-+        bne       1b
 +
-+        bx lr
++        push      {r4, lr}
++        subs      r12, #1
++        mov       r4, r1
++        it ne
++        addne     r4, r3
++
++1:      subs      r12, #1
++        vldm      r1, {q8-q11}
++        pld       [r4]
++        vshr.u8   q12, q8, #3
++        vshr.u8   q13, q9, #3
++        add       r1, r3
++        vshr.u8   q14, q10, #3
++        vshr.u8   q15, q11, #3
++        sao_band_64
++        it ne
++        addne     r4, r3
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
 +endfunc
 +
++
++@ ff_hevc_sao_band_c_w64_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++@ As this is often done in-place on the frame buffer it is worth preloading
++@ the pixel values but we want to beware of loading ouside our buffer to avoid
++@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
++
++function ff_hevc_sao_band_c_neon_8, export=1
++        mov     r12, sp
++        push   {r4-r8, lr}  // 24 bytes
++
++        ldm     r12, {r4-r7}
++
++        add     r4, #2
++        add     r6, #2
++        vld1.16 {d16}, [r4]    @ Unaligned
++        lsl     r5, r5, #3
++        vld1.16 {d18}, [r6]
++        pld     [r1]
++        vmov.i8  d17, #0
++        mov     r4, r1
++        vmov.i8  d19, #0
++        lsl     r7, r7, #3
++        vdup.8  q1, r5
++        ldr     r5, [r12, #16]  @ width
++        vdup.8  q2, r7
++        ldr     r12, [r12, #20]
++        vqmovn.s16 d0, q8
++        cmp     r5, #16         @ At some point we may want a table lookup
++        vqmovn.s16 d1, q9
++        vmov.i8 q3, #128
++        beq     16f
++
++        @ d0 U lookup
++        @ d1 V lookup
++        @ q1 U raw offset
++        @ q2 V raw offset
++        @ q3 #128
++
++        @ r4 = r1 = src - Inteded for preload pointer
++        @ r12 = height
++
++        @ Might (unlikely) be called with height == 1
++        subs      r12, #1
++        it ne
++        addne     r4, r3
++
++1:
++        subs      r12, #1
++        vld2.8    {q8-q9}, [r1, :128]!
++        vsub.u8   q12, q8, q1
++        vld2.8    {q10-q11}, [r1, :128], r3
++        vsub.u8   q14, q10, q1
++        vsub.u8   q13, q9, q2
++        sub       r1, #32
++        vsub.u8   q15, q11, q2
++        pld       [r4]
++        vshr.u8   q12, #3
++        vadd.s8   q8, q3
++        vshr.u8   q13, #3
++        vadd.s8   q9, q3
++
++        vtbl.8   d24, {d0}, d24
++        vshr.u8  q14, #3
++        vtbl.8   d25, {d0}, d25
++        vshr.u8  q15, #3
++        vtbl.8   d26, {d1}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d1}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d1}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d1}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++
++        it ne
++        addne     r4, r3        @ Do not inc on final pass
++        vst2.8    {q8-q9}, [r0, :128]!
++        vst2.8    {q10-q11}, [r0, :128], r2
++        sub       r0, #32
++        bpl       1b
++
++        pop    {r4-r8, pc}
++
++@ -- width 16 (UV pairs) --
++16:
++        subs    r12, #2
++        it ne
++        addne   r4, r4, r3, lsl #1
++
++1:
++        subs      r12, #2
++        vld2.8    {q8-q9}, [r1, :128], r3
++        vsub.u8   q12, q8, q1
++        vld2.8    {q10-q11}, [r1, :128], r3
++        vsub.u8   q14, q10, q1
++        vsub.u8   q13, q9, q2
++        pld       [r4]
++        vsub.u8   q15, q11, q2
++        pld       [r4, r3]
++        vshr.u8  q12, #3
++        vadd.s8  q8, q3
++        vshr.u8  q13, #3
++        vadd.s8  q9, q3
++
++        vtbl.8   d24, {d0}, d24
++        vshr.u8  q14, #3
++        vtbl.8   d25, {d0}, d25
++        vshr.u8  q15, #3
++        vtbl.8   d26, {d1}, d26
++        vadd.s8  q10, q3
++        vtbl.8   d27, {d1}, d27
++        vadd.s8  q11, q3
++        vtbl.8   d28, {d0}, d28
++        vqadd.s8 q8, q12
++        vtbl.8   d29, {d0}, d29
++        vqadd.s8 q9, q13
++        vtbl.8   d30, {d1}, d30
++        vqadd.s8 q10, q14
++        vtbl.8   d31, {d1}, d31
++        vsub.s8  q8, q3
++        vqadd.s8 q11, q15
++        vsub.s8  q9, q3
++        vsub.s8  q10, q3
++        vsub.s8  q11, q3
++
++        it ne
++        addne   r4, r4, r3, lsl #1
++        vst2.8    {q8-q9}, [r0, :128], r2
++        vst2.8    {q10-q11}, [r0, :128], r2
++        bpl       1b
++
++        pop    {r4-r8, pc}
++
++endfunc
++
++
 +.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
 +        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
 +        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
@@ -1977,71 +3097,120 @@ index 0000000..9c7808d
 +        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
 +.endm
 +
-+.macro table64
-+        vmov.s8 q13, #2 // 2 to all elements
-+        vmov.32  d24[0], r4  // load offset table from general registers
-+        vmov.32  d24[1], r5  // load rest of offset table
-+
-+        vadd.s8 q0, q13
-+        vadd.s8 q1, q13
-+        vadd.s8 q2, q13
-+        vadd.s8 q3, q13
-+
-+        vmov.u8  q15, #128 // s8 #-128
-+        vtbl.8   d0, {d24}, d0
-+        vadd.s8  q13,  q4, q15
-+        vtbl.8   d1, {d24}, d1
-+        vadd.s8  q14,  q5, q15
-+        vtbl.8   d2, {d24}, d2
-+        vqadd.s8 q0, q13
-+        vtbl.8   d3, {d24}, d3
-+        vqadd.s8 q1, q14
-+        vtbl.8   d4, {d24}, d4
-+        vadd.s8  q13,  q6, q15
-+        vtbl.8   d5, {d24}, d5
-+        vadd.s8  q14,  q7, q15
-+        vtbl.8   d6, {d24}, d6
-+        vqadd.s8 q2, q13
-+        vtbl.8   d7, {d24}, d7
-+        vqadd.s8 q3, q14
-+        vsub.s8   q0, q15
-+        vsub.s8   q1, q15
-+        vsub.s8   q2, q15
-+        vsub.s8   q3, q15
-+        vst1.8  {q0-q1}, [r0, :128]!
-+        vst1.8  {q2-q3}, [r0, :128], r2
-+        sub     r0, #32
-+.endm
 +
 +// input
 +// a in q0 - q3
 +// c in q4 - q7
 +// b in q8 - q11
-+// offset table in r7 and r5
++// offset table r4,r5 and r6,r7
++//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
 +// output in q0 - q3
 +// clobbers q12 - q15
-+.macro edge_w64_body
-+        diff32 q12, q13, q0, q1, q0, q1, q4, q5
-+        diff32 q0, q1, q14, q15, q8, q9, q4, q5
 +
-+        vadd.s8  q0, q12 //diff0 + diff1
-+        vadd.s8  q1, q13
++@ a <- c <- b
++@
++@ It appears that Neon can stall if you try and use results too soon so we try to
++@ spread our instruction out
 +
-+        diff32  q14, q15, q2, q3, q2, q3, q6, q7
-+        diff32  q2, q3, q12, q13, q10, q11, q6, q7
++.macro edgeidx64
++
++        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u8 q13, q5, q1
++        vcgt.u8 q14, q6, q2
++        vcgt.u8 q15, q7, q3
++
++        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u8 q1, q1, q5
++        vcgt.u8 q2, q2, q6
++        vcgt.u8 q3, q3, q7
++
++        vsub.s8 q0, q0, q12 // a = sign(c-a)
++        vsub.s8 q1, q1, q13
++        vsub.s8 q2, q2, q14
++        vsub.s8 q3, q3, q15
++
++        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u8 q13, q5, q9
++        vcgt.u8 q14, q6, q10
++        vcgt.u8 q15, q7, q11
++
++        vsub.s8 q0, q0, q12
++        vsub.s8 q1, q1, q13
++        vsub.s8 q2, q2, q14
++        vsub.s8 q3, q3, q15
++
++        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u8 q13, q9, q5
++        vcgt.u8 q14, q10, q6
++        vcgt.u8 q15, q11, q7
++
++        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s8 q1, q1, q13
++        vmov.u8 q12, #2
++        vadd.s8 q2, q2, q14
++        vadd.s8 q3, q3, q15
++
++        vadd.s8 q0, q0, q12
++        vadd.s8 q1, q1, q12
++        @ whilst vmov dn, rm, rn exists it is a vfp instruction
++        @ and causes a stall till neon pipe empty - so don't do that!
++        vmov    d26[0], r4
++        vmov    d26[1], r5
++        vmov    d27[0], r6
++        vmov    d27[1], r7
++        vadd.s8 q2, q2, q12
++        vuzp.8    q0, q1
++        vmov.u8 q15, #128
++        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
++
++        vtbl.8  d0, {d26}, d0
++        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d1, {d26}, d1
++        vadd.s8 q14, q5, q15
++
++        vtbl.8  d2, {d27}, d2
++        vuzp.8    q2, q3
++
++        vtbl.8  d3, {d27}, d3
++
++        vtbl.8  d4, {d26}, d4
++        vzip.8    q0, q1
++
++        vtbl.8  d5, {d26}, d5
++        vqadd.s8 q0, q0, q12
++        vqadd.s8 q1, q1, q14
++        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d6, {d27}, d6
++        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
++
++        vtbl.8  d7, {d27}, d7
++        vzip.8   q2, q3
++
++        vsub.s8 q0, q0, q15
++        vqadd.s8 q2, q2, q12
++        vqadd.s8 q3, q3, q14
++        vsub.s8 q1, q1, q15
++        vsub.s8 q2, q2, q15
++        vsub.s8 q3, q3, q15
 +
-+        vadd.s8  q2, q14
-+        vadd.s8  q3, q15
-+        table64
 +.endm
 +
++function edge_w64_body
++        edgeidx64
++        vstm    r0, {q0-q3}
++        add     r0, r0, r2
++        bx       lr
++endfunc
++
 +.macro init_edge_64
-+        push   {r4-r5}
-+        ldr    r12, [sp, #8] // height
-+        ldr    r5, [sp, #12] // sao_offset_val_table
-+        ldr    r4, [r5]
-+        add    r5, #4
-+        ldr    r5, [r5]
++        push   {r4-r8,lr}
++        ldr    r12, [sp, #24] // height
++        ldr    r5,  [sp, #28] // sao_offset_val_table
++        ldrd   r4, r5, [r5]
++        mov    r6, r4
++        mov    r7, r5
 +.endm
 +
 +function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
@@ -2064,11 +3233,10 @@ index 0000000..9c7808d
 +        vext.8 q9, q5, q6, #1
 +        vext.8 q10, q6, q7, #1
 +        vext.8 q11, q7, q12, #1
-+        edge_w64_body
++        bl    edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
@@ -2088,7 +3256,7 @@ index 0000000..9c7808d
 +        vld1.8  {q8-q9}, [r1, :128]!
 +        vld1.8  {q10-q11}, [r1, :128], r3
 +        sub     r1, #32
-+        edge_w64_body
++        bl      edge_w64_body
 +        // copy c to a
 +        vmov.64 q0, q4
 +        vmov.64 q1, q5
@@ -2101,8 +3269,7 @@ index 0000000..9c7808d
 +        vmov.64 q7, q11
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
@@ -2126,11 +3293,10 @@ index 0000000..9c7808d
 +        vld1.8  {q8-q9}, [r1]!
 +        vld1.8  {q10-q11}, [r1]
 +        sub     r1, #33
-+        edge_w64_body
++        bl      edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
 +function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
@@ -2154,13 +3320,157 @@ index 0000000..9c7808d
 +        vld1.8  {q8-q9}, [r1]!
 +        vld1.8  {q10-q11}, [r1]
 +        sub     r1, #31
-+        edge_w64_body
++        bl      edge_w64_body
 +        bne   1b
 +        vpop  {d8-d15}
-+        pop   {r4-r5}
-+        bx lr
++        pop   {r4-r8,pc}
 +endfunc
 +
++
++@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
++@   uint8_t *_dst,               r0
++@   uint8_t *_src,               r1
++@   ptrdiff_t stride_dst,        r2
++@   ptrdiff_t stride_src,        r3
++@   int height,                  sp[0]
++@   int16_t *sao_offset_table_u,  sp[4]
++@   int16_t *sao_offset_table_v); sp[8]
++@   int eo                        sp[12]
++
++function ff_hevc_sao_edge_c_w64_neon_8, export=1
++        push   {r4-r8,lr}     // 6 reg = 24
++        ldr    r5,  [sp, #28] // sao_offset_val_table_u
++        ldr    r7,  [sp, #32] // sao_offset_val_table_v
++
++        @ Load and rearrange offsets
++        @ Also "convert" from 16bit to 8bit
++        ldrb    r4, [r5, #2]
++        ldrb    r8, [r5, #4]
++        ldrb    r6, [r7, #2]
++        ldrb    r12, [r7, #4]
++        orr     r4, r4, r8, lsl #8
++        orr     r6, r6, r12, lsl #8
++        ldrb    r8, [r5, #6]
++        ldrb    r12, [r7, #6]
++        orr     r4, r4, r8, lsl #24
++        orr     r6, r6, r12, lsl #24
++        ldrb    r5, [r5, #8]
++        ldrb    r7, [r7, #8]
++
++        ldr     r12, [sp, #36] // e0
++        adr     r8, edge_c_tbl_w64
++        ldr     r8, [r8, r12, lsl #2]
++
++        ldr     r12, [sp, #24] // height
++        vpush   {d8-d15}
++        mov     pc, r8
++
++edge_c_tbl_w64:
++        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
++        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
++
++ff_hevc_sao_edge_c_eo0_w64_neon_8:
++        sub    r1, #8
++1:      subs    r12, #1
++        vld1.64  {d7}, [r1, :64]!
++        vld1.64  {q4-q5}, [r1, :128]! // load c
++        vld1.64  {q6-q7}, [r1, :128]!
++        vld1.64  {d24}, [r1, :64], r3
++        sub      r1, #72
++        // load a
++        vext.8 q0, q3, q4, #14
++        vext.8 q1, q4, q5, #14
++        vext.8 q2, q5, q6, #14
++        vext.8 q3, q6, q7, #14
++        // load b
++        vext.8 q8, q4, q5, #2
++        vext.8 q9, q5, q6, #2
++        vext.8 q10, q6, q7, #2
++        vext.8 q11, q7, q12, #2
++        bl    edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo1_w64_neon_8:
++        sub     r1, r3
++        // load a
++        vldm    r1, {q0-q3}
++        add     r1, r3
++        // load c
++        vldm    r1, {q4-q7}
++        add     r1, r3
++1:      subs    r12, #1
++        // load b
++        vldm    r1, {q8-q11}
++        add     r1, r3
++        bl      edge_w64_body
++        // copy c to a
++        vmov.64 q0, q4
++        vmov.64 q1, q5
++        vmov.64 q2, q6
++        vmov.64 q3, q7
++        // copy b to c
++        vmov.64 q4, q8
++        vmov.64 q5, q9
++        vmov.64 q6, q10
++        vmov.64 q7, q11
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo2_w64_neon_8:
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        sub     r1, #2
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #30
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        add     r1, #2
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #34
++        bl      edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++
++ff_hevc_sao_edge_c_eo3_w64_neon_8:
++1:      sub     r1, r3
++        // load a
++        // TODO: fix unaligned load
++        //       don't reload a like in eo1
++        add     r1, #2
++        vld1.8  {q0-q1}, [r1]!
++        vld1.8  {q2-q3}, [r1], r3
++        sub     r1, #34
++        subs    r12, #1
++        // load c
++        vld1.8  {q4-q5}, [r1, :128]!
++        vld1.8  {q6-q7}, [r1, :128], r3
++        sub     r1, #32
++        // load b
++        sub     r1, #2
++        vld1.8  {q8-q9}, [r1]!
++        vld1.8  {q10-q11}, [r1]
++        sub     r1, #30
++        bl      edge_w64_body
++        bne   1b
++        vpop  {d8-d15}
++        pop   {r4-r8,pc}
++endfunc
++
++
 +.macro init_edge_32
 +        ldr     r12, [sp, #4] // sao_offset_val_table
 +        vld1.32 {d31}, [r12]
@@ -2277,7 +3587,7 @@ index 0000000..9c7808d
 +        vext.8  q7, q11, q12, #8
 +        vext.8  q5, q10, q11, #7
 +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
 +        vadd.s8 q0, q12 //diff0 + diff1
 +        vadd.s8 q1, q13
 +        table32
@@ -2317,7 +3627,7 @@ index 0000000..9c7808d
 +        vext.8  q14, q12, q10, #7
 +
 +        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11, q8, q9, q2, q3
++        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
 +
 +        vadd.s8 q0, q12 //diff0 + diff1
 +        vadd.s8 q1, q13
@@ -2335,10 +3645,10 @@ index 0000000..9c7808d
 +        bx      lr
 +endfunc
 +
-diff --git b/libavcodec/avcodec.h a/libavcodec/avcodec.h
-index d780477..5807e1b 100644
---- b/libavcodec/avcodec.h
-+++ a/libavcodec/avcodec.h
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index 57334df..7648294 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
 @@ -443,6 +443,8 @@ enum AVCodecID {
      AV_CODEC_ID_XPM,
      AV_CODEC_ID_AV1,
@@ -2348,7 +3658,7 @@ index d780477..5807e1b 100644
      /* various PCM "codecs" */
      AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
      AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2925,6 +2927,7 @@ typedef struct AVCodecContext {
+@@ -2935,6 +2937,7 @@ typedef struct AVCodecContext {
  #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
  #define FF_BUG_TRUNCATED       16384
  #define FF_BUG_IEDGE           32768
@@ -2356,7 +3666,7 @@ index d780477..5807e1b 100644
  
      /**
       * strictly follow the standard (MPEG-4, ...).
-@@ -3276,6 +3279,9 @@ typedef struct AVCodecContext {
+@@ -3286,6 +3289,9 @@ typedef struct AVCodecContext {
  #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
  #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
  #define FF_PROFILE_H264_CAVLC_444            44
@@ -2366,7 +3676,7 @@ index d780477..5807e1b 100644
  
  #define FF_PROFILE_VC1_SIMPLE   0
  #define FF_PROFILE_VC1_MAIN     1
-@@ -3586,7 +3592,13 @@ typedef struct AVCodecContext {
+@@ -3596,7 +3602,13 @@ typedef struct AVCodecContext {
  #endif
  
      /**
@@ -2381,10 +3691,10 @@ index d780477..5807e1b 100644
       * the end of the audio. I.e. this number of decoded samples must be
       * discarded by the caller from the end of the stream to get the original
       * audio without any trailing padding.
-diff --git b/libavcodec/cabac.h a/libavcodec/cabac.h
+diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
 index 1bf1c62..ccfa991 100644
---- b/libavcodec/cabac.h
-+++ a/libavcodec/cabac.h
+--- a/libavcodec/cabac.h
++++ b/libavcodec/cabac.h
 @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
  typedef struct CABACContext{
      int low;
@@ -2401,10 +3711,10 @@ index 1bf1c62..ccfa991 100644
      const uint8_t *bytestream_start;
      const uint8_t *bytestream;
      const uint8_t *bytestream_end;
-diff --git b/libavcodec/codec_desc.c a/libavcodec/codec_desc.c
+diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
 index 9711019..9f99a2c 100644
---- b/libavcodec/codec_desc.c
-+++ a/libavcodec/codec_desc.c
+--- a/libavcodec/codec_desc.c
++++ b/libavcodec/codec_desc.c
 @@ -1622,6 +1622,48 @@ static const AVCodecDescriptor codec_descriptors[] = {
          .props     = AV_CODEC_PROP_LOSSLESS,
          .mime_types= MT("image/png"),
@@ -2454,29 +3764,10 @@ index 9711019..9f99a2c 100644
  
      /* various PCM "codecs" */
      {
-diff --git b/libavcodec/dvdsubdec.c a/libavcodec/dvdsubdec.c
-index 4e9c058..22ce728 100644
---- b/libavcodec/dvdsubdec.c
-+++ a/libavcodec/dvdsubdec.c
-@@ -189,12 +189,12 @@ static void guess_palette(DVDSubContext* ctx,
-                 r = (((subtitle_color >> 16) & 0xff) * level) >> 8;
-                 g = (((subtitle_color >> 8) & 0xff) * level) >> 8;
-                 b = (((subtitle_color >> 0) & 0xff) * level) >> 8;
--                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17U) << 24);
-+                rgba_palette[i] = b | (g << 8) | (r << 16) | ((alpha[i] * 17) << 24);
-                 color_used[colormap[i]] = (i + 1);
-                 j++;
-             } else {
-                 rgba_palette[i] = (rgba_palette[color_used[colormap[i]] - 1] & 0x00ffffff) |
--                                    ((alpha[i] * 17U) << 24);
-+                                    ((alpha[i] * 17) << 24);
-             }
-         }
-     }
-diff --git b/libavcodec/h264.h a/libavcodec/h264.h
+diff --git a/libavcodec/h264.h b/libavcodec/h264.h
 index 86df5eb..22c4f1d 100644
---- b/libavcodec/h264.h
-+++ a/libavcodec/h264.h
+--- a/libavcodec/h264.h
++++ b/libavcodec/h264.h
 @@ -41,7 +41,9 @@ enum {
      H264_NAL_END_STREAM      = 11,
      H264_NAL_FILLER_DATA     = 12,
@@ -2487,44 +3778,10 @@ index 86df5eb..22c4f1d 100644
  };
  
  #endif /* AVCODEC_H264_H */
-diff --git b/libavcodec/h264_parse.c a/libavcodec/h264_parse.c
-index ea202e7..0c87319 100644
---- b/libavcodec/h264_parse.c
-+++ a/libavcodec/h264_parse.c
-@@ -59,9 +59,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
-             if (luma_weight_flag) {
-                 pwt->luma_weight[i][list][0] = get_se_golomb(gb);
-                 pwt->luma_weight[i][list][1] = get_se_golomb(gb);
--                if ((int8_t)pwt->luma_weight[i][list][0] != pwt->luma_weight[i][list][0] ||
--                    (int8_t)pwt->luma_weight[i][list][1] != pwt->luma_weight[i][list][1])
--                    goto out_range_weight;
-                 if (pwt->luma_weight[i][list][0] != luma_def ||
-                     pwt->luma_weight[i][list][1] != 0) {
-                     pwt->use_weight             = 1;
-@@ -79,9 +76,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
-                     for (j = 0; j < 2; j++) {
-                         pwt->chroma_weight[i][list][j][0] = get_se_golomb(gb);
-                         pwt->chroma_weight[i][list][j][1] = get_se_golomb(gb);
--                        if ((int8_t)pwt->chroma_weight[i][list][j][0] != pwt->chroma_weight[i][list][j][0] ||
--                            (int8_t)pwt->chroma_weight[i][list][j][1] != pwt->chroma_weight[i][list][j][1])
--                            goto out_range_weight;
-                         if (pwt->chroma_weight[i][list][j][0] != chroma_def ||
-                             pwt->chroma_weight[i][list][j][1] != 0) {
-                             pwt->use_weight_chroma        = 1;
-@@ -110,9 +104,6 @@ int ff_h264_pred_weight_table(GetBitContext *gb, const SPS *sps,
-     }
-     pwt->use_weight = pwt->use_weight || pwt->use_weight_chroma;
-     return 0;
--out_range_weight:
--    avpriv_request_sample(logctx, "Out of range weight\n");
--    return AVERROR_INVALIDDATA;
- }
- 
- /**
-diff --git b/libavcodec/h264_parser.c a/libavcodec/h264_parser.c
+diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
 index bc35a61..055828c 100644
---- b/libavcodec/h264_parser.c
-+++ a/libavcodec/h264_parser.c
+--- a/libavcodec/h264_parser.c
++++ b/libavcodec/h264_parser.c
 @@ -60,6 +60,8 @@ typedef struct H264ParseContext {
      uint8_t parse_history[6];
      int parse_history_count;
@@ -2618,30 +3875,11 @@ index bc35a61..055828c 100644
 +    .parser_close   = h264_close,
 +    .split          = h264_split,
 +};
-diff --git b/libavcodec/h264_slice.c a/libavcodec/h264_slice.c
-index 44a0b9f..fa1e9ae 100644
---- b/libavcodec/h264_slice.c
-+++ a/libavcodec/h264_slice.c
-@@ -1778,12 +1778,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl,
-     }
-     if ((pps->weighted_pred && sl->slice_type_nos == AV_PICTURE_TYPE_P) ||
-         (pps->weighted_bipred_idc == 1 &&
--         sl->slice_type_nos == AV_PICTURE_TYPE_B)) {
--        ret = ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
-+         sl->slice_type_nos == AV_PICTURE_TYPE_B))
-+        ff_h264_pred_weight_table(&sl->gb, sps, sl->ref_count,
-                                   sl->slice_type_nos, &sl->pwt, h->avctx);
--        if (ret < 0)
--            return ret;
--    }
- 
-     sl->explicit_ref_marking = 0;
-     if (nal->ref_idc) {
-diff --git b/libavcodec/hevc.h a/libavcodec/hevc.h
-index de77d2a..494ca48 100644
---- b/libavcodec/hevc.h
-+++ a/libavcodec/hevc.h
-@@ -21,6 +21,34 @@
+diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+index de77d2a..a63db2b 100644
+--- a/libavcodec/hevc.h
++++ b/libavcodec/hevc.h
+@@ -21,6 +21,45 @@
  #ifndef AVCODEC_HEVC_H
  #define AVCODEC_HEVC_H
  
@@ -2649,6 +3887,8 @@ index de77d2a..494ca48 100644
 +#ifndef RPI
 +
 +  #define RPI_INTER          0
++  #define RPI_TSTATS         0
++  #define RPI_HEVC_SAND      0
 +
 +#else
 +
@@ -2671,15 +3911,24 @@ index de77d2a..494ca48 100644
 +//  #define RPI_DEBLOCK_VPU
 +
 +  #define RPI_VPU_DEBLOCK_CACHED 1
++
++  #if HAVE_NEON
++  #define RPI_HEVC_SAND      1
++  #else
++  // Sand bust on Pi1 currently - reasons unknown
++  #define RPI_HEVC_SAND      0
++  #endif
++
++  #define RPI_TSTATS 0
 +#endif
 +
  /**
   * Table 7-3: NAL unit type codes
   */
-diff --git b/libavcodec/hevc_cabac.c a/libavcodec/hevc_cabac.c
-index e27c54e..1dbbb16 100644
---- b/libavcodec/hevc_cabac.c
-+++ a/libavcodec/hevc_cabac.c
+diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+index e27c54e..09727d9 100644
+--- a/libavcodec/hevc_cabac.c
++++ b/libavcodec/hevc_cabac.c
 @@ -21,6 +21,8 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
@@ -2689,10 +3938,14 @@ index e27c54e..1dbbb16 100644
  #include "libavutil/attributes.h"
  #include "libavutil/common.h"
  
-@@ -29,8 +31,64 @@
+@@ -29,8 +31,68 @@
  #include "hevc.h"
  #include "hevcdec.h"
  
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
 +// BY22 is probably faster than simple bypass if the processor has
 +// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
 +// x86 has fast int divide
@@ -2754,7 +4007,7 @@ index e27c54e..1dbbb16 100644
  /**
   * number of bin by SyntaxElement.
   */
-@@ -447,6 +505,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+@@ -447,6 +509,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
      { 28, 36, 43, 49, 54, 58, 61, 63, },
  };
  
@@ -2966,7 +4219,7 @@ index e27c54e..1dbbb16 100644
  void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
  {
      if (s->ps.pps->entropy_coding_sync_enabled_flag &&
-@@ -865,19 +1128,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+@@ -865,19 +1132,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
      return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
  }
  
@@ -2992,7 +4245,7 @@ index e27c54e..1dbbb16 100644
  }
  
  int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
-@@ -893,14 +1156,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+@@ -893,14 +1160,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
      return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
  }
  
@@ -3009,7 +4262,7 @@ index e27c54e..1dbbb16 100644
          ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
          ctx_shift = (log2_size + 1) >> 2;
      } else {
-@@ -931,22 +1194,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+@@ -931,22 +1198,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
      return value;
  }
  
@@ -3035,7 +4288,7 @@ index e27c54e..1dbbb16 100644
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -968,90 +1225,337 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+@@ -968,90 +1229,395 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -3048,7 +4301,7 @@ index e27c54e..1dbbb16 100644
 +
 +#ifndef coeff_abs_level_remaining_decode_bypass
 +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
- {
++{
 +    CABACContext * const c = &s->HEVClc->cc;
 +    uint32_t y;
 +    unsigned int prefix;
@@ -3089,7 +4342,7 @@ index e27c54e..1dbbb16 100644
 +#endif
 +
 +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
-+{
+ {
 +    CABACContext * const c = &s->HEVClc->cc;
      int prefix = 0;
      int suffix = 0;
@@ -3235,7 +4488,7 @@ index e27c54e..1dbbb16 100644
 +static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
 +{
 +    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
- }
++}
 +#endif
 +
 +
@@ -3249,7 +4502,7 @@ index e27c54e..1dbbb16 100644
 +        (*stat_coeff)++;
 +    else if (x == 0 && *stat_coeff > 0)
 +        (*stat_coeff)--;
-+}
+ }
 +#endif
 +
 +
@@ -3330,6 +4583,62 @@ index e27c54e..1dbbb16 100644
 +    return i;
 +}
 +
++#ifdef RPI
++static void rpi_add_residual(HEVCContext * const s,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    unsigned int stride = frame->linesize[c_idx];
++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
++    const int is_sliced = rpi_sliced_frame(frame);
++    uint8_t * dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            rpi_sliced_frame_pos_y(frame, x, y) :
++            rpi_sliced_frame_pos_c(frame, x, y);
++
++    if (s->enable_rpi) {
++        const unsigned int i = s->num_pred_cmds[s->pass0_job];
++        HEVCPredCmd * const pc = s->univ_pred_cmds[s->pass0_job] + i - 1;
++
++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++            pc->ta.dst == dst)
++        {
++            av_assert0(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->ta.buf + (1 << (log2_trafo_size * 2)) &&
++                       pc->ta.stride == stride);
++
++            pc->type = RPI_PRED_ADD_RESIDUAL_C;
++        }
++        else
++        {
++            HEVCPredCmd * const cmd = pc + 1;
++            s->num_pred_cmds[s->pass0_job] = i + 1;
++
++            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++            cmd->size = log2_trafo_size;
++            cmd->c_idx = c_idx;
++            cmd->ta.buf = coeffs;
++            cmd->ta.dst = dst;
++            cmd->ta.stride = stride;
++        }
++    }
++    else if (!is_sliced || c_idx == 0) {
++        s->hevcdsp.add_residual[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++#if RPI_HEVC_SAND
++    else if (c_idx == 1) {
++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++    else {
++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++    }
++#endif
++}
++#endif
  
  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                                  int log2_trafo_size, enum ScanType scan_idx,
@@ -3359,13 +4668,16 @@ index e27c54e..1dbbb16 100644
 +    const uint8_t *scan_x_cg, *scan_y_cg;
 +    const xy_off_t * scan_xy_off;
  
++#ifndef RPI
      ptrdiff_t stride = s->frame->linesize[c_idx];
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-     uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
                                            ((x0 >> hshift) << s->ps.sps->pixel_shift)];
 -    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
 -    uint8_t significant_coeff_group_flag[8][8] = {{0}};
++#endif
 +#ifdef RPI
 +    int use_vpu;
 +#endif
@@ -3398,7 +4710,7 @@ index e27c54e..1dbbb16 100644
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1067,9 +1571,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1067,9 +1633,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          };
          int qp_y = lc->qp_y;
  
@@ -3419,7 +4731,7 @@ index e27c54e..1dbbb16 100644
          }
  
          if (c_idx == 0) {
-@@ -1102,39 +1616,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1102,39 +1678,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -3510,7 +4822,7 @@ index e27c54e..1dbbb16 100644
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1162,119 +1713,133 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1162,119 +1775,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -3578,8 +4890,9 @@ index e27c54e..1dbbb16 100644
 +        if (s->enable_rpi) {
 +            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
 +            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#ifndef RPI_PRECLEAR
-+            // We now do the memset after transform_add while we know the data is cached.
++#if HAVE_NEON
++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
 +            memset(coeffs, 0, ccount * sizeof(int16_t));
 +#endif
 +        }
@@ -3708,7 +5021,7 @@ index e27c54e..1dbbb16 100644
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1288,34 +1853,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1288,34 +1916,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      }
                  }
              }
@@ -3757,12 +5070,11 @@ index e27c54e..1dbbb16 100644
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1325,141 +1886,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1325,141 +1949,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          }
  
 -        n_end = nb_significant_coeff_flag;
--
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
 +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
@@ -3810,6 +5122,9 @@ index e27c54e..1dbbb16 100644
 +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
 +                }
  
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ 
 -        if (n_end) {
 -            int first_nz_pos_in_cg;
 -            int last_nz_pos_in_cg;
@@ -3820,9 +5135,6 @@ index e27c54e..1dbbb16 100644
 -            int sum_abs = 0;
 -            int sign_hidden;
 -            int sb_type;
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
- 
 +                if (coded_val)
 +                {
 +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
@@ -3833,13 +5145,18 @@ index e27c54e..1dbbb16 100644
 +                        const unsigned int c_rice_param = *stat_coeff >> 2;
 +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
  
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
 +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
 +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                    }
 +                }
  
+-            // initialize first elem of coeff_bas_level_greater1_flag
+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+ 
 -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
 -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
@@ -3847,11 +5164,7 @@ index e27c54e..1dbbb16 100644
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
 -                c_rice_param = lc->stat_coeff[sb_type] / 4;
 -            }
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
+-
 -            if (!(i == num_last_subset) && greater1_ctx == 0)
 -                ctx_set++;
 -            greater1_ctx = 1;
@@ -3936,10 +5249,6 @@ index e27c54e..1dbbb16 100644
 +
 +                            sum_abs += last_coeff_abs_level_remaining + 1;
 +                            *level = trans_coeff_level;
-+
-+                            if (stat_coeff != NULL)
-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+                            stat_coeff = NULL;
  
 -            for (m = 0; m < n_end; m++) {
 -                n = significant_coeff_flag_idx[m];
@@ -3960,6 +5269,10 @@ index e27c54e..1dbbb16 100644
 -                                if (lc->stat_coeff[sb_type] > 0)
 -                                    lc->stat_coeff[sb_type]--;
 -                            rice_init = 1;
++                            if (stat_coeff != NULL)
++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                            stat_coeff = NULL;
++
 +                            if (trans_coeff_level > (3 << c_rice_param) &&
 +                                (c_rice_param < 4 || rice_adaptation_enabled))
 +                                ++c_rice_param;
@@ -4060,7 +5373,7 @@ index e27c54e..1dbbb16 100644
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1469,7 +2074,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1469,7 +2137,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -4069,61 +5382,37 @@ index e27c54e..1dbbb16 100644
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1490,6 +2095,24 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1489,7 +2157,13 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+             }
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
              s->hevcdsp.transform_4x4_luma(coeffs);
-         } else {
+-        } else {
++        }
 +#ifdef RPI
-+            if (!use_vpu) {
-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+              if (max_xy == 0) {
-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+              } else {
-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                  if (max_xy < 4)
-+                      col_limit = FFMIN(4, col_limit);
-+                  else if (max_xy < 8)
-+                      col_limit = FFMIN(8, col_limit);
-+                  else if (max_xy < 12)
-+                      col_limit = FFMIN(24, col_limit);
-+
-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-+              }
-+            }
++        else if (!use_vpu)
 +#else
++        else
++#endif
++        {
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
                  s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-@@ -1503,6 +2126,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                     col_limit = FFMIN(24, col_limit);
-                 s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
-             }
-+#endif
-         }
-     }
-     if (lc->tu.cross_pf) {
-@@ -1512,6 +2136,17 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1512,7 +2186,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
 +#ifdef RPI
-+    if (s->enable_rpi) {
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+        cmd->type = RPI_PRED_TRANSFORM_ADD;
-+        cmd->size = log2_trafo_size;
-+        cmd->ta.buf = coeffs;
-+        cmd->ta.dst = dst;
-+        cmd->ta.stride = stride;
-+        return;
-+    }
-+#endif
++    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++#else
      s->hevcdsp.add_residual[log2_trafo_size-2](dst, coeffs, stride);
++#endif
  }
  
-diff --git b/libavcodec/hevc_filter.c a/libavcodec/hevc_filter.c
-index 14e7c8d..e4ffd87 100644
---- b/libavcodec/hevc_filter.c
-+++ a/libavcodec/hevc_filter.c
+ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+index 14e7c8d..0256b01 100644
+--- a/libavcodec/hevc_filter.c
++++ b/libavcodec/hevc_filter.c
 @@ -22,6 +22,12 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
@@ -4137,18 +5426,83 @@ index 14e7c8d..e4ffd87 100644
  #include "libavutil/common.h"
  #include "libavutil/internal.h"
  
-@@ -30,6 +36,10 @@
+@@ -30,6 +36,11 @@
  
  #include "bit_depth_template.c"
  
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#include "rpi_zc.h"
 +#endif
 +
  #define LUMA 0
  #define CB 1
  #define CR 2
-@@ -272,6 +282,10 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -138,6 +149,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
+ }
+ 
++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
++{
++#ifdef RPI
++    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
++#else
++    return s->ps.sps->pixel_shift;
++#endif
++}
++
+ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+                      ptrdiff_t stride_dst, ptrdiff_t stride_src)
+ {
+@@ -192,7 +212,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+                            ptrdiff_t stride_src, int x, int y, int width, int height,
+                            int c_idx, int x_ctb, int y_ctb)
+ {
+-    int sh = s->ps.sps->pixel_shift;
++    const unsigned int sh = pixel_shift(s, c_idx);
+     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+ 
+@@ -223,13 +243,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
++        const unsigned int sh = pixel_shift(s, c_idx);
++        int len          = (min_pu_size >> hshift) << sh;
+         for (y = y_min; y < y_max; y++) {
+             for (x = x_min; x < x_max; x++) {
+                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+                     int n;
+-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+                     for (n = 0; n < (min_pu_size >> vshift); n++) {
+                         memcpy(src, dst, len);
+                         src += stride_src;
+@@ -245,7 +266,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+ 
+ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+ {
+-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+     HEVCLocalContext *lc = s->HEVClc;
+     int c_idx;
+     int edges[4];  // 0 left 1 top 2 right 3 bottom
+@@ -266,12 +287,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+     uint8_t right_tile_edge  = 0;
+     uint8_t up_tile_edge     = 0;
+     uint8_t bottom_tile_edge = 0;
++#ifdef RPI
++    const int sliced = rpi_sliced_frame(s->frame);
++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
++#else
++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
++#endif
+ 
+     edges[0]   = x_ctb == 0;
+     edges[1]   = y_ctb == 0;
      edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
      edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
  
@@ -4159,7 +5513,300 @@ index 14e7c8d..e4ffd87 100644
      if (restore) {
          if (!edges[0]) {
              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -495,6 +509,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -303,7 +334,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         }
+     }
+ 
+-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
+         int x0       = x >> s->ps.sps->hshift[c_idx];
+         int y0       = y >> s->ps.sps->vshift[c_idx];
+         ptrdiff_t stride_src = s->frame->linesize[c_idx];
+@@ -312,28 +343,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+         ptrdiff_t stride_dst;
+         uint8_t *dst;
+ 
++#ifdef RPI
++        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = !sliced ?
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0, y0);
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++            !sliced ? src - (1 << sh) :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++            !sliced ? src + (width << sh) :
++            c_idx == 0 ?
++                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
++                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
++
++
++        if (sliced && c_idx > 1) {
++            break;
++        }
++#else
++        const unsigned int sh = s->ps.sps->pixel_shift;
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
++#endif
++
+         switch (sao->type_idx[c_idx]) {
+         case SAO_BAND:
+             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                            x_ctb, y_ctb);
+             if (s->ps.pps->transquant_bypass_enable_flag ||
+                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+-            dst = lc->edge_emu_buffer;
+-            stride_dst = 2*MAX_PB_SIZE;
+-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+-                                            width, height);
+-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-                               x, y, width, height, c_idx);
++                dst = lc->edge_emu_buffer;
++                stride_dst = 2*MAX_PB_SIZE;
++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
++#ifdef RPI
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++#endif
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                   x, y, width, height, c_idx);
+             } else {
+-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+-                                            width, height);
++#ifdef RPI
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++#endif
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
+             }
+             sao->type_idx[c_idx] = SAO_APPLIED;
+             break;
+@@ -341,108 +426,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         {
+             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+-            int left_edge = edges[0];
+             int top_edge = edges[1];
+-            int right_edge = edges[2];
+             int bottom_edge = edges[3];
+-            int sh = s->ps.sps->pixel_shift;
+-            int left_pixels, right_pixels;
+ 
+             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+ 
+             if (!top_edge) {
+-                int left = 1 - left_edge;
+-                int right = 1 - right_edge;
+-                const uint8_t *src1[2];
+                 uint8_t *dst1;
+-                int src_idx, pos;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+ 
+-                dst1 = dst - stride_dst - (left << sh);
+-                src1[0] = src - stride_src - (left << sh);
+-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+-                pos = 0;
+-                if (left) {
++                dst1 = dst - stride_dst;
++
++                if (src_l != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1, src1[src_idx], sh);
+-                    pos += (1 << sh);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
+                 }
++
+                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+                            SAO_APPLIED);
+-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+-                if (right) {
+-                    pos += width << sh;
++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++                if (src_r != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
+                 }
+             }
+             if (!bottom_edge) {
+-                int left = 1 - left_edge;
+-                int right = 1 - right_edge;
+-                const uint8_t *src1[2];
+-                uint8_t *dst1;
+-                int src_idx, pos;
++                uint8_t * const dst1 = dst + height * stride_dst;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++                const unsigned int hoff = height * stride_src;
+ 
+-                dst1 = dst + height * stride_dst - (left << sh);
+-                src1[0] = src + height * stride_src - (left << sh);
+-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+-                pos = 0;
+-                if (left) {
++                if (src_l != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1, src1[src_idx], sh);
+-                    pos += (1 << sh);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
+                 }
++
+                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+                            SAO_APPLIED);
+-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+-                if (right) {
+-                    pos += width << sh;
++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++                if (src_r != NULL) {
+                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+                                SAO_APPLIED);
+-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
+                 }
+             }
+-            left_pixels = 0;
+-            if (!left_edge) {
++            if (src_l != NULL) {
+                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                     copy_vert(dst - (1 << sh),
+                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+                               sh, height, stride_dst, 1 << sh);
+                 } else {
+-                    left_pixels = 1;
++                    copy_vert(dst - (1 << sh),
++                              src_l,
++                              sh, height, stride_dst, stride_src);
+                 }
+             }
+-            right_pixels = 0;
+-            if (!right_edge) {
++            if (src_r != NULL) {
+                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+                     copy_vert(dst + (width << sh),
+                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+                               sh, height, stride_dst, 1 << sh);
+                 } else {
+-                    right_pixels = 1;
++                    copy_vert(dst + (width << sh),
++                              src_r,
++                              sh, height, stride_dst, stride_src);
+                 }
+             }
+ 
+-            copy_CTB(dst - (left_pixels << sh),
+-                     src - (left_pixels << sh),
+-                     (width + left_pixels + right_pixels) << sh,
++            copy_CTB(dst,
++                     src,
++                     width << sh,
+                      height, stride_dst, stride_src);
+ 
+             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+                            x_ctb, y_ctb);
+-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+-                                            sao->eo_class[c_idx], width, height);
+-            s->hevcdsp.sao_edge_restore[restore](src, dst,
+-                                                stride_src, stride_dst,
+-                                                sao,
+-                                                edges, width,
+-                                                height, c_idx,
+-                                                vert_edge,
+-                                                horiz_edge,
+-                                                diag_edge);
++#ifdef RPI
++            if (sliced && c_idx != 0)
++            {
++                // Class always the same for both U & V (which is just as well :-))
++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++                                                width, height);
++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            else
++#endif
++            {
++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++                                                sao->eo_class[c_idx], width, height);
++                s->hevcdsp.sao_edge_restore[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
+             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+                                x, y, width, height, c_idx);
+             sao->type_idx[c_idx] = SAO_APPLIED;
+@@ -452,6 +546,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+     }
+ }
+ 
++// Returns 2 or 0.
+ static int get_pcm(HEVCContext *s, int x, int y)
+ {
+     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
+@@ -478,7 +573,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+     uint8_t *src;
+     int x, y;
+     int chroma, beta;
+-    int32_t c_tc[2], tc[2];
++    int32_t c_tc[4], tc[2];
+     uint8_t no_p[2] = { 0 };
+     uint8_t no_q[2] = { 0 };
+ 
+@@ -495,6 +590,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -4175,27 +5822,81 @@ index 14e7c8d..e4ffd87 100644
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -538,6 +561,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
-                                                          s->frame->linesize[LUMA],
-                                                          beta, tc, no_p, no_q);
-                 } else
-+#ifdef RPI_DEBLOCK_VPU
-+                if (s->enable_rpi_deblock) {
-+                    uint8_t (*setup)[2][2][4];
-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
-+                    int a = ((y>>3) & 1) << 1;
-+                    int b = (x>>3) & 1;
-+                    setup = s->dvq->y_setup_arm[num16];
-+                    setup[0][b][0][a] = beta;
-+                    setup[0][b][0][a + 1] = beta;
-+                    setup[0][b][1][a] = tc[0];
-+                    setup[0][b][1][a + 1] = tc[1];
-+                } else
+@@ -528,19 +632,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+ 
+                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                 if (pcmf) {
+                     no_p[0] = get_pcm(s, x - 1, y);
+                     no_p[1] = get_pcm(s, x - 1, y + 4);
+                     no_q[0] = get_pcm(s, x, y);
+                     no_q[1] = get_pcm(s, x, y + 4);
+-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+-                                                         s->frame->linesize[LUMA],
+-                                                         beta, tc, no_p, no_q);
+-                } else
+-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
+-                                                       s->frame->linesize[LUMA],
+-                                                       beta, tc, no_p, no_q);
++                }
++#ifdef RPI
++                if (rpi_sliced_frame(s->frame)) {
++
++                    // This copes properly with no_p/no_q
++                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
++                                                     s->frame->linesize[LUMA],
++                                                     beta, tc, no_p, no_q,
++                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
++                }
++                else
 +#endif
-                     s->hevcdsp.hevc_v_loop_filter_luma(src,
-                                                        s->frame->linesize[LUMA],
-                                                        beta, tc, no_p, no_q);
-@@ -570,6 +606,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                {
++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                    if (pcmf) {
++                        // Standard DSP code is broken if no_p / no_q is set
++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++                                                           s->frame->linesize[LUMA],
++                                                           beta, tc, no_p, no_q);
++                    }
++                    else
++#ifdef RPI_DEBLOCK_VPU
++                    if (s->enable_rpi_deblock) {
++                        uint8_t (*setup)[2][2][4];
++                        int num16 = (y>>4)*s->setup_width + (x>>4);
++                        int a = ((y>>3) & 1) << 1;
++                        int b = (x>>3) & 1;
++                        setup = s->dvq->y_setup_arm[num16];
++                        setup[0][b][0][a] = beta;
++                        setup[0][b][0][a + 1] = beta;
++                        setup[0][b][1][a] = tc[0];
++                        setup[0][b][1][a + 1] = tc[1];
++                    } else
++#endif
++                    {
++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
++                                                           s->frame->linesize[LUMA],
++                                                           beta, tc, no_p, no_q);
++                    }
++                }
+             }
+         }
+ 
+@@ -560,7 +696,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                src =
++#ifdef RPI
++                    rpi_sliced_frame(s->frame) ?
++                        rpi_sliced_frame_pos_y(s->frame, x, y) :
++#endif
++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+                 if (pcmf) {
+                     no_p[0] = get_pcm(s, x, y - 1);
+                     no_p[1] = get_pcm(s, x + 4, y - 1);
+@@ -570,6 +711,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -4215,7 +5916,113 @@ index 14e7c8d..e4ffd87 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -604,9 +653,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -578,6 +732,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+     }
+ 
+     if (s->ps.sps->chroma_format_idc) {
++#ifdef RPI
++        if (rpi_sliced_frame(s->frame)) {
++            const int v = 2;
++            const int h = 2;
++
++            // vertical filtering chroma
++            for (y = y0; y < y_end; y += 8 * v) {
++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
++
++                    if ((bs0 == 2) || (bs1 == 2)) {
++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
++                        unsigned int no_f = 0;
++
++                        // tc_offset here should be set to cur_tc_offset I think
++                        const uint32_t tc4 =
++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++
++                        if (tc4 == 0)
++                            continue;
++
++                        if (pcmf) {
++                            no_f =
++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
++                                (get_pcm(s, x, y) ? 4 : 0) |
++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
++                            if (no_f == 0xf)
++                                continue;
++                        }
++
++                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                       s->frame->linesize[1],
++                                                       tc4,
++                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       no_f);
++                    }
++                }
++
++                if (y == 0)
++                    continue;
++
++                // horizontal filtering chroma
++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
++                x_end2 = x_end;
++                if (x_end != s->ps.sps->width)
++                    x_end2 = x_end - 8 * h;
++
++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
++                    if ((bs0 == 2) || (bs1 == 2)) {
++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
++                        const uint32_t tc4 =
++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
++                        unsigned int no_f = 0;
++
++                        if (tc4 == 0)
++                            continue;
++
++                        if (pcmf) {
++                            no_f =
++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
++                                (get_pcm(s, x,         y)     ? 4 : 0) |
++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
++
++                            if (no_f == 0xf)
++                                continue;
++                        }
++
++                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                             s->frame->linesize[1],
++                                                             tc4, no_f);
++                    }
++                }
++            }
++        }
++        else
++#endif
+         for (chroma = 1; chroma <= 2; chroma++) {
+             int h = 1 << s->ps.sps->hshift[chroma];
+             int v = 1 << s->ps.sps->vshift[chroma];
+@@ -594,7 +833,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+ 
+                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                        src =
++#ifdef RPI
++                            rpi_sliced_frame(s->frame) ?
++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#endif
++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+                         if (pcmf) {
+                             no_p[0] = get_pcm(s, x - 1, y);
+                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+@@ -604,9 +848,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -4239,7 +6046,21 @@ index 14e7c8d..e4ffd87 100644
                      }
                  }
  
-@@ -637,6 +700,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -627,7 +885,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+ 
+                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                        src =
++#ifdef RPI
++                            rpi_sliced_frame(s->frame) ?
++                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#endif
++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+                         if (pcmf) {
+                             no_p[0] = get_pcm(s, x,           y - 1);
+                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+@@ -637,6 +900,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -4259,7 +6080,7 @@ index 14e7c8d..e4ffd87 100644
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -647,69 +723,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -647,69 +923,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  }
  
@@ -4329,7 +6150,7 @@ index 14e7c8d..e4ffd87 100644
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -720,10 +733,21 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -720,10 +933,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -4339,8 +6160,9 @@ index 14e7c8d..e4ffd87 100644
 -    int i, j, bs;
 +    int i, j;
 +    RefPicList *rpl      = s->ref->refPicList;
-+    int min_pu_in_4pix   = (1 << log2_min_pu_size) >> 2;
-+    int trafo_in_min_pus = (1 << log2_trafo_size) >> log2_min_pu_size;
++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
 +    int y_pu             = y0 >> log2_min_pu_size;
 +    int x_pu             = x0 >> log2_min_pu_size;
 +    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
@@ -4354,7 +6176,7 @@ index 14e7c8d..e4ffd87 100644
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -735,34 +759,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -735,34 +960,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -4431,7 +6253,7 @@ index 14e7c8d..e4ffd87 100644
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -773,64 +819,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -773,64 +1020,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -4467,22 +6289,22 @@ index 14e7c8d..e4ffd87 100644
 -
 -    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
 -        RefPicList *rpl = s->ref->refPicList;
--
++                               rpl;
++        MvField *left = curr - 1;
+ 
 -        // bs for TU internal horizontal PU boundaries
 -        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
 -            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
 -            int yq_pu = (y0 + j)     >> log2_min_pu_size;
-+                               rpl;
-+        MvField *left = curr - 1;
++        if (is_intra) {
++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
++                bs[j * s->bs_width >> 2] = 2;
  
 -            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
 -                int x_pu = (x0 + i) >> log2_min_pu_size;
 -                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
 -                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
-+        if (is_intra) {
-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
-+                bs[j * s->bs_width >> 2] = 2;
- 
+-
 -                bs = boundary_strength(s, curr, top, rpl);
 -                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
 +        } else {
@@ -4534,7 +6356,7 @@ index 14e7c8d..e4ffd87 100644
          }
      }
  }
-@@ -839,11 +875,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -839,11 +1076,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
  #undef CB
  #undef CR
  
@@ -4544,8 +6366,8 @@ index 14e7c8d..e4ffd87 100644
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
 +    rpi_cache_flush_finish(rfe);
 +}
 +#endif
@@ -4559,10 +6381,11 @@ index 14e7c8d..e4ffd87 100644
 +        const int d0 = ((int *)f->progress->data)[0];
 +        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
 +
-+        if (curr_y < (unsigned int)f->f->height) {
++        if (curr_y < (unsigned int)s->ps.sps->height) {
 +            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++              0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
++              s->ps.sps->vshift[1], 1, 1);
 +            rpi_cache_flush_finish(rfe);
 +        }
 +    }
@@ -4639,7 +6462,7 @@ index 14e7c8d..e4ffd87 100644
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -852,16 +981,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+@@ -852,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
@@ -4660,8 +6483,7 @@ index 14e7c8d..e4ffd87 100644
 +#endif
                  ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            }
-         }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++        }
 +    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
@@ -4677,7 +6499,8 @@ index 14e7c8d..e4ffd87 100644
 +          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
 +          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+        }
+         }
+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
 +#else
 +#if RPI_INTER
 +        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
@@ -4689,11 +6512,24 @@ index 14e7c8d..e4ffd87 100644
  }
  
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
-diff --git b/libavcodec/hevc_ps.c a/libavcodec/hevc_ps.c
-index acd55cc..0a465d4 100644
---- b/libavcodec/hevc_ps.c
-+++ a/libavcodec/hevc_ps.c
-@@ -1001,6 +1001,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
+index acd55cc..c1716c2 100644
+--- a/libavcodec/hevc_ps.c
++++ b/libavcodec/hevc_ps.c
+@@ -780,7 +780,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+     switch (sps->bit_depth) {
+     case 8:
+         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
++#if RPI_HEVC_SAND
++        // *** Horrid kludge s.t. we start out with sand format
++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
++#else
+         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
++#endif
+         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+        break;
+@@ -1001,6 +1006,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
      sps->amp_enabled_flag = get_bits1(gb);
      sps->sao_enabled      = get_bits1(gb);
  
@@ -4702,17 +6538,69 @@ index acd55cc..0a465d4 100644
      sps->pcm_enabled_flag = get_bits1(gb);
      if (sps->pcm_enabled_flag) {
          sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
-diff --git b/libavcodec/hevcdec.c a/libavcodec/hevcdec.c
-index ef21595..b36e840 100644
---- b/libavcodec/hevcdec.c
-+++ a/libavcodec/hevcdec.c
-@@ -42,8 +42,233 @@
+diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
+index 9103c84..eb26e7d 100644
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -206,7 +206,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+             HEVCFrame *frame = &s->DPB[min_idx];
+             AVFrame *dst = out;
+             AVFrame *src = frame->frame;
+-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
++            const int fmt = src->format;
++            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+             int pixel_shift = !!(desc->comp[0].depth > 8);
+ 
+             ret = av_frame_ref(out, src);
+@@ -217,12 +218,29 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+             if (ret < 0)
+                 return ret;
+ 
+-            for (i = 0; i < 3; i++) {
+-                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+-                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+-                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+-                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
+-                dst->data[i] += off;
++            if (fmt == AV_PIX_FMT_SAND128)
++            {
++                // Sand cannot be windowed by offset so add side data if we have an offset
++                const HEVCWindow * const window = &frame->window;
++                if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
++                {
++                    AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
++                    AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++                    si->left_offset = window->left_offset;
++                    si->top_offset = window->top_offset;
++                    si->pic_width = s->ps.sps->width;
++                    si->pic_height = s->ps.sps->height;
++                }
++            }
++            else
++            {
++                for (i = 0; i < 3; i++) {
++                    int hshift = (i > 0) ? desc->log2_chroma_w : 0;
++                    int vshift = (i > 0) ? desc->log2_chroma_h : 0;
++                    int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
++                              (frame->window.top_offset   >> vshift) * dst->linesize[i];
++                    dst->data[i] += off;
++                }
+             }
+             av_log(s->avctx, AV_LOG_DEBUG,
+                    "Output frame with POC %d.\n", frame->poc);
+diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
+index f9e8ff0..8a3d874 100644
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -42,8 +42,207 @@
  #include "hevcdec.h"
  #include "profiles.h"
  
 +#ifdef RPI
 +  #include "rpi_qpu.h"
 +  #include "rpi_shader.h"
++  #include "rpi_shader_cmd.h"
++  #include "rpi_zc.h"
 +
 +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
 +  #define RPI_CACHE_UNIF_MVS  1
@@ -4726,10 +6614,15 @@ index ef21595..b36e840 100644
 +  #include "libavutil/atomic.h"
 +
 +  static void worker_core(HEVCContext * const s);
++
++  // We can pred any block height, but caching may make some heights better than others
++  // Currently it doesn't seem to make a lot of difference
++  // 0 => any height
++  #define Y_P_MAX_H     0
++  #define Y_B_MAX_H     0
 +#endif
 +
-+// #define DISABLE_MC
-+
++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
 +
 +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
 +
@@ -4740,78 +6633,31 @@ index ef21595..b36e840 100644
 +}
 +#   define av_mod_uintp2   av_mod_uintp2_c
 +#endif
-+
-+#define Y_B_ONLY 1
 +
  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
  
 +
 +#if RPI_INTER
 +
-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-+// For each block of 64*64 the smallest block size is 8x4
-+// We also need an extra command for the setup information
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV still has min 4x4 pred
++// Allow for even spread +1 for setup, +1 for rounding
++// If we have load sharingw e will want different (bigger) numbers and/or a non-constant chunk size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
++#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
++
++#define UV_COMMANDS_PER_QPU (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 / QPU_N_UV + 2)
++#define Y_COMMANDS_PER_QPU  (((RPI_MAX_WIDTH * 64) / (4 * 4))     / QPU_N_Y  + 2)
 +
-+#define RPI_CHROMA_COMMAND_WORDS 11
-+#define UV_COMMANDS_PER_QPU ((1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4)) * RPI_CHROMA_COMMAND_WORDS)
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
 +
-+typedef struct qpu_mc_pred_c_s {
-+    uint32_t next_fn;
-+    int16_t next_src_y;
-+    int16_t next_src_x;
-+    uint32_t next_src_base_u;
-+    uint32_t next_src_base_v;
-+    union {
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t wo_u;
-+            uint32_t wo_v;
-+            uint32_t dst_addr_u;
-+            uint32_t dst_addr_v;
-+        } p;
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t weight_u;
-+            uint32_t weight_v;
-+            uint32_t dummy0;
-+            uint32_t dummy1;
-+        } b0;
-+        struct {
-+            uint32_t dummy0;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t wo_u;
-+            uint32_t wo_v;
-+            uint32_t dst_addr_u;
-+            uint32_t dst_addr_v;
-+        } b1;
-+        struct {
-+            uint32_t pic_w;
-+            uint32_t pic_h;
-+            uint32_t src_stride;
-+            uint32_t dst_stride;
-+            uint32_t wdenom;
-+            uint32_t dummy0;
-+            uint32_t dummy1;
-+        } s;
-+    };
-+} qpu_mc_pred_c_t;
-+
-+
-+static const char static_assert_qpu_mc_pred[sizeof(qpu_mc_pred_c_t) != RPI_CHROMA_COMMAND_WORDS * 4 ? -1 : 1] = {0};
-+
 +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
 +
-+// TODO Chroma only needs 4 taps
 +
 +// Actual filter goes -ve, +ve, +ve, -ve using these values
 +static const uint32_t rpi_filter_coefs[8] = {
@@ -4825,30 +6671,44 @@ index ef21595..b36e840 100644
 +        ENCODE_COEFFS(  2,  10,  58,  2)
 +};
 +
-+#define RPI_LUMA_COMMAND_WORDS 10
-+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)) * RPI_LUMA_COMMAND_WORDS)
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++    mc_interrupt_exit12c, mc_exit_c, mc_exit_c, mc_exit_c,
++    mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c,
++    mc_exit_c, mc_exit_c, mc_exit_c, mc_exit_c
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++    mc_interrupt_exit12, mc_exit, mc_exit, mc_exit,
++    mc_exit,   mc_exit, mc_exit, mc_exit,
++    mc_exit,   mc_exit, mc_exit, mc_exit
++};
++
++
 +#endif
 +
 +
 +#ifdef RPI_WORKER
 +
-+typedef struct worker_global_env_s
-+{
-+    volatile int arm_load;
-+    pthread_mutex_t lock;
-+
-+    unsigned int arm_y;
-+    unsigned int arm_c;
-+    unsigned int gpu_y;
-+    unsigned int gpu_c;
-+} worker_global_env_t;
-+
-+static worker_global_env_t worker_global_env =
-+{
-+    .lock = PTHREAD_MUTEX_INITIALIZER
-+};
-+
-+
 +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +
@@ -4940,7 +6800,7 @@ index ef21595..b36e840 100644
  /**
   * NOTE: Each function hls_foo correspond to the function foo in the
   * specification (HLS stands for High Level Syntax).
-@@ -56,6 +281,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+@@ -56,6 +255,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
  /* free everything allocated  by pic_arrays_init() */
  static void pic_arrays_free(HEVCContext *s)
  {
@@ -4973,7 +6833,7 @@ index ef21595..b36e840 100644
      av_freep(&s->sao);
      av_freep(&s->deblock);
  
-@@ -92,6 +343,88 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+@@ -92,6 +317,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
      int ctb_count        = sps->ctb_width * sps->ctb_height;
      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
  
@@ -4993,19 +6853,20 @@ index ef21595..b36e840 100644
 +    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
 +
 +    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      for(job=0;job<RPI_MAX_JOBS;job++) {
-+        gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+        if (!s->coeffs_buf_arm[job][0])
-+            goto fail;
-+        gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+        s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+        s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+        if (!s->coeffs_buf_arm[job][2])
-+            goto fail;
-+        s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+        s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+      }
++        for(job=0;job<RPI_MAX_JOBS;job++) {
++            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
++            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
++            if (!s->coeffs_buf_arm[job][0])
++                goto fail;
++
++            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
++            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
++            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
++            if (!s->coeffs_buf_arm[job][2])
++                goto fail;
++            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
++            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
++        }
 +    }
 +#endif
 +#ifdef RPI_DEBLOCK_VPU
@@ -5062,7 +6923,7 @@ index ef21595..b36e840 100644
      s->bs_width  = (width  >> 2) + 1;
      s->bs_height = (height >> 2) + 1;
  
-@@ -138,6 +471,29 @@ fail:
+@@ -138,6 +446,29 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -5092,7 +6953,52 @@ index ef21595..b36e840 100644
  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
  {
      int i = 0;
-@@ -678,6 +1034,11 @@ static int hls_slice_header(HEVCContext *s)
+@@ -332,7 +663,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
+ {
+     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
+-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+     int ret, i;
+ 
+     pic_arrays_free(s);
+@@ -351,6 +682,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     switch (sps->pix_fmt) {
+     case AV_PIX_FMT_YUV420P:
+     case AV_PIX_FMT_YUVJ420P:
++#if RPI_HEVC_SAND
++        // Currently geometry calc is stuffed for big sizes
++        if (sps->width < 2048 && sps->height <= 1088) {
++            *fmt++ = AV_PIX_FMT_SAND128;
++        }
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -384,6 +721,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+         ret = ff_thread_get_format(s->avctx, pix_fmts);
+         if (ret < 0)
+             goto fail;
++
+         s->avctx->pix_fmt = ret;
+     }
+     else {
+@@ -406,11 +744,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+         for(c_idx = 0; c_idx < c_count; c_idx++) {
+             int w = sps->width >> sps->hshift[c_idx];
+             int h = sps->height >> sps->vshift[c_idx];
++            // ******** Very very nasty allocation kludge for plaited Chroma
+             s->sao_pixel_buffer_h[c_idx] =
+-                av_malloc((w * 2 * sps->ctb_height) <<
++                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
+                           sps->pixel_shift);
+             s->sao_pixel_buffer_v[c_idx] =
+-                av_malloc((h * 2 * sps->ctb_width) <<
++                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
+                           sps->pixel_shift);
+         }
+     }
+@@ -678,6 +1017,11 @@ static int hls_slice_header(HEVCContext *s)
                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) {
                  pred_weight_table(s, gb);
              }
@@ -5104,13 +7010,17 @@ index ef21595..b36e840 100644
  
              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -933,6 +1294,25 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+@@ -933,6 +1277,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
      return 0;
  }
  
 +#ifdef RPI
 +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
 +{
++    // U & V done on U call in the case of sliced frames
++    if (rpi_sliced_frame(s->frame) && c_idx > 1)
++        return;
++
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
 +        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
@@ -5121,16 +7031,21 @@ index ef21595..b36e840 100644
 +        cmd->i_pred.x = x0;
 +        cmd->i_pred.y = y0;
 +        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
-+    } else {
++    }
++    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
++    }
++    else {
 +        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
 +    }
++
 +}
 +#endif
 +
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -945,8 +1325,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -945,8 +1317,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -5143,7 +7058,7 @@ index ef21595..b36e840 100644
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1032,7 +1415,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1032,7 +1407,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -5155,7 +7070,7 @@ index ef21595..b36e840 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1061,7 +1448,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1061,7 +1440,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -5167,7 +7082,7 @@ index ef21595..b36e840 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1090,7 +1481,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1090,7 +1473,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -5179,7 +7094,7 @@ index ef21595..b36e840 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1100,7 +1495,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1100,7 +1487,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -5191,7 +7106,7 @@ index ef21595..b36e840 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1112,26 +1511,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1112,26 +1503,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -5238,7 +7153,7 @@ index ef21595..b36e840 100644
              }
          }
      }
-@@ -1277,33 +1696,23 @@ do {
+@@ -1277,47 +1688,120 @@ do {
      return 0;
  }
  
@@ -5249,12 +7164,12 @@ index ef21595..b36e840 100644
 -    HEVCLocalContext *lc = s->HEVClc;
      GetBitContext gb;
 -    int cb_size   = 1 << log2_cb_size;
-     ptrdiff_t stride0 = s->frame->linesize[0];
-     ptrdiff_t stride1 = s->frame->linesize[1];
-     ptrdiff_t stride2 = s->frame->linesize[2];
-     uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
-     uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
-     uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+-    ptrdiff_t stride0 = s->frame->linesize[0];
+-    ptrdiff_t stride1 = s->frame->linesize[1];
+-    ptrdiff_t stride2 = s->frame->linesize[2];
+-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
 -
 -    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
 -                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
@@ -5271,11 +7186,47 @@ index ef21595..b36e840 100644
          return ret;
  
 -    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
-+    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-     if (s->ps.sps->chroma_format_idc) {
-         s->hevcdsp.put_pcm(dst1, stride1,
+-    if (s->ps.sps->chroma_format_idc) {
+-        s->hevcdsp.put_pcm(dst1, stride1,
++#if RPI_HEVC_SAND
++    if (rpi_sliced_frame(s->frame)) {
++        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
++                           s->frame->linesize[0],
++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
++                           s->frame->linesize[1],
                             cb_size >> s->ps.sps->hshift[1],
-@@ -1318,6 +1727,59 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+                            cb_size >> s->ps.sps->vshift[1],
+                            &gb, s->ps.sps->pcm.bit_depth_chroma);
+-        s->hevcdsp.put_pcm(dst2, stride2,
+-                           cb_size >> s->ps.sps->hshift[2],
+-                           cb_size >> s->ps.sps->vshift[2],
+-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+     }
++    else
++#endif
++    {
++        const int stride0   = s->frame->linesize[0];
++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++        const int   stride1 = s->frame->linesize[1];
++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++        const int   stride2 = s->frame->linesize[2];
++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
++
++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++        if (s->ps.sps->chroma_format_idc) {
++            s->hevcdsp.put_pcm(dst1, stride1,
++                               cb_size >> s->ps.sps->hshift[1],
++                               cb_size >> s->ps.sps->vshift[1],
++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
++            s->hevcdsp.put_pcm(dst2, stride2,
++                               cb_size >> s->ps.sps->hshift[2],
++                               cb_size >> s->ps.sps->vshift[2],
++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
++        }
+ 
++    }
      return 0;
  }
  
@@ -5312,9 +7263,22 @@ index ef21595..b36e840 100644
 +    if (s->enable_rpi) {
 +        // Copy coeffs
 +        const int blen = (length + 7) >> 3;
-+        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, (blen + 1) >> 1);
++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
++        // Allocation is in int16_t s
++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++        // sample this rounding doesn't affect the total size we need to allocate for
++        // the coeff buffer
++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
 +        memcpy(coeffs, pcm, blen);
 +
++        // Our coeff stash assumes that any partially allocated 64byte lump
++        // is zeroed so make that true.
++        {
++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++            if ((-(intptr_t)eopcm & 63) != 0)
++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
++        }
++
 +        // Add command
 +        {
 +            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
@@ -5335,99 +7299,7 @@ index ef21595..b36e840 100644
  /**
   * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
   *
-@@ -1334,6 +1796,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-  * @param luma_offset additive offset applied to the luma prediction value
-  */
- 
-+#if RPI_INTER
-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                        int block_w, int block_h, int luma_weight, int luma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_UNI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref->data[0];
-+    cmd->srcstride = ref->linesize[0];
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = luma_weight;
-+    cmd->offset = luma_offset;
-+}
-+
-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
-+                       const struct MvField * const current_mv)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_BI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[0];
-+    cmd->srcstride = ref0->linesize[0];
-+    cmd->mv = *mv0;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[0];
-+    cmd->srcstride1 = ref1->linesize[0];
-+    cmd->mv1 = *mv1;
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
-+                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = src0;
-+    cmd->srcstride = srcstride;
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = chroma_weight;
-+    cmd->offset = chroma_offset;
-+}
-+
-+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[cidx+1];
-+    cmd->srcstride = ref0->linesize[cidx+1];
-+    cmd->mv = current_mv->mv[0];
-+    cmd->mv1 = current_mv->mv[1];
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[cidx+1];
-+    cmd->srcstride1 = ref1->linesize[cidx+1];
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+#endif
-+
- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1349,6 +1896,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1349,6 +1833,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                             (s->sh.slice_type == HEVC_SLICE_B && s->ps.pps->weighted_bipred_flag);
      int idx              = ff_hevc_pel_weight[block_w];
  
@@ -5438,7 +7310,7 @@ index ef21595..b36e840 100644
      x_off += mv->x >> 2;
      y_off += mv->y >> 2;
      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1395,7 +1946,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1395,7 +1883,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -5447,7 +7319,7 @@ index ef21595..b36e840 100644
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1419,6 +1970,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1419,6 +1907,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
  
@@ -5458,7 +7330,7 @@ index ef21595..b36e840 100644
      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1504,6 +2059,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+@@ -1504,6 +1996,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
      intptr_t _mx         = mx << (1 - hshift);
      intptr_t _my         = my << (1 - vshift);
  
@@ -5469,7 +7341,7 @@ index ef21595..b36e840 100644
      x_off += mv->x >> (2 + hshift);
      y_off += mv->y >> (2 + vshift);
      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1568,6 +2127,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+@@ -1568,6 +2064,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
      int hshift = s->ps.sps->hshift[1];
      int vshift = s->ps.sps->vshift[1];
  
@@ -5480,7 +7352,7 @@ index ef21595..b36e840 100644
      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1695,14 +2258,312 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1695,14 +2195,582 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
      }
  }
  
@@ -5489,6 +7361,106 @@ index ef21595..b36e840 100644
 -                                int log2_cb_size, int partIdx, int idx)
 +
 +#if RPI_INTER
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
++    HEVCRpiInterPredQ * ypt = yp + 1;
++    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
++        if (ypt->load < yp->load)
++            yp = ypt;
++    }
++
++    yp->load += load_val;
++    ipe->used_grp = 1;
++    ((uint32_t *)yp->qpu_mc_curr)[-1] = fn;  // Link is always last el of previous cmd
++
++    return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++    for (unsigned int i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        ((uint32_t *)q->qpu_mc_curr)[-1] = q->code_sync;
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)((uint32_t *)q->qpu_mc_curr + 1);
++        q->load = 0;
++    }
++}
++
++// Returns 0 on success, -1 if Q is dangerously full
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++    if (!ipe->used_grp)
++        return 0;
++
++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
++    {
++        ipe->curr = 0;
++        rpi_inter_pred_sync(ipe);
++    }
++    ipe->used = 1;
++    ipe->used_grp = 0;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
++        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
++            return -1;
++        }
++    }
++    return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    ipe->curr = 0;
++    ipe->used = 0;
++    ipe->used_grp = 0;
++    for (i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base;
++        q->load = 0;
++        q->last_l0 = NULL;
++        q->last_l1 = NULL;
++    }
++}
++
++static void rpi_alloc_inter_pred(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n, const unsigned int n_grp,
++                                 const unsigned int q1_size, const unsigned int min_gap,
++                                 const int * const * const setup_fns,
++                                 const int * const * const sync_fns,
++                                 const int * const * const exit_fns)
++{
++    unsigned int i;
++
++    memset(ipe, 0, sizeof(*ipe));
++    av_assert0((ipe->q = av_mallocz(n * sizeof(*ipe->q))) != NULL);
++    ipe->n = n;
++    ipe->n_grp = n_grp;
++    ipe->q1_size = q1_size;
++    ipe->max_fill = ipe->q1_size - min_gap;
++
++#if RPI_CACHE_UNIF_MVS
++    gpu_malloc_cached(n * q1_size, &ipe->gptr);
++#else
++    gpu_malloc_uncached(n * q1_size, &ipe->gptr);
++#endif
++
++    for(i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(setup_fns[i]);
++        q->code_sync = qpu_fn(sync_fns[i]);
++        q->code_exit = qpu_fn(exit_fns[i]);
++    }
++}
++
++
 +static void
 +rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
 +           const int nPbW, const int nPbH,
@@ -5497,69 +7469,175 @@ index ef21595..b36e840 100644
 +           const int weight_offset,
 +           AVFrame *const src_frame)
 +{
-+    const unsigned int y_off = x0 + y0 * s->frame->linesize[0];
-+
-+    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
-+                    mv, x0, y0, nPbW, nPbH,
-+                    weight_mul, weight_offset);
++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx       = (my << 8) | mx;
++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++    const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
++    uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
++    const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip;
 +
++    if (my_mx == 0)
++    {
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++
++#if Y_P_MAX_H == 0
++        const int bh = nPbH;
++        const int start_y = 0;
++#else
++        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H)
++        {
++            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
++#endif
++
++            for (int start_x = 0; start_x < nPbW; start_x += 16)
++            {
++                const int bw = FFMIN(nPbW - start_x, 16);
++                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_p00);
++                qpu_mc_src_t *const src1 = yp->last_l0;
++                qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++                {
++                    HEVCRpiStats *const ts = &s->tstats;
++                    ++ts->y_pred1_x0y0;
++
++                    if (nPbW > 8)
++                        ++ts->y_pred1_wgt8;
++                    else
++                        ++ts->y_pred1_wle8;
++
++                    if (nPbH > 16)
++                        ++ts->y_pred1_hgt16;
++                    else
++                        ++ts->y_pred1_hle16;
++                }
++#endif
++
++                src1->x = x1 + start_x;
++                src1->y = y1 + start_y;
++                src1->base = src_vc_address_y;
++                cmd_y->w = bw;
++                cmd_y->h = bh;
++                cmd_y->wo1 = wo;
++                cmd_y->dst_addr =  dst_addr + start_x;
++                yp->last_l0 = &cmd_y->next_src1;
++                *(qpu_mc_pred_y_p00_t **)&yp->qpu_mc_curr = cmd_y + 1;
++            }
++#if Y_P_MAX_H != 0
++        }
++#endif
++    }
++    else
 +    {
-+        const unsigned int mx          = mv->x & 3;
-+        const unsigned int my          = mv->y & 3;
-+        const unsigned int my_mx       = (my << 8) | mx;
-+        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
 +        const int x1_m3 = x0 + (mv->x >> 2) - 3;
 +        const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
-+        uint32_t *y = s->curr_y_mvs;
-+        uint32_t dst_base = get_vc_address_y(s->frame) + y_off;
-+        const uint32_t wo_0 = PACK2(weight_offset * 2 + 1, weight_mul);
 +
-+        // Potentially we could change the assembly code to support taller sizes in one go
-+        for (int start_y = 0; start_y < nPbH; start_y += 16, dst_base += s->frame->linesize[0] * 16) {
++#if Y_P_MAX_H == 0
++        const int bh = nPbH;
++        const int start_y = 0;
++#else
++        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * Y_P_MAX_H)
++        {
++            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
++#endif
 +            const uint32_t src_yx_y = y1_m3 + start_y;
 +            int start_x = 0;
-+            const int bh = FFMIN(nPbH - start_y, 16);
-+            uint32_t *const py = y - RPI_LUMA_COMMAND_WORDS;
-+            uint32_t *const ppy = y - RPI_LUMA_COMMAND_WORDS * 2;
 +
++#if 1
 +            // As Y-pred operates on two independant 8-wide src blocks we can merge
 +            // this pred with the previous one if it the previous one is 8 pel wide,
 +            // the same height as the current block, immediately to the left of our
 +            // current dest block and mono-pred.
-+            //
-+            // In the init (1st) block w/h is pic width height so given
-+            // that no pic will ever be 8 pixels wide the first test here
-+            // should fail if this is the first pred (i.e. after that test
-+            // ppy is valid)
-+            if (py[4] == ((8 << 16) | bh) && py[8] + 8 == dst_base && ppy[9] == s->qpu_filter) {
++
++            qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
++            if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + 8 == dst_addr)
++            {
 +                const int bw = FFMIN(nPbW, 8);
++                qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
 +
-+                ppy[2] = PACK2(src_yx_y, x1_m3);
-+                ppy[3] = src_vc_address_y;
-+                py[4] += bw << 16;
-+                py[5] = PACK2(my2_mx2_my_mx, py[5]);
-+                // py[6] stays the same
-+                py[7] = wo_0;
++                last_y8_src2->x = x1_m3;
++                last_y8_src2->y = src_yx_y;
++                last_y8_src2->base = src_vc_address_y;
++                last_y8_p->w += bw;
++                last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++                last_y8_p->wo2 = wo;
 +
++                s->last_y8_p = NULL;
++                s->last_y8_l1 = NULL;
 +                start_x = bw;
++#if RPI_TSTATS
++                ++s->tstats.y_pred1_y8_merge;
++#endif
 +            }
++#endif
 +
-+            for (; start_x < nPbW; start_x += 16) {
-+                const int bw = FFMIN(nPbW - start_x, 16);;
-+                y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + start_x);
-+                y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(src_yx_y, x1_m3 + 8 + start_x);
-+                y++[-RPI_LUMA_COMMAND_WORDS] = src_vc_address_y;
-+                *y++ = PACK2(bw, bh);
-+                *y++ = my2_mx2_my_mx;
-+                *y++ = wo_0;
-+                *y++ = wo_0;
-+                *y++ = dst_base + start_x;
-+                y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter;
++            for (; start_x < nPbW; start_x += 16)
++            {
++                const int bw = FFMIN(nPbW - start_x, 16);
++                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter);
++                qpu_mc_src_t *const src1 = yp->last_l0;
++                qpu_mc_src_t *const src2 = yp->last_l1;
++                qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++                {
++                    HEVCRpiStats *const ts = &s->tstats;
++                    if (mx == 0 && my == 0)
++                        ++ts->y_pred1_x0y0;
++                    else if (mx == 0)
++                        ++ts->y_pred1_x0;
++                    else if (my == 0)
++                        ++ts->y_pred1_y0;
++                    else
++                        ++ts->y_pred1_xy;
++
++                    if (nPbW > 8)
++                        ++ts->y_pred1_wgt8;
++                    else
++                        ++ts->y_pred1_wle8;
++
++                    if (nPbH > 16)
++                        ++ts->y_pred1_hgt16;
++                    else
++                        ++ts->y_pred1_hle16;
++                }
++#endif
++                src1->x = x1_m3 + start_x;
++                src1->y = src_yx_y;
++                src1->base = src_vc_address_y;
++                if (bw <= 8)
++                {
++                    src2->x = MC_DUMMY_X;
++                    src2->y = MC_DUMMY_Y;
++                    src2->base = s->qpu_dummy_frame;
++                }
++                else
++                {
++                    src2->x = x1_m3 + start_x + 8;
++                    src2->y = src_yx_y;
++                    src2->base = src_vc_address_y;
++                }
++                cmd_y->w = bw;
++                cmd_y->h = bh;
++                cmd_y->mymx21 = my2_mx2_my_mx;
++                cmd_y->wo1 = wo;
++                cmd_y->wo2 = wo;
++                cmd_y->dst_addr =  dst_addr + start_x;
++                yp->last_l0 = &cmd_y->next_src1;
++                yp->last_l1 = &cmd_y->next_src2;
++                *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1;
++
++                if (bw == 8) {
++                    s->last_y8_l1 = src2;
++                    s->last_y8_p = cmd_y;
++                }
 +            }
++#if Y_P_MAX_H != 0
 +        }
-+        s->curr_y_mvs = y;
++#endif
 +    }
 +}
 +
@@ -5571,58 +7649,146 @@ index ef21595..b36e840 100644
 +           AVFrame *const src_frame,
 +           AVFrame *const src_frame2)
 +{
-+    const unsigned int y_off = x0 + y0 * s->frame->linesize[0];
++    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
 +    const Mv * const mv  = mv_field->mv + 0;
 +    const Mv * const mv2 = mv_field->mv + 1;
 +
-+    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
-+           mv, x0, y0, nPbW, nPbH,
-+           src_frame2, mv2, mv_field);
-+#if !Y_B_ONLY
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx = (my<<8) | mx;
++    const unsigned int mx2          = mv2->x & 3;
++    const unsigned int my2          = mv2->y & 3;
++    const unsigned int my2_mx2 = (my2<<8) | mx2;
++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
++    const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
++                 s->sh.luma_offset_l1[ref_idx1] + 1;
++    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++    uint32_t dst = get_vc_address_y(s->frame) + y_off;
++    const uint32_t src1_base = get_vc_address_y(src_frame);
++    const uint32_t src2_base = get_vc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].luma_ip;
++
++    if (my2_mx2_my_mx == 0)
 +    {
-+        const unsigned int mx          = mv->x & 3;
-+        const unsigned int my          = mv->y & 3;
-+        const unsigned int my_mx = (my<<8) | mx;
-+        const unsigned int mx2          = mv2->x & 3;
-+        const unsigned int my2          = mv2->y & 3;
-+        const unsigned int my2_mx2 = (my2<<8) | mx2;
-+        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int x2 = x0 + (mv2->x >> 2);
++        const int y2 = y0 + (mv2->y >> 2);
++
++#if Y_B_MAX_H == 0
++        const int bh = nPbH;
++        const int start_y = 0;
++#else
++        for (int start_y = 0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H)
++        {
++            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
++#endif
++            // Can do chunks a full 16 wide if we don't want the H filter
++            for (int start_x=0; start_x < nPbW; start_x += 16)
++            {
++                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu_filter_y_b00);
++                qpu_mc_src_t *const src1 = yp->last_l0;
++                qpu_mc_src_t *const src2 = yp->last_l1;
++                qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++                {
++                    HEVCRpiStats *const ts = &s->tstats;
++                    ++ts->y_pred2_x0y0;
++
++                    if (nPbH > 16)
++                        ++ts->y_pred2_hgt16;
++                    else
++                        ++ts->y_pred2_hle16;
++                }
++#endif
++                src1->x = x1 + start_x;
++                src1->y = y1 + start_y;
++                src1->base = src1_base;
++                src2->x = x2 + start_x;
++                src2->y = y2 + start_y;
++                src2->base = src2_base;
++                cmd_y->w = FFMIN(nPbW - start_x, 16);
++                cmd_y->h = bh;
++                cmd_y->mymx21 = 0;
++                cmd_y->wo1 = wo1;
++                cmd_y->wo2 = wo2;
++                cmd_y->dst_addr =  dst + start_x;
++                yp->last_l0 = &cmd_y->next_src1;
++                yp->last_l1 = &cmd_y->next_src2;
++                *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1;
++            }
++#if Y_P_MAX_H != 0
++        }
++#endif
++    }
++    else
++    {
++        // Filter requires a run-up of 3
 +        const int x1 = x0 + (mv->x >> 2) - 3;
 +        const int y1 = y0 + (mv->y >> 2) - 3;
 +        const int x2 = x0 + (mv2->x >> 2) - 3;
 +        const int y2 = y0 + (mv2->y >> 2) - 3;
-+        const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+        const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
-+                     s->sh.luma_offset_l1[ref_idx1] + 1;
-+        const uint32_t wo_0 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
-+        const uint32_t wo_1 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
 +
-+        uint32_t * y = s->curr_y_mvs;
-+        uint32_t dst = get_vc_address_y(s->frame) + y_off;
-+
-+        for(int start_y=0;start_y < nPbH;start_y+=16) {  // Potentially we could change the assembly code to support taller sizes in one go
-+          for(int start_x=0;start_x < nPbW;start_x+=8) { // B blocks work 8 at a time
-+              int bw = nPbW-start_x;
-+              int bh = nPbH-start_y;
-+              y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y1 + start_y, x1 + start_x);
-+              y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame);
-+              y++[-RPI_LUMA_COMMAND_WORDS] = PACK2(y2 + start_y, x2 + start_x);
-+              y++[-RPI_LUMA_COMMAND_WORDS] = get_vc_address_y(src_frame2);
-+              *y++ = PACK2(bw<8 ? bw : 8, bh<16 ? bh : 16);
-+              *y++ = my2_mx2_my_mx;
-+
-+              *y++ = wo_0;
-+              *y++ = wo_1;
-+
-+              *y++ = dst + start_x;
-+              y++[-RPI_LUMA_COMMAND_WORDS] = s->qpu_filter_b;
-+          }
-+          dst += s->frame->linesize[0] * 16;
-+        }
-+        s->curr_y_mvs = y;
-+    }
++#if Y_B_MAX_H == 0
++        const int bh = nPbH;
++        const int start_y = 0;
++#else
++        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H, dst += s->frame->linesize[0] * Y_B_MAX_H)
++        {
++            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
 +#endif
++            for (int start_x=0; start_x < nPbW; start_x += 8)
++            { // B blocks work 8 at a time
++                // B weights aren't doubled as the QPU code does the same
++                // amount of work as it does for P
++                HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu_filter_b);
++                qpu_mc_src_t *const src1 = yp->last_l0;
++                qpu_mc_src_t *const src2 = yp->last_l1;
++                qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++                {
++                    HEVCRpiStats *const ts = &s->tstats;
++                    const unsigned int mmx = mx | mx2;
++                    const unsigned int mmy = my | my2;
++                    if (mmx == 0 && mmy == 0)
++                        ++ts->y_pred2_x0y0;
++                    else if (mmx == 0)
++                        ++ts->y_pred2_x0;
++                    else if (mmy == 0)
++                        ++ts->y_pred2_y0;
++                    else
++                        ++ts->y_pred2_xy;
++
++                    if (nPbH > 16)
++                        ++ts->y_pred2_hgt16;
++                    else
++                        ++ts->y_pred2_hle16;
++                }
++#endif
++                src1->x = x1 + start_x;
++                src1->y = y1 + start_y;
++                src1->base = src1_base;
++                src2->x = x2 + start_x;
++                src2->y = y2 + start_y;
++                src2->base = src2_base;
++                cmd_y->w = FFMIN(nPbW - start_x, 8);
++                cmd_y->h = bh;
++                cmd_y->mymx21 = my2_mx2_my_mx;
++                cmd_y->wo1 = wo1;
++                cmd_y->wo2 = wo2;
++                cmd_y->dst_addr =  dst + start_x;
++                yp->last_l0 = &cmd_y->next_src1;
++                yp->last_l1 = &cmd_y->next_src2;
++                *(qpu_mc_pred_y_p_t **)&yp->qpu_mc_curr = cmd_y + 1;
++            }
++#if Y_B_MAX_H != 0
++        }
++#endif
++    }
 +}
 +
 +
@@ -5634,65 +7800,48 @@ index ef21595..b36e840 100644
 +  const int16_t * const c_offsets,
 +  AVFrame * const src_frame)
 +{
++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift           = s->ps.sps->hshift[1];
++    const int vshift           = s->ps.sps->vshift[1];
 +
-+    const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1];
-+    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
-+
-+    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
-+                c_weights[0], c_offsets[0]);
-+
-+    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
-+                c_weights[1], c_offsets[1]);
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    const uint32_t src_base_u = get_vc_address_u(src_frame);
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++    const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
++    const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
++    uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip;
 +
++    for(int start_y=0;start_y < nPbH_c;start_y+=16)
 +    {
-+        const int hshift           = s->ps.sps->hshift[1];
-+        const int vshift           = s->ps.sps->vshift[1];
++        const int bh = FFMIN(nPbH_c-start_y, 16);
 +
-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
-+        const uint32_t src_base_u = get_vc_address_u(src_frame);
-+        const uint32_t src_base_v = get_vc_address_v(src_frame);
-+        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
-+        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
-+        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
-+        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+        uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off;
-+
-+        qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs;
-+
-+        for(int start_y=0;start_y < nPbH_c;start_y+=16)
++        for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
 +        {
-+            const int bh = FFMIN(nPbH_c-start_y, 16);
-+            // We are allowed 3/4 powers of two as well as powers of 2
-+            av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
++            HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, s->qpu_filter_uv);
++            qpu_mc_pred_c_p_t * const u = &cp->qpu_mc_curr->c.p;
++            qpu_mc_src_t * const last_l0 = cp->last_l0;
++            const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH, ++u)
-+            {
-+                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+                u[-1].next_fn  = s->qpu_filter_uv;
-+                u[-1].next_src_x = x1_c + start_x;
-+                u[-1].next_src_y = y1_c + start_y;
-+                u[-1].next_src_base_u = src_base_u;
-+                u[-1].next_src_base_v = src_base_v;
-+                u[0].p.h = bh;
-+                u[0].p.w = bw;
-+                u[0].p.coeffs_x = x_coeffs;
-+                u[0].p.coeffs_y = y_coeffs;
-+                u[0].p.wo_u = wo_u;
-+                u[0].p.wo_v = wo_v;
-+                u[0].p.dst_addr_u = dst_base_u + start_x;
-+                u[0].p.dst_addr_v = dst_base_v + start_x;
-+            }
-+
-+            dst_base_u += s->frame->linesize[1] * 16;
-+            dst_base_v += s->frame->linesize[2] * 16;
++            last_l0->x = x1_c + start_x;
++            last_l0->y = y1_c + start_y;
++            last_l0->base = src_base_u;
++            u[0].h = bh;
++            u[0].w = bw;
++            u[0].coeffs_x = x_coeffs;
++            u[0].coeffs_y = y_coeffs;
++            u[0].wo_u = wo_u;
++            u[0].wo_v = wo_v;
++            u[0].dst_addr_c = dst_base_u + start_x * 2;
++            cp->last_l0 = &u->next_src;
++            *(qpu_mc_pred_c_p_t **)&cp->qpu_mc_curr = u + 1;
 +        }
-+        s->curr_u_mvs = (uint32_t *)u;
++
++        dst_base_u += s->frame->linesize[1] * 16;
 +    }
-+  return;
++    return;
 +}
 +
 +static void
@@ -5706,81 +7855,74 @@ index ef21595..b36e840 100644
 +  AVFrame * const src_frame,
 +  AVFrame * const src_frame2)
 +{
-+    const unsigned int c_off = x0_c + y0_c * s->frame->linesize[1];
-+    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = s->ps.sps->hshift[1];
++    const int vshift = s->ps.sps->vshift[1];
++    const Mv * const mv = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
 +
-+    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
++    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
 +
-+    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
++    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
 +
++    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++    uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
++    const uint32_t src1_base = get_vc_address_u(src_frame);
++    const uint32_t src2_base = get_vc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jobs[s->pass0_job].chroma_ip;
++
++    for (int start_y = 0; start_y < nPbH_c; start_y += 16)
 +    {
-+        const int hshift = s->ps.sps->hshift[1];
-+        const int vshift = s->ps.sps->vshift[1];
-+        const Mv * const mv = mv_field->mv + 0;
-+        const Mv * const mv2 = mv_field->mv + 1;
++        const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
 +
-+        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
-+        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
-+        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++        for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++        {
++            const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
-+        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
-+        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++            HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu_filter_uv_b0);
++            qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++            qpu_mc_src_t * const src_l0 = cp->last_l0;
++            qpu_mc_src_t * const src_l1 = cp->last_l1;
 +
-+        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
-+        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++            src_l0->x = x1_c + start_x;
++            src_l0->y = y1_c + start_y;
++            src_l0->base = src1_base;
++            src_l1->x = x2_c + start_x;
++            src_l1->y = y2_c + start_y;
++            src_l1->base = src2_base;
 +
-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+        uint32_t dst_base_v = get_vc_address_v(s->frame) + c_off;
-+        qpu_mc_pred_c_t * u = (qpu_mc_pred_c_t *)s->curr_u_mvs;
++            u[0].h = bh;
++            u[0].w = bw;
++            u[0].coeffs_x1 = coefs0_x;
++            u[0].coeffs_y1 = coefs0_y;
++            u[0].weight_u1 = c_weights[0]; // Weight L0 U
++            u[0].weight_v1 = c_weights[1]; // Weight L0 V
++            u[0].coeffs_x2 = coefs1_x;
++            u[0].coeffs_y2 = coefs1_y;
++            u[0].wo_u2 = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
++            u[0].wo_v2 = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
++            u[0].dst_addr_c = dst_base_u + start_x * 2;
 +
-+        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
-+          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH, u += 2) {
-+              int bw = nPbW_c-start_x;
-+              int bh = nPbH_c-start_y;
-+              u[-1].next_fn = s->qpu_filter_uv_b0; // In fact ignored
-+              u[-1].next_src_x = x1_c + start_x;
-+              u[-1].next_src_y = y1_c + start_y;
-+              u[-1].next_src_base_u = get_vc_address_u(src_frame);
-+              u[-1].next_src_base_v = get_vc_address_v(src_frame);
-+
-+              u[0].next_fn = s->qpu_filter_uv_b;
-+              u[0].next_src_x = x2_c + start_x;
-+              u[0].next_src_y = y2_c + start_y;
-+              u[0].next_src_base_u = get_vc_address_u(src_frame2);
-+              u[0].next_src_base_v = get_vc_address_v(src_frame2);
-+
-+              u[0].b0.h = (bh<16 ? bh : 16);
-+              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
-+              u[0].b0.coeffs_x = coefs0_x;
-+              u[0].b0.coeffs_y = coefs0_y;
-+              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
-+              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
-+              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
-+              u[0].b0.dummy1 = 0;
-+
-+              u[1].b1.dummy0 = 0;  // w,h inherited from b0
-+              u[1].b1.coeffs_x = coefs1_x;
-+              u[1].b1.coeffs_y = coefs1_y;
-+              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
-+              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
-+              u[1].b1.dst_addr_u = dst_base_u + start_x;
-+              u[1].b1.dst_addr_v = dst_base_v + start_x;
-+          }
-+
-+          dst_base_u += s->frame->linesize[1] * 16;
-+          dst_base_v += s->frame->linesize[2] * 16;
++            cp->last_l0 = &u[0].next_src1;
++            cp->last_l1 = &u[0].next_src2;
++            *(qpu_mc_pred_c_b_t **)&cp->qpu_mc_curr = u + 1;
 +        }
 +
-+        s->curr_u_mvs = (uint32_t *)u;
++        dst_base_u += s->frame->linesize[1] * 16;
 +    }
 +}
++
++
 +#endif
 +
 +
@@ -5797,7 +7939,7 @@ index ef21595..b36e840 100644
      int merge_idx = 0;
      struct MvField current_mv = {{{ 0 }}};
  
-@@ -1720,8 +2581,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1720,8 +2788,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
      int y_cb             = y0 >> log2_min_cb_size;
      int x_pu, y_pu;
      int i, j;
@@ -5807,7 +7949,7 @@ index ef21595..b36e840 100644
  
      if (!skip_flag)
          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1765,12 +2625,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1765,12 +2832,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -5838,7 +7980,7 @@ index ef21595..b36e840 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-@@ -1784,12 +2661,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1784,12 +2868,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -5869,7 +8011,7 @@ index ef21595..b36e840 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-@@ -1804,11 +2698,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1804,11 +2905,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -5902,7 +8044,7 @@ index ef21595..b36e840 100644
              chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
-@@ -2083,7 +2997,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+@@ -2083,7 +3204,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                  intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
                  ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
                  if (s->ps.sps->pcm.loop_filter_disable_flag)
@@ -5912,7 +8054,7 @@ index ef21595..b36e840 100644
  
                  if (ret < 0)
                      return ret;
-@@ -2306,6 +3222,741 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+@@ -2306,6 +3429,373 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
  }
  
@@ -5958,9 +8100,16 @@ index ef21595..b36e840 100644
 +}
 +#endif
 +
++
 +// I-pred, transform_and_add for all blocks types done here
 +// All ARM
++#define RPI_OPT_SEP_PRED 0
++
++#if RPI_OPT_SEP_PRED
++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
++#else
 +static void rpi_execute_pred_cmds(HEVCContext * const s)
++#endif
 +{
 +  int i;
 +  int job = s->pass1_job;
@@ -5972,7 +8121,12 @@ index ef21595..b36e840 100644
 +#endif
 +
 +  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
-+      //printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++#if RPI_OPT_SEP_PRED
++      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
++          continue;
++      }
++#endif
 +
 +      switch (cmd->type)
 +      {
@@ -5983,16 +8137,26 @@ index ef21595..b36e840 100644
 +              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
 +              lc->na.cand_up           = (cmd->na >> 1) & 1;
 +              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+              s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
++              else
++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
 +              break;
 +
-+          case RPI_PRED_TRANSFORM_ADD:
++          case RPI_PRED_ADD_RESIDUAL:
 +              s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+#ifdef RPI_PRECLEAR
-+              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
-+#endif
 +              break;
-+
++#if RPI_HEVC_SAND
++          case RPI_PRED_ADD_RESIDUAL_U:
++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++          case RPI_PRED_ADD_RESIDUAL_V:
++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++          case RPI_PRED_ADD_RESIDUAL_C:
++              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++#endif
 +          case RPI_PRED_I_PCM:
 +              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
 +              break;
@@ -6002,463 +8166,141 @@ index ef21595..b36e840 100644
 +              abort();
 +      }
 +  }
-+  s->num_pred_cmds[job] = 0;
++#if RPI_OPT_SEP_PRED
++  if (do_luma)
++#endif
++  {
++      s->num_pred_cmds[job] = 0;
++  }
 +}
 +
-+// Do any inter-pred that we want to do in software
-+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
-+// All ARM
-+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
-+{
-+    unsigned int cidx;
-+    AVFrame myref;
-+    AVFrame myref1;
-+    struct MvField mymv;
-+
-+    for(; n>0 ; n--, cmd++) {
-+        switch(cmd->cmd) {
-+        case RPI_CMD_LUMA_UNI:
-+            if (b_only)
-+                break;
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_LUMA_BI:
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            myref1.data[0] = cmd->src1;
-+            myref1.linesize[0] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
-+                       &myref1, &cmd->mv1, &mymv);
-+            break;
-+        case RPI_CMD_CHROMA_UNI:
-+            if (b_only)
-+                break;
-+            mymv.mv[0] = cmd->mv;
-+            chroma_mc_uni(s, cmd->dst,
-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_CHROMA_BI:
-+        case RPI_CMD_CHROMA_BI+1:
-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
-+            myref.data[cidx+1] = cmd->src;
-+            myref.linesize[cidx+1] = cmd->srcstride;
-+            myref1.data[cidx+1] = cmd->src1;
-+            myref1.linesize[cidx+1] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            mymv.mv[0] = cmd->mv;
-+            mymv.mv[1] = cmd->mv1;
-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
-+            break;
-+        }
-+    }
-+}
-+
-+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
-+{
-+    const int job = s->pass1_job;
-+
-+    if (!qpu_luma || luma_b_only)
-+        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
-+    s->num_mv_cmds_y[job] = 0;
-+    if (!qpu_chroma || chroma_b_only)
-+        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
-+    s->num_mv_cmds_c[job] = 0;
-+}
 +
 +#endif
 +
 +#ifdef RPI
++
 +// Set initial uniform job values & zero ctu_count
 +static void rpi_begin(HEVCContext *s)
 +{
 +#if RPI_INTER
 +    int job = s->pass0_job;
 +    int i;
++    HEVCRpiJob * const jb = s->jobs + job;
++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
 +
-+    int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[1];
-+    int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[1];
++    const uint16_t pic_width_y        = s->ps.sps->width;
++    const uint16_t pic_height_y       = s->ps.sps->height;
 +
++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
++
++    rpi_inter_pred_reset(cipe);
 +    for(i=0; i < QPU_N_UV;i++) {
-+        qpu_mc_pred_c_t * const u = (qpu_mc_pred_c_t *)s->mvs_base[job][i];
++        HEVCRpiInterPredQ * const cp = cipe->q + i;
++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++        u->next_src1.x = 0;
++        u->next_src1.y = 0;
++        u->next_src1.base = 0;
++        u->pic_cw = pic_width_c;
++        u->pic_ch = pic_height_c;
++        u->stride2 = rpi_sliced_frame_stride2(s->frame);
++        u->stride1 = s->frame->linesize[1];
++        u->wdenom = s->sh.chroma_log2_weight_denom + 6;
++        cp->last_l0 = &u->next_src1;
 +
 +        u->next_fn = 0;
-+        u->next_src_x = 0;
-+        u->next_src_y = 0;
-+        u->next_src_base_u = 0;
-+        u->next_src_base_v = 0;
-+        u->s.pic_w = pic_width;
-+        u->s.pic_h = pic_height;
-+        u->s.src_stride = s->frame->linesize[1];
-+        u->s.dst_stride = s->frame->linesize[1];
-+        u->s.wdenom = s->sh.chroma_log2_weight_denom + 6;
-+        u->s.dummy0 = 0;
-+        u->s.dummy1 = 0;
++        u->next_src2.x = 0;
++        u->next_src2.y = 0;
++        u->next_src2.base = 0;
++        cp->last_l1 = &u->next_src2;
 +
-+        s->u_mvs[job][i] = (uint32_t *)(u + 1);
++        *(qpu_mc_pred_c_s_t **)&cp->qpu_mc_curr = u + 1;
 +    }
-+    s->curr_u_mvs = s->u_mvs[job][0];
 +
++    rpi_inter_pred_reset(yipe);
 +    for(i=0;i < QPU_N_Y;i++) {
-+        // This needs to have a generally similar structure to the
-+        // actual filter code as various pipelined bits need to land correctly
-+        // when inserted by the filter requests
-+        s->y_mvs[job][i] = s->y_mvs_base[job][i];
-+        *s->y_mvs[job][i]++ = 0; // y_x
-+        *s->y_mvs[job][i]++ = 0; // ref_y_base
-+        *s->y_mvs[job][i]++ = 0; // y2_x2
-+        *s->y_mvs[job][i]++ = 0; // ref_y2_base
-+        *s->y_mvs[job][i]++ = (s->ps.sps->width << 16) + s->ps.sps->height;
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // pitch
-+        *s->y_mvs[job][i]++ = s->frame->linesize[0]; // dst_pitch
-+        *s->y_mvs[job][i]++ = s->sh.luma_log2_weight_denom + 6;  // weight demon + 6
-+        *s->y_mvs[job][i]++ = 0; // Unused - alignment with per-block
-+        *s->y_mvs[job][i]++ = 0; // Next kernel
++        HEVCRpiInterPredQ * const yp = s->jobs[job].luma_ip.q + i;
++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++        y->next_src1.x = 0;
++        y->next_src1.y = 0;
++        y->next_src1.base = 0;
++        y->next_src2.x = 0;
++        y->next_src2.y = 0;
++        y->next_src2.base = 0;
++        y->pic_h = pic_height_y;
++        y->pic_w = pic_width_y;
++        y->stride2 = rpi_sliced_frame_stride2(s->frame);
++        y->stride1 = s->frame->linesize[0];
++        y->wdenom = s->sh.luma_log2_weight_denom + 6;
++        y->next_fn = 0;
++        yp->last_l0 = &y->next_src1;
++        yp->last_l1 = &y->next_src2;
++
++        *(qpu_mc_pred_y_s_t **)&yp->qpu_mc_curr = y + 1;
 +    }
-+    s->curr_y_mvs = s->y_mvs[job][0];
++
++    s->last_y8_p = NULL;
++    s->last_y8_l1 = NULL;
 +#endif
 +    s->ctu_count = 0;
 +}
 +#endif
 +
-+#ifdef RPI_SIMULATE_QPUS
-+#error Rotted
-+
-+static int32_t clipx(int x,int FRAME_WIDTH)
-+{
-+	if (x<=0) return 0;
-+	if (x>=FRAME_WIDTH) return FRAME_WIDTH-1;
-+	return x;
-+}
-+
-+static int32_t clipy(int y,int FRAME_HEIGHT)
-+{
-+	if (y<=0) return 0;
-+	if (y>=FRAME_HEIGHT) return FRAME_HEIGHT-1;
-+	return y;
-+}
-+
-+/*static int32_t filter8(uint8_t *data, int x0, int y0, int pitch, int mx, int my,int round,int denom,int weight,int offset)
-+{
-+   int32_t vsum = 0;
-+   int x, y;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[mx][x]*data[clipx(x + x0) + clipy(y + y0) * pitch];
-+
-+      vsum += lumaFilter[my][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+round)>>denom)+offset;
-+
-+   return av_clip_uint8( vsum );
-+}*/
-+
-+static int32_t filter8_chroma(uint8_t *data, int x0, int y0, int pitch, int hcoeffs, int vcoeffs,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int chromaFilterH[4];
-+  int chromaFilterV[4];
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+  for(i=0;i<4;i++) {
-+    chromaFilterH[i] = ((hcoeffs>>(8*i))<<24)>>24;
-+    chromaFilterV[i] = ((vcoeffs>>(8*i))<<24)>>24;
-+  }
-+
-+   for (y = 0; y < 4; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 4; x++)
-+         hsum += chromaFilterH[x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-+
-+      vsum += chromaFilterV[y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-+
-+   return vsum;
-+}
-+
-+int lumaFilter[4][8]={ {0,0,0,64,0,0,0,0},{-1,4,-10,58,17,-5,1,0},{-1,4,-11,40,40,-11,4,-1},{0,1,-5,17,58,-10,4,-1} };
-+
-+static int32_t filter8_luma(uint8_t *data, int x0, int y0, int pitch, int my_mx,int offset_weight,int offset_before,int denom,int pic_width, int pic_height)
-+{
-+  int32_t vsum = 0;
-+  int x, y;
-+  int i;
-+  int offset_after = offset_weight>>16;
-+  int weight = (offset_weight<<16)>>16;
-+
-+   for (y = 0; y < 8; y++) {
-+      int32_t hsum = 0;
-+
-+      for (x = 0; x < 8; x++)
-+         hsum += lumaFilter[my_mx&3][x]*data[clipx(x + x0,pic_width) + clipy(y + y0,pic_height) * pitch];
-+
-+      vsum += lumaFilter[(my_mx>>8)&3][y]*hsum;
-+   }
-+   vsum >>= 6;
-+   vsum = (((vsum*weight)+offset_before)>>denom)+offset_after;
-+
-+   return vsum;
-+}
-+
-+static uint8_t *test_frame(HEVCContext *s,uint32_t p, AVFrame *frame, const int cIdx)
-+{
-+  //int pic_width        = s->ps.sps->width >> s->ps.sps->hshift[cIdx];
-+  int pic_height       = s->ps.sps->height >> s->ps.sps->vshift[cIdx];
-+  int pitch = frame->linesize[cIdx];
-+  uint32_t base = cIdx == 0 ? get_vc_address_y(frame) :
-+    cIdx == 1 ? get_vc_address_u(frame) : get_vc_address_v(frame);
-+  if (p>=base && p<base+pitch*pic_height) {
-+    return frame->data[cIdx] + (p-base);
-+  }
-+  return NULL;
-+}
-+
-+static uint8_t *compute_arm_addr(HEVCContext *s,uint32_t p, int cIdx)
-+{
-+  SliceHeader *sh   = &s->sh;
-+  uint8_t *arm = test_frame(s,p,s->frame,cIdx);
-+  int i;
-+  if (arm) return arm;
-+  if (sh->slice_type == P_SLICE || sh->slice_type == B_SLICE)
-+  {
-+    for(i=0;i<sh->nb_refs[L0];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[0].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  if (sh->slice_type == B_SLICE) {
-+    for(i=0;i<sh->nb_refs[L1];i++) {
-+      arm = test_frame(s,p,s->ref->refPicList[1].ref[i]->frame,cIdx);
-+      if (arm) return arm;
-+    }
-+  }
-+  printf("Frame 0x%x not found! Exit=%x\n",p,qpu_get_fn(QPU_MC_EXIT));
-+  exit(-1);
-+  return NULL;
-+}
-+
-+static void rpi_simulate_inter_chroma(HEVCContext *s,uint32_t *p)
-+{
-+  uint32_t next_kernel;
-+  uint32_t x0;
-+  uint32_t y0;
-+  uint8_t *ref_u_base;
-+  uint8_t *ref_v_base;
-+  uint32_t frame_width = p[5];
-+  uint32_t frame_height = p[6];
-+  uint32_t pitch = p[7];
-+  uint32_t dst_pitch = p[8];
-+  int32_t offset_before = p[9];
-+  int32_t denom = p[10];
-+  uint32_t vpm_id = p[11];
-+  uint32_t tmp_u_dst[256];
-+  uint32_t tmp_v_dst[256];
-+  while(1) {
-+    p += 12;
-+    next_kernel = p[0-12];
-+    x0 = p[1-12];
-+    y0 = p[2-12];
-+    if (next_kernel==s->qpu_filter_uv || next_kernel==s->qpu_filter_uv_b0 || next_kernel==s->qpu_filter_uv_b) {
-+      int x,y;
-+      uint32_t width_height = p[5];
-+      uint32_t hcoeffs = p[6];
-+      uint32_t vcoeffs = p[7];
-+      uint32_t offset_weight_u = p[8];
-+      uint32_t offset_weight_v = p[9];
-+      uint8_t *this_u_dst;
-+      uint8_t *this_v_dst;
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      ref_u_base = compute_arm_addr(s,p[3-12],1);
-+      ref_v_base = compute_arm_addr(s,p[4-12],2);
-+      if (next_kernel!=s->qpu_filter_uv_b0)
-+      {
-+        this_u_dst = compute_arm_addr(s,p[10],1);
-+        this_v_dst = compute_arm_addr(s,p[11],2);
-+      }
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->qpu_filter_uv) {
-+            int32_t refa = filter8_chroma(ref_u_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_u,offset_before,denom,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base,x+x0, y+y0, pitch, hcoeffs, vcoeffs, offset_weight_v,offset_before,denom,frame_width,frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          } else if (next_kernel==s->qpu_filter_uv_b0) {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1,0,0,frame_width,frame_height);
-+            tmp_u_dst[x+y*16] = refa;
-+            tmp_v_dst[x+y*16] = refb;
-+          } else {
-+            int32_t refa = filter8_chroma(ref_u_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_u_dst[x+y*16], 7, frame_width, frame_height);
-+            int32_t refb = filter8_chroma(ref_v_base, x+x0, y+y0, pitch, hcoeffs, vcoeffs, 1, 64 + tmp_v_dst[x+y*16], 7, frame_width, frame_height);
-+            this_u_dst[x+y*dst_pitch] = av_clip_uint8(refa);
-+            this_v_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT8) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
-+    }
-+  }
-+}
-+
-+// mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, next_kernel)
-+static void rpi_simulate_inter_luma(HEVCContext *s,uint32_t *p,int chan)
-+{
-+  uint32_t next_kernel;
-+  int y_x,y2_x2;
-+  int x0;
-+  int y0;
-+  int x2;
-+  int y2;
-+  uint32_t *p0 = p;
-+  uint8_t *ref_y_base;
-+  uint8_t *ref_y2_base;
-+  uint32_t frame_width_height = p[4];
-+  uint32_t frame_width = frame_width_height>>16;
-+  uint32_t frame_height = (frame_width_height<<16)>>16;
-+  uint32_t pitch = p[5];
-+  uint32_t dst_pitch = p[6];
-+  int offset_shift = p[7];
-+  int32_t offset_before = offset_shift>>16;
-+  int32_t denom = (offset_shift<<16)>>16;
-+  while(1) {
-+    p += 9;
-+    next_kernel = p[8-9];
-+    y_x = p[0-9];
-+    x0 = (y_x<<16)>>16;
-+    y0 = y_x>>16;
-+    y2_x2 = p[2-9];
-+    x2 = (y2_x2<<16)>>16;
-+    y2 = y2_x2>>16;
-+
-+    if (next_kernel==s->qpu_filter || next_kernel==s->qpu_filter_b) {
-+      // y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+      int x,y;
-+      uint32_t width_height = p[4];
-+      uint32_t my2_mx2_my_mx = p[5];
-+      uint32_t offset_weight = p[6];
-+      uint8_t *this_dst = compute_arm_addr(s,p[7],0);
-+      uint32_t width = width_height >> 16;
-+      uint32_t height = (width_height << 16) >> 16;
-+      uint8_t *dst_base = s->frame->data[0];
-+      ref_y_base = compute_arm_addr(s,p[1-9],0);
-+      ref_y2_base = compute_arm_addr(s,p[3-9],0);
-+      for (y=0; y<height; ++y) {
-+        for (x=0; x<width; ++x) {
-+          if (next_kernel==s->qpu_filter) {
-+            int32_t refa = filter8_luma(ref_y_base,x+x0, y+y0, pitch, my2_mx2_my_mx, offset_weight,offset_before,denom,frame_width,frame_height);
-+            refa = av_clip_uint8(refa);
-+            this_dst[x+y*dst_pitch] = refa;
-+          }
-+          else {
-+            int32_t refa = filter8_luma(ref_y_base, x+x0, y+y0, pitch, my2_mx2_my_mx, 1, 0, 0, frame_width, frame_height);
-+            int32_t refb = filter8_luma(ref_y2_base, x+x2, y+y2, pitch, my2_mx2_my_mx>>16, 1, 64 + refa, 7, frame_width, frame_height);
-+            this_dst[x+y*dst_pitch] = av_clip_uint8(refb);
-+          }
-+        }
-+      }
-+    } else {
-+      av_assert0(next_kernel==qpu_get_fn(QPU_MC_INTERRUPT_EXIT12) || next_kernel==qpu_get_fn(QPU_MC_EXIT) );
-+      break;
-+    }
-+  }
-+}
-+
-+static void rpi_simulate_inter_qpu(HEVCContext *s)
-+{
-+  // First run the transform as normal
-+  int i;
-+  rpi_execute_transform(s);
-+  for(i=0;i<8;i++)
-+  {
-+    rpi_simulate_inter_chroma(s,s->mvs_base[i]);
-+  }
-+  for(i=0;i<12;i++)
-+  {
-+    rpi_simulate_inter_luma(s,s->y_mvs_base[i],i);
-+  }
-+}
-+
-+#endif
-+
 +
 +#if RPI_INTER
-+static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
++static unsigned int mc_terminate_add(HEVCContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
 +{
 +    unsigned int i;
-+    const uint32_t exit_fn = qpu_fn(mc_exit);
-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
-+    const uint32_t dummy_texture = qpu_fn(mc_setup_uv);
-+    unsigned int tc = 0;
++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
 +
-+    // Add final commands to Q
-+    for(i = 0; i != QPU_N_Y; ++i) {
-+        uint32_t * const pu = s->y_mvs[job][i] - RPI_LUMA_COMMAND_WORDS;
-+        const int cmd_count = pu - s->y_mvs_base[job][i];
-+        tc += cmd_count;
-+
-+        av_assert0(cmd_count < Y_COMMANDS_PER_QPU - 1);
-+
-+        // We use this code as a dummy texture - safe?
-+        pu[0] = 0; // x,y
-+        pu[1] = dummy_texture;
-+        pu[2] = 0;
-+        pu[3] = dummy_texture;
-+        pu[RPI_LUMA_COMMAND_WORDS - 1] = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
++    if (!ipe->used) {
++        return 0;
 +    }
 +
-+    return tc;
-+}
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
 +
-+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
-+{
-+    unsigned int i;
-+    const uint32_t exit_fn = qpu_fn(mc_exit_c);
-+#if QPU_N_UV == 8
-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit8c);
-+#elif QPU_N_UV == 12
-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12c);
-+#else
-+#error Need appropriate exit code
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        ((uint32_t *)yp->qpu_mc_curr)[-1] = yp->code_exit;
++
++        av_assert0((char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base <= ipe->q1_size);
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++
++        // Add to mailbox list
++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++        mail[i][1] = yp->code_setup;
++    }
++
++#if RPI_CACHE_UNIF_MVS
++    rpi_cache_flush_add_gm_ptr(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
 +#endif
-+    const uint32_t dummy_texture = qpu_fn(mc_setup_uv);
-+    unsigned int tc = 0;
++    vpu_qpu_job_add_qpu(vqj, QPU_N_UV, (uint32_t *)mail);
 +
-+    // Add final commands to Q
-+    for(i = 0; i != QPU_N_UV; ++i) {
-+        qpu_mc_pred_c_t * const pu = (qpu_mc_pred_c_t *)s->u_mvs[job][i] - 1;
-+        const int cmd_count = (uint32_t *)pu - s->mvs_base[job][i];
-+        tc += cmd_count;
-+
-+        pu->next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
-+        // Need to set the src to something that can be (pointlessly) prefetched
-+        pu->next_src_x = 0;
-+        pu->next_src_y = 0;
-+        // We use this code as a dummy texture - safe?
-+        pu->next_src_base_u = dummy_texture;
-+        pu->next_src_base_v = dummy_texture;
-+    }
-+
-+    return tc;
++    return 1;
 +}
++
 +#endif
 +
 +#ifdef RPI
@@ -6475,17 +8317,10 @@ index ef21595..b36e840 100644
 +// Core execution tasks
 +static void worker_core(HEVCContext * const s)
 +{
-+    worker_global_env_t * const wg = &worker_global_env;
-+    int arm_cost = 0;
-+//    vpu_qpu_wait_h sync_c;
++#if RPI_OPT_SEP_PRED
++    vpu_qpu_wait_h sync_c;
++#endif
 +    vpu_qpu_wait_h sync_y;
-+    int qpu_luma = 0;
-+    int qpu_chroma = 0;
-+    int gpu_load;
-+    int arm_load;
-+    static const int arm_const_cost = 2;
-+
-+//    static int z = 0;
 +
 +    const int job = s->pass1_job;
 +    unsigned int flush_start = 0;
@@ -6509,36 +8344,6 @@ index ef21595..b36e840 100644
 +
 +
 +#if RPI_INTER
-+    pthread_mutex_lock(&wg->lock);
-+
-+//    ++z;
-+    gpu_load = vpu_qpu_current_load();
-+    arm_load = avpriv_atomic_int_get(&wg->arm_load);
-+#if !Y_B_ONLY
-+    qpu_luma =  gpu_load + 2 < arm_load;
-+    qpu_chroma = gpu_load < arm_load + 8;
-+#elif 1
-+    qpu_luma =  gpu_load < arm_load + 2;
-+    qpu_chroma = gpu_load < arm_load + 8;
-+#else
-+    qpu_chroma = 1;
-+    qpu_luma = 1;
-+#endif
-+
-+    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
-+
-+    wg->gpu_c += qpu_chroma;
-+    wg->gpu_y += qpu_luma;
-+    wg->arm_c += !qpu_chroma;
-+    wg->arm_y += !qpu_luma;
-+
-+
-+//    if ((z & 511) == 0) {
-+//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
-+//    }
-+
-+
 +    {
 +        int (*d)[2] = s->dblk_cmds[job];
 +        unsigned int high=(*d)[1];
@@ -6550,60 +8355,26 @@ index ef21595..b36e840 100644
 +            flush_start = FFMIN(flush_start, y);
 +            high=FFMAX(high,y);
 +        }
-+        // Avoid flushing past end of frame
-+        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
++        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->ps.sps->height) - flush_start;
 +    }
 +
-+    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
++    if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].chroma_ip) != 0)
 +    {
-+        uint32_t * const unif_vc = (uint32_t *)s->unif_mvs_ptr[job].vc;
-+        const uint32_t code = qpu_fn(mc_setup_uv);
-+        uint32_t * p;
-+        unsigned int i;
-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
-+            *p++ = (uint32_t)(unif_vc + (s->mvs_base[job][i] - (uint32_t*)s->unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
-+
-+#if RPI_CACHE_UNIF_MVS
-+        rpi_cache_flush_add_gm_ptr(rfe, s->unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+#endif
-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
++        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 0, 1);
 +    }
 +
 +// We can take a sync here and try to locally overlap QPU processing with ARM
 +// but testing showed a slightly negative benefit with noticable extra complexity
-+//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
-+
-+    if (qpu_luma && mc_terminate_y(s, job) != 0)
-+    {
-+        uint32_t * const y_unif_vc = (uint32_t *)s->y_unif_mvs_ptr[job].vc;
-+        const uint32_t code = qpu_fn(mc_setup);
-+        uint32_t * p;
-+        unsigned int i;
-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
-+            *p++ = (uint32_t)(y_unif_vc + (s->y_mvs_base[job][i] - (uint32_t*)s->y_unif_mvs_ptr[job].arm));
-+            *p++ = code;
-+        }
-+
-+        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
-+
-+#if RPI_CACHE_UNIF_MVS
-+        rpi_cache_flush_add_gm_ptr(rfe, s->y_unif_mvs_ptr + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#if RPI_OPT_SEP_PRED
++    vpu_qpu_job_add_sync_this(vqj, &sync_c);
 +#endif
-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
++
++    if (mc_terminate_add(s, vqj, rfe, &s->jobs[job].luma_ip) != 0)
++    {
++        rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++          0, flush_start, s->ps.sps->width, flush_count, s->ps.sps->vshift[1], 1, 0);
 +    }
-+
-+    pthread_mutex_unlock(&wg->lock);
-+
 +#endif
 +
 +    vpu_qpu_job_add_sync_this(vqj, &sync_y);
@@ -6612,31 +8383,35 @@ index ef21595..b36e840 100644
 +    rpi_cache_flush_finish(rfe);
 +    vpu_qpu_job_finish(vqj);
 +
-+    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
++    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));
 +
-+#if Y_B_ONLY
-+    if (qpu_luma)
-+        vpu_qpu_wait(&sync_y);
-+#endif
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
++    // We would do ARM inter prediction here but no longer
++    // Look back in git if you find you want it back - As we have
++    // no arm/neon sand pred code there doesn't seem a lot of point
++    // keeping it around
 +
++#if RPI_OPT_SEP_PRED
 +    // Wait for transform completion
++    vpu_qpu_wait(&sync_c);
 +
 +    // Perform intra prediction and residual reconstruction
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
-+#if Y_B_ONLY
-+    if (!qpu_luma)
-+        vpu_qpu_wait(&sync_y);
-+#else
++    rpi_execute_pred_cmds(s, 0, 1);
++
++    // Wait for transform completion
 +    vpu_qpu_wait(&sync_y);
-+#endif
++
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s, 1, 0);
++#else
++    // Wait for transform completion
++    vpu_qpu_wait(&sync_y);
++
++    // Perform intra prediction and residual reconstruction
 +    rpi_execute_pred_cmds(s);
++#endif
 +
 +    // Perform deblocking for CTBs in this row
 +    rpi_execute_dblk_cmds(s);
-+
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
 +}
 +
 +static void rpi_do_all_passes(HEVCContext *s)
@@ -6648,19 +8423,19 @@ index ef21595..b36e840 100644
 +}
 +
 +
-+
 +#endif
 +
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  {
      HEVCContext *s  = avctxt->priv_data;
-@@ -2315,6 +3966,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2315,6 +3805,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
      int y_ctb       = 0;
      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
 +#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8
-+                    && !s->ps.pps->cross_component_prediction_enabled_flag;
++    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
++        s->frame->format == AV_PIX_FMT_SAND128 &&
++        !s->ps.pps->cross_component_prediction_enabled_flag;
 +
 +    if (!s->enable_rpi) {
 +      if (s->ps.pps->cross_component_prediction_enabled_flag)
@@ -6672,7 +8447,7 @@ index ef21595..b36e840 100644
      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
          return AVERROR_INVALIDDATA;
-@@ -2328,6 +3990,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2328,6 +3830,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          }
      }
  
@@ -6687,7 +8462,7 @@ index ef21595..b36e840 100644
      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
          int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
  
-@@ -2335,6 +4005,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2335,6 +3845,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
          hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
  
@@ -6695,56 +8470,48 @@ index ef21595..b36e840 100644
          ff_hevc_cabac_init(s, ctb_addr_ts);
  
          hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
-@@ -2343,7 +4014,57 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+@@ -2344,6 +3855,49 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
-+#if RPI_INTER
-+        s->curr_u_mvs = s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV];
-+        s->curr_y_mvs = s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y];
-+#endif
-+
          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
 +#ifdef RPI
-+#if RPI_INTER
-+        s->u_mvs[s->pass0_job][s->ctu_count % QPU_N_UV]= s->curr_u_mvs;
-+        s->y_mvs[s->pass0_job][s->ctu_count % QPU_N_Y] = s->curr_y_mvs;
-+#endif
-+
 +        if (s->enable_rpi) {
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-+          //av_assert0(s->pass0_job>=0);
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+          s->ctu_count++;
++            int q_full = (s->ctu_count >= s->max_ctu_count);
 +
-+          if ( s->ctu_count >= s->max_ctu_count ) {
++            if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].luma_ip) != 0)
++                q_full = 1;
++            if (rpi_inter_pred_next_ctu(&s->jobs[s->pass0_job].chroma_ip) != 0)
++                q_full = 1;
++
++            s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
++            s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
++            s->ctu_count++;
++
++            if (q_full) {
 +#ifdef RPI_WORKER
-+            if (s->used_for_ref)
-+            {
-+//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
++                if (s->used_for_ref)
++                {
++//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
 +
-+//                worker_wait(s);
-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+              // Pass on this job to worker thread
-+              worker_submit_job(s);
++//                  worker_wait(s);
++                    // Split work load onto separate threads so we make as rapid progress as possible with this frame
++                    // Pass on this job to worker thread
++                    worker_submit_job(s);
 +
-+              // Make sure we have space to prepare the next job
-+              worker_pass0_ready(s);
++                    // Make sure we have space to prepare the next job
++                    worker_pass0_ready(s);
 +
-+              // Prepare the next batch of commands
-+              rpi_begin(s);
-+            } else {
-+              // Non-ref frame so do it all on this thread
-+              rpi_do_all_passes(s);
-+            }
++                    // Prepare the next batch of commands
++                    rpi_begin(s);
++                } else {
++                    // Non-ref frame so do it all on this thread
++                    rpi_do_all_passes(s);
++                }
 +#else
-+            rpi_do_all_passes(s);
++                rpi_do_all_passes(s);
 +#endif
-+          }
++            }
 +
 +        }
 +#endif
@@ -6753,7 +8520,7 @@ index ef21595..b36e840 100644
          if (more_data < 0) {
              s->tab_slice_address[ctb_addr_rs] = -1;
              return more_data;
-@@ -2352,9 +4073,29 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2352,9 +3906,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  
          ctb_addr_ts++;
          ff_hevc_save_states(s, ctb_addr_ts);
@@ -6778,12 +8545,25 @@ index ef21595..b36e840 100644
 +        rpi_do_all_passes(s);
 +    }
 +
++#if RPI_TSTATS
++    {
++        HEVCRpiStats *const ts = &s->tstats;
++
++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
++        memset(ts, 0, sizeof(*ts));
++    }
++#endif
++
 +#endif
 +
      if (x_ctb + ctb_size >= s->ps.sps->width &&
          y_ctb + ctb_size >= s->ps.sps->height)
          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2389,6 +4130,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+@@ -2389,6 +3976,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
      s = s1->sList[self_id];
      lc = s->HEVClc;
  
@@ -6795,7 +8575,7 @@ index ef21595..b36e840 100644
      if(ctb_row) {
          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-@@ -2771,6 +4517,20 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+@@ -2771,6 +4363,33 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
          if (ret < 0)
              return ret;
  
@@ -6809,14 +8589,27 @@ index ef21595..b36e840 100644
 +                        s->nal_unit_type == HEVC_NAL_RADL_N  ||
 +                        s->nal_unit_type == HEVC_NAL_RASL_N);
 +
++#if DEBUG_DECODE_N
++        {
++            static int z = 0;
++            if (IS_IDR(s)) {
++                z = 1;
++            }
++            if (z != 0 && z++ > DEBUG_DECODE_N) {
++                s->is_decoded = 0;
++                break;
++            }
++        }
++#endif
 +        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
 +            s->is_decoded = 0;
 +            break;
 +        }
-         if (s->max_ra == INT_MAX) {
-             if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-                 s->max_ra = s->poc;
-@@ -2894,10 +4654,18 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++
+         if (s->sh.first_slice_in_pic_flag) {
+             if (s->max_ra == INT_MAX) {
+                 if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
+@@ -2894,10 +4513,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
          }
      }
  
@@ -6829,16 +8622,17 @@ index ef21595..b36e840 100644
 +#endif
          ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 -
-+    } else if (s->ref) {
++    }
 +#if RPI_INTER
++    else if (s->ref && s->enable_rpi) {
 +      // When running single threaded we need to flush the whole frame
 +      flush_frame(s,s->frame);
-+#endif
 +    }
++#endif
      return ret;
  }
  
-@@ -3150,6 +4918,41 @@ fail:
+@@ -3150,6 +4778,48 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -6875,12 +8669,19 @@ index ef21595..b36e840 100644
 +    s->worker_head=0;
 +    s->kill_worker=0;
 +}
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++    av_freep(&ipe->q);
++    gpu_free(&ipe->gptr);
++}
++
 +#endif
 +
  static av_cold int hevc_decode_free(AVCodecContext *avctx)
  {
      HEVCContext       *s = avctx->priv_data;
-@@ -3161,6 +4964,33 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3161,6 +4831,27 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
  
      av_freep(&s->cabac_state);
  
@@ -6891,44 +8692,27 @@ index ef21595..b36e840 100644
 +#endif
 +
 +    for(i=0;i<RPI_MAX_JOBS;i++) {
-+      av_freep(&s->unif_mv_cmds_y[i]);
-+      av_freep(&s->unif_mv_cmds_c[i]);
-+      av_freep(&s->univ_pred_cmds[i]);
++
++        av_freep(&s->univ_pred_cmds[i]);
 +
 +#if RPI_INTER
-+      if (s->unif_mvs[i]) {
-+        gpu_free( &s->unif_mvs_ptr[i] );
-+        s->unif_mvs[i] = 0;
-+      }
-+      if (s->y_unif_mvs[i]) {
-+        gpu_free( &s->y_unif_mvs_ptr[i] );
-+        s->y_unif_mvs[i] = 0;
-+      }
++        rpi_free_inter_pred(&s->jobs[i].chroma_ip);
++        rpi_free_inter_pred(&s->jobs[i].luma_ip);
 +#endif
 +    }
 +
 +    vpu_qpu_term();
 +
++    av_rpi_zc_uninit(avctx);
 +#endif
 +
      for (i = 0; i < 3; i++) {
          av_freep(&s->sao_pixel_buffer_h[i]);
          av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3202,10 +5032,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3202,10 +4893,14 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
      return 0;
  }
  
-+#ifdef RPI
-+#ifdef RPI_PRECLEAR
-+static av_cold void memclear16(int16_t *p, int n)
-+{
-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+  //int i;
-+  //for(i=0;i<n;i++)
-+  //  p[i] = 0;
-+}
-+#endif
-+#endif
 +
  static av_cold int hevc_init_context(AVCodecContext *avctx)
  {
@@ -6940,7 +8724,7 @@ index ef21595..b36e840 100644
  
      s->avctx = avctx;
  
-@@ -3215,6 +5060,82 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+@@ -3215,6 +4910,59 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
      s->HEVClcList[0] = s->HEVClc;
      s->sList[0] = s;
  
@@ -6949,65 +8733,42 @@ index ef21595..b36e840 100644
 +    // many times as we have threads (init_thread_copy is called for the
 +    // threads).  So to match init & term put the init here where it will be
 +    // called by both init & copy
++    av_rpi_zc_init(avctx);
++
 +    if (vpu_qpu_init() != 0)
 +        goto fail;
 +
 +    for(job = 0; job < RPI_MAX_JOBS; job++) {
-+        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
-+        if (!s->unif_mv_cmds_y[job])
-+            goto fail;
-+        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
-+        if (!s->unif_mv_cmds_c[job])
-+            goto fail;
 +        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
 +        if (!s->univ_pred_cmds[job])
 +            goto fail;
 +    }
 +
 +#if RPI_INTER
-+    // We divide the image into blocks 256 wide and 64 high
-+    // We support up to 2048 widths
-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-+    // Also add space for the startup command for each stream.
 +
 +    for (job = 0; job < RPI_MAX_JOBS; job++) {
-+        uint32_t *p;
-+#if RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+#else
-+        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(uint32_t), &s->unif_mvs_ptr[job] );
-+#endif
-+        s->unif_mvs[job] = (uint32_t *) s->unif_mvs_ptr[job].arm;
++        HEVCRpiJob * const jb = s->jobs + job;
++        // ** Sizeof the union structure might be overkill but at the moment it
++        //    is correct (it certainly isn't going to be too samll)
 +
-+        // Set up initial locations for uniform streams
-+        p = s->unif_mvs[job];
-+        for(i = 0; i < QPU_N_UV; i++) {
-+            s->mvs_base[job][i] = p;
-+            p += UV_COMMANDS_PER_QPU;
-+        }
++        rpi_alloc_inter_pred(&jb->chroma_ip,
++                             QPU_N_UV, QPU_N_GRP_UV,
++                             UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t),
++                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t),
++                             inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu);
++        rpi_alloc_inter_pred(&jb->luma_ip,
++                             QPU_N_Y,  QPU_N_GRP_Y,
++                             Y_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_y_t),
++                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t),
++                             inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu);
 +    }
++
 +    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
 +    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
-+    s->qpu_filter_uv_b = qpu_fn(mc_filter_uv_b);
-+
-+    for (job=0; job < RPI_MAX_JOBS; job++)
-+    {
-+        uint32_t *p;
-+#if RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#else
-+        gpu_malloc_uncached(QPU_N_Y * Y_COMMANDS_PER_QPU * sizeof(uint32_t), &s->y_unif_mvs_ptr[job] );
-+#endif
-+        s->y_unif_mvs[job] = (uint32_t *) s->y_unif_mvs_ptr[job].arm;
-+
-+        // Set up initial locations for uniform streams
-+        p = s->y_unif_mvs[job];
-+        for(i = 0; i < QPU_N_Y; i++) {
-+            s->y_mvs_base[job][i] = p;
-+            p += Y_COMMANDS_PER_QPU;
-+        }
-+    }
++    s->qpu_dummy_frame = qpu_fn(mc_start);  // Use our code as a dummy frame
 +    s->qpu_filter = qpu_fn(mc_filter);
++    s->qpu_filter_y_p00 = qpu_fn(mc_filter_y_p00);
++    s->qpu_filter_y_b00 = qpu_fn(mc_filter_y_b00);
 +    s->qpu_filter_b = qpu_fn(mc_filter_b);
 +#endif
 +    //gpu_malloc_uncached(2048*64,&s->dummy);
@@ -7023,7 +8784,7 @@ index ef21595..b36e840 100644
      s->cabac_state = av_malloc(HEVC_CONTEXTS);
      if (!s->cabac_state)
          goto fail;
-@@ -3357,9 +5278,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
+@@ -3357,9 +5105,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
      }
  
      if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
@@ -7036,7 +8797,7 @@ index ef21595..b36e840 100644
  
      return 0;
  }
-@@ -3418,6 +5339,8 @@ AVCodec ff_hevc_decoder = {
+@@ -3418,6 +5166,8 @@ AVCodec ff_hevc_decoder = {
      .update_thread_context = hevc_update_thread_context,
      .init_thread_copy      = hevc_init_thread_copy,
      .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
@@ -7045,10 +8806,10 @@ index ef21595..b36e840 100644
                               AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
      .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
      .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
-diff --git b/libavcodec/hevcdec.h a/libavcodec/hevcdec.h
-index 0c78812..e068936 100644
---- b/libavcodec/hevcdec.h
-+++ a/libavcodec/hevcdec.h
+diff --git a/libavcodec/hevcdec.h b/libavcodec/hevcdec.h
+index 0c78812..c268d39 100644
+--- a/libavcodec/hevcdec.h
++++ b/libavcodec/hevcdec.h
 @@ -334,17 +334,6 @@ typedef struct CodingUnit {
      uint8_t cu_transquant_bypass_flag;
  } CodingUnit;
@@ -7102,15 +8863,13 @@ index 0c78812..e068936 100644
  
  #define BOUNDARY_LEFT_SLICE     (1 << 0)
  #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -464,6 +460,89 @@ typedef struct HEVCLocalContext {
+@@ -464,6 +460,149 @@ typedef struct HEVCLocalContext {
      int boundary_flags;
  } HEVCLocalContext;
  
 +#ifdef RPI
 +
 +// The processing is done in chunks
-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-+// This is a distance of 1536 pixels across the screen
 +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
 +// but allocate more memory and increase the latency before data in the next frame can be processed
 +#define RPI_NUM_CHUNKS 4
@@ -7133,9 +8892,6 @@ index 0c78812..e068936 100644
 +#define RPI_CMD_CHROMA_BI 3
 +#define RPI_CMD_V_BI 4
 +
-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+// #define RPI_PRECLEAR
-+
 +// Command for inter prediction
 +typedef struct HEVCMvCmd {
 +    uint8_t cmd;
@@ -7158,9 +8914,16 @@ index 0c78812..e068936 100644
 +
 +
 +// Command for intra prediction and transform_add of predictions to coefficients
-+#define RPI_PRED_TRANSFORM_ADD 0
-+#define RPI_PRED_INTRA 1
-+#define RPI_PRED_I_PCM 2
++enum rpi_pred_cmd_e
++{
++    RPI_PRED_ADD_RESIDUAL,
++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_INTRA,
++    RPI_PRED_I_PCM,
++    RPI_PRED_CMD_MAX
++};
 +
 +typedef struct HEVCPredCmd {
 +    uint8_t type;
@@ -7188,11 +8951,69 @@ index 0c78812..e068936 100644
 +} HEVCPredCmd;
 +
 +#endif
++
++#ifdef RPI
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++    union qpu_mc_pred_cmd_u *qpu_mc_base;
++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    struct qpu_mc_src_s *last_l0;
++    struct qpu_mc_src_s *last_l1;
++    unsigned int load;
++    uint32_t code_setup;
++    uint32_t code_sync;
++    uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++    HEVCRpiInterPredQ * q;
++    unsigned int n;        // Number of Qs
++    unsigned int n_grp;    // Number of Q in a group
++    unsigned int curr;     // Current Q number (0..n-1)
++    int used;              // 0 if nothing in any Q, 1 otherwise
++    int used_grp;          // 0 if nothing in any Q in the current group
++    unsigned int max_fill;
++    GPU_MEM_PTR_T gptr;
++    unsigned int q1_size;  // size of 1 uniform Q
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiJob {
++    HEVCRpiInterPredEnv chroma_ip;
++    HEVCRpiInterPredEnv luma_ip;
++} HEVCRpiJob;
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++    int y_pred1_y8_merge;
++    int y_pred1_xy;
++    int y_pred1_x0;
++    int y_pred1_y0;
++    int y_pred1_x0y0;
++    int y_pred1_wle8;
++    int y_pred1_wgt8;
++    int y_pred1_hle16;
++    int y_pred1_hgt16;
++    int y_pred2_xy;
++    int y_pred2_x0;
++    int y_pred2_y0;
++    int y_pred2_x0y0;
++    int y_pred2_hle16;
++    int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++#endif
 +
  typedef struct HEVCContext {
      const AVClass *c;  // needed by private avoptions
      AVCodecContext *avctx;
-@@ -472,6 +551,9 @@ typedef struct HEVCContext {
+@@ -472,6 +611,9 @@ typedef struct HEVCContext {
  
      HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
      HEVCLocalContext    *HEVClc;
@@ -7202,15 +9023,13 @@ index 0c78812..e068936 100644
  
      uint8_t             threads_type;
      uint8_t             threads_number;
-@@ -479,6 +561,98 @@ typedef struct HEVCContext {
+@@ -479,6 +621,90 @@ typedef struct HEVCContext {
      int                 width;
      int                 height;
  
 +    int used_for_ref;  // rpi
 +#ifdef RPI
 +    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
-+    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
 +    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
 +    int buf_width;
 +    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
@@ -7231,28 +9050,22 @@ index 0c78812..e068936 100644
 +    int ctu_per_y_chan; // Number of CTUs per luma QPU
 +    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
 +
++    HEVCRpiJob jobs[RPI_MAX_JOBS];
++#if RPI_TSTATS
++    HEVCRpiStats tstats;
++#endif
 +#if RPI_INTER
-+    GPU_MEM_PTR_T unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
++    struct qpu_mc_pred_y_p_s * last_y8_p;
++    struct qpu_mc_src_s * last_y8_l1;
 +
-+    // _base pointers are to the start of the row
-+    uint32_t *mvs_base[RPI_MAX_JOBS][QPU_N_UV];
-+    // these pointers are to the next free space
-+    uint32_t *u_mvs[RPI_MAX_JOBS][QPU_N_UV];
-+    uint32_t *curr_u_mvs; // Current uniform stream to use for chroma
 +    // Function pointers
 +    uint32_t qpu_filter_uv;
 +    uint32_t qpu_filter_uv_b0;
-+    uint32_t qpu_filter_uv_b;
-+
-+    GPU_MEM_PTR_T y_unif_mvs_ptr[RPI_MAX_JOBS];
-+    uint32_t *y_unif_mvs[RPI_MAX_JOBS]; // Base of memory for motion vector commands
-+    uint32_t *y_mvs_base[RPI_MAX_JOBS][QPU_N_Y];
-+    uint32_t *y_mvs[RPI_MAX_JOBS][QPU_N_Y];
-+    uint32_t *curr_y_mvs; // Current uniform stream for luma
-+    // Function pointers
++    uint32_t qpu_dummy_frame;  // Not a frame - just a bit of memory
 +    uint32_t qpu_filter;
 +    uint32_t qpu_filter_b;
++    uint32_t qpu_filter_y_p00;
++    uint32_t qpu_filter_y_b00;
 +#endif
 +
 +#ifdef RPI_WORKER
@@ -7301,7 +9114,7 @@ index 0c78812..e068936 100644
      uint8_t *cabac_state;
  
      /** 1 if the independent slice segment header was successfully parsed */
-@@ -596,6 +770,9 @@ typedef struct HEVCContext {
+@@ -596,6 +822,9 @@ typedef struct HEVCContext {
      uint32_t max_mastering_luminance;
      uint32_t min_mastering_luminance;
  
@@ -7311,7 +9124,7 @@ index 0c78812..e068936 100644
  } HEVCContext;
  
  int ff_hevc_decode_nal_sei(HEVCContext *s);
-@@ -703,6 +880,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -703,6 +932,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
  
  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
@@ -7323,19 +9136,26 @@ index 0c78812..e068936 100644
  /**
   * Reset SEI values that are stored on the Context.
   * e.g. Caption data that was extracted during NAL
-@@ -716,4 +898,8 @@ extern const uint8_t ff_hevc_qpel_extra_before[4];
+@@ -716,4 +950,15 @@ extern const uint8_t ff_hevc_qpel_extra_before[4];
  extern const uint8_t ff_hevc_qpel_extra_after[4];
  extern const uint8_t ff_hevc_qpel_extra[4];
  
 +#ifdef RPI
 +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
 +#endif
 +
  #endif /* AVCODEC_HEVCDEC_H */
-diff --git b/libavcodec/hevcdsp.c a/libavcodec/hevcdsp.c
-index 23e923f..a985f02 100644
---- b/libavcodec/hevcdsp.c
-+++ a/libavcodec/hevcdsp.c
+diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+index 23e923f..c4f1a6c 100644
+--- a/libavcodec/hevcdsp.c
++++ b/libavcodec/hevcdsp.c
 @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
  #include "hevcdsp_template.c"
  #undef BIT_DEPTH
@@ -7457,7 +9277,74 @@ index 23e923f..a985f02 100644
  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
  {
  #undef FUNC
-@@ -257,6 +371,8 @@ int i = 0;
+@@ -193,12 +307,38 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+ 
++#if !RPI_HEVC_SAND
++#define SLICED_LOOP_FILTERS(depth)
++#define SLICED_ADD_RESIDUAL(depth)
++#else
++#define SLICED_ADD_RESIDUAL(depth)\
++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#endif
++
++
+ #define HEVC_DSP(depth)                                                     \
+     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+     hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
+     hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
+     hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
+     hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
++    SLICED_ADD_RESIDUAL(depth);                                             \
+     hevcdsp->dequant                = FUNC(dequant, depth);                 \
+     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+     hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
+@@ -225,6 +365,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+                                                                                \
++    hevcdsp->sao_band_filter_c[0] =                                            \
++    hevcdsp->sao_band_filter_c[1] =                                            \
++    hevcdsp->sao_band_filter_c[2] =                                            \
++    hevcdsp->sao_band_filter_c[3] =                                            \
++    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
++    hevcdsp->sao_edge_filter_c[0] =                                            \
++    hevcdsp->sao_edge_filter_c[1] =                                            \
++    hevcdsp->sao_edge_filter_c[2] =                                            \
++    hevcdsp->sao_edge_filter_c[3] =                                            \
++    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
++                                                                               \
+     QPEL_FUNCS(depth);                                                         \
+     QPEL_UNI_FUNCS(depth);                                                     \
+     QPEL_BI_FUNCS(depth);                                                      \
+@@ -232,6 +385,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     EPEL_UNI_FUNCS(depth);                                                     \
+     EPEL_BI_FUNCS(depth);                                                      \
+                                                                                \
++    SLICED_LOOP_FILTERS(depth);                                                \
+     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
+     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
+     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
+@@ -257,6 +411,8 @@ int i = 0;
          break;
      }
  
@@ -7466,11 +9353,19 @@ index 23e923f..a985f02 100644
      if (ARCH_X86)
          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
      if (ARCH_ARM)
-diff --git b/libavcodec/hevcdsp.h a/libavcodec/hevcdsp.h
-index eefb3cd..a41aa09 100644
---- b/libavcodec/hevcdsp.h
-+++ a/libavcodec/hevcdsp.h
-@@ -42,6 +42,17 @@ typedef struct SAOParams {
+diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+index eefb3cd..9e44e7f 100644
+--- a/libavcodec/hevcdsp.h
++++ b/libavcodec/hevcdsp.h
+@@ -25,6 +25,7 @@
+ #ifndef AVCODEC_HEVCDSP_H
+ #define AVCODEC_HEVCDSP_H
+ 
++#include "hevc.h"
+ #include "get_bits.h"
+ 
+ #define MAX_PB_SIZE 64
+@@ -42,11 +43,30 @@ typedef struct SAOParams {
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
  
@@ -7488,21 +9383,795 @@ index eefb3cd..a41aa09 100644
  typedef struct HEVCDSPContext {
      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                      struct GetBitContext *gb, int pcm_bit_depth);
-@@ -120,6 +131,9 @@ typedef struct HEVCDSPContext {
+ 
+     void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++#if RPI_HEVC_SAND
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++
++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++#endif
+ 
+     void (*dequant)(int16_t *coeffs, int16_t log2_size);
+ 
+@@ -60,14 +80,23 @@ typedef struct HEVCDSPContext {
+ 
+     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
++                               int width, int height);
+ 
+     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+ 
+     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+ 
+     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, intptr_t mx, intptr_t my, int width);
+@@ -120,6 +149,22 @@ typedef struct HEVCDSPContext {
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                          int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
++#ifdef RPI
++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                                 const uint8_t no_p[2], const uint8_t no_q[2],
++                                 uint8_t * _pix_l);
++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f);
++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f);
++
++#endif
++
 +    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
 +                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
 +                                               MvField *curr, MvField *neigh, uint8_t *bs);
  } HEVCDSPContext;
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-diff --git b/libavcodec/hevcpred_template.c a/libavcodec/hevcpred_template.c
-index 6ae87cc..28d2653 100644
---- b/libavcodec/hevcpred_template.c
-+++ a/libavcodec/hevcpred_template.c
-@@ -20,6 +20,8 @@
+diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
+index 25f1a81..d475b3d 100644
+--- a/libavcodec/hevcdsp_template.c
++++ b/libavcodec/hevcdsp_template.c
+@@ -26,6 +26,10 @@
+ #include "bit_depth_template.c"
+ #include "hevcdsp.h"
+ 
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
+ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+                           GetBitContext *gb, int pcm_bit_depth)
+ {
+@@ -41,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+     }
+ }
+ 
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++
++    dst = (pixel *)_dst + 1;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++
+ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
+                                                 ptrdiff_t stride, int size)
+ {
+@@ -58,6 +85,44 @@ static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
+     }
+ }
+ 
++#if RPI_HEVC_SAND
++static av_always_inline void FUNC(add_residual_u_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, unsigned int size)
++{
++    unsigned int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int16_t * ru = res;
++    const int16_t * rv = res + size * size;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++        }
++        dst += stride;
++    }
++}
++#endif
++
+ static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
+                                   ptrdiff_t stride)
+ {
+@@ -82,6 +147,90 @@ static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
+     FUNC(add_residual)(_dst, res, stride, 32);
+ }
+ 
++#if RPI_HEVC_SAND
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_u_v)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_u_v)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_u_v)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_u_v)(_dst + 1, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_u_v)(_dst + 1, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_u_v)(_dst + 1, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++#endif
++
++
+ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
+ {
+     int16_t *coeffs = (int16_t *) _coeffs;
+@@ -361,7 +510,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+     pixel *src = (pixel *)_src;
+-    int16_t *sao_offset_val = sao->offset_val[c_idx];
+     int sao_eo_class    = sao->eo_class[c_idx];
+     int init_x = 0, width = _width, height = _height;
+ 
+@@ -370,33 +518,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+ 
+     if (sao_eo_class != SAO_EO_VERT) {
+         if (borders[0]) {
+-            int offset_val = sao_offset_val[0];
+             for (y = 0; y < height; y++) {
+-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
++                dst[y * stride_dst] = src[y * stride_src];
+             }
+             init_x = 1;
+         }
+         if (borders[2]) {
+-            int offset_val = sao_offset_val[0];
+             int offset     = width - 1;
+             for (x = 0; x < height; x++) {
+-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+             }
+             width--;
+         }
+     }
+     if (sao_eo_class != SAO_EO_HORIZ) {
+         if (borders[1]) {
+-            int offset_val = sao_offset_val[0];
+             for (x = init_x; x < width; x++)
+-                dst[x] = av_clip_pixel(src[x] + offset_val);
++                dst[x] = src[x];
+         }
+         if (borders[3]) {
+-            int offset_val   = sao_offset_val[0];
+             ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+             ptrdiff_t y_stride_src = stride_src * (height - 1);
+             for (x = init_x; x < width; x++)
+-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
++                dst[x + y_stride_dst] = src[x + y_stride_src];
+             height--;
+         }
+     }
+@@ -411,7 +555,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+     pixel *src = (pixel *)_src;
+-    int16_t *sao_offset_val = sao->offset_val[c_idx];
+     int sao_eo_class    = sao->eo_class[c_idx];
+     int init_x = 0, init_y = 0, width = _width, height = _height;
+ 
+@@ -420,34 +563,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+ 
+     if (sao_eo_class != SAO_EO_VERT) {
+         if (borders[0]) {
+-            int offset_val = sao_offset_val[0];
+             for (y = 0; y < height; y++) {
+-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
++                dst[y * stride_dst] = src[y * stride_src];
+             }
+             init_x = 1;
+         }
+         if (borders[2]) {
+-            int offset_val = sao_offset_val[0];
+             int offset     = width - 1;
+             for (x = 0; x < height; x++) {
+-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+             }
+             width--;
+         }
+     }
+     if (sao_eo_class != SAO_EO_HORIZ) {
+         if (borders[1]) {
+-            int offset_val = sao_offset_val[0];
+             for (x = init_x; x < width; x++)
+-                dst[x] = av_clip_pixel(src[x] + offset_val);
++                dst[x] = src[x];
+             init_y = 1;
+         }
+         if (borders[3]) {
+-            int offset_val   = sao_offset_val[0];
+             ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+             ptrdiff_t y_stride_src = stride_src * (height - 1);
+             for (x = init_x; x < width; x++)
+-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
++                dst[x + y_stride_dst] = src[x + y_stride_src];
+             height--;
+         }
+     }
+@@ -488,6 +627,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+     }
+ }
+ 
++
++// --- Plaited chroma versions
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table_u[32] = { 0 };
++    int offset_table_v[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++    width *= 2;
++
++    for (k = 0; k < 4; k++)
++    {
++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++    }
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2)
++        {
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++        }
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++#endif
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++    width *= 2;
++
++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2) {
++            int diff0u = CMP(src[x], src[x + a_stride]);
++            int diff1u = CMP(src[x], src[x + b_stride]);
++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++#endif
++
++#if BIT_DEPTH != 8
++static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#else
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
++#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
++#endif
++
++
+ #undef CMP
+ 
+ ////////////////////////////////////////////////////////////////////////////////
+@@ -1690,3 +1950,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+ #undef TQ1
+ #undef TQ2
+ #undef TQ3
++
++#ifdef RPI
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
++                                 uint8_t * _pix_l)
++{
++    int d, j;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    const ptrdiff_t xstride = 1;
++    const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
++
++        if (d0 + d3 >= beta) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            }
++        }
++    }
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, const int32_t *_tc,
++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++    int d, j, no_p, no_q;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix_l += ystride;
++            pix_r += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
++
++#endif
++
+diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
+index 7a86ed3..7d32c4a 100644
+--- a/libavcodec/hevcpred.c
++++ b/libavcodec/hevcpred.c
+@@ -24,6 +24,7 @@
+ 
+ #include "hevcpred.h"
+ 
++#define PRED_C 0
+ #define BIT_DEPTH 8
+ #include "hevcpred_template.c"
+ #undef BIT_DEPTH
+@@ -39,13 +40,37 @@
+ #define BIT_DEPTH 12
+ #include "hevcpred_template.c"
+ #undef BIT_DEPTH
++#undef PRED_C
++
++#ifdef RPI
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++#endif
+ 
+ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+ {
+ #undef FUNC
+ #define FUNC(a, depth) a ## _ ## depth
+ 
+-#define HEVC_PRED(depth)                                \
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth)                                \
+     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
+     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
+     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
+@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+ 
++#define HEVC_PRED_C(depth)                                \
++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
++
++#ifdef RPI
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth); \
++    HEVC_PRED_C(depth);
++#else
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth);
++#endif
++
+     switch (bit_depth) {
+     case 9:
+         HEVC_PRED(9);
+diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
+index eb17663..00ba3f9 100644
+--- a/libavcodec/hevcpred.h
++++ b/libavcodec/hevcpred.h
+@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
+     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+                             const uint8_t *left, ptrdiff_t stride,
+                             int c_idx, int mode);
++#ifdef RPI
++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
++
++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride, int log2_size, int c_idx);
++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int c_idx, int mode);
++#endif
+ } HEVCPredContext;
+ 
+ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+index 6ae87cc..c14dddd 100644
+--- a/libavcodec/hevcpred_template.c
++++ b/libavcodec/hevcpred_template.c
+@@ -20,13 +20,55 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -7511,7 +10180,54 @@ index 6ae87cc..28d2653 100644
  #include "libavutil/pixdesc.h"
  
  #include "bit_depth_template.c"
-@@ -69,8 +71,11 @@ do {                                  \
+ #include "hevcpred.h"
+ 
++#ifdef RPI
++#include "rpi_zc.h"
++#endif
++
++#define DUMP_PRED 0
++
+ #define POS(x, y) src[(x) + stride * (y)]
+ 
++#if PRED_C
++
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++
++#if BIT_DEPTH == 8
++#undef BIT_DEPTH
++#define BIT_DEPTH 16
++#include "bit_depth_template.c"
++#undef FUNC
++#define FUNC(a) FUNC3(a, 8, _c)
++#else
++#undef FUNC
++#define FUNC FUNCC
++#endif
++
++#endif
++
++#if DUMP_PRED
++#ifndef DEBUG_ONCE
++#define DEBUG_ONCE
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++        for (unsigned int x = 0; x != size; x++) {
++            printf("%4d", data[x * 2]);
++        }
++        printf("\n");
++    }
++    printf("\n");
++}
++#endif
++#endif
++
+ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
+                                               int log2_size, int c_idx)
+ {
+@@ -69,8 +111,11 @@ do {                                  \
                  AV_WN4P(&ptr[i], a);                                           \
              else                                                               \
                  a = PIXEL_SPLAT_X4(ptr[i + 3])
@@ -7524,54 +10240,403 @@ index 6ae87cc..28d2653 100644
      int i;
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-@@ -114,6 +119,10 @@ do {                                  \
+@@ -79,15 +124,23 @@ do {                                  \
+     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+     int size_in_luma_v = size << vshift;
+     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+-    int x = x0 >> hshift;
+-    int y = y0 >> vshift;
++    const int x = x0 >> hshift;
++    const int y = y0 >> vshift;
+     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+ 
+     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+ 
+-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
++#if defined(RPI)
++    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
++            (pixel*)s->frame->data[c_idx] + x + y * stride :
++        c_idx == 0 ?
++            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
++            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
++#else
+     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
++#endif
+ 
+     int min_pu_width = s->ps.sps->min_pu_width;
+ 
+@@ -95,14 +148,20 @@ do {                                  \
+                               lc->tu.intra_pred_mode;
+     pixel4 a;
+     pixel  left_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
+     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
++#endif
+     pixel  top_array[2 * MAX_TB_SIZE + 1];
++#if !PRED_C
+     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
++#endif
+ 
+     pixel  *left          = left_array + 1;
+     pixel  *top           = top_array  + 1;
++#if !PRED_C
+     pixel  *filtered_left = filtered_left_array + 1;
+     pixel  *filtered_top  = filtered_top_array  + 1;
++#endif
+     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
+     int cand_left        = lc->na.cand_left;
+     int cand_up_left     = lc->na.cand_up_left;
+@@ -114,6 +173,26 @@ do {                                  \
      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                             (x0 + size_in_luma_h)) >> hshift;
  
++    pixel * src_l = src - 1;
++    pixel * src_u = src - stride;
++    pixel * src_ur = src_u + size;
++
 +#ifdef DISABLE_INTRA
 +    return;
 +#endif
++
++#if defined(RPI)
++    if (s->frame->format == AV_PIX_FMT_SAND128) {
++        const AVFrame * const frame = s->frame;
++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
++        if ((x & mask) == 0)
++            src_l -= stripe_adj;
++        if (((x + size) & mask) == 0)
++            src_ur += stripe_adj;
++    }
++#endif
 +
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
-diff --git b/libavcodec/mjpegenc_common.c a/libavcodec/mjpegenc_common.c
-index 6d9c982..83a9e95 100644
---- b/libavcodec/mjpegenc_common.c
-+++ a/libavcodec/mjpegenc_common.c
-@@ -91,17 +91,13 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
- {
-     int i, j, size;
-     uint8_t *ptr;
--    MpegEncContext *s = NULL;
+@@ -163,23 +242,24 @@ do {                                  \
+         top[-1] = 128;
+     }
+     if (cand_up_left) {
+-        left[-1] = POS(-1, -1);
++        left[-1] = src_l[-stride];
+         top[-1]  = left[-1];
+     }
+     if (cand_up)
+-        memcpy(top, src - stride, size * sizeof(pixel));
++        // Always good - even with sand
++        memcpy(top, src_u, size * sizeof(pixel));
+     if (cand_up_right) {
+-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
+-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
+                size - top_right_size);
+     }
+     if (cand_left)
+         for (i = 0; i < size; i++)
+-            left[i] = POS(-1, i);
++            left[i] = src_l[stride * i];
+     if (cand_bottom_left) {
+         for (i = size; i < size + bottom_left_size; i++)
+-            left[i] = POS(-1, i);
+-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
++            left[i] = src_l[stride * i];
++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
+                size - bottom_left_size);
+     }
+ 
+@@ -268,7 +348,11 @@ do {                                  \
+             cand_up_left = 1;
+             cand_left    = 1;
+         } else { // No samples available
++#if PRED_C && BIT_DEPTH == 16
++            left[-1] = 0x8080;
++#else
+             left[-1] = (1 << (BIT_DEPTH - 1));
++#endif
+             EXTEND(top,  left[-1], 2 * size);
+             EXTEND(left, left[-1], 2 * size);
+         }
+@@ -287,6 +371,9 @@ do {                                  \
+     top[-1] = left[-1];
+ 
+     // Filtering process
++    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
++    // worry about chroma smoothing for that case
++#if !PRED_C
+     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+         if (mode != INTRA_DC && size != 4){
+             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+@@ -342,13 +429,46 @@ do {                                  \
+                                            mode);
+         break;
+     }
++#else
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
++                       (uint8_t *)left, stride, log2_size, c_idx);
++        break;
++    default:
++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
++                                           (uint8_t *)left, stride, c_idx,
++                                           mode);
++        break;
++    }
++
++#if DUMP_PRED
++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
+ }
+ 
++#if !PRED_C || BIT_DEPTH == 16
+ #define INTRA_PRED(size)                                                            \
+ static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+ {                                                                                   \
+     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
+ }
++#else
++#define INTRA_PRED(size)                                                            \
++static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
++{                                                                                   \
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
++    abort();                                                                        \
++}
++#endif
+ 
+ INTRA_PRED(2)
+ INTRA_PRED(3)
+@@ -357,6 +477,7 @@ INTRA_PRED(5)
+ 
+ #undef INTRA_PRED
+ 
++#if !PRED_C
+ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+                                   const uint8_t *_left, ptrdiff_t stride,
+                                   int trafo_size)
+@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
+                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
+ }
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++                                  const uint8_t * _left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    int size = 1 << trafo_size;
++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++
++    for (y = 0; y < size; y++, src += stride)
++    {
++        for (x = 0; x < size; x++)
++        {
++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++        }
++    }
++}
++#endif
+ 
++#if !PRED_C || BIT_DEPTH == 16
+ #define PRED_PLANAR(size)\
+ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+                                        const uint8_t *left, ptrdiff_t stride)   \
+ {                                                                               \
+     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
+ }
++#else
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
++    abort();                                                                    \
++}
++#endif
+ 
+ PRED_PLANAR(0)
+ PRED_PLANAR(1)
+@@ -386,6 +540,7 @@ PRED_PLANAR(3)
+ 
+ #undef PRED_PLANAR
+ 
++#if !PRED_C
+ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+                           const uint8_t *_left,
+                           ptrdiff_t stride, int log2_size, int c_idx)
+@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+     }
+ }
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size, int c_idx)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
++    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
++    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    unsigned int dc0 = size;
++    unsigned int dc1 = size;
++
++    for (i = 0; i < size; i++)
++    {
++        dc0 += left[i][0] + top[i][0];
++        dc1 += left[i][1] + top[i][1];
++    }
++
++    dc0 >>= log2_size + 1;
++    dc1 >>= log2_size + 1;
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = dc0;
++            src[j][1] = dc1;
+ 
++        }
++    }
++}
++#endif
++
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++};
++static const int inv_angle[] = {
++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++    -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
+ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+                                                 const uint8_t *_top,
+                                                 const uint8_t *_left,
+@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+     const pixel *top  = (const pixel *)_top;
+     const pixel *left = (const pixel *)_left;
+ 
+-    static const int intra_pred_angle[] = {
+-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+-    };
+-    static const int inv_angle[] = {
+-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+-        -630, -910, -1638, -4096
+-    };
 -
--    /* Since avctx->priv_data will point to LJpegEncContext in this case */
--    if (avctx->codec_id != AV_CODEC_ID_LJPEG)
--        s = avctx->priv_data;
-+    MpegEncContext *s = avctx->priv_data;
+     int angle = intra_pred_angle[mode - 2];
+     pixel ref_array[3 * MAX_TB_SIZE + 4];
+     pixel *ref_tmp = ref_array + size;
+@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+         }
+     }
+ }
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride, int c_idx,
++                                                int mode, int size)
++{
++    int x, y;
++    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
++    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
++    c8_src_ptr_t left = (c8_src_ptr_t)_left;
++
++    const int angle = intra_pred_angle[mode - 2];
++    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
++    c8_dst_ptr_t ref_tmp = ref_array + size;
++    c8_src_ptr_t ref;
++    const int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, top - 1, (size + 1) * 2);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c8_src_ptr_t)ref_tmp;
++        }
++
++        for (y = 0; y < size; y++, src += stride) {
++            const int idx  = ((y + 1) * angle) >> 5;
++            const int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; ++x) {
++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                memcpy(src, ref + idx + 1, size * 2);
++            }
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, left - 1, (size + 1) * 2);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c8_src_ptr_t)ref_tmp;
++        }
++
++        for (x = 0; x < size; x++, src++) {
++            const int idx  = ((x + 1) * angle) >> 5;
++            const int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                {
++                    src[y * stride][0] = ref[y + idx + 1][0];
++                    src[y * stride][1] = ref[y + idx + 1][1];
++                }
++            }
++        }
++    }
++}
++#endif
  
-     if (avctx->codec_id != AV_CODEC_ID_LJPEG) {
-         int matrix_count = 1 + !!memcmp(luma_intra_matrix,
-                                         chroma_intra_matrix,
-                                         sizeof(luma_intra_matrix[0]) * 64);
--    if (s && s->force_duplicated_matrix)
-+    if (s->force_duplicated_matrix)
-         matrix_count = 2;
-     /* quant matrixes */
-     put_marker(p, DQT);
-@@ -138,7 +134,7 @@ static void jpeg_table_header(AVCodecContext *avctx, PutBitContext *p,
- 
-     // Only MJPEG can have a variable Huffman variable. All other
-     // formats use the default Huffman table.
--    if (s && s->huffman == HUFFMAN_TABLE_OPTIMAL) {
-+    if (s->out_format == FMT_MJPEG && s->huffman == HUFFMAN_TABLE_OPTIMAL) {
-         size += put_huffman_table(p, 0, 0, s->mjpeg_ctx->bits_dc_luminance,
-                                   s->mjpeg_ctx->val_dc_luminance);
-         size += put_huffman_table(p, 0, 1, s->mjpeg_ctx->bits_dc_chrominance,
-diff --git b/libavcodec/mmaldec.c a/libavcodec/mmaldec.c
+ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+                                  const uint8_t *left,
+diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
 index 81fcebc..7858478 100644
---- b/libavcodec/mmaldec.c
-+++ a/libavcodec/mmaldec.c
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
 @@ -24,6 +24,9 @@
   * MMAL Video Decoder
   */
@@ -7590,11 +10655,11 @@ index 81fcebc..7858478 100644
  #include <stdatomic.h>
  
  #include "avcodec.h"
-diff --git b/libavcodec/mpeg4videodec.c a/libavcodec/mpeg4videodec.c
-index 791a07b..502c21f 100644
---- b/libavcodec/mpeg4videodec.c
-+++ a/libavcodec/mpeg4videodec.c
-@@ -2249,6 +2249,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
+index 54b7be1..894dcdc 100644
+--- a/libavcodec/mpeg4videodec.c
++++ b/libavcodec/mpeg4videodec.c
+@@ -2247,6 +2247,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
  
          if (ctx->divx_version >= 0)
              s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
@@ -7604,7 +10669,7 @@ index 791a07b..502c21f 100644
      }
  
      if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2273,6 +2276,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+@@ -2271,6 +2274,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
                 s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
                 ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
  
@@ -7612,25 +10677,154 @@ index 791a07b..502c21f 100644
      if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
          s->codec_id == AV_CODEC_ID_MPEG4 &&
          avctx->idct_algo == FF_IDCT_AUTO) {
-diff --git b/libavcodec/mpegvideo_enc.c a/libavcodec/mpegvideo_enc.c
-index 882cf09..71a858f 100644
---- b/libavcodec/mpegvideo_enc.c
-+++ a/libavcodec/mpegvideo_enc.c
-@@ -399,9 +399,6 @@ FF_ENABLE_DEPRECATION_WARNINGS
-         return AVERROR(EINVAL);
-     }
+diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+index 7146e3a..240b274 100644
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -273,6 +273,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
  
--    if (s->huffman && avctx->codec_id == AV_CODEC_ID_AMV)
--        s->huffman = 0;
--
-     if (s->intra_dc_precision > (avctx->codec_id == AV_CODEC_ID_MPEG2VIDEO ? 3 : 0)) {
-         av_log(avctx, AV_LOG_ERROR, "intra dc precision too large\n");
-         return AVERROR(EINVAL);
-diff --git b/libavcodec/rpi_hevc_transform.h a/libavcodec/rpi_hevc_transform.h
++    /* RPI */
++#ifdef RPI
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++#endif
++
+     /* special */
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+index d181b74..84f8e8c 100644
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -31,6 +31,7 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -49,6 +50,101 @@ FF_ENABLE_DEPRECATION_WARNINGS
+     return 0;
+ }
+ 
++// x0 & width in luma units (so chroma * 2)
++// x0 odd for v
++static uint8_t * sand_copy_line_u(uint8_t * dst, const uint8_t * src,
++                           unsigned int x0, const unsigned int width,
++                           const unsigned int stride1, const unsigned int stride2)
++{
++    unsigned int xend;
++
++    // Skip any empty slices
++    src += (x0 & ~(stride1 - 1)) * stride2;
++    x0 &= (stride1 - 1);
++
++    xend = x0 + width;
++    for (unsigned int x = 0; x < xend; x += stride1)
++    {
++        const unsigned int w = FFMIN(stride1, xend - x) - x0;
++        for (unsigned int i = 0; i < w; i += 2)
++            *dst++ = src[x0 + i];
++        src += stride1 * stride2;
++        x0 &= 1;
++    }
++
++    return dst;
++}
++
++static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame,
++                            const unsigned int x0, const unsigned int y0,
++                            const unsigned int width, const unsigned int height)
++{
++    for (unsigned int y = y0; y < height + y0; ++y) {
++        dst = sand_copy_line_u(dst, frame->data[1] + y * frame->linesize[1], x0, width, frame->linesize[1], frame->linesize[3]);
++    }
++    return dst;
++}
++
++static uint8_t * sand_copy_line_y(uint8_t * dst, const uint8_t * src,
++                           unsigned int x0, const unsigned int width,
++                           const unsigned int stride1, const unsigned int stride2)
++{
++    unsigned int xend;
++
++    // Skip any empty slices
++    src += (x0 & ~(stride1 - 1)) * stride2;
++    x0 &= (stride1 - 1);
++
++    xend = x0 + width;
++    for (unsigned int x = 0; x < xend; x += stride1)
++    {
++        const unsigned int w = FFMIN(stride1, xend - x) - x0;
++        memcpy(dst, src + x0, w);
++        dst += w;
++        src += stride1 * stride2;
++        x0 = 0;
++    }
++    return dst;
++}
++
++static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
++    int size;
++    int width = frame->width;
++    int height = frame->height;
++    int x0 = 0;
++    int y0 = 0;
++    uint8_t * dst;
++    int ret;
++
++    if (sd != NULL) {
++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++
++//        printf("PScan: h/w=%d/%d, off=%d,%d\n", pscan->height, pscan->width, pscan->position[0][0], pscan->position[0][0]);
++
++        x0 = si->left_offset;
++        y0 = si->top_offset;
++    }
++
++    size = width * height * 3 / 2;
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    // Luma is "easy"
++    for (int y = y0; y < height + y0; ++y) {
++        dst = sand_copy_line_y(dst, frame->data[0] + y * frame->linesize[0], x0, width, frame->linesize[0], frame->linesize[3]);
++    }
++
++    // Chroma is dull
++    dst = cpy_sand_c(dst, frame, x0 & ~1, y0 / 2, width, height / 2);
++    dst = cpy_sand_c(dst, frame, x0 | 1,  y0 / 2, width, height / 2);
++    return 0;
++}
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+                       const AVFrame *frame, int *got_packet)
+ {
+@@ -58,6 +154,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+     if (ret < 0)
+         return ret;
+ 
++    if (frame->format == AV_PIX_FMT_SAND128) {
++        ret = raw_sand_as_yuv420(avctx, pkt, frame);
++        *got_packet = (ret == 0);
++        return ret;
++    }
++
+     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+         return ret;
+     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
 new file mode 100644
 index 0000000..4309f1c
 --- /dev/null
-+++ a/libavcodec/rpi_hevc_transform.h
++++ b/libavcodec/rpi_hevc_transform.h
 @@ -0,0 +1,3070 @@
 +unsigned char rpi_hevc_transform [] = {
 +21,
@@ -10702,11 +13896,11 @@ index 0000000..4309f1c
 +33,
 +3,
 +};
-diff --git b/libavcodec/rpi_hevc_transform.s a/libavcodec/rpi_hevc_transform.s
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
 new file mode 100644
 index 0000000..5543093
 --- /dev/null
-+++ a/libavcodec/rpi_hevc_transform.s
++++ b/libavcodec/rpi_hevc_transform.s
 @@ -0,0 +1,917 @@
 +# ******************************************************************************
 +# Argon Design Ltd.
@@ -11625,12 +14819,12 @@ index 0000000..5543093
 +  bgt loop_cmds
 +
 +  pop r6-r7, pc
-diff --git b/libavcodec/rpi_mailbox.c a/libavcodec/rpi_mailbox.c
+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
 new file mode 100644
-index 0000000..8d8a20d
+index 0000000..0255f5d
 --- /dev/null
-+++ a/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,118 @@
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,149 @@
 +/*
 +Copyright (c) 2012, Broadcom Europe Ltd.
 +All rights reserved.
@@ -11658,6 +14852,8 @@ index 0000000..8d8a20d
 +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 +*/
 +
++#ifdef RPI
++
 +#include <stdio.h>
 +#include <string.h>
 +#include <stdlib.h>
@@ -11674,6 +14870,7 @@ index 0000000..8d8a20d
 +#define DEVICE_FILE_NAME "/dev/vcio"
 +
 +#include "rpi_mailbox.h"
++//#include <interface/vctypes/vc_image_structs.h>
 +
 +/*
 + * use ioctl to send mbox property message
@@ -11733,6 +14930,31 @@ index 0000000..8d8a20d
 +   return p[5];
 +}
 +
++#define GET_VCIMAGE_PARAMS 0x30044
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(fd, buf);
++    memcpy(img, rimg, sizeof(*img));
++
++    return rv;
++}
 +
 +int mbox_open() {
 +   int file_desc;
@@ -11749,28 +14971,79 @@ index 0000000..8d8a20d
 +void mbox_close(int file_desc) {
 +  close(file_desc);
 +}
-diff --git b/libavcodec/rpi_mailbox.h a/libavcodec/rpi_mailbox.h
++
++#endif
++
+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
 new file mode 100644
-index 0000000..b51303b
+index 0000000..b316878
 --- /dev/null
-+++ a/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,10 @@
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
 +#ifndef RPI_MAILBOX_H
 +#define RPI_MAILBOX_H
 +
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++  void *u, *v;
++  int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++    VC_IMAGE_EXTRA_UV_T uv;
++//  VC_IMAGE_EXTRA_RGBA_T rgba;
++//  VC_IMAGE_EXTRA_PAL_T pal;
++//  VC_IMAGE_EXTRA_TF_T tf;
++//  VC_IMAGE_EXTRA_BAYER_T bayer;
++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++//  VC_IMAGE_EXTRA_CODEC_T codec;
++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++  unsigned short                  type;           /* should restrict to 16 bits */
++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++  unsigned short                  width;          /* width in pixels */
++  unsigned short                  height;         /* height in pixels */
++  int                             pitch;          /* pitch of image_data array in bytes */
++  int                             size;           /* number of bytes available in image_data array */
++  void                           *image_data;     /* pixel data */
++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++  void                           *metadata;       /* metadata header for the image */
++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
++  int                             metadata_size;  /* size of metadata of each channel in bytes */
++  int                             channel_offset; /* offset of consecutive channels in bytes */
++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                            into a linked-mulitchannel image */
++  uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                            it is being linked. */
++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
 +extern int mbox_open(void);
 +extern void mbox_close(int file_desc);
 +
 +extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
 +extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
 +
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
++
 +#endif
-diff --git b/libavcodec/rpi_qpu.c a/libavcodec/rpi_qpu.c
+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
 new file mode 100644
-index 0000000..be58458
+index 0000000..36c8ab6
 --- /dev/null
-+++ a/libavcodec/rpi_qpu.c
-@@ -0,0 +1,827 @@
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,878 @@
 +#ifdef RPI
 +#include <stdio.h>
 +#include <stdlib.h>
@@ -11784,10 +15057,13 @@ index 0000000..be58458
 +#include <pthread.h>
 +#include <time.h>
 +
++#include <interface/vcsm/user-vcsm.h>
++
 +#include "rpi_mailbox.h"
 +#include "rpi_qpu.h"
 +#include "rpi_shader.h"
 +#include "rpi_hevc_transform.h"
++#include "rpi_zc.h"
 +
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
@@ -11798,6 +15074,10 @@ index 0000000..be58458
 +// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
 +#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
 +
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL       0
++
 +// QPU "noflush" flags
 +// a mixture of flushing & profiling
 +
@@ -11807,26 +15087,13 @@ index 0000000..be58458
 +#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
 +#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
-+// On Pi2 there is no way to access the VPU L2 cache
-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+#define GPU_MEM_FLG 0x4
-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
-+#define GPU_MEM_MAP 0x0
-+
 +#define vcos_verify_ge0(x) ((x)>=0)
 +
-+/*static const unsigned code[] =
-+{
-+  #include "rpi_shader.hex"
-+};*/
-+
 +// Size in 32bit words
 +#define QPU_CODE_SIZE 2048
 +#define VPU_CODE_SIZE 2048
 +
-+const short rpi_transMatrix2even[32][16] = { // Even rows first
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
 +{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
 +{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
 +{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
@@ -11870,6 +15137,17 @@ index 0000000..be58458
 +  short transMatrix2even[16*16*2];
 +};
 +
++#define CFE_ENTS_PER_A 8
++// If we have a sliced frame 2048 wide @ 64 per slice then there are 32 slices
++// in a line & we want to flush luma + chroma + a couple of bits so ents ~ 70
++// allow 128
++#define CFE_ENT_COUNT  128
++#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
++
++struct rpi_cache_flush_env_s {
++    unsigned int n;
++    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++};
 +
 +#define WAIT_COUNT_MAX 16
 +
@@ -11892,7 +15170,6 @@ index 0000000..be58458
 +typedef struct vq_wait_s
 +{
 +  sem_t sem;
-+  unsigned int cost;
 +  struct vq_wait_s * next;
 +} vq_wait_t;
 +
@@ -11911,7 +15188,6 @@ index 0000000..be58458
 +  int open_count;
 +  int init_count;
 +  int mb;
-+  unsigned int current_load;
 +  GPU_MEM_PTR_T code_gm_ptr;
 +  vq_wait_pool_t wait_pool;
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -12183,6 +15459,18 @@ index 0000000..be58458
 +  return gpu->mb;
 +}
 +
++void gpu_ref(void)
++{
++  gpu_lock_ref();
++  gpu_unlock();
++}
++
++void gpu_unref(void)
++{
++  gpu_env_t * const ge = gpu_lock();
++  gpu_unlock_unref(ge);
++}
++
 +// ----------------------------------------------------------------------------
 +//
 +// Cache flush functions
@@ -12190,10 +15478,11 @@ index 0000000..be58458
 +
 +rpi_cache_flush_env_t * rpi_cache_flush_init()
 +{
-+    rpi_cache_flush_env_t * const rfe = calloc(1, sizeof(rpi_cache_flush_env_t));
++    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
 +    if (rfe == NULL)
 +        return NULL;
 +
++    rfe->n = 0;
 +    return rfe;
 +}
 +
@@ -12205,7 +15494,19 @@ index 0000000..be58458
 +
 +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
 +{
-+    int rc = (rfe->n == 0) ? 0 : vcsm_clean_invalid(&rfe->a);
++    int rc = 0;
++    unsigned int na;
++    unsigned int nr;
++
++    // Clear any reamaining ents in the final block
++    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
++        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
++
++    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
++    {
++        if (vcsm_clean_invalid(rfe->a + na) != 0)
++            rc = -1;
++    }
 +
 +    free(rfe);
 +
@@ -12218,17 +15519,22 @@ index 0000000..be58458
 +
 +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
 +{
-+    av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0]));
-+
 +    // Deal with empty pointer trivially
 +    if (gm == NULL || gm->numbytes == 0)
 +        return;
 +
-+    rfe->a.s[rfe->n].cmd = mode;
-+    rfe->a.s[rfe->n].handle = gm->vcsm_handle;
-+    rfe->a.s[rfe->n].addr = (unsigned int)gm->arm;
-+    rfe->a.s[rfe->n].size = gm->numbytes;
-+    ++rfe->n;
++    {
++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++
++        av_assert0(rfe->n < CFE_ENT_COUNT);
++
++        a->s[n].cmd = mode;
++        a->s[n].handle = gm->vcsm_handle;
++        a->s[n].addr = (unsigned int)gm->arm;
++        a->s[n].size = gm->numbytes;
++        ++rfe->n;
++    }
 +}
 +
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
@@ -12238,16 +15544,24 @@ index 0000000..be58458
 +    if (gm == NULL || size == 0)
 +        return;
 +
-+    av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0]));
++//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
++
 +    av_assert0(offset <= gm->numbytes);
 +    av_assert0(size <= gm->numbytes);
 +    av_assert0(offset + size <= gm->numbytes);
 +
-+    rfe->a.s[rfe->n].cmd = mode;
-+    rfe->a.s[rfe->n].handle = gm->vcsm_handle;
-+    rfe->a.s[rfe->n].addr = (unsigned int)gm->arm + offset;
-+    rfe->a.s[rfe->n].size = size;
-+    ++rfe->n;
++    {
++        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
++        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++
++        av_assert0(rfe->n < CFE_ENT_COUNT);
++
++        a->s[n].cmd = mode;
++        a->s[n].handle = gm->vcsm_handle;
++        a->s[n].addr = (unsigned int)gm->arm + offset;
++        a->s[n].size = size;
++        ++rfe->n;
++    }
 +}
 +
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
@@ -12266,23 +15580,38 @@ index 0000000..be58458
 +  }
 +}
 +
-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
 +{
-+  const unsigned int y_offset = frame->linesize[0] * start_line;
-+  const unsigned int y_size = frame->linesize[0] * n;
++  const unsigned int y_offset = frame->linesize[0] * y0;
++  const unsigned int y_size = frame->linesize[0] * height;
 +  // Round UV up/down to get everything
 +  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
 +
++#if 0
++  // *** frame->height is cropped height so not good
 +  // As all unsigned they will also reject -ve
 +  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);
++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
 +  av_assert0(n <= (unsigned int)frame->height);
 +  av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
 +
-+  if (gpu_is_buf1(frame)) {
++  if (!gpu_is_buf1(frame))
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++  else if (!rpi_sliced_frame(frame))
++  {
 +    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
 +    if (do_luma) {
 +      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
@@ -12294,12 +15623,17 @@ index 0000000..be58458
 +  }
 +  else
 +  {
-+    if (do_luma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+    }
-+    if (do_chroma) {
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
++    // **** Use x0!
++    for (int x = 0; x < x0 + width; x += frame->linesize[0]) {
++      if (do_luma) {
++        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, y0), y_size);
++      }
++      if (do_chroma) {
++        rpi_cache_flush_add_gm_range(rfe, gm, mode,
++                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, y0 >> 1), uv_size);
++      }
 +    }
 +  }
 +}
@@ -12340,13 +15674,11 @@ index 0000000..be58458
 +
 +
 +// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(const unsigned int cost)
++static vq_wait_t * vq_wait_new(void)
 +{
 +  gpu_env_t * const ge = gpu_lock_ref();
 +  vq_wait_t * const wait = ge->wait_pool.head;
 +  ge->wait_pool.head = wait->next;
-+  ge->current_load += cost;
-+  wait->cost = cost;
 +  wait->next = NULL;
 +
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -12402,17 +15734,13 @@ index 0000000..be58458
 +
 +static void vq_wait_post(vq_wait_t * const wait)
 +{
-+#if !RPI_TRACE_TIME_VPU_QPU_WAIT
-+  if (wait->cost != 0)
-+#endif
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +  {
 +    gpu_env_t *const ge = gpu_lock();
-+    ge->current_load -= wait->cost;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +    tto_end(&ge->ttw.active, ns_time());
-+#endif
 +    gpu_unlock();
 +  }
++#endif
 +
 +  sem_post(&wait->sem);
 +}
@@ -12428,7 +15756,6 @@ index 0000000..be58458
 +{
 +  unsigned int n;
 +  unsigned int mask;
-+  unsigned int cost;
 +  struct gpu_job_s j[VPU_QPU_JOB_MAX];
 +};
 +
@@ -12472,16 +15799,19 @@ index 0000000..be58458
 +}
 +
 +// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
 +{
 +  if (n != 0) {
 +    struct gpu_job_s *const j = new_job(vqj);
 +    vqj->mask |= VPU_QPU_MASK_QPU;
-+    vqj->cost += cost;
 +
 +    j->command = EXECUTE_QPU;
 +    j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
 +    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
 +    j->u.q.timeout = 5000;
 +    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
 +  }
@@ -12503,7 +15833,7 @@ index 0000000..be58458
 +  }
 +
 +  // We are going to want a sync object
-+  wait = vq_wait_new(vqj->cost);
++  wait = vq_wait_new();
 +
 +  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
 +  // If we only posted one thing or only QPU jobs
@@ -12525,7 +15855,6 @@ index 0000000..be58458
 +    j->callback.cookie = wait;
 +  }
 +
-+  vqj->cost = 0;
 +  vqj->mask = 0;
 +  *wait_h = wait;
 +}
@@ -12544,11 +15873,6 @@ index 0000000..be58458
 +  return rv;
 +}
 +
-+unsigned int vpu_qpu_current_load(void)
-+{
-+  return gpu_ptr()->current_load;
-+}
-+
 +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
 +{
 +  if (wait_h != NULL)
@@ -12598,17 +15922,15 @@ index 0000000..be58458
 +}
 +
 +#endif // RPI
-diff --git b/libavcodec/rpi_qpu.h a/libavcodec/rpi_qpu.h
+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
 new file mode 100644
-index 0000000..bcde316
+index 0000000..636e420
 --- /dev/null
-+++ a/libavcodec/rpi_qpu.h
-@@ -0,0 +1,204 @@
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,201 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
-+#include <interface/vcsm/user-vcsm.h>
-+
 +#define RPI_ONE_BUF 1
 +
 +typedef struct gpu_mem_ptr_s {
@@ -12731,10 +16053,8 @@ index 0000000..bcde316
 +
 +// Cache flush stuff
 +
-+typedef struct rpi_flush_envss {
-+    unsigned int n;
-+    struct vcsm_user_clean_invalid_s a;
-+} rpi_cache_flush_env_t;
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
 +
 +rpi_cache_flush_env_t * rpi_cache_flush_init(void);
 +// Free env without flushing
@@ -12753,8 +16073,9 @@ index 0000000..bcde316
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
 +  const unsigned int offset, const unsigned int size);
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
 +
 +// init, add, finish for one gm ptr
 +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
@@ -12763,14 +16084,13 @@ index 0000000..bcde316
 +// QPU specific functions
 +uint32_t qpu_fn(const int * const mc_fn);
 +
-+#define QPU_N_UV   12
-+#define QPU_N_Y    12
-+#define QPU_N_MAX  16
++#define QPU_N_GRP_UV 4
++#define QPU_N_UV     12
++#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
++#define QPU_N_Y      12
++#define QPU_N_MAX    12
 +
 +#define QPU_MAIL_EL_VALS  2
-+#define QPU_MAIL_EL_SIZE  (QPU_MAIL_EL_VALS * sizeof(uint32_t))
-+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
-+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
 +
 +struct vpu_qpu_wait_s;
 +typedef struct vq_wait_s * vpu_qpu_wait_h;
@@ -12784,7 +16104,7 @@ index 0000000..bcde316
 +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
 +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
 +  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
 +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
 +int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
 +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
@@ -12795,7 +16115,6 @@ index 0000000..bcde316
 +
 +// Waits for previous post_codee to complete and Will null out *wait_h after use
 +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+unsigned int vpu_qpu_current_load(void);
 +int vpu_qpu_init(void);
 +void vpu_qpu_term(void);
 +
@@ -12806,14 +16125,16 @@ index 0000000..bcde316
 +extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
 +
 +extern int gpu_get_mailbox(void);
++void gpu_ref(void);
++void gpu_unref(void);
 +
 +#endif
-diff --git b/libavcodec/rpi_shader.c a/libavcodec/rpi_shader.c
+diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
 new file mode 100644
-index 0000000..627cda9
+index 0000000..f2842b6
 --- /dev/null
-+++ a/libavcodec/rpi_shader.c
-@@ -0,0 +1,624 @@
++++ b/libavcodec/rpi_shader.c
+@@ -0,0 +1,734 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -12837,744 +16158,947 @@ index 0000000..627cda9
 +__attribute__((aligned(8)))
 +#endif
 +unsigned int rpi_shader[] = {
-+// ::mc_setup_uv
-+/* [0x00000000] */ 0x95801ff6, 0xd002591e, // mov tmurs, 1          ; mov ra_link, unif
-+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x00000010] */ 0x159a7d80, 0x10020827, // mov r0, elem_num
-+/* [0x00000018] */ 0x0c027c00, 0x14020427, // add ra_x, ra0.16b, r0
-+/* [0x00000020] */ 0x15027d80, 0x12020767, // mov ra_y, ra0.16a
-+/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+/* [0x00000030] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000038] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+/* [0x00000040] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000048] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000050] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000058] */ 0x0c827380, 0x10021627, // add rb24, r1, unif
-+/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000070] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000078] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
-+/* [0x00000080] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
-+/* [0x00000088] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
-+/* [0x00000090] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
-+/* [0x00000098] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000a0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000c0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000000c8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0                      ; mov r1, ra_y
-+/* [0x000000d0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000000d8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3          ; mov r2, ra_u2v_ref_offset
-+/* [0x000000e0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000000e8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x000000f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000000f8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0                      ; mov ra_x, r0
-+/* [0x00000100] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000108] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000110] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x00000118] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+/* [0x00000120] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
-+/* [0x00000128] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000130] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000140] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000150] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000158] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000160] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000168] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000170] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000178] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000180] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000188] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000190] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000198] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x000001a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000001a8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x000001b0] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x000001b8] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000028] */ 0x0c9e7000, 0x10021667, // add rb_max_x, r0, r0
++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000038] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000040] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000048] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000050] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
++/* [0x00000058] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
++/* [0x00000060] */ 0x00000000, 0xe00059ce, // nop                   ; mov ra14, 0
++/* [0x00000068] */ 0x8c5103f6, 0x1802560f, // add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0
++/* [0x00000070] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000078] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00000080] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000088] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b
++/* [0x00000090] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000a8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000b0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000b8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000000c0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000c8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000000d0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000000d8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000e0] */ 0x0c809f80, 0xd0021367, // add rb_wt_den_p15, 9, unif
++/* [0x000000e8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000000f0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x000000f8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000100] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000108] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000110] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000118] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000120] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000128] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000130] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000138] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000140] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00000148] */ 0x0c027d80, 0x14020827, // add r0, ra0.16b, ra0.16b
++/* [0x00000150] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00000158] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000160] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000168] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000170] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000178] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000180] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00000188] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000190] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x00000198] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x000001a0] */ 0x95442ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :c_preload
++/* [0x000001a8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001b0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001c0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000001c8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000001d0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001d8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:c_preload
++/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001e8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000001f0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000001f8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000200] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000208] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000210] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000218] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000220] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
 +// ::mc_filter_uv
-+/* [0x000001c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000001c8] */ 0x15827d80, 0x100200a7, // mov ra2, unif
-+/* [0x000001d0] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num      ; mov r3, unif
-+/* [0x000001d8] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0
-+/* [0x000001e0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000001e8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
-+/* [0x000001f0] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+/* [0x000001f8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000200] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
-+/* [0x00000208] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000210] */ 0x9509cdbf, 0x12024731, // mov ra_y_next, ra2.16a ; mov vw_setup, rb28
-+/* [0x00000218] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000220] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000228] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000230] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000238] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000240] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9     ; mov -, vw_wait
-+/* [0x00000248] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:filter_uv_1
-+/* [0x00000250] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000258] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000260] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27    ; mov ra3, unif
-+/* [0x00000268] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
-+/* [0x00000270] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
-+/* [0x00000278] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
-+// :filter_uv_1
-+/* [0x00000280] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000288] */ 0x950e0ff6, 0x18024048, // mov ra1, unif         ; mov rb8,  ra3.8a
-+/* [0x00000290] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif    ; mov rb9,  ra3.8b
-+/* [0x00000298] */ 0x800e7036, 0x1c0049ca, // nop                   ; mov rb10, ra3.8c
-+/* [0x000002a0] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x000002a8] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000002b0] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x000002b8] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
++/* [0x00000228] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000230] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1
++/* [0x00000238] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1
++/* [0x00000240] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00000248] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x00000250] */ 0x935401f6, 0xd4024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
++/* [0x00000258] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x00000260] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000268] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4        ; mov ra0, unif
++/* [0x00000270] */ 0x800a7036, 0x122059d3, // nop                   ; mov ra_y_next, ra2.16a
++/* [0x00000278] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
++/* [0x00000280] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000288] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
++/* [0x00000290] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0
++/* [0x00000298] */ 0x119c73c0, 0xd0020827, // shl r0, r1, 7
++/* [0x000002a0] */ 0x8d818eb6, 0x10025743, // sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif
++/* [0x000002a8] */ 0x8c8013f6, 0xd0025456, // add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif
++/* [0x000002b0] */ 0x8c8033f6, 0xd002d496, // add rb_lcount, r1, 3  ; mov.ifnz ra_wt_off_mul_l0, unif
++/* [0x000002b8] */ 0x8c0e70b6, 0x18024808, // add r0, r0, r2        ; mov rb8,  ra3.8a
++/* [0x000002c0] */ 0x910d01f6, 0xda024809, // shl r0, r0, i_shift16 ; mov rb9,  ra3.8b
++/* [0x000002c8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002d0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif     ; mov ra9, rb_max_y
++/* [0x000002d8] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c
++/* [0x000002e0] */ 0x950c0ff6, 0xde02494b, // mov r5quad, 0         ; mov rb11, ra3.8d
++/* [0x000002e8] */ 0x8f8013f6, 0xd002531e, // asr rb_wt_off, r1, 1  ; mov ra_link, unif
++/* [0x000002f0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++/* [0x000002f8] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 +// :uvloop
-+/* [0x000002c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000002c8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x000002d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000002d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x000002e0] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift    ; v8min r0, r0, rb_k255
-+/* [0x000002e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x000002f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x000002f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000300] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2    ; v8min r1, r1, rb_k255
-+/* [0x00000308] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000310] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000318] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x00000320] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+/* [0x00000328] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+/* [0x00000330] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+/* [0x00000338] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+/* [0x00000340] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+/* [0x00000348] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+/* [0x00000350] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+/* [0x00000358] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000360] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x00000368] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000370] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x00000378] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000380] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x00000388] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+/* [0x00000390] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra15, rb11
-+/* [0x00000398] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x000003a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x000003a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003b0] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x000003b8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000003c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000003c8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003d0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000003d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000003e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000003e8] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26    ; mov ra9, rb26
-+/* [0x000003f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000003f8] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29    ; mov ra10, rb29
-+/* [0x00000400] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000408] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000300] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000308] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++/* [0x00000310] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, 8         ; mov.ifnz r3, ra_y
++/* [0x00000318] */ 0x8c6817f6, 0xd0029818, // add r0, r3, 1         ; mov.ifz ra_base, ra_base_next
++/* [0x00000320] */ 0x94981f80, 0xd02279d1, // and.setf -, 1, elem_num ; mov ra_y, r0
++/* [0x00000328] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000330] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++/* [0x00000338] */ 0x559d049f, 0x10044822, // mov.ifz r0, r2        ; mul24 r2, r3, rb_pitch
++/* [0x00000340] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++/* [0x00000348] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3       ; mov ra4, ra5
++/* [0x00000350] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++/* [0x00000358] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000360] */ 0x40034031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000368] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000370] */ 0x40032031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000378] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d      , r1
++/* [0x00000380] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x00000388] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000390] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000398] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x000003a0] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x000003a8] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000003b0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000003b8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003c8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off
++/* [0x000003e0] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x000003e8] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb_wt_den_p15
++/* [0x000003f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000003f8] */ 0x15067d80, 0x18020c27, // mov vpm, ra1.8a
++/* [0x00000400] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000408] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0
++/* [0x00000410] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1
++/* [0x00000418] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
 +// ::mc_filter_uv_b0
-+/* [0x00000410] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000418] */ 0x15827d80, 0x100200a7, // mov ra2, unif
-+/* [0x00000420] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num      ; mov r3, unif
-+/* [0x00000428] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0
-+/* [0x00000430] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000438] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
-+/* [0x00000440] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+/* [0x00000448] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000450] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3        ; mov ra1, unif
-+/* [0x00000458] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000460] */ 0x150a7d80, 0x12020727, // mov ra_y_next, ra2.16a
-+/* [0x00000468] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000470] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000478] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000480] */ 0x0c043dc0, 0xd20207e7, // add ra31, ra1.16a, 3
-+/* [0x00000488] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000490] */ 0x8c0601bf, 0x14025803, // add r0,   r0, ra1.16b        ; mov ra3, unif
-+/* [0x00000498] */ 0x918101f6, 0xd002480e, // shl r0,   r0, i_shift16      ; mov rb14, unif
-+/* [0x000004a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000004a8] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
-+/* [0x000004b0] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
-+/* [0x000004b8] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
-+/* [0x000004c0] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
-+/* [0x000004c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000004d0] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif    ; mov r3, 0
-+// :uvloop_b0
-+/* [0x000004d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17  ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x000004e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x000004e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000004f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+/* [0x000004f8] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255
-+/* [0x00000500] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000508] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000510] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+/* [0x00000518] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2     ; v8min r1, r1, rb_k255
-+/* [0x00000520] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000528] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000530] */ 0x40027030, 0x180049e3, // nop                   ; mul24      r3, ra0.8a,       r0
-+/* [0x00000538] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00000540] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00000548] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00000550] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000558] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00000560] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00000568] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00000570] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
-+/* [0x00000578] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
-+/* [0x00000580] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x00000588] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
-+/* [0x00000590] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
-+/* [0x00000598] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
-+/* [0x000005a0] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
-+/* [0x000005a8] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
-+/* [0x000005b0] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
-+/* [0x000005b8] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
-+/* [0x000005c0] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
-+/* [0x000005c8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005d0] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
-+/* [0x000005d8] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
-+/* [0x000005e0] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
-+/* [0x000005e8] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
-+/* [0x000005f0] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
-+/* [0x000005f8] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
-+/* [0x00000600] */ 0x95810ff6, 0xd0020827, // mov r0, i_shift16     ; mov -, unif
-+/* [0x00000608] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
-+/* [0x00000610] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
-+/* [0x00000618] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
-+/* [0x00000620] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
-+/* [0x00000628] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
-+/* [0x00000630] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
-+/* [0x00000638] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
-+/* [0x00000640] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
-+/* [0x00000648] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
-+/* [0x00000650] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
-+/* [0x00000658] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
-+/* [0x00000660] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
-+/* [0x00000668] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
-+// :uv_b0_post12
-+/* [0x00000670] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
-+/* [0x00000678] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
-+/* [0x00000680] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
-+/* [0x00000688] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
-+/* [0x00000690] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
-+/* [0x00000698] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
-+// ::mc_filter_uv_b
-+// :uv_b0_post_fin
-+/* [0x000006a0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000006a8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9       ; mov -, vw_wait
-+/* [0x000006b0] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:uv_filter_b_1
-+/* [0x000006b8] */ 0x959a0ff6, 0x10024020, // mov ra0, unif         ; mov r0, elem_num
-+/* [0x000006c0] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
-+/* [0x000006c8] */ 0x0c027c00, 0x14020827, // add r0, ra0.16b, r0
-+/* [0x000006d0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
-+/* [0x000006d8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
-+/* [0x000006e0] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
-+// :uv_filter_b_1
-+/* [0x000006e8] */ 0x930001f6, 0xd202581c, // max r0, r0, 0                      ; mov ra_y_next, ra0.16a
-+/* [0x000006f0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x000006f8] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8
-+/* [0x00000700] */ 0x8c8270f6, 0x10020827, // add r0, r0, r3                     ; mov -, unif
-+/* [0x00000708] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3              ; mov ra0, unif
-+/* [0x00000710] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000720] */ 0x950e0ff6, 0x18024048, // mov      ra1, unif  ; mov rb8,  ra3.8a
-+/* [0x00000728] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif  ; mov rb9,  ra3.8b
-+/* [0x00000730] */ 0x8c0d3eb6, 0x1c02468a, // add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c
-+/* [0x00000738] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0           ; mov rb11, ra3.8d
-+/* [0x00000740] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x00000748] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x00000420] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000428] */ 0x14981dc0, 0xd00229e7, // and.setf -, elem_num, 1
++/* [0x00000430] */ 0xec0a7d89, 0x14024821, // add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1
++/* [0x00000438] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00000440] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x00000448] */ 0x935401f6, 0xd4125815, // max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
++/* [0x00000450] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x00000458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000460] */ 0x9481c1f6, 0xd0025800, // and r0, r0, -4        ; mov ra0, unif
++/* [0x00000468] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
++/* [0x00000470] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000478] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
++/* [0x00000480] */ 0x0c9e7600, 0x100206a7, // add ra_base_next, r3, r0
++/* [0x00000488] */ 0x918073f6, 0xd0025802, // shl r0, r1, 7         ; mov ra2, unif
++/* [0x00000490] */ 0x0d9d8e80, 0x10021767, // sub rb_dma1, rb_dma1_base, r2
++/* [0x00000498] */ 0x0c9c13c0, 0xd0021467, // add rb_i_tmu, r1, 3 - PREREAD
++/* [0x000004a0] */ 0x0c9c33c0, 0xd00214a7, // add rb_lcount, r1, 3
++/* [0x000004a8] */ 0x8c8270b6, 0x10125816, // add r0, r0, r2        ; mov ra_wt_mul_l0, unif
++/* [0x000004b0] */ 0x915201bf, 0x1c12d816, // shl r0, r0, ra_k16    ; mov.ifnz ra_wt_mul_l0, unif
++/* [0x000004b8] */ 0x8c81b1f6, 0x10025683, // add rb_dma0, r0, rb_dma0_base ; mov ra3, unif
++/* [0x000004c0] */ 0x159defc0, 0x10020267, // mov ra9, rb_max_y
++/* [0x000004c8] */ 0xec0e7d89, 0x14024821, // add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1
++/* [0x000004d0] */ 0x8c0c21f6, 0x12125813, // add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a
++/* [0x000004d8] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
++/* [0x000004e0] */ 0x935011bf, 0x18024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
++/* [0x000004e8] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
++/* [0x000004f0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000004f8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000500] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
++/* [0x00000508] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000510] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
++/* [0x00000518] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00000520] */ 0x950e0ff6, 0x1a024489, // mov ra_wt_off_mul_l1, unif        ; mov rb9,  ra3.8b
++/* [0x00000528] */ 0x950e0ff6, 0x1c06448a, // mov.ifnz ra_wt_off_mul_l1, unif   ; mov rb10, ra3.8c
++/* [0x00000530] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
++/* [0x00000538] */ 0x950c0ff6, 0xde02494b, // mov r5quad,0          ; mov rb11, ra3.8d
++/* [0x00000540] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00000548] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++/* [0x00000550] */ 0x0000ff00, 0xe20210e7, // mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 +// :uvloop_b
-+/* [0x00000750] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0
-+/* [0x00000758] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+/* [0x00000760] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000768] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000770] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift     ; v8min r0, r0, rb_k255
-+/* [0x00000778] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000780] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000788] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+/* [0x00000790] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2         ; v8min r1, r1, rb_k255
-+/* [0x00000798] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x000007a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000007a8] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,       r0
-+/* [0x000007b0] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+/* [0x000007b8] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+/* [0x000007c0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+/* [0x000007c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+/* [0x000007d0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+/* [0x000007d8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+/* [0x000007e0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+/* [0x000007e8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000007f0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000007f8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000800] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x00000808] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
-+/* [0x00000810] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x00000818] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
-+/* [0x00000820] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
-+/* [0x00000828] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
-+/* [0x00000830] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
-+/* [0x00000838] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
-+/* [0x00000840] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
-+/* [0x00000848] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
-+/* [0x00000850] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
-+/* [0x00000858] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
-+/* [0x00000860] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
-+/* [0x00000868] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000870] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
-+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
-+/* [0x00000880] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a
-+/* [0x00000888] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26    ; mov ra9, rb26
-+/* [0x00000890] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000898] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29    ; mov ra10, rb29
-+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x000008a8] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+// ::mc_exit_c
-+/* [0x000008b0] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9      ; mov -, vw_wait
-+/* [0x000008b8] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit_c_1
++/* [0x00000558] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000560] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000568] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, 8         ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000570] */ 0x95685ff6, 0x10029118, // mov rb4, rb5          ; mov.ifz ra_base, ra_base_next
++/* [0x00000578] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00000580] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num
++/* [0x00000588] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000590] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++/* [0x00000598] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
++/* [0x000005a0] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_k255
++/* [0x000005a8] */ 0x95143ff6, 0x100279c4, // mov.setf -, rb3       ; mov ra4, ra5
++/* [0x000005b0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++/* [0x000005b8] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x000005c0] */ 0x40034031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x000005c8] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000005d0] */ 0x40032031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000005d8] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x000005e0] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++/* [0x000005e8] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x000005f0] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, 8         ; mov r3, ra_y2
++/* [0x000005f8] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x00000600] */ 0x14981f80, 0xd00229e7, // and.setf -, 1, elem_num
++/* [0x00000608] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000610] */ 0x9227f792, 0xd00288e1, // min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++/* [0x00000618] */ 0x559d049f, 0x10044823, // mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
++/* [0x00000620] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255
++/* [0x00000628] */ 0x950c3ff6, 0x100269c7, // mov.setf -, rb3       ; mov rb7, ra3
++/* [0x00000630] */ 0x540563f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra1.8a,       r0
++/* [0x00000638] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000640] */ 0x40074031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000648] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000650] */ 0x40072031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000658] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00000660] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000668] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00000670] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00000678] */ 0x4d08443e, 0x180241e0, // sub ra7, r2, r0       ; mul24 r0, rb4, ra2.8a
++/* [0x00000680] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00000688] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00000690] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00000698] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x000006a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000006a8] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x000006b0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x000006b8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x000006c0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000006c8] */ 0x0c9e7280, 0x10020867, // add r1, r1, r2
++/* [0x000006d0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000006d8] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x000006e0] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
++/* [0x000006e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000006f0] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a
++/* [0x000006f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000700] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb_dma0
++/* [0x00000708] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb_dma1
++/* [0x00000710] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
++// ::mc_sync_q0
++/* [0x00000718] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000720] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000728] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000730] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000738] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000740] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000748] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000750] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000758] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000760] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000768] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000770] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000778] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000780] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000788] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q2
++/* [0x00000790] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000798] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000007a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000007a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000007b0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000007b8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q3
++/* [0x000007c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000007c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000007d0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000007d8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000007e0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000007e8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x000007f0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000007f8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000800] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000808] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000810] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000818] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000820] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000828] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000830] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000840] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000850] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000858] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000860] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000868] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000878] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000880] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000888] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000890] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000898] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000008a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000008a8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000008b0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000008b8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
 +/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000008d8] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
-+/* [0x000008e0] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
-+/* [0x000008e8] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
-+/* [0x000008f0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x000008c8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000008d0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000008d8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000008e0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000008e8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000008f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000008f8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000900] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000908] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000920] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000928] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000930] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000938] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000940] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000948] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000950] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000958] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000960] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000968] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000970] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000978] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000980] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000988] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000990] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000998] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_exit
-+// :exit_c_1
-+/* [0x000008f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000900] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000908] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000910] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
-+/* [0x00000918] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000920] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000930] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_setup
-+/* [0x00000938] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
-+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000958] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000960] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000968] */ 0x0d0c1dc0, 0xd4021667, // sub rb_frame_width_minus_1, ra3.16b, 1
-+/* [0x00000970] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_frame_height_minus_1, ra3.16a, 1
-+/* [0x00000978] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000980] */ 0x15827380, 0x10021627, // or  rb24, r1, unif
-+/* [0x00000988] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
-+/* [0x00000990] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
-+/* [0x00000998] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000009a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
-+/* [0x000009a8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000009b0] */ 0x0c201dc0, 0xd4020767, // add ra_y, ra8.16b, 1
-+/* [0x000009b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000009c0] */ 0x0c267c00, 0x100208a7, // add r2, ra9, r0
-+/* [0x000009c8] */ 0x13200dc0, 0xd4020867, // max r1, ra8.16b, 0
-+/* [0x000009d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000009d8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x000009e0] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1       ; mov ra_frame_base, r2
-+/* [0x000009e8] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
-+/* [0x000009f0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000009f8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
-+/* [0x00000a00] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000a08] */ 0x0c281dc0, 0xd4120567, // add ra_y2, ra10.16b, 1
-+/* [0x00000a10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000a18] */ 0x0c2e7c00, 0x100208a7, // add r2, ra11, r0
-+/* [0x00000a20] */ 0x13280dc0, 0xd4020867, // max r1, ra10.16b, 0
-+/* [0x00000a28] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a30] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000a38] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1       ; mov ra_frame_base2, r2
-+/* [0x00000a40] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000a48] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000a50] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000a58] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
-+/* [0x00000a60] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
-+/* [0x00000a68] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
-+/* [0x00000a70] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
-+/* [0x00000a78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000a80] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000a88] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000a90] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000a98] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000aa0] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000aa8] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000ab0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000ab8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000ac0] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000ac8] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000ad0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000ad8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000ae0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000ae8] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
-+/* [0x00000af0] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000af8] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000b00] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b08] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000b10] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000b18] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
-+// :per_block_setup
-+/* [0x00000b20] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000b28] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b30] */ 0x959a0ff6, 0x10024061, // mov ra1, unif  ; mov r1, elem_num
-+/* [0x00000b38] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000b40] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000b48] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000b50] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000b58] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000b60] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000b68] */ 0x15067d80, 0x14020727, // mov ra_y_next, ra1.16b
-+/* [0x00000b70] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000b78] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000b80] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000b88] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000b98] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000ba0] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
-+/* [0x00000ba8] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3                     ; mov ra1, unif
-+/* [0x00000bb0] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000bb8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000bc0] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000bc8] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
-+/* [0x00000bd0] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
-+/* [0x00000bd8] */ 0x11047dc0, 0xd2020827, // shl r0,   ra1.16a, 7
-+/* [0x00000be0] */ 0x0c067180, 0x14020827, // add r0,   r0, ra1.16b
-+/* [0x00000be8] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000bf0] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
-+/* [0x00000bf8] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
-+/* [0x00000c00] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
-+/* [0x00000c08] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
-+/* [0x00000c10] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000c18] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000c20] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000c28] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
-+/* [0x00000c30] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
-+/* [0x00000c38] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000c40] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000c48] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000c50] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000c58] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000c60] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000c68] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000c70] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000c78] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000c80] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000c88] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000c90] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000c98] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000ca0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000ca8] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000cb0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000cb8] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
-+/* [0x00000cc0] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
-+/* [0x00000cc8] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
-+/* [0x00000cd0] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
-+/* [0x00000cd8] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
-+/* [0x00000ce0] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
-+/* [0x00000ce8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000cf0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
-+/* [0x00000cf8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
-+/* [0x00000d00] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
-+// ::mc_filter
-+/* [0x00000d08] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
-+// :yloop
-+/* [0x00000d10] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000d18] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000d20] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000d28] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000d30] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000d38] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000d40] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000d48] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000d50] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255
-+/* [0x00000d58] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000d60] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000d68] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000d70] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
-+/* [0x00000d78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000d80] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000d88] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00000d90] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x00000d98] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x00000da0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x00000da8] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x00000db0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x00000db8] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x00000dc0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x00000dc8] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x00000dd0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x00000dd8] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x00000de0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x00000de8] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x00000df0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00000df8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00000e00] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000e08] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000e10] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000e18] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000e20] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00000e28] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00000e30] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00000e38] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000e40] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000e48] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00000e50] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00000e58] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00000e60] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00000e68] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00000e70] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00000e78] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x00000e80] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00000e88] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000e90] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00000e98] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000ea0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000ea8] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000eb0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000eb8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000ec0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000ec8] */ 0xfffffc38, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00000ed0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000ed8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000ee0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_b
-+// :yloopb
-+/* [0x00000ee8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000ef0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+/* [0x00000ef8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000f00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000f08] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000f10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000f18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000f20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000f28] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255
-+/* [0x00000f30] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000f38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000f40] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000f48] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
-+/* [0x00000f50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000f58] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000f60] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00000f68] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x00000f70] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x00000f78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x00000f80] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x00000f88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x00000f90] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x00000f98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x00000fa0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x00000fa8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x00000fb0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x00000fb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x00000fc0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x00000fc8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00000fd0] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00000fd8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000fe0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000fe8] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000ff0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00000ff8] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00001000] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00001008] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00001010] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00001018] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00001020] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00001028] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00001030] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00001038] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00001040] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00001048] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00001050] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
-+/* [0x00001058] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001060] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001068] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
-+/* [0x00001070] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
-+/* [0x00001078] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x00001080] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001088] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001090] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001098] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000010a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000010a8] */ 0xfffffa58, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x000010b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000010b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000010c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_interrupt_exit12c
-+/* [0x000010c8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9      ; mov -, vw_wait
-+/* [0x000010d0] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit12_c_1
-+/* [0x000010d8] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000010e8] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x000010f0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
-+/* [0x000010f8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
-+/* [0x00001100] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
-+/* [0x00001108] */ 0x00000000, 0xe0020267, // mov ra9, 0
++// ::mc_exit_c
++/* [0x000009a0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000009a8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000009b0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000009b8] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait        ; nop           ; ldtmu1
++/* [0x000009c0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000009c8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000009d0] */ 0x009e7000, 0x100009e7, // nop
 +// ::mc_interrupt_exit12
-+// :exit12_c_1
-+/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001118] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001120] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001128] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
-+/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001178] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001180] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001188] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00001190] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001198] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit1
-+/* [0x000011a0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000011b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000011c0] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000011c8] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000011d0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011d8] */ 0x009e7000, 0x100009e7, // nop        ; nop
++// ::mc_interrupt_exit12c
++/* [0x000009d8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000009e0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000009e8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000009f0] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait        ; nop           ; ldtmu1
++/* [0x000009f8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a00] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000a08] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000a10] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000a18] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000a20] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00000a28] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000a30] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000a38] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000a40] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000a48] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000a50] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000a58] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000a60] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
++/* [0x00000a68] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000a70] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000a78] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000a80] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00000a88] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000a90] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000a98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000aa0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000aa8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000ab0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000ab8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000ac0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000ac8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ad0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000ad8] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000ae0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000ae8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000af0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000af8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000b00] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000b08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000b10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000b18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000b20] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00000b28] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00000b30] */ 0x95042ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :y_preload
++/* [0x00000b38] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000b40] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000b48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000b50] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000b58] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00000b60] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000b68] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:y_preload
++/* [0x00000b70] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000b78] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000b80] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000b88] */ 0x0c809dc0, 0xd0021367, // add rb_wt_den_p15, unif, 9
++/* [0x00000b90] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000b98] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000ba0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000ba8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000bb0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000bb8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000bc0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000bc8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000bd0] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000bd8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000be0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000be8] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000bf0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000bf8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000c00] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000c08] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup
++/* [0x00000c10] */ 0x935401f6, 0xd4125815, // max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
++/* [0x00000c18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000c20] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000c28] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000c30] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00000c38] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00000c40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000c48] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00000c50] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000c58] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000c60] */ 0x930401f6, 0xd2125813, // max r0, r0, 0         ; mov ra_y2_next, ra1.16a
++/* [0x00000c68] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x00000c70] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000c78] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x00000c80] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000c88] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000c90] */ 0x8c9dc07f, 0x10024831, // add r0, r0, r1        ; mov vw_setup, rb_vpm_init
++/* [0x00000c98] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ca0] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width
++/* [0x00000ca8] */ 0x8c405df6, 0xd2025460, // add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height
++/* [0x00000cb0] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16
++/* [0x00000cb8] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x00000cc0] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
++/* [0x00000cc8] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
++/* [0x00000cd0] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x00000cd8] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000ce0] */ 0x918101f6, 0xd0045816, // shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000ce8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
++/* [0x00000cf0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000cf8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000d00] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000d08] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000d10] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000d18] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000d20] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000d28] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000d30] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000d38] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000d40] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000d48] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000d50] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000d58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d    ; mov ra_wt_off_mul_l1, unif
++/* [0x00000d60] */ 0x90216387, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255
++/* [0x00000d68] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000d70] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000d78] */ 0x90216387, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255
++/* [0x00000d80] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000d88] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000d90] */ 0x90216387, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255
++/* [0x00000d98] */ 0x954a0dbf, 0x10064597, // mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x00000da0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000da8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000db0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000db8] */ 0x90216387, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255
++/* [0x00000dc0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x00000dc8] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter
++/* [0x00000dd0] */ 0xfffffe20, 0xf0f807a7, // brr ra_link, r:per_block_setup
++/* [0x00000dd8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00000de0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000de8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000df0] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :yloop
++/* [0x00000df8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00000e00] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00000e08] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00000e10] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00000e18] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00000e20] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00000e28] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000e30] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000e38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00000e40] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00000e48] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255
++/* [0x00000e50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000e58] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
++/* [0x00000e60] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00000e68] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00000e70] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00000e78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00000e80] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00000e88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00000e90] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00000e98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00000ea0] */ 0x40074031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00000ea8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00000eb0] */ 0x40073031, 0xda00c9e3, // nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00000eb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00000ec0] */ 0x40072031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00000ec8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00000ed0] */ 0x40071031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00000ed8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8     ; mov r1,   ra8
++/* [0x00000ee0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9         ; mov rb8,  rb9
++/* [0x00000ee8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000ef0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10        ; mov rb9,  rb10
++/* [0x00000ef8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00000f00] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3      ; mov rb11, r1
++/* [0x00000f08] */ 0x4008803e, 0x180049e0, // nop                   ; mul24 r0, rb8,  ra2.8a
++/* [0x00000f10] */ 0x4008903e, 0x1a0049e1, // nop                   ; mul24 r1, rb9,  ra2.8b
++/* [0x00000f18] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00000f20] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00000f28] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00000f30] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00000f38] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00000f40] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00000f48] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000f50] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00000f58] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00000f60] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00000f68] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off
++/* [0x00000f70] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8         ; mov r0, ra_height
++/* [0x00000f78] */ 0xfffffe60, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000f80] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
++/* [0x00000f88] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
++/* [0x00000f90] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
++/* [0x00000f98] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
++/* [0x00000fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000fa8] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
++/* [0x00000fb0] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
++/* [0x00000fb8] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
++/* [0x00000fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000fc8] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
++/* [0x00000fd0] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
++/* [0x00000fd8] */ 0xfffffe00, 0xf0f809e7, // brr -, r:yloop
++/* [0x00000fe0] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
++/* [0x00000fe8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x00000ff0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_b
++/* [0x00000ff8] */ 0xfffffbf8, 0xf0f807a7, // brr ra_link, r:per_block_setup
++/* [0x00001000] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001008] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00001010] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :yloopb
++/* [0x00001018] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1             ; ldtmu1
++/* [0x00001020] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00001028] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001030] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001038] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001040] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001048] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001050] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001058] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001060] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001068] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255
++/* [0x00001070] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00001078] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
++/* [0x00001080] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00001088] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00001090] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00001098] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000010a0] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000010a8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000010b0] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000010b8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000010c0] */ 0x40074031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000010c8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x000010d0] */ 0x40073031, 0xda00c9e3, // nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x000010d8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x000010e0] */ 0x40072031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x000010e8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x000010f0] */ 0x40071031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x000010f8] */ 0x8d208bf6, 0xd00269e1, // sub.setf -, r5, 8     ; mov r1,   ra8
++/* [0x00001100] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9         ; mov rb8,  rb9
++/* [0x00001108] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001110] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10        ; mov rb9,  rb10
++/* [0x00001118] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001120] */ 0x8d9e74c9, 0x100242cb, // sub ra11, r2, r3      ; mov rb11, r1
++/* [0x00001128] */ 0x4008803e, 0x180049e0, // nop                   ; mul24 r0, rb8,  ra2.8a
++/* [0x00001130] */ 0x4008903e, 0x1a0049e1, // nop                   ; mul24 r1, rb9,  ra2.8b
++/* [0x00001138] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001140] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001148] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001150] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001158] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001160] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001168] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00001170] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001178] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001180] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001188] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x00001190] */ 0x0c9e7200, 0x10020867, // add r1, r1, r0
++/* [0x00001198] */ 0x914083f6, 0xd2024860, // shl r1, r1, 8         ; mov r0, ra_height
++/* [0x000011a0] */ 0xfffffe58, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x000011a8] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
++/* [0x000011b0] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
++/* [0x000011b8] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
++/* [0x000011c0] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
++/* [0x000011c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000011d0] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
++/* [0x000011d8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
++/* [0x000011e0] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
++/* [0x000011e8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000011f0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
++/* [0x000011f8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
++/* [0x00001200] */ 0xfffffdf8, 0xf0f809e7, // brr -, r:yloopb
++/* [0x00001208] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
++/* [0x00001210] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x00001218] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x00001220] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001228] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00001230] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00001238] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001240] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001248] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001250] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00001258] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00001260] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00001268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001270] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00001278] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001280] */ 0x0d418f80, 0x14021767, // sub rb_dma1, rb_dma1_base, ra_width
++/* [0x00001288] */ 0x8d402df6, 0xd2025460, // sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height
++/* [0x00001290] */ 0x12527180, 0x1c020827, // min r0, r0, ra_k16
++/* [0x00001298] */ 0x8c8001f6, 0xd0025496, // add rb_lcount, r0, 0  ; mov ra_wt_off_mul_l0, unif
++/* [0x000012a0] */ 0x918071f6, 0xd0024817, // shl r0,   r0, 7       ; mov rb_dest, unif
++/* [0x000012a8] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
++/* [0x000012b0] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
++/* [0x000012b8] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000012c0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000012c8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :yloop_p00
++/* [0x000012d0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000012d8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000012e0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000012e8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000012f0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x000012f8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001300] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
++/* [0x00001308] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001310] */ 0x9140f3f6, 0xd2024860, // shl r1, r1, 15        ; mov r0, ra_height
++/* [0x00001318] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb_wt_off
++/* [0x00001320] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:yloop_p00
++/* [0x00001328] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
++/* [0x00001330] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
++/* [0x00001338] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
++/* [0x00001340] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
++/* [0x00001348] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001350] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
++/* [0x00001358] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
++/* [0x00001360] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
++/* [0x00001368] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001370] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
++/* [0x00001378] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
++/* [0x00001380] */ 0xffffff30, 0xf0f809e7, // brr -, r:yloop_p00
++/* [0x00001388] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
++/* [0x00001390] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x00001398] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x000013a0] */ 0xfffff850, 0xf0f807a7, // brr ra_link, r:per_block_setup
++/* [0x000013a8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x000013b0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000013b8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x000013c0] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x000013c8] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x000013d0] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x000013d8] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x000013e0] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x000013e8] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :yloop_b00
++/* [0x000013f0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x000013f8] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x00001400] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001408] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001410] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001418] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001420] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001428] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001430] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001438] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00001440] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
++/* [0x00001448] */ 0x545963c6, 0x12024860, // and r1, r1, rb_k255   ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00001450] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001458] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00001460] */ 0x119ce3c0, 0xd0020867, // shl r1, r1, 14
++/* [0x00001468] */ 0x8c40c3f6, 0x12024860, // add r1, r1, rb_wt_off ; mov r0, ra_height
++/* [0x00001470] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:yloop_b00
++/* [0x00001478] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb_wt_den_p15
++/* [0x00001480] */ 0x95532dbf, 0x1c020867, // mov r1, ra_k16        ; mov -, vw_wait
++/* [0x00001488] */ 0x8d0e7076, 0x18024830, // sub r0, r0, r1        ; mov vpm, ra3.8a
++/* [0x00001490] */ 0x939c01c0, 0xd01279d0, // max.setf -, r0, 0     ; mov ra_height, r0
++/* [0x00001498] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000014a0] */ 0x929da07f, 0x10024831, // min r0, r0, r1        ; mov vw_setup, rb_dma0
++/* [0x000014a8] */ 0x8d9dd07f, 0x100248b1, // sub r2, r0, r1        ; mov vw_setup, rb_dma1
++/* [0x000014b0] */ 0x809d703f, 0x100049f2, // nop                   ; mov vw_addr, rb_dest
++/* [0x000014b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000014c0] */ 0x119d75c0, 0xd0020827, // shl r0, r2, i_shift23
++/* [0x000014c8] */ 0x0c9dae00, 0x100216a7, // add rb_dma0, rb_dma0, r0
++/* [0x000014d0] */ 0xffffff00, 0xf0f809e7, // brr -, r:yloop_b00
++/* [0x000014d8] */ 0x409d000f, 0x100049e0, // nop                   ; mul24 r0, r1, rb_pitch
++/* [0x000014e0] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
++/* [0x000014e8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
 +#pragma Align_to(8, rpi_shader)
 +#endif
-diff --git b/libavcodec/rpi_shader.h a/libavcodec/rpi_shader.h
+diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
 new file mode 100644
-index 0000000..3b1229e
+index 0000000..a44bce9
 --- /dev/null
-+++ a/libavcodec/rpi_shader.h
-@@ -0,0 +1,20 @@
++++ b/libavcodec/rpi_shader.h
+@@ -0,0 +1,35 @@
 +#ifndef rpi_shader_H
 +#define rpi_shader_H
 +
 +extern unsigned int rpi_shader[];
 +
-+#define mc_setup_uv (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 112)
-+#define mc_filter_uv_b0 (rpi_shader + 260)
-+#define mc_filter_uv_b (rpi_shader + 424)
-+#define mc_exit_c (rpi_shader + 556)
-+#define mc_exit (rpi_shader + 574)
-+#define mc_setup (rpi_shader + 590)
-+#define mc_filter (rpi_shader + 834)
-+#define mc_filter_b (rpi_shader + 954)
-+#define mc_interrupt_exit12c (rpi_shader + 1074)
-+#define mc_interrupt_exit12 (rpi_shader + 1092)
-+#define mc_exit1 (rpi_shader + 1128)
-+#define mc_end (rpi_shader + 1144)
++#define mc_setup_c_q0 (rpi_shader + 0)
++#define mc_start (rpi_shader + 0)
++#define mc_setup_c_qn (rpi_shader + 2)
++#define mc_filter_uv (rpi_shader + 138)
++#define mc_filter_uv_b0 (rpi_shader + 264)
++#define mc_sync_q0 (rpi_shader + 454)
++#define mc_sync_q1 (rpi_shader + 472)
++#define mc_sync_q2 (rpi_shader + 484)
++#define mc_sync_q3 (rpi_shader + 496)
++#define mc_sync_q4 (rpi_shader + 508)
++#define mc_sync_q5 (rpi_shader + 526)
++#define mc_sync_q6 (rpi_shader + 538)
++#define mc_sync_q7 (rpi_shader + 550)
++#define mc_sync_q8 (rpi_shader + 562)
++#define mc_sync_q9 (rpi_shader + 580)
++#define mc_sync_q10 (rpi_shader + 592)
++#define mc_sync_q11 (rpi_shader + 604)
++#define mc_exit (rpi_shader + 616)
++#define mc_exit_c (rpi_shader + 616)
++#define mc_interrupt_exit12 (rpi_shader + 630)
++#define mc_interrupt_exit12c (rpi_shader + 630)
++#define mc_setup_y_q0 (rpi_shader + 646)
++#define mc_setup_y_qn (rpi_shader + 648)
++#define mc_filter (rpi_shader + 884)
++#define mc_filter_b (rpi_shader + 1022)
++#define mc_filter_y_p00 (rpi_shader + 1160)
++#define mc_filter_y_b00 (rpi_shader + 1256)
++#define mc_end (rpi_shader + 1340)
 +
 +#endif
-diff --git b/libavcodec/rpi_shader.qasm a/libavcodec/rpi_shader.qasm
+diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
 new file mode 100644
-index 0000000..6fd6af5
+index 0000000..58fd911
 --- /dev/null
-+++ a/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1150 @@
++++ b/libavcodec/rpi_shader.qasm
+@@ -0,0 +1,1349 @@
 +
 +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
 +# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be routated through their
++# rotated through all 16 elems ra regs can only be rotated through their
 +# local 4.  As it happens this is what is wanted here as we do not want the
 +# constants from the other half of the calc.
 +
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# However in the current world there seems to be no benefit (and a small
++# overhead) in setting this bigger than 2.
++
++.set PREREAD, 2
++
++
 +# register allocation
 +#
-+# ra0...ra7                                     eight horizontal filter coefficients
-+#
-+# rb0 rx_shift2
-+# rb1 rb_y2_next
-+#
-+# rb4...rb7
-+#
-+# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
-+#
-+#                                               (ra15 isn't clamped to zero - this happens during the
-+#                                                copy to ra14, and during its use in the vertical filter)
-+#
-+# rb8...rb11                                    eight vertical filter coefficients
 +
-+# ra4                                           y: Fiter, UV: part -of b0 -> b stash
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-7
++# C:   L0 H filter out FIFO
++# otherwise -- free --
++
++# ra8-11
++# temp in some places - check usage
++# Y:   (with rb8-11) horiz out FIFO
++
++# ra12-15
++# -- free --
++
++# uniform: width:height
++.set ra_width_height,              ra16
++.set ra_width,                     ra16.16b
++.set ra_height,                    ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2,                      ra17
++.set ra_y2,                        ra17.16a
++.set ra_y,                         ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1,             ra18
++.set ra_wt_off_l1,                 ra18.16b
++.set ra_wt_mul_l1,                 ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next,                 ra19
++.set ra_y_next,                    ra19.16b
++.set ra_y2_next,                   ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff100100,                 ra20
++.set ra_k256,                      ra20.16a
++.set ra_k0,                        ra20.8a
++.set ra_k1,                        ra20.8b
++.set ra_k16,                       ra20.8c
++.set ra_k255,                      ra20.8d
++
++# Loop: xshifts
++.set ra_xshift,                    ra21.16a
++.set ra_xshift_next,               ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0,             ra22
++.set ra_wt_mul_l0,                 ra22.16a
++.set ra_wt_off_l0,                 ra22.16b
++
++# -- free --                       ra23
++
++# Loop:  src frame base (L0)
++.set ra_base,                      ra24
++
++# Loop: src frame base (L1)
++.set ra_base2,                     ra25
++
++# Loop: next src frame base (L0)
++.set ra_base_next,                 ra26
++
++# -- free --                       ra27
++# -- free --                       ra28
++# -- free --                       ra29
 +
-+# rb12                                          offset to add before shift (round + weighting offsets)
-+# rb13                                          shift: denom + 6 + 9
-+# rb14                                          L0 weight (U on left, V on right)
-+# rb15                                          -- free --
-+#
-+# ra16                                          clipped(row start address+elem_num)&~3
-+# ra17                                          per-channel shifts
-+# ra18                                          L1 weight (Y)
-+# ra19                                          next ra17
-+#
-+# rb16                                          pitch
-+# rb17                                          height + 1
-+# rb18                                          height + 3
-+# rb19                                          next ra16
-+#
-+# ra20                                          1
-+# ra21                                          ra_21
-+# ra22 ra_k256                                  256
-+# ra23 ra_y2_next                               ra_y2_next
-+#
-+# rb20                                          -- free --
-+# rb21                                          -- free --
-+# rb22 rb_k255                                  255
-+# rb23                                          -- free --
-+#
-+# rb24                                          vdw_setup_1(dst_pitch)
-+# rb25                                          frame width-1
-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
-+# rb27                                          vdw_setup_0 (depends on QPU number)
-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
-+# rb29                                          vdw_setup_1(dst_pitch-width)
-+# rb30                                          frame height-1
-+# rb31                                          used as temp to count loop iterations
-+#
-+# ra24                                          clipped(row start address+8+elem_num)&~3
-+# ra25                                          per-channel shifts 2
-+# ra26                                          next ra24
-+# ra27                                          next ra25
-+# ra28                                          next y
-+# ra29                                          y for next texture access
-+#
 +# Use an even numbered register as a link register to avoid corrupting flags
-+# ra30                                          next kernel address
-+# ra31                                          chroma-B height+3; free otherwise
-+
-+.set rb_frame_width_minus_1,       rb25
-+.set rb_frame_height_minus_1,      rb30
-+.set rb_pitch,                     rb16
-+.set ra_x,                         ra16
-+.set ra_y2,                        ra21.16a
-+.set ra_y2_next,                   ra21.16b
-+
-+.set rb_x_next,                    rb19
-+.set rx_frame_base2_next,          rb19
-+
-+.set ra_frame_base,                ra24
-+.set ra_frame_base_next,           ra26
-+.set ra_xshift,                    ra17
-+
-+.set ra_u2v_ref_offset,            ra25
-+.set ra_frame_base2,               ra25
-+
-+.set ra_xshift_next,               ra19
-+.set rx_xshift2,                   rb0
-+.set rx_xshift2_next,              rb1
-+
-+.set ra_u2v_dst_offset,            ra27
-+
-+.set ra_y_next,                    ra28
-+.set ra_y,                         ra29
-+
-+.set ra_k1,                        ra20
-+.set rb_k255,                      rb22
-+.set ra_k256,                      ra22
-+
 +.set ra_link,                      ra30
 +
++# -- free --                       ra31
++
++.set rb_xshift2,                   rb0
++.set rb_xshift2_next,              rb1
++
++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x,                    rb2
++
++# rb3
++# C: Temp (U/V flag)
++# Y: free
++
++# rb4-7
++# C-B: L1 H filter out FIFO
++# Y:   (with ra2.8x) Y vertical filter coeffs
++
++# rb8-11
++# C:   Vertical filter coeffs
++# Y:   (with ra8-11) horiz out FIFO
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off,                    rb12
++
++# Setup: denom + 6 + 9
++.set rb_wt_den_p15,                rb13
++
++# -- free --                       rb14
++# -- free --                       rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch,                     rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu,                     rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount,                    rb18
++
++# frame_base2_next
++.set rb_base2_next,                rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch,                    rb20
++
++# -- free --                       rb21
++
++# Setup: 255
++.set rb_k255,                      rb22
++
++# Loop: destination address
++.set rb_dest,                      rb23
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base,                 rb24
++
++# Setup: pic width - 1
++# In the case of chroma it is in bytes so 2 * (pic_width_c - 1)
++.set rb_max_x,                     rb25
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set rb_dma0,                      rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base,                 rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init,                  rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1,                      rb29
++
++# Setup: pic_height - 1
++.set rb_max_y,                     rb30
++
++# -- free --                       rb31
++
++
++
++
 +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
 +.set i_shift16,                    -16
 +.set i_shift21,                    -11
++.set i_shift23,                     -9
 +.set i_shift30,                     -2
 +
 +# Much of the setup code is common between Y & C
@@ -13596,714 +17120,719 @@ index 0000000..6fd6af5
 +  add r_dma, r0, r1  # DMA out
 +.endm
 +
++.macro m_setup_q0
++  srel -, 12
++.endm
++
++# Code start label
++::mc_start
 +
 +################################################################################
-+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+::mc_setup_uv
-+  mov tmurs, 1          ; mov ra_link, unif        # No swap TMUs ; Next fn
++# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  mov tmurs, 1                                  # No swap TMUs
 +
 +# Load first request location
-+mov ra0, unif
-+mov r0, elem_num
++  mov ra0, unif         # next_x_y
 +
-+add ra_x, ra0.16b, r0   # Store x
-+mov ra_y, ra0.16a       # Store y
-+mov ra_frame_base, unif # Store frame u base
-+mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_frame_base
-+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
++  mov ra_base, unif                             # Store frame c base
 +
 +# Read image dimensions
-+sub rb25,unif,1
-+sub rb30,unif,1
-+
-+# get source pitch
-+mov rb16, unif
-+
-+# get destination vdw setup
-+add rb24, r1, unif      # dst_stride
++  sub r0, unif, 1                               # pic c width
++  add rb_max_x, r0, r0
++  sub rb_max_y, unif, 1     # pic c height
 +
 +# load constants
-+  mov ra_k1, 1
-+  mov ra_k256, 256
++  mov ra_kff100100, 0xff100100
 +  mov rb_k255, 255
 +
-+# touch registers to keep simulator happy
 +
++  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
++
++# touch registers to keep simulator happy
++# ; ra12..15: vertical scroll registers
++# get source pitch
++  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
++  mov rb_pitch, unif    ; mov ra13, 0           # stride1
++  nop                   ; mov ra14, 0
++# get destination vdw setup
++  add rb_dma1_base, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
++
++  and r0, 1, elem_num
++  nop                   ; mul24 r0, r0, 5
++  add rb_elem_x, r0, elem_num
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  add r0, ra0.16b, ra0.16b                      # [rb_elem_x delay]
++  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
++  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
++  min r0, r0, rb_max_x
++
++# Get shift
++  shl ra_xshift_next, r0, 3
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++  and r0, r0, -4
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1
++  add ra_base, ra_base, r0
++
++  add rb_wt_den_p15, 9, unif     # denominator
++
++# Compute part of VPM to use for DMA output
++  m_calc_dma_regs rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Load first request location
++  mov ra0, unif                                 # next_x_y
++
++  mov ra_base2, unif                            # [ra0 delay] Store frame c base
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  add r0, ra0.16b, ra0.16b                      # Load x
++  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++
++# Get shift
++  shl rb_xshift2_next, r0, 3
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++  and r0, r0, -4
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r2, ra_y2
++  add ra_base2, ra_base2, r0
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2
++  mov r3, PREREAD       ; mov r0, ra_y
++
++:c_preload
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:c_preload
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz c_preload
++
++  mov ra_link, unif                             # link
++# touch registers to keep simulator happy
 +  # ra/b4..7: B0 -> B stash registers
 +  mov ra4, 0 ; mov rb4, 0
++  bra -, ra_link
 +  mov ra5, 0 ; mov rb5, 0
 +  mov ra6, 0 ; mov rb6, 0
 +  mov ra7, 0 ; mov rb7, 0
-+
-+  # ra12..15: vertical scroll registers
-+  mov ra12, 0
-+  mov ra13, 0
-+  mov ra14, 0
-+  mov ra15, 0
-+
-+  # ra9 - delayed setup - must be 0 initially
-+  mov ra9, 0
-+
-+# Compute base address for first and second access
-+mov r0, ra_x           # Load x
-+max r0, r0, 0                      ; mov r1, ra_y # Load y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base  # Load the frame base
-+shl ra_xshift_next, r0, 3          ; mov r2, ra_u2v_ref_offset
-+add ra_y, r1, 1
-+add r0, r0, r3
-+and r0, r0, ~3
-+max r1, r1, 0                      ; mov ra_x, r0 # y
-+min r1, r1, rb_frame_height_minus_1
-+# submit texture requests for first line
-+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+add t0s, r0, r1 ; mov ra_frame_base, r2
-+add t1s, r2, r1
-+
-+add rb13, 9, unif   # denominator
-+mov -, unif         # Unused
-+
-+mov -, unif   # ??? same as (register) qpu_num
-+
-+# Compute part of VPM to use for DMA output
-+m_calc_dma_regs rb28, rb27
-+
-+# submit texture requests for second line
-+max r1, ra_y, 0
-+min r1, r1, rb_frame_height_minus_1
-+add ra_y, ra_y, 1
-+bra -, ra_link
-+nop ; mul24 r1, r1, rb_pitch
-+add t0s, r1, ra_x
-+add t1s, r1, ra_frame_base
-+
-+
++# >>> ra_link
 +
 +################################################################################
 +
-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv
-+mov ra_link, unif
-+
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+mov ra2, unif         # x_y
-+mov r0, elem_num      ; mov r3, unif          # frame_base
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+add r0, ra2.16b, r0   # x
-+max r0, r0, 0
-+min r0, r0, rb_frame_width_minus_1
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+shl ra_xshift_next, r0, 3
-+add r0, r0, r3        ; mov ra1, unif  # ; width_height
-+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
-+mov ra_y_next, ra2.16a ; mov vw_setup, rb28
++  and.setf -, elem_num, 1                       # [ra2 delay]
 +
-+add ra_frame_base_next, rb_x_next, r2
++  add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1  # x ; r1=0
++  add r0, r0, rb_elem_x
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++
++  shl ra_xshift_next, r0, 3
++
++  and r0, r0, -4        ; mov ra0, unif         # H filter coeffs
++  nop                   ; mov ra_y_next, ra2.16a
++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0
++  shl r0, r1, 7
 +
 +# set up VPM write
-+# get width,height of block
 +
-+sub rb29, rb24, ra1.16b  # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0,   ra1.16a, 7
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
++  add rb_i_tmu, r1, 3 - PREREAD ; mov ra_wt_off_mul_l0, unif         # ; U offset/weight
++  add rb_lcount, r1, 3  ; mov.ifnz ra_wt_off_mul_l0, unif    # ; V offset/weight
 +
-+  mov.setf -, ra9     ; mov -, vw_wait
-+  brr.anyz -, r:filter_uv_1
++# ; unpack filter coefficients
 +
-+add r0,   r0, ra1.16b    # Combine width and height of destination area
-+shl r0,   r0, i_shift16  # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27    ; mov ra3, unif  # ; V filter coeffs
-+# >>> (skip V DMA if never requested)
++  add r0, r0, r2        ; mov rb8,  ra3.8a      # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, i_shift16 ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
 +
-+  sub vw_setup, ra9, -16
-+  mov vw_setup, ra10
-+  mov vw_addr, ra11
-+:filter_uv_1
++  mov rb_dest, unif     ; mov ra9, rb_max_y     # dst_addr ; alias rb_max_y
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  shl r1, r1, rb_wt_den_p15 ; mov rb10, ra3.8c
++  mov r5quad, 0         ; mov rb11, ra3.8d      # Loop count (r5rep is B, r5quad is A)
 +
-+# unpack filter coefficients
++  asr rb_wt_off, r1, 1  ; mov ra_link, unif     # Link
++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1             # weight*2
 +
-+mov ra1, unif         ; mov rb8,  ra3.8a   # U offset/weight
-+mov.ifnz ra1, unif    ; mov rb9,  ra3.8b   # V offset/weight
-+nop                   ; mov rb10, ra3.8c
-+mov r3, 0             ; mov rb11, ra3.8d   # Loop count
++# ra9 alias for rb_max_y
++# ra_wt_mul_l0 - weight L0 * 2
++# rb_wt_den_p15 = weight denom + 6 + 9
++# rb_wt_off = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
 +
-+shl r1, ra1.16b, rb13
-+asr rb12, r1, 1
-+shl rb14, ra1.16a, 1  # b14 = weight*2
-+
-+# rb14 - weight L0 * 2
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
-+
-+# r2 is elem_num
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+# r3 = 0
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++
++  mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
++
++# r5 = 0 (loop counter)
 +:uvloop
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift    ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
++  shr r2, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
++  shr r1, r2, 8         ; mov.ifnz r3, ra_y
++  add r0, r3, 1         ; mov.ifz ra_base, ra_base_next
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2    ; v8min r1, r1, rb_k255
-+add t1s, ra_frame_base, r2
++  and.setf -, 1, elem_num ; mov ra_y, r0
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
++  mov.ifz r0, r2        ; mul24 r2, r3, rb_pitch
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++# ra4 not really needed; this could be a mul24 rather than a mov but current
++# register usage means this wouldn't help
++  mov.setf -, rb3       ; mov ra4, ra5
 +
 +# apply horizontal filter
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop
-+mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are vaild for all QPUs
++
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d      , r1
++  brr.anyn -, r:uvloop
++  add r2, r2, r3        ; mov ra5, ra6
++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
 +# >>> .anyn uvloop
 +
-+# apply vertical filter and write to VPM
++  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
++  sub r1, r1, r0
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  shl r1, r1, 8
 +
-+sub r1, r1, r0          ; mul24 r0, ra14, rb10
-+add r1, r1, r0          ; mul24 r0, ra15, rb11
-+sub r1, r1, r0          ; mov -, vw_wait
-+sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14
-+nop                     ; mul24 r1, r1, rb14
-+shl r1, r1, 8
-+
-+add r1, r1, rb12
-+brr.anyn -, r:uvloop
-+asr r1, r1, rb13
-+min r1, r1, rb_k255       # Delay 2
-+max vpm, r1, 0         # Delay 3
-+# >>>
++  add r1, r1, rb_wt_off
++  brr.anyn -, r:uvloop
++  asr ra1.8as, r1, rb_wt_den_p15
++  mov -, vw_wait
++  mov vpm, ra1.8a
++# >>> .anyn uvloop
 +
 +# DMA out for U & stash for V
-+  mov vw_setup, rb26    ; mov ra9, rb26 # VDW setup 0
 +  bra -, ra_link
-+  mov vw_setup, rb29    ; mov ra10, rb29 # Stride
-+  mov vw_addr, unif     # u_dst_addr
-+  mov ra11, unif        # v_dst_addr
-+# >>>
++  mov vw_setup, rb_dma0
++  mov vw_setup, rb_dma1
++  mov vw_addr, rb_dest     # u_dst_addr
++# >>> ra_link
 +
 +################################################################################
 +
-+# mc_filter_uv_b0(next_kernel, x, y, frame_u_base, frame_v_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
 +::mc_filter_uv_b0
-+mov -, unif                  # Ignore chain address - always "b"
-+
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+mov ra2, unif         # x_y
-+mov r0, elem_num      ; mov r3, unif          # frame_base
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+add r0, ra2.16b, r0   # x
-+max r0, r0, 0
-+min r0, r0, rb_frame_width_minus_1
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3      ; mov ra_xshift, ra_xshift_next
-+shl ra_xshift_next, r0, 3
-+add r0, r0, r3        ; mov ra1, unif  # ; width_height
-+and rb_x_next, r0, ~3 ; mov ra0, unif  # H filter coeffs
-+mov ra_y_next, ra2.16a
++  and.setf -, elem_num, 1                       # Also acts as delay slot for ra2
 +
-+add ra_frame_base_next, rb_x_next, r2
++  add r0, ra2.16b, ra2.16b ; v8subs r1, r1, r1  # x ; r1=0
++  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
 +
-+# Need to have unsigned coeffs to so we can just unpack in the filter
-+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
-+# filter code. Unpack into b regs for V
++  shl ra_xshift_next, r0, 3
 +
-+sub rb29, rb24, ra1.16b         # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add ra31, ra1.16a, 3
-+shl r0,   ra1.16a, 7
-+add r0,   r0, ra1.16b        ; mov ra3, unif   # Combine width and height of destination area ; V filter coeffs
-+shl r0,   r0, i_shift16      ; mov rb14, unif  # U weight L0
-+add rb26, r0, rb27
-+
-+mov rb8, ra3.8a
-+mov rb9, ra3.8b
-+mov rb10, ra3.8c
-+mov rb11, ra3.8d
-+
-+# r2 is elem_num
-+# r3 is loop counter
-+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+mov.ifnz rb14, unif    ; mov r3, 0  # V weight L0 ; Loop counter
-+
-+# rb14 unused in b0 but will hang around till the second pass
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# r3 = 0
-+:uvloop_b0
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+  sub.setf -, r3, rb17  ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+  shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+  shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255          # v8subs masks out all but bottom byte
-+
-+  max r2, ra_y, 0       # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
-+  add t0s, ra_x, r2     ; v8min r1, r1, rb_k255
-+  add t1s, ra_frame_base, r2
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+  nop                   ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  sub r0, r2, r3        ; mov r3, rb31
-+  sub.setf -, r3, 4     ; mov ra12, ra13
-+  brr.anyn -, r:uvloop_b0
-+  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
-+  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
-+  mov ra15, r0          ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop_b0
-+
-+# apply vertical filter and write to B-FIFO
-+
-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
-+  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
-+  sub r1, r1, r0        ; mov ra7, rb6
-+
-+# FIFO goes:
-+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
-+# This arrangement optimizes the inner loop FIFOs at the expense of making the
-+# bulk shift between loops quite a bit nastier
-+# a8 used as temp
-+
-+  sub.setf -, r3, ra31
-+  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
-+  brr.anyn -, r:uvloop_b0
-+  mov ra5, rb4          ; mov rb4, ra4
-+  mov ra4, rb5          ; mov rb5, ra6
-+  mov ra6, rb7          ; mov rb7, ra8
-+# >>>
-+
-+# 1st half done all results now in the a/b4..7 fifo
-+
-+# Need to bulk rotate FIFO for heights other than 16
-+# plausible heights are 16, 12, 8, 6, 4, 3, 2 and that is all we deal with
-+# we are allowed 3/4 cb_size w/h :-(
-+
-+# Destination uniforms discarded
-+# At the end drop through to _b - we will always do b after b0
-+
-+  sub.setf -, 15, r3    # 12 + 3 of preroll
-+  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
-+  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
-+  mov r0, i_shift16     ; mov -, unif           # ; Discard v_dst_addr
-+  mov r1, 0x10000
-+# >>>
-+  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
-+# If h != 16 && h != 12 then h <= 8 so
-+# shift 8 with discard (.16b = .16a on all regs)
-+  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
-+  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
-+  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
-+# >>>
-+  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
-+
-+  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
-+# Shift 4
-+  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
-+  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
-+  # If we shifted by 4 here then the max length remaining is 4
-+  # so that is it
-+
-+  brr -, r:uv_b0_post_fin
-+# Shift 2
-+  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
-+  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
-+  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
-+  # 6 / 2 so need 6 outputs
-+# >>>
-+
-+:uv_b0_post12
-+# this one is annoying as we need to swap halves of things that don't
-+# really want to be swapped
-+
-+# b7a, a6a, b5a, a4a
-+# b4a, a5a, b6a, a7a
-+# b7b, a6b, b5b, a4b
-+# b4b, a5b, b6b, a7b
-+
-+  mov r2,  ra4          ; mov r3,  rb5
-+  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
-+  mov ra7, r2           ; mov rb6, r3
-+
-+  mov r2, ra6           ; mov r3, rb7
-+  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
-+  mov ra5, r2           ; mov rb4, r3
-+
-+:uv_b0_post_fin
-+  # drop through
-+
-+################################################################################
-+
-+::mc_filter_uv_b
-+
-+  mov ra_link, unif
-+  mov.setf -, ra9       ; mov -, vw_wait  # Delayed V DMA
-+  brr.anyz -, r:uv_filter_b_1
-+
-+  mov ra0, unif         ; mov r0, elem_num
-+
-+# per-channel shifts were calculated on the *previous* invocation
++  and r0, r0, -4        ; mov ra0, unif         # L0 H filter coeffs
++  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0
++  shl r0, r1, 7         ; mov ra2, unif         # ; L0 V filter coeffs
 +
 +# set up VPM write
-+mov ra_xshift, ra_xshift_next      ; mov vw_setup, rb28
 +
-+# get base addresses and per-channel shifts for *next* invocation
-+add r0, ra0.16b, r0    # x
-+# >>>
-+  sub vw_setup, ra9, -16
-+  mov vw_setup, ra10
-+  mov vw_addr, ra11
-+:uv_filter_b_1
++  sub rb_dma1, rb_dma1_base, r2                 # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r1, 3 - PREREAD
++  add rb_lcount, r1, 3
 +
-+max r0, r0, 0                      ; mov ra_y_next, ra0.16a # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif        # V frame_base
-+# compute offset from frame base u to frame base v
-+sub r2, unif, r3                   ; mul24 ra_xshift_next, r0, 8 # U frame_base
-+add r0, r0, r3                     ; mov -, unif         # discard width_height
-+and rb_x_next, r0, ~3              ; mov ra0, unif       # H filter coeffs
++  add r0, r0, r2        ; mov ra_wt_mul_l0, unif # ; U weight
++  shl r0, r0, ra_k16    ; mov.ifnz ra_wt_mul_l0, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; V weight
++  add rb_dma0, r0, rb_dma0_base ; mov ra3, unif  # ; x2_y2
 +
-+# rb17, rb26, rb29, ra31 inherited from B0 as w/h must be the same
++# L1 - uniform layout could possibly be optimized
 +
-+mov ra3, unif #  V filter coeffs
++  mov ra9, rb_max_y                             # [ra3 delay]
 +
-+# get filter coefficients
++  add r0, ra3.16b, ra3.16b ; v8subs r1, r1, r1  # r0=x*2 ; r1=0
++  add r0, r0, rb_elem_x ; mov ra_y2_next, ra3.16a
++  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
++  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++  min r0, r0, rb_max_x  ; mov ra1, unif         # H filter coeffs
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  shl rb_xshift2_next, r0, 3
 +
-+# Get offset & weight stuff
++  and r0, r0, -4
++  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
++  add rb_base2_next, r3, r0
 +
-+# The unif read occurs unconditionally, only the write is conditional
-+mov      ra1, unif  ; mov rb8,  ra3.8a    # U offset/weight ;
-+mov.ifnz ra1, unif  ; mov rb9,  ra3.8b    # V offset/weight ;
-+add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c
-+mov r3, 0           ; mov rb11, ra3.8d    # Loop counter ;
++  mov ra_wt_off_mul_l1, unif        ; mov rb9,  ra3.8b      # U offset/weight
++  mov.ifnz ra_wt_off_mul_l1, unif   ; mov rb10, ra3.8c      # V offset/weight
 +
-+shl r1, ra1.16b, rb13
-+asr rb12, r1, 1
++  mov rb_dest, unif                             # dst_addr
++  mov r5quad,0          ; mov rb11, ra3.8d
++  shl r1, ra_wt_off_l1, rb_wt_den_p15
++  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
 +
-+# ra1.16a used directly in the loop
++# r5        loop counter
++# ra0       H coeffs L0
++# ra1       H coeffs L1
++# ra2       V coeffs L0
++# ra3       temp
++# ra4-7     L0 H FIFO
++# rb4-7     L1 H FIFO
++# rb8-rb11  V coeffs L1
++# ra9       rb_max_y alias
 +
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++  mov rb3, [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
 +
-+# r3 = 0
 +:uvloop_b
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
++  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++  shr r1, r2, 8         ; mov.ifz ra_y_y2, ra_y_y2_next
++  mov rb4, rb5          ; mov.ifz ra_base, ra_base_next
++  add ra_y, 1, ra_y     ; mov r3, ra_y
 +
-+sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1          ; ldtmu0     # loop counter increment
-+shr r0, r4, ra_xshift     ; mov.ifz ra_x, rb_x_next       ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift     ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++  and.setf -, 1, elem_num
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
 +
-+max r2, ra_y, 0  # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1         ; mul24 r2, r2, r3
-+add t0s, ra_x, r2         ; v8min r1, r1, rb_k255
-+add t1s, ra_frame_base, r2
++  mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
++# L0 H-filter
++# H FIFO scrolls are spread all over this loop
++  mov.setf -, rb3       ; mov ra4, ra5
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
 +
-+nop                  ; mul24      r3, ra0.8a,       r0
-+nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+sub r0, r2, r3       ; mov r3, rb31
-+sub.setf -, r3, 4    ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b
-+mov ra13, ra14          ; mul24 r1, ra14, rb9
-+mov ra14, ra15          ; mul24 r2, ra15, rb10
-+mov ra15, r0            ; mul24 r0, ra12, rb8
++  shr r2, r4, rb_xshift2 ; mov ra5, ra6
++  shr r1, r2, 8         ; mov r3, ra_y2
++  add ra_y2, r3, ra_k1  ; mov rb6, rb7
++
++  and.setf -, 1, elem_num
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifz  r1, r2 << 1
++
++  mov.ifz r0, r2        ; mul24 r3, r3, rb_pitch
++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++
++# L1 H-filter
++  mov.setf -, rb3       ; mov rb7, ra3
++
++  and r1, r1, rb_k255   ; mul24      r3, ra1.8a,       r0
++  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++  brr.anyn -, r:uvloop_b
++# V filters - start in branch delay slots of H
++  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++  mov ra6, ra7          ; mul24 r3, ra7, rb10
++  sub ra7, r2, r0       ; mul24 r0, rb4, ra2.8a
++# >>> .anyn uvloop_b0
++
++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++  sub r2, r1, r0        ; mul24 r0, ra4, rb8
++  sub r1, r3, r0        ; mul24 r0, ra5, rb9
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
++  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++
++  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++
++  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
++  add r1, r1, r2
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
++
++  brr.anyn -, r:uvloop_b
++  asr ra3.8as, r1, rb_wt_den_p15
++  mov -, vw_wait
++  mov vpm, ra3.8a
 +# >>> .anyn uvloop_b
 +
-+# apply vertical filter and write to VPM
-+
-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
-+  add r1, r1, r2        ; mul24 r0, ra15, rb11
-+  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
-+  mov ra7, rb6          ; mul24 r1, r1, ra_k256
-+  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
-+
-+  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
-+  add r1, r1, r0        ; mov rb4, ra4
-+
-+  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
-+  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
-+
-+  sub.setf -, r3, ra31  ; mov ra6, rb7
-+  brr.anyn -, r:uvloop_b
-+  asr ra3.8as, r1, rb13
-+  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
-+  mov vpm, ra3.8a
-+# >>>
-+
-+# DMA out for U & stash for V
-+
-+  mov vw_setup, rb26    ; mov ra9, rb26 # VDW setup 0
++# DMA out
 +  bra -, ra_link
-+  mov vw_setup, rb29    ; mov ra10, rb29 # Stride
-+  mov vw_addr, unif     # u_dst_addr
-+  mov ra11, unif        # v_dst_addr
-+
-+
++  mov vw_setup, rb_dma0
++  mov vw_setup, rb_dma1
++  mov vw_addr, rb_dest
++# >>> ra_link
 +
 +################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  nop                   ; nop           ; ldtmu0
++  mov -, vw_wait        ; nop           ; ldtmu1
++.else
++  mov.setf r3, PREREAD - 1
++:1
++  brr.anynz -, r:1b
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  sub.setf r3, r3, 1
++ # >>>
++  mov  -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu
++  mov ra_link, unif
++  mov -, vw_wait
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in,  12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % 3)
++
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  bra -, ra_link
++  sacq -, n_sem_quad_in
++  srel -, n_sem_out
++  srel -, n_sem_quad_out
++
++.else
++  bra -, ra_link
++  srel -, n_sem_sync
++  sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++  srel -, n_sem_out
++.else
++  nop
++.endif
++.endif
++.endm
++
++::mc_sync_q0
++  m_sync_q 0
++::mc_sync_q1
++  m_sync_q 1
++::mc_sync_q2
++  m_sync_q 2
++::mc_sync_q3
++  m_sync_q 3
++::mc_sync_q4
++  m_sync_q 4
++::mc_sync_q5
++  m_sync_q 5
++::mc_sync_q6
++  m_sync_q 6
++::mc_sync_q7
++  m_sync_q 7
++::mc_sync_q8
++  m_sync_q 8
++::mc_sync_q9
++  m_sync_q 9
++::mc_sync_q10
++  m_sync_q 10
++::mc_sync_q11
++  m_sync_q 11
 +
 +# mc_exit()
-+
++# Chroma & Luma the same now
 +::mc_exit_c
-+  mov.setf -, ra9      ; mov -, vw_wait
-+# Annoyingly it looks iike condition codes don't work on writes to special
-+# registers so we have to branch around the writes
-+  brr.anyz -, r:exit_c_1
-+  nop
-+  nop
-+  nop
-+# >>>
-+
-+  sub vw_setup, ra9, -16
-+  mov vw_setup, ra10
-+  mov vw_addr, ra11
-+  nop
-+:exit_c_1
-+
 +::mc_exit
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
-+
-+  mov -,srel(0)
-+
-+  nop        ; nop ; thrend
-+  nop        ; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+# mc_interrupt_exit8()
-+#::mc_interrupt_exit8
-+#mov  -, vw_wait # wait on the VDW
-+#
-+#ldtmu0
-+#ldtmu1
-+#ldtmu0
-+#ldtmu1
-+#
-+#mov -,sacq(0) # 1
-+#mov -,sacq(0) # 2
-+#mov -,sacq(0) # 3
-+#mov -,sacq(0) # 4
-+#mov -,sacq(0) # 5
-+#mov -,sacq(0) # 6
-+#mov -,sacq(0) # 7
-+#
-+#nop        ; nop ; thrend
-+#mov interrupt, 1; nop # delay slot 1
-+#nop        ; nop # delay slot 2
-+#
-+
-+
++  m_exit_drain
++  nop                   ; nop           ; thrend
++  nop
++  nop
 +
++# mc_interrupt_exit12()
++::mc_interrupt_exit12c
++::mc_interrupt_exit12
++  m_exit_drain
++  sacq -, 12
++  nop                   ; nop           ; thrend
++  mov interrupt, 1
++  nop
++# >>> thrend <<<
 +
 +# LUMA CODE
 +
 +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
 +# For P frames we make the second x,y coordinates offset by +8
 +
++
 +################################################################################
-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
-+::mc_setup
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t pic_h;
++#    uint16_t pic_w;
++#    uint32_t stride2;
++#    uint32_t stride1;
++#    uint32_t wdenom;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
-+  mov ra9, unif         # ref_y_base
-+  mov ra10, unif        # y2_x2
-+  mov ra11, unif        # ref_y2_base
-+
-+# Read image dimensions
-+  mov ra3, unif         # width_height
-+  mov rb_pitch, unif    # src_pitch [ra3 delay]
-+  sub rb_frame_width_minus_1, ra3.16b, 1
-+  sub rb_frame_height_minus_1, ra3.16a, 1
-+
-+# get destination pitch
-+  mov r1, vdw_setup_1(0)
-+  or  rb24, r1, unif    # dst_pitch
-+
-+# Compute base address for first and second access
-+  mov r3, elem_num
-+  add r0, ra8.16a, r3   # Load x + elem_num
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1
-+  shl ra_xshift_next, r0, 3 # Compute shifts
-+  add ra_y, ra8.16b, 1
-+  and r0, r0, ~3        # r0 gives the clipped and aligned x coordinate
-+  add r2, ra9, r0       # ra9 is address for frame0 (not including y offset)
-+  max r1, ra8.16b, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop                   ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t0s, r2, r1       ; mov ra_frame_base, r2
-+
-+  # r3 still contains elem_num
-+  add r0, ra10.16a, r3  # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1
-+  shl rx_xshift2_next, r0, 3 # Compute shifts
-+  add ra_y2, ra10.16b, 1
-+  and r0, r0, ~3        # r0 gives the clipped and aligned x coordinate
-+  add r2, ra11, r0      # r2 is address for frame1 (not including y offset)
-+  max r1, ra10.16b, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  nop                   ; mul24 r1, r1, rb_pitch   # r2 contains the addresses (not including y offset) for frame0
-+  add t1s, r2, r1       ; mov ra_frame_base2, r2
++  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
++  mov ra9, unif                                 # ref_y_base
++  mov ra1, unif                                 # x2_y2
++  mov ra11, unif                                # ref_y2_base
 +
 +# load constants
 +
-+  mov ra_k1, 1
-+  mov ra_k256, 256
++  mov ra_kff100100, 0xff100100
 +  mov rb_k255, 255
 +
-+# touch vertical context to keep simulator happy
++# Compute part of VPM to use
 +
++# Read image dimensions
++  mov ra3, unif         # width_height
++  mov rb_xpitch, unif   # stride2
++  sub rb_max_x, ra3.16b, 1
++  sub rb_max_y, ra3.16a, 1
++  mov rb_pitch, unif    # stride1
++
++# get destination pitch
++  mov r1, vdw_setup_1(0)
++  or  rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++  mov r3, elem_num
++  add r0, ra0.16b, r3   # Load x + elem_num
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl ra_xshift_next, r0, 3 # Compute shifts
++
++# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
++
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base, ra9, r0
++
++  # r3 still contains elem_num
++  add r0, ra1.16b, r3  # Load x
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl rb_xshift2_next, r0, 3 # Compute shifts
++
++  # r2 still contains mask
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        # Add stripe offsets
++  add ra_base2, ra11, r0
++
++# Do preloads
++  nop                   ; mov r0, ra0.16a       # ; r0 = y
++  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
++
++:y_preload
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:y_preload
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz y_preload
++
++  add rb_wt_den_p15, unif, 9                    # weight denom + 6
++
++  m_calc_dma_regs rb_vpm_init, rb_dma0_base
++
++  mov ra_link, unif                             # Next fn
++
++# touch vertical context to keep simulator happy
 +  mov ra8,  0           ; mov rb8,  0
++  bra -, ra_link
 +  mov ra9,  0           ; mov rb9,  0
 +  mov ra10, 0           ; mov rb10, 0
 +  mov ra11, 0           ; mov rb11, 0
++# >>> ra_link
 +
-+# Compute part of VPM to use
-+  m_calc_dma_regs rb28, rb27
-+
-+# Weighted prediction denom
-+  add rb13, unif, 9     # unif = weight denom + 6
-+
-+# submit texture requests for second line
-+  max r1, ra_y, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1
-+  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
-+  add t0s, r1, ra_frame_base
-+
-+  max r1, ra_y2, 0
-+  min r1, r1, rb_frame_height_minus_1
-+  add ra_y2, ra_y2, 1
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t1s, r1, ra_frame_base2
-+
-+# FALL THROUGHT TO PER-BLOCK SETUP
-+
++################################################################################
++#
 +# Start of per-block setup code
 +# P and B blocks share the same setup code to save on Icache space
-+:per_block_setup
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra_link, unif
 +
-+  mov ra1, unif  ; mov r1, elem_num  # y_x ; elem_num has implicit unpack??
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+  mov ra_xshift, ra_xshift_next
-+  mov rx_xshift2, rx_xshift2_next
++# luma_setup_delay3 done in delay slots of branch that got us here
 +
 +# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
 +
-+  add r0, ra1.16a, r1 # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl ra_xshift_next, r0, 3 # Compute shifts
-+  mov ra_y_next, ra1.16b
-+  and r0, r0, ~3                     ; mov ra1, unif # y2_x2
-+  add ra_frame_base_next, r2, r0
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t mymx21;
++#    uint32_t wo1;
++#    uint32_t wo2;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
 +
-+  add r0, ra1.16a, r1 # Load x
-+  max r0, r0, 0
-+  min r0, r0, rb_frame_width_minus_1 ; mov r2, unif  # Load the frame base
-+  shl rx_xshift2_next, r0, 3         # Compute shifts
-+  mov ra_y2_next, ra1.16b
-+  and r0, r0, ~3                     ; mov ra1, unif  # width_height ; r0 gives the clipped and aligned x coordinate
-+  add rx_frame_base2_next, r2, r0    # r2 is address for frame1 (not including y offset)
++.macro luma_setup
++  brr ra_link, r:per_block_setup
++  mov ra0, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
++  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] # [ra0 delay]
++  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++.endm
 +
-+# set up VPM write
-+  mov vw_setup, rb28
++:per_block_setup
++  max r0, r0, 0         ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3         # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
++  add ra_base_next, ra_base_next, r0            # [ra1 delay]
++
++  add r0, ra1.16b, r3                           # Load x2
++  max r0, r0, 0         ; mov ra_y2_next, ra1.16a
++  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
++  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
++  and r1, r0, r2
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov vw_setup, rb_vpm_init # Add stripe offsets ; set up VPM write
++  add rb_base2_next, rb_base2_next, r0
 +
 +# get width,height of block (unif load above)
-+  sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
-+  add rb17, ra1.16a, 5
-+  add rb18, ra1.16a, 7
-+  shl r0,   ra1.16a, 7
-+  add r0,   r0, ra1.16b # Combine width and height of destination area
-+  shl r0,   r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
++  sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, ra_height, 7 - PREREAD ; mov r0, ra_height
++  min r0, r0, ra_k16
++  add rb_lcount, r0, 7
++  shl r0,   r0, 7
++  add r0,   r0, ra_width                        # Combine width and height of destination area
++  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
-+  mov r2, 0x01040400                 # [ra5 delay]
-+  shl ra8, r0, 3                     ; mov rb14, ra5.16a
++  shl.ifz r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
 +
-+  mov r1,0x00010100  # -ve
++  mov r1,0x00010100  # -ve                      [ra8 delay]
 +  ror ra2.8a, r1, ra8.8d
 +  ror ra0.8a, r1, ra8.8c
 +
-+  ror ra2.8b, r2, ra8.8d
-+  ror ra0.8b, r2, ra8.8c
++  mov r1, 0x01040400
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
 +
 +  mov r1,0x050b0a00  # -ve
 +  ror ra2.8c, r1, ra8.8d
@@ -14313,87 +17842,73 @@ index 0000000..6fd6af5
 +  ror ra2.8d, r1, ra8.8d
 +  ror ra0.8d, r1, ra8.8c
 +
-+# In the 2nd vertical half we use b registers due to
-+# using a-side fifo regs. The easiest way to achieve this to pack it
-+# and then unpack!
++# In the 2nd vertical half we use b registers due to using a-side fifo regs
 +
 +  mov r1,0x3a281100
-+  ror ra3.8a, r1, ra8.8d
-+  ror ra1.8a, r1, ra8.8c
++  ror r0, r1, ra8.8d    ; mov ra_wt_off_mul_l1, unif
++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, rb_k255
 +
 +  mov r1,0x0a0b0500  # -ve
-+  ror ra3.8b, r1, ra8.8d
-+  ror ra1.8b, r1, ra8.8c
++  ror r0, r1, ra8.8d
++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, rb_k255
 +
 +  mov r1,0x04040100
-+  ror ra3.8c, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c
++  ror r0, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, rb_k255
++
++  mov.ifnz ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
 +
 +  mov r1,0x01010000  # -ve
-+  ror ra3.8d, r1, ra8.8d
-+  ror ra1.8d, r1, ra8.8c
-+
-+# Extract weighted prediction information in parallel
-+# We are annoyingly A src limited here
-+
-+  mov rb4, ra3.8a            ; mov ra18, unif
-+  mov rb5, ra3.8b
-+  mov rb6, ra3.8c
-+  mov.ifnz ra5, ra18
-+
++  ror r0, r1, ra8.8d
 +  bra -, ra_link
++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, rb_k255
 +
-+  shl r0, ra5.16b, rb13      # Offset calc
-+  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
-+  mov r3, 0                  ; mov rb7, ra3.8d
++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
 +# >>> branch ra_link
-+#
++
 +# r3 = 0
-+# ra18.16a = weight L1
-+# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
-+# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
-+# rb13     = weight denom + 6 + 9
-+# rb14     = weight L0
++# ra_wt_mul_l1  = weight L1
++# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
++# rb_wt_den_p15 = weight denom + 6 + 9
++# rb_wt_mul_l0  = weight L0
 +
 +
 +################################################################################
-+# mc_filter(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# mc_filter(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, y2_x2 should be y_x+8
 +# At this point we have already issued two pairs of texture requests for the current block
 +
 +::mc_filter
-+# ra5.16a = weight << 16; We want weight * 2 in rb14
++  luma_setup
 +
-+  shl rb14, ra5.16a, 1
++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
 +
-+# r3 = 0
++# r5 = 0 (loop count)
 +
 +:yloop
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
 +# N.B. Whilst y == y2 as far as this loop is concerned we will start
 +# the grab for the next block before we finish with this block and that
 +# might be B where y != y2 so we must do full processing on both y and y2
 +
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14401,72 +17916,86 @@ index 0000000..6fd6af5
 +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
 +# apply horizontal filter
-+  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
++  sub.setf -, r5, 8     ; mov r1,   ra8
++  mov ra8,  ra9         ; mov rb8,  rb9
 +  brr.anyn -, r:yloop
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
++  mov ra9,  ra10        ; mov rb9,  rb10
++  mov ra10, ra11        ; mov rb10, rb11
++  sub ra11, r2, r3      ; mov rb11, r1
 +  # >>> .anyn yloop
 +
 +  # apply vertical filter and write to VPM
 +
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov -, vw_wait
++  nop                   ; mul24 r0, rb8,  ra2.8a
++  nop                   ; mul24 r1, rb9,  ra2.8b
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0
 +# At this point r1 is a 22-bit signed quantity: 8 (original sample),
 +#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
 +# The top 8 bits have rubbish in them as mul24 is unsigned
 +# The low 6 bits need discard before weighting
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
 +  asr r1, r1, 14
-+  nop                     ; mul24 r1, r1, rb14
-+  add r1, r1, rb12
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  add r1, r1, rb_wt_off
 +
-+  shl r1, r1, 8
++  shl r1, r1, 8         ; mov r0, ra_height
 +  brr.anyn -, r:yloop
-+  asr r1, r1, rb13
-+# We have a saturating pack unit - I can't help feeling it should be useful here
-+  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
-+  max vpm, r1, 0         # Delay 3
++  asr ra3.8as, r1, rb_wt_den_p15
++  mov r1, ra_k16        ; mov -, vw_wait
++  sub r0, r0, r1        ; mov vpm, ra3.8a
 +# >>> branch.anyn yloop
 +
++# If looping again the we consumed 16 height last loop
++  # rb_dma1 (stride) remains constant
++  # rb_i_tmu remains const (based on total height)
++  # recalc rb_dma0, rb_lcount based on new segment height
++  # N.B. r3 is loop counter still
++
++  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++
 +# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
++  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
++  nop                   ; mov vw_addr, rb_dest  # start the VDW
++# >>> .anyz ra_link
 +
-+  brr -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
-+
++  add rb_lcount, rb_lcount, r0
++  shl r0, r2, i_shift23
++  add rb_dma0, rb_dma0, r0
++  brr -, r:yloop
++  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> yloop
 +
 +
 +################################################################################
 +
-+# mc_filter_b(y_x, frame_base, y2_x2, frame_base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, only the first half of coefficients contain used information.
 +# At this point we have already issued two pairs of texture requests for the current block
 +# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
@@ -14477,8 +18006,7 @@ index 0000000..6fd6af5
 +# From 19->7 32bits per command.
 +
 +::mc_filter_b
-+  # r0 = weightL0 << 16, we want it in rb14
-+#  asr rb14, r0, i_shift16
++  luma_setup
 +
 +:yloopb
 +# retrieve texture results and pick out bytes
@@ -14487,21 +18015,19 @@ index 0000000..6fd6af5
 +# If we knew there was no clipping then this code would get simpler.
 +# Perhaps we could add on the pitch and clip using larger values?
 +
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_frame_base2, rx_frame_base2_next    ; ldtmu1
-+  mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rx_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1             ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_frame_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_frame_height_minus_1
-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_frame_base2, r2  ; v8min r1, r1, rb_k255
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
 +
 +# generate seven shifted versions
 +# interleave with scroll of vertical context
@@ -14509,129 +18035,372 @@ index 0000000..6fd6af5
 +  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +
 +# apply horizontal filter
-+  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
++  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
++  sub.setf -, r5, 8     ; mov r1,   ra8
++  mov ra8,  ra9         ; mov rb8,  rb9
 +  brr.anyn -, r:yloopb
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
++  mov ra9,  ra10        ; mov rb9,  rb10
++  mov ra10, ra11        ; mov rb10, rb11
++  sub ra11, r2, r3      ; mov rb11, r1
 +  # >>> .anyn yloopb
 +
 +  # apply vertical filter and write to VPM
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov r2, rb12
++  nop                   ; mul24 r0, rb8,  ra2.8a
++  nop                   ; mul24 r1, rb9,  ra2.8b
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0        ; mov r2, rb_wt_off
 +# As with P-pred r1 is a 22-bit signed quantity in 32-bits
 +# Top 8 bits are bad - low 6 bits should be discarded
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
 +
 +  asr r1, r1, 14
-+  nop                     ; mul24 r0, r1, rb14
-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
-+
-+  add r1, r1, r0          ; mov -, vw_wait
-+  shl r1, r1, 8
++  nop                   ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
 +
++  add r1, r1, r0
++  shl r1, r1, 8         ; mov r0, ra_height
 +  brr.anyn -, r:yloopb
-+  asr r1, r1, rb13         # Delay 1
-+  min r1, r1, rb_k255       # Delay 2
-+  max vpm, r1, 0         # Delay 3
++  asr ra3.8as, r1, rb_wt_den_p15
++  mov r1, ra_k16        ; mov -, vw_wait
++  sub r0, r0, r1        ; mov vpm, ra3.8a
++# >>> branch.anyn yloop
++
++# If looping again the we consumed 16 height last loop
++  # rb_dma1 (stride) remains constant
++  # rb_i_tmu remains const (based on total height)
++  # recalc rb_dma0, rb_lcount based on new segment height
++  # N.B. r5 is loop counter still
++
++  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
 +
 +# DMA out
-+  brr -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, unif # start the VDW   Delay 3
++  bra.anyz -, ra_link
++  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
++  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
++  nop                   ; mov vw_addr, rb_dest  # start the VDW
++# >>> .anyz ra_link
++
++  add rb_lcount, rb_lcount, r0
++  shl r0, r2, i_shift23
++  add rb_dma0, rb_dma0, r0
++  brr -, r:yloopb
++  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> yloopb
 +
 +################################################################################
-+::mc_interrupt_exit12c
-+  mov.setf -, ra9      ; mov -, vw_wait
-+  brr.anyz -, r:exit12_c_1
-+  nop
-+  nop
-+  nop
-+# >>>
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++#    qpu_mc_src_t next_src1;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t wo1;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
 +
-+  sub vw_setup, ra9, -16
-+  mov vw_setup, ra10
-+  mov vw_addr, ra11
-+  mov ra9, 0
-+:exit12_c_1
++::mc_filter_y_p00
++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
++  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
++  add r0, ra0.16b, r3
 +
-+# mc_interrupt_exit12()
-+::mc_interrupt_exit12
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
++  max r0, r0, 0
++  min r0, r0, rb_max_x
 +
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+  mov -,sacq(0) # 8
-+  mov -,sacq(0) # 9
-+  mov -,sacq(0) # 10
-+  mov -,sacq(0) # 11
++  shl ra_xshift_next, r0, 3                     # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # ; set up VPM write
 +
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
++# get width,height of block (unif load above)
++  sub rb_dma1, rb_dma1_base, ra_width # Compute vdw_setup1(dst_pitch-width)
++  sub rb_i_tmu, ra_height, PREREAD ; mov r0, ra_height
++  min r0, r0, ra_k16
++  add rb_lcount, r0, 0  ; mov ra_wt_off_mul_l0, unif
++  shl r0,   r0, 7       ; mov rb_dest, unif     # Destination address
++  add r0,   r0, ra_width                        # Combine width and height of destination area
++  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base
 +
++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
 +
-+::mc_exit1
-+  mov  -, vw_wait # wait on the VDW
++:yloop_p00
++  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
 +
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  ldtmu1
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
 +
++  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++  shl r1, r1, 15        ; mov r0, ra_height
++  add r1, r1, rb_wt_off
++
++  brr.anyn -, r:yloop_p00
++  asr ra3.8as, r1, rb_wt_den_p15
++  mov r1, ra_k16        ; mov -, vw_wait
++  sub r0, r0, r1        ; mov vpm, ra3.8a
++# >>> branch.anyn yloop_p00
++
++# If looping again the we consumed 16 height last loop
++  # rb_dma1 (stride) remains constant
++  # rb_i_tmu remains const (based on total height)
++  # recalc rb_dma0, rb_lcount based on new segment height
++  # N.B. r5 is loop counter still
++
++  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
++  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
++  nop                   ; mov vw_addr, rb_dest  # start the VDW
++# >>> .anyz ra_link
++
++  add rb_lcount, rb_lcount, r0
++  shl r0, r2, i_shift23
++  add rb_dma0, rb_dma0, r0
++  brr -, r:yloop_p00
++  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> yloop_p00
++
++################################################################################
++
++::mc_filter_y_b00
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++  luma_setup
++
++# Fix up vals that were expecting a filter (somewhat icky)
++  mov r0, 7
++  sub rb_i_tmu, rb_i_tmu, r0
++  sub rb_lcount, rb_lcount, r0
++  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++  shl rb_wt_off, rb_wt_off, r0
++  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:yloop_b00
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  and r1, r1, rb_k255   ; mul24 r0, r0, ra_wt_mul_l0
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++  add r1, r0, r1
++  shl r1, r1, 14
++  add r1, r1, rb_wt_off ; mov r0, ra_height
++
++  brr.anyn -, r:yloop_b00
++  asr ra3.8as, r1, rb_wt_den_p15
++  mov r1, ra_k16        ; mov -, vw_wait
++  sub r0, r0, r1        ; mov vpm, ra3.8a
++# >>> branch.anyn yloop
++
++# If looping again the we consumed 16 height last loop
++  # rb_dma1 (stride) remains constant
++  # rb_i_tmu remains const (based on total height)
++  # recalc rb_dma0, rb_lcount based on new segment height
++  # N.B. r5 is loop counter still
++
++  max.setf -, r0, 0     ; mov ra_height, r0     # Done if Z now
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r1        ; mov vw_setup, rb_dma0 # VDW setup 0
++  sub r2, r0, r1        ; mov vw_setup, rb_dma1 # Stride
++  nop                   ; mov vw_addr, rb_dest  # start the VDW
++# >>> .anyz ra_link
++
++  add rb_lcount, rb_lcount, r0
++  shl r0, r2, i_shift23
++  add rb_dma0, rb_dma0, r0
++  brr -, r:yloop_b00
++  nop                   ; mul24 r0, r1, rb_pitch # r0 = pitch*16
++  add rb_dest, rb_dest, r0
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> yloopb00
++
++################################################################################
 +
 +::mc_end
 +# Do not add code here because mc_end must appear after all other code.
-diff --git b/libavcodec/rpi_zc.c a/libavcodec/rpi_zc.c
+diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
 new file mode 100644
-index 0000000..9ac22aa
+index 0000000..838b6bd
 --- /dev/null
-+++ a/libavcodec/rpi_zc.c
-@@ -0,0 +1,453 @@
++++ b/libavcodec/rpi_shader_cmd.h
+@@ -0,0 +1,112 @@
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++typedef struct qpu_mc_src_s
++{
++    int16_t y;
++    int16_t x;
++    uint32_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++    qpu_mc_src_t next_src;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x;
++    uint32_t coeffs_y;
++    uint32_t wo_u;
++    uint32_t wo_v;
++    uint32_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x1;
++    uint32_t coeffs_y1;
++    uint32_t weight_u1;
++    uint32_t weight_v1;
++    qpu_mc_src_t next_src2;
++    uint32_t coeffs_x2;
++    uint32_t coeffs_y2;
++    uint32_t wo_u2;
++    uint32_t wo_v2;
++    uint32_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++    qpu_mc_src_t next_src1;
++    uint32_t pic_cw;            // C Width (== Y width / 2)
++    uint32_t pic_ch;            // C Height (== Y Height / 2)
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    qpu_mc_src_t next_src2;
++    uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++    union {
++        qpu_mc_pred_c_p_t p;
++        qpu_mc_pred_c_b_t b;
++        qpu_mc_pred_c_s_t s;
++    };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t h;
++    uint16_t w;
++    uint32_t mymx21;
++    uint32_t wo1;
++    uint32_t wo2;
++    uint32_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t wo1;
++    uint32_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t pic_h;
++    uint16_t pic_w;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++    union {
++        qpu_mc_pred_y_p_t p;
++        qpu_mc_pred_y_p00_t p00;
++        qpu_mc_pred_y_s_t s;
++    };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++    qpu_mc_pred_y_t y;
++    qpu_mc_pred_c_t c;
++} qpu_mc_pred_cmd_t;
++
++#pragma pack(pop)
++
++#endif
++
+diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+new file mode 100644
+index 0000000..b061fe0
+--- /dev/null
++++ b/libavcodec/rpi_zc.c
+@@ -0,0 +1,581 @@
 +#include "config.h"
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#include "rpi_mailbox.h"
 +#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
 +
 +#include "libavutil/buffer_internal.h"
++#include <interface/vctypes/vc_image_types.h>
++
++#define TRACE_ALLOC 0
 +
 +struct ZcPoolEnt;
 +
@@ -14668,6 +18437,9 @@ index 0000000..9ac22aa
 +#define STRIDE_OR       0
 +#endif
 +
++#define DEBUG_ZAP0_BUFFERS 0
++
++
 +static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
 +{
 +    ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
@@ -14686,6 +18458,11 @@ index 0000000..9ac22aa
 +        goto fail1;
 +    }
 +
++#if TRACE_ALLOC
++    printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
++#endif
++
++    pool->numbytes = zp->gmem.numbytes;
 +    zp->next = NULL;
 +    zp->pool = pool;
 +    zp->n = pool->n++;
@@ -14699,6 +18476,10 @@ index 0000000..9ac22aa
 +
 +static void zc_pool_ent_free(ZcPoolEnt * const zp)
 +{
++#if TRACE_ALLOC
++    printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
++#endif
++
 +    gpu_free(&zp->gmem);
 +    av_free(zp);
 +}
@@ -14707,6 +18488,8 @@ index 0000000..9ac22aa
 +{
 +    ZcPoolEnt * p = pool->head;
 +    pool->head = NULL;
++    pool->numbytes = -1;
++
 +    while (p != NULL)
 +    {
 +        ZcPoolEnt * const zp = p;
@@ -14715,15 +18498,21 @@ index 0000000..9ac22aa
 +    }
 +}
 +
-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int numbytes)
++static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
 +{
 +    ZcPoolEnt * zp;
++    int numbytes;
++
 +    pthread_mutex_lock(&pool->lock);
 +
-+    if (numbytes != pool->numbytes)
++    numbytes = pool->numbytes;
++
++    // If size isn't close then dump the pool
++    // Close in this context means within 128k
++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
 +    {
 +        zc_pool_flush(pool);
-+        pool->numbytes = numbytes;
++        numbytes = req_bytes;
 +    }
 +
 +    if (pool->head != NULL)
@@ -14750,6 +18539,10 @@ index 0000000..9ac22aa
 +    if (zp != NULL)
 +    {
 +        pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
++#endif
++
 +        if (pool->numbytes == zp->gmem.numbytes)
 +        {
 +            zp->next = pool->head;
@@ -14780,10 +18573,18 @@ index 0000000..9ac22aa
 +    pthread_mutex_destroy(&pool->lock);
 +}
 +
++typedef struct ZcOldCtxVals
++{
++    int thread_safe_callbacks;
++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++    void * get_buffer_context;
++} ZcOldCtxVals;
 +
 +typedef struct AVZcEnv
 +{
++    unsigned int refcount;
 +    ZcPool pool;
++    ZcOldCtxVals old;
 +} ZcEnv;
 +
 +// Callback when buffer unrefed to zero
@@ -14803,18 +18604,71 @@ index 0000000..9ac22aa
 +}
 +
 +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+    const unsigned int video_width, const unsigned int video_height)
++    const int format, const unsigned int video_width, const unsigned int video_height)
 +{
 +    AVRpiZcFrameGeometry geo;
-+    geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+//    geo.stride_y = ((video_width + 32 + 31) & ~31);
-+    geo.stride_c = geo.stride_y / 2;
-+//    geo.height_y = (video_height + 15) & ~15;
-+    geo.height_y = (video_height + 32 + 31) & ~31;
-+    geo.height_c = geo.height_y / 2;
++
++    switch (format)
++    {
++        case AV_PIX_FMT_YUV420P:
++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
++            geo.stride_c = geo.stride_y / 2;
++        //    geo.height_y = (video_height + 15) & ~15;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            break;
++
++        case AV_PIX_FMT_SAND128:
++        {
++            const unsigned int stripe_w = 128;
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                gpu_ref();
++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
++                gpu_unref();
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++
++            pthread_mutex_unlock(&sand_lock);
++
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        default:
++            memset(&geo, 0, sizeof(geo));
++            break;
++    }
 +    return geo;
 +}
 +
++
 +static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
 +{
 +    ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
@@ -14833,6 +18687,10 @@ index 0000000..9ac22aa
 +    idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
 +#endif
 +
++#if DEBUG_ZAP0_BUFFERS
++    memset((void*)idata, 0, size);
++#endif
++
 +    if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
 +    {
 +        av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
@@ -14847,13 +18705,12 @@ index 0000000..9ac22aa
 +    return NULL;
 +}
 +
-+static int rpi_get_display_buffer(struct AVCodecContext * const s, AVFrame * const frame)
++static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
 +{
-+    ZcEnv *const zc = s->get_buffer_context;
-+    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->width, frame->height);
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
 +    const unsigned int size_y = geo.stride_y * geo.height_y;
 +    const unsigned int size_c = geo.stride_c * geo.height_c;
-+    const unsigned int size_pic = size_y + size_c * 2;
++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
 +    AVBufferRef * buf;
 +    unsigned int i;
 +
@@ -14861,7 +18718,7 @@ index 0000000..9ac22aa
 +
 +    if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
 +    {
-+        av_log(s, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
 +        return AVERROR(ENOMEM);
 +    }
 +
@@ -14872,19 +18729,24 @@ index 0000000..9ac22aa
 +    }
 +
 +    frame->buf[0] = buf;
++
 +    frame->linesize[0] = geo.stride_y;
 +    frame->linesize[1] = geo.stride_c;
 +    frame->linesize[2] = geo.stride_c;
++    if (geo.stripes > 1)
++        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
++
 +    frame->data[0] = buf->data;
 +    frame->data[1] = frame->data[0] + size_y;
-+    frame->data[2] = frame->data[1] + size_c;
++    if (geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
 +    frame->extended_data = frame->data;
 +    // Leave extended buf alone
 +
 +    return 0;
 +}
 +
-+
 +#define RPI_GET_BUFFER2 1
 +
 +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
@@ -14894,21 +18756,25 @@ index 0000000..9ac22aa
 +#else
 +    int rv;
 +
-+    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0 ||
-+        frame->format != AV_PIX_FMT_YUV420P)
++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
 +    {
 +//        printf("Do default alloc: format=%#x\n", frame->format);
 +        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
++    else if (frame->format == AV_PIX_FMT_YUV420P ||
++             frame->format == AV_PIX_FMT_SAND128)
++    {
++        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
++    }
 +    else
 +    {
-+        rv = rpi_get_display_buffer(s, frame);
++        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
 +
 +#if 0
-+    printf("%s: %dx%d lsize=%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+        frame->width, frame->height,
-+        frame->linesize[0], frame->linesize[1], frame->linesize[2],
++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++        frame->format, frame->width, frame->height,
++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
 +        frame->data[0], frame->data[1], frame->data[2],
 +        frame->buf[0], frame->buf[1], frame->buf[2],
 +        av_buffer_get_opaque(frame->buf[0]));
@@ -14929,7 +18795,7 @@ index 0000000..9ac22aa
 +    dest->width = src->width;
 +    dest->height = src->height;
 +
-+    if (rpi_get_display_buffer(s, dest) != 0)
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
 +    {
 +        return NULL;
 +    }
@@ -14962,14 +18828,16 @@ index 0000000..9ac22aa
 +{
 +    assert(s != NULL);
 +
-+    if (frame->format != AV_PIX_FMT_YUV420P)
++    if (frame->format != AV_PIX_FMT_YUV420P &&
++        frame->format != AV_PIX_FMT_SAND128)
 +    {
-+        av_log(s, AV_LOG_WARNING, "%s: *** Format not YUV420P: %d\n", __func__, frame->format);
++        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
 +        return NULL;
 +    }
 +
 +    if (frame->buf[1] != NULL)
 +    {
++        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
 +        if (maycopy)
 +        {
 +            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
@@ -15053,47 +18921,70 @@ index 0000000..9ac22aa
 +    }
 +}
 +
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++    return s->get_buffer2 == av_rpi_zc_get_buffer2;
++}
++
 +int av_rpi_zc_init(struct AVCodecContext * const s)
 +{
-+    ZcEnv * const zc = av_rpi_zc_env_alloc();
-+    if (zc == NULL)
++    if (av_rpi_zc_in_use(s))
 +    {
-+        return AVERROR(ENOMEM);
++        ZcEnv * const zc = s->get_buffer_context;
++        ++zc->refcount;
 +    }
++    else
++    {
++        ZcEnv *const zc = av_rpi_zc_env_alloc();
++        if (zc == NULL)
++        {
++            return AVERROR(ENOMEM);
++        }
 +
-+    s->get_buffer_context = zc;
-+    s->get_buffer2 = av_rpi_zc_get_buffer2;
++        zc->refcount = 1;
++        zc->old.get_buffer_context = s->get_buffer_context;
++        zc->old.get_buffer2 = s->get_buffer2;
++        zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
++
++        s->get_buffer_context = zc;
++        s->get_buffer2 = av_rpi_zc_get_buffer2;
++        s->thread_safe_callbacks = 1;
++    }
 +    return 0;
 +}
 +
 +void av_rpi_zc_uninit(struct AVCodecContext * const s)
 +{
-+    if (s->get_buffer2 == av_rpi_zc_get_buffer2)
++    if (av_rpi_zc_in_use(s))
 +    {
 +        ZcEnv * const zc = s->get_buffer_context;
-+        s->get_buffer2 = avcodec_default_get_buffer2;
-+        s->get_buffer_context = NULL;
-+        av_rpi_zc_env_free(zc);
++        if (--zc->refcount == 0)
++        {
++            s->get_buffer2 = zc->old.get_buffer2;
++            s->get_buffer_context = zc->old.get_buffer_context;
++            s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++            av_rpi_zc_env_free(zc);
++        }
 +    }
 +}
 +
 +#endif  // RPI
 +
-diff --git b/libavcodec/rpi_zc.h a/libavcodec/rpi_zc.h
+diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
 new file mode 100644
-index 0000000..4dd7a8b
+index 0000000..f4aeb78
 --- /dev/null
-+++ a/libavcodec/rpi_zc.h
-@@ -0,0 +1,88 @@
++++ b/libavcodec/rpi_zc.h
+@@ -0,0 +1,137 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
 +// Zero-Copy frame code for RPi
 +// RPi needs Y/U/V planes to be contiguous for display.  By default
 +// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display.  This code prodes a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be refrence counted until
-+// display ahs finsihed with it.
++// display.  This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
 +
 +#include "libavutil/frame.h"
 +#include "libavcodec/avcodec.h"
@@ -15110,10 +19001,13 @@ index 0000000..4dd7a8b
 +    unsigned int height_y;
 +    unsigned int stride_c;
 +    unsigned int height_c;
++    unsigned int planes_c;
++    unsigned int stripes;
 +} AVRpiZcFrameGeometry;
 +
 +
 +AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format,
 +    const unsigned int video_width, const unsigned int video_height);
 +
 +// Replacement fn for avctx->get_buffer2
@@ -15122,7 +19016,7 @@ index 0000000..4dd7a8b
 +// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
 +// must be set to 1 as otherwise the buffer info is killed before being returned
 +// by avcodec_decode_video2.  Note also that this means that the AVFrame that is
-+// return must be manually derefed with av_frame_unref.  This should be done
++// returned must be manually derefed with av_frame_unref.  This should be done
 +// after av_rpi_zc_ref has been called.
 +int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
 +
@@ -15160,6 +19054,8 @@ index 0000000..4dd7a8b
 +// Allocate the environment used by the ZC code
 +void av_rpi_zc_env_free(AVZcEnvPtr);
 +
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
 +
 +// Init ZC into a context
 +// There is nothing magic in this fn - it just packages setting
@@ -15171,12 +19067,56 @@ index 0000000..4dd7a8b
 +// get_buffer2 & get_buffer_context
 +void av_rpi_zc_uninit(struct AVCodecContext * const s);
 +
++
++
++static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    const unsigned int stride1 = frame->linesize[0];
++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = frame->linesize[0];
++    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
++    const unsigned int x = x_c * 2;
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
++}
++
++static inline int rpi_sliced_frame(const AVFrame * const frame)
++{
++    return frame->format == AV_PIX_FMT_SAND128;
++}
++
++
 +#endif
 +
-diff --git b/libavcodec/utils.c a/libavcodec/utils.c
-index 3e8677d..f1efc0d 100644
---- b/libavcodec/utils.c
-+++ a/libavcodec/utils.c
+diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+index 0c68836..b8139f5 100644
+--- a/libavcodec/utils.c
++++ b/libavcodec/utils.c
 @@ -26,6 +26,12 @@
   */
  
@@ -15264,48 +19204,34 @@ index 3e8677d..f1efc0d 100644
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
-diff --git b/libavformat/matroskaenc.c a/libavformat/matroskaenc.c
-index 9c7a213..af941ce 100644
---- b/libavformat/matroskaenc.c
-+++ a/libavformat/matroskaenc.c
-@@ -2223,7 +2223,7 @@ static int mkv_check_new_extra_data(AVFormatContext *s, AVPacket *pkt)
+@@ -729,6 +788,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+ {
+     int ret;
  
-     switch (par->codec_id) {
-     case AV_CODEC_ID_FLAC:
--        if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL) && !mkv->is_live) {
-+        if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL)) {
-             AVCodecParameters *codecpriv_par;
-             int64_t curpos;
-             if (side_data_size != par->extradata_size) {
-diff --git b/libavformat/mov.c a/libavformat/mov.c
-index f2296f8..4550cf0 100644
---- b/libavformat/mov.c
-+++ a/libavformat/mov.c
-@@ -1186,12 +1186,6 @@ static void mov_metadata_creation_time(AVDictionary **metadata, int64_t time)
-     if (time) {
-         if(time >= 2082844800)
-             time -= 2082844800;  /* seconds between 1904-01-01 and Epoch */
--
--        if ((int64_t)(time * 1000000ULL) / 1000000 != time) {
--            av_log(NULL, AV_LOG_DEBUG, "creation_time is not representable\n");
--            return;
--        }
--
-         avpriv_dict_set_timestamp(metadata, "creation_time", time * 1000000);
-     }
- }
-@@ -5794,7 +5788,6 @@ static int mov_read_close(AVFormatContext *s)
-     av_freep(&mov->fragment_index_data);
++#ifdef RPI
++    // This is going to end badly if we let it continue
++    av_assert0(frame->format != AV_PIX_FMT_SAND128);
++#endif
++
+     if (avctx->hw_frames_ctx)
+         return av_hwframe_get_buffer(avctx->hw_frames_ctx, frame, 0);
  
-     av_freep(&mov->aes_decrypt);
--    av_freep(&mov->chapter_tracks);
+diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
+index ecfb872..5fa099f 100644
+--- a/libavfilter/avfilter.c
++++ b/libavfilter/avfilter.c
+@@ -969,6 +969,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
+                    "options, but options were provided: %s.\n", args);
+             return AVERROR(EINVAL);
+         }
++        printf("=== args='%s'\n", args);
  
-     return 0;
- }
-diff --git b/libavformat/mpegts.c a/libavformat/mpegts.c
+ #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
+             if (   !strcmp(filter->filter->name, "format")     ||
+diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
 index 3eff152..30dfb14 100644
---- b/libavformat/mpegts.c
-+++ a/libavformat/mpegts.c
+--- a/libavformat/mpegts.c
++++ b/libavformat/mpegts.c
 @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
  #endif
      { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264       },
@@ -15315,10 +19241,10 @@ index 3eff152..30dfb14 100644
      { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000   },
      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
-diff --git b/libavformat/utils.c a/libavformat/utils.c
-index a059046..ef70074 100644
---- b/libavformat/utils.c
-+++ a/libavformat/utils.c
+diff --git a/libavformat/utils.c b/libavformat/utils.c
+index a82bbc7..4bf5574 100644
+--- a/libavformat/utils.c
++++ b/libavformat/utils.c
 @@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
          int default_stream_index = av_find_default_stream_index(s);
          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
@@ -15328,10 +19254,10 @@ index a059046..ef70074 100644
                      continue;
                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
-diff --git b/libavutil/buffer.c a/libavutil/buffer.c
+diff --git a/libavutil/buffer.c b/libavutil/buffer.c
 index 8d1aa5f..649876d 100644
---- b/libavutil/buffer.c
-+++ a/libavutil/buffer.c
+--- a/libavutil/buffer.c
++++ b/libavutil/buffer.c
 @@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
  
      return ret;
@@ -15342,10 +19268,10 @@ index 8d1aa5f..649876d 100644
 +  BufferPoolEntry *buf = av_buffer_get_opaque(ref);
 +  return buf->opaque;
 +}
-diff --git b/libavutil/buffer.h a/libavutil/buffer.h
+diff --git a/libavutil/buffer.h b/libavutil/buffer.h
 index 73b6bd0..d907de3 100644
---- b/libavutil/buffer.h
-+++ a/libavutil/buffer.h
+--- a/libavutil/buffer.h
++++ b/libavutil/buffer.h
 @@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
   */
  AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
@@ -15356,60 +19282,315 @@ index 73b6bd0..d907de3 100644
  /**
   * @}
   */
-diff --git b/pi-util/conf.sh a/pi-util/conf.sh
-new file mode 100755
-index 0000000..8b596a2
---- /dev/null
-+++ a/pi-util/conf.sh
-@@ -0,0 +1,33 @@
-+echo "Configure for Pi2/3"
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index 7cb78a1..b94a635 100644
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -127,6 +127,13 @@ enum AVFrameSideDataType {
+      * libavutil/spherical.h.
+      */
+     AV_FRAME_DATA_SPHERICAL,
 +
-+RPI_BUILDROOT=`pwd`/build
-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
++    /**
++     * Extra data required to deal with a cropped Sand frame
++     * AVFrame holds the cropped size, but we cannot simply offset the start
++     * address to get the picture as we can for planar formats
++     */
++    AV_FRAME_DATA_SAND_INFO,
+ };
+ 
+ enum AVActiveFormatDescription {
+@@ -139,6 +146,13 @@ enum AVActiveFormatDescription {
+     AV_AFD_SP_4_3       = 15,
+ };
+ 
++typedef struct AVFrameDataSandInfo
++{
++    unsigned int left_offset;
++    unsigned int top_offset;
++    unsigned int pic_width;
++    unsigned int pic_height;
++} AVFrameDataSandInfo;
+ 
+ /**
+  * Structure to hold side data for an AVFrame.
+diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+index d4a7a8b..92a01a4 100644
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2158,6 +2158,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+         .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
+                  AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
+     },
++    [AV_PIX_FMT_SAND128] = {
++        .name = "sand128",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
++        },
++        .flags = 0,
++    }
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+index 5dafc34..0895b69 100644
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -314,6 +314,9 @@ enum AVPixelFormat {
+     AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
+     AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
+ 
++// RPI - not on ifdef so can be got at by calling progs
++    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
 +
-+./configure --enable-cross-compile\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+     AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+ };
+ 
+diff --git a/libswscale/input.c b/libswscale/input.c
+index 04a5190..837f633 100644
+--- a/libswscale/input.c
++++ b/libswscale/input.c
+@@ -741,6 +741,13 @@ static void p016BEToUV_c(uint8_t *dstU, uint8_t *dstV,
+     }
+ }
+ 
++static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
++                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
++                       int width, uint32_t *unused)
++{
++    // NIF
++}
 +
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git b/pi-util/conf_h265.csv a/pi-util/conf_h265.csv
+ #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+ 
+ static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+@@ -1124,6 +1131,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+     case AV_PIX_FMT_P016BE:
+         c->chrToYV12 = p016BEToUV_c;
+         break;
++    case AV_PIX_FMT_SAND128:
++        c->chrToYV12 = sand128ToUV_c;
++        break;
+     }
+     if (c->chrSrcHSubSample) {
+         switch (srcFormat) {
+diff --git a/libswscale/utils.c b/libswscale/utils.c
+index 4c9b53b..835f3aa 100644
+--- a/libswscale/utils.c
++++ b/libswscale/utils.c
+@@ -254,6 +254,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+     [AV_PIX_FMT_P010BE]      = { 1, 1 },
+     [AV_PIX_FMT_P016LE]      = { 1, 0 },
+     [AV_PIX_FMT_P016BE]      = { 1, 0 },
++#ifdef RPI
++    [AV_PIX_FMT_SAND128]     = { 1, 0 },
++#endif
+ };
+ 
+ int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
-index 0000000..d3db338
+index 0000000..2d45679
 --- /dev/null
-+++ a/pi-util/conf_h265.csv
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,24 @@
++Building Pi FFmpeg
++==================
++
++Configuration:
++=============
++
++pi-util/conf_pi2.sh
++
++contains suitable options to build the code for Pi2/3.  It expects to find
++git clones of
++
++https://github.com/raspberrypi/tools
++https://github.com/raspberrypi/firmware
++
++in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
++lot of history you don't want.
++
++If you have a copy of qasm.py in ../local then the .qasm sources will be
++rebuilt.  Otherwise the prebuilt .c & .h files will be used.
++
++pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
++H265 QPU acceleration is broken on Pi1 and so it is disabled.
++
++
+diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
+new file mode 100644
+index 0000000..6082641
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+new file mode 100644
+index 0000000..fc14f2a
+--- /dev/null
++++ b/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
 +1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
 +1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
 +1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
 +1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
 +1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
 +1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
 +1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+2,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
 +1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
 +1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
 +1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
@@ -15431,7 +19612,7 @@ index 0000000..d3db338
 +1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
 +1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
 +1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+2,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
 +1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
 +1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
 +1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
@@ -15471,7 +19652,7 @@ index 0000000..d3db338
 +1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
 +1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
 +1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+2,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
 +1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
 +1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
 +1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
@@ -15485,10 +19666,10 @@ index 0000000..d3db338
 +1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
 +1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
 +1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+2,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
 +1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
 +1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+2,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
 +1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
 +1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
 +1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
@@ -15517,7 +19698,7 @@ index 0000000..d3db338
 +1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
 +1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
 +1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+2,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
 +1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
 +1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
 +1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
@@ -15528,7 +19709,7 @@ index 0000000..d3db338
 +1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
 +0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
 +1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
 +1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
 +1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
 +1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
@@ -15545,12 +19726,85 @@ index 0000000..d3db338
 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git b/pi-util/ffconf.py a/pi-util/ffconf.py
-new file mode 100644
-index 0000000..c896bc6
+diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
+new file mode 100755
+index 0000000..ec25b81
 --- /dev/null
-+++ a/pi-util/ffconf.py
-@@ -0,0 +1,154 @@
++++ b/pi-util/conf_pi1.sh
+@@ -0,0 +1,31 @@
++echo "Configure for Pi1"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --cpu=arm1176jzf-s\
++ --arch=arm\
++ --disable-neon\
++ --target-os=linux\
++ --disable-stripping\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
+new file mode 100755
+index 0000000..f8e5e75
+--- /dev/null
++++ b/pi-util/conf_pi2.sh
+@@ -0,0 +1,30 @@
++echo "Configure for Pi2/3"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --arch=armv6t2\
++ --cpu=cortex-a7\
++ --target-os=linux\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+new file mode 100755
+index 0000000..e96bad2
+--- /dev/null
++++ b/pi-util/ffconf.py
+@@ -0,0 +1,164 @@
 +#!/usr/bin/env python
 +
 +import os
@@ -15561,7 +19815,6 @@ index 0000000..c896bc6
 +import csv
 +from stat import *
 +
-+conf_root = "/opt/conform/h265"
 +ffmpeg_exec = "./ffmpeg"
 +
 +def testone(fileroot, name, es_file, md5_file):
@@ -15611,10 +19864,10 @@ index 0000000..c896bc6
 +
 +def scandir(root):
 +    aconf = []
-+    ents = os.listdir(conf_root)
++    ents = os.listdir(root)
 +    ents.sort(key=str.lower)
 +    for name in ents:
-+        test_path = os.path.join(conf_root, name)
++        test_path = os.path.join(root, name)
 +        if S_ISDIR(os.stat(test_path).st_mode):
 +            files = os.listdir(test_path)
 +            es_file = "?"
@@ -15625,7 +19878,7 @@ index 0000000..c896bc6
 +                    pass
 +                elif ext == ".bit" or ext == ".bin":
 +                    es_file = f
-+                elif ext == ".md5":
++                elif ext == ".md5" or (ext == ".txt" and base[-4:] == "_md5"):
 +                    if md5_file == "?":
 +                        md5_file = f
 +                    elif base[-3:] == "yuv":
@@ -15641,9 +19894,11 @@ index 0000000..c896bc6
 +            return True
 +        return False
 +
-+def doconf(csva, tests):
-+    failures = []
++def doconf(csva, tests, test_root):
++    unx_failures = []
 +    unx_success = []
++    failures = 0
++    successes = 0
 +    for a in csva:
 +        exp_test = int(a[0])
 +        if (exp_test and runtest(a[1], tests)):
@@ -15651,17 +19906,25 @@ index 0000000..c896bc6
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
++            if (rv == 0):
++                successes += 1
++            else:
++                failures += 1
++
 +            if (rv == 0):
 +                if exp_test == 2:
 +                    print ": * OK *"
 +                    unx_success.append(name)
 +                else:
 +                    print ": ok"
-+            elif exp_test > 1 and rv == 1:
++            elif exp_test == 2 and rv == 1:
 +                print ": fail"
++            elif exp_test == 3 and rv == 2:
++                # Call an expected "crash" an abort
++                print ": abort"
 +            else:
-+                failures.append(name)
++                unx_failures.append(name)
 +                if rv == 1:
 +                    print ": * FAIL *"
 +                elif (rv == 2) :
@@ -15671,11 +19934,11 @@ index 0000000..c896bc6
 +                else :
 +                    print ": * BANG *"
 +
-+    if failures or unx_success:
-+        print "Unexpected Failures:", failures
++    if unx_failures or unx_success:
++        print "Unexpected Failures:", unx_failures
 +        print "Unexpected Success: ", unx_success
 +    else:
-+        print "All tests normal"
++        print "All tests normal:", successes, "ok,", failures, "failed"
 +
 +
 +class ConfCSVDialect(csv.Dialect):
@@ -15691,2662 +19954,67 @@ index 0000000..c896bc6
 +
 +    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
 +    argp.add_argument("tests", nargs='*')
++    argp.add_argument("--test_root", default="/opt/conform/h265", help="Root dir for test")
 +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
 +    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
 +    args = argp.parse_args()
 +
 +    if args.csvgen:
-+        csv.writer(sys.stdout).writerows(scandir(conf_root))
++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
 +        exit(0)
 +
 +    with open(args.csv, 'rt') as csvfile:
 +        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
 +
 +
-+    doconf(csva, args.tests)
++    doconf(csva, args.tests, args.test_root)
 +
-diff --git b/pi-util/qasm.py a/pi-util/qasm.py
-new file mode 100644
-index 0000000..1eacc04
+diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+new file mode 100755
+index 0000000..47dd071
 --- /dev/null
-+++ a/pi-util/qasm.py
-@@ -0,0 +1,2502 @@
-+#!/usr/bin/env python
-+
-+#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
-+#    add  r0, r0, 1                    # implicit mul nop
-+#    nop                               # explicit add nop, implicit mul nop
-+#    bkpt                              # implicit add/mul nop
-+#    mov  r0, 0x1234                   # hex immediate
-+#    mov  r0, 20 * 40                  # expressions...
-+#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
-+#    mov  r0, a:label                  # put address of label in r0
-+# :label
-+#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
-+# :1
-+#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
-+# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
-+# .set my_val, 3                       # introduce alias for 3
-+# .set my_reg, r0                      # and for r0
-+#    mov  my_reg, my_val               # then use them
-+# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
-+# .macro my_add, a, b, c               # a, b, c act as if .set on entry
-+# .set my_val, 10
-+#    add  a, b, c
-+#    mov  r0, my_val                   # 10
-+# .endm                                # forget all .sets since .macro (including arg .sets)
-+#    mov  r0, my_val                   # 3
-+#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
-+
-+import math
-+import optparse
-+import os
-+import random
-+import re
-+import struct
-+import sys
-+import time
-+
-+###############################################################################
-+# constants
-+###############################################################################
-+
-+# ops
-+######
-+
-+# negatives are internal qasm ops
-+
-+AOP_MOV     = -3   # two operands
-+AOP_BRA     = -2   # two operands
-+AOP_BRR     = -1   # two operands
-+AOP_NOP     = 0x00 # no operands
-+AOP_FADD    = 0x01
-+AOP_FSUB    = 0x02
-+AOP_FMIN    = 0x03
-+AOP_FMAX    = 0x04
-+AOP_FMINABS = 0x05
-+AOP_FMAXABS = 0x06
-+AOP_FTOI    = 0x07 # two operands
-+AOP_ITOF    = 0x08 # two operands
-+AOP_ADD     = 0x0c
-+AOP_SUB     = 0x0d
-+AOP_SHR     = 0x0e
-+AOP_ASR     = 0x0f
-+AOP_ROR     = 0x10
-+AOP_SHL     = 0x11
-+AOP_MIN     = 0x12
-+AOP_MAX     = 0x13
-+AOP_AND     = 0x14
-+AOP_OR      = 0x15
-+AOP_XOR     = 0x16
-+AOP_NOT     = 0x17 # two operands
-+AOP_CLZ     = 0x18 # two operands
-+AOP_V8ADDS  = 0x1e
-+AOP_V8SUBS  = 0x1f
-+
-+MOP_MOV    = -1  # two operands
-+MOP_NOP    = 0x0 # no operands
-+MOP_FMUL   = 0x1
-+MOP_MUL24  = 0x2
-+MOP_V8MULD = 0x3
-+MOP_V8MIN  = 0x4
-+MOP_V8MAX  = 0x5
-+MOP_V8ADDS = 0x6
-+MOP_V8SUBS = 0x7
-+
-+# ldi modes
-+############
-+
-+LDI_32          = 0
-+LDI_EL_SIGNED   = 1
-+LDI_EL_UNSIGNED = 3
-+LDI_SEMA        = 4
-+
-+# conds
-+########
-+
-+COND_NEVER  = 0
-+COND_ALWAYS = 1
-+COND_IFZ    = 2
-+COND_IFNZ   = 3
-+COND_IFN    = 4
-+COND_IFNN   = 5
-+COND_IFC    = 6
-+COND_IFNC   = 7
-+
-+BCOND_ALLZ   = 0
-+BCOND_ALLNZ  = 1
-+BCOND_ANYZ   = 2
-+BCOND_ANYNZ  = 3
-+BCOND_ALLN   = 4
-+BCOND_ALLNN  = 5
-+BCOND_ANYN   = 6
-+BCOND_ANYNN  = 7
-+BCOND_ALLC   = 8
-+BCOND_ALLNC  = 9
-+BCOND_ANYC   = 10
-+BCOND_ANYNC  = 11
-+BCOND_ALWAYS = 15
-+
-+# packing/unpacking
-+####################
-+
-+# regfile a pack modes
-+PACK_A_NOP   = 0
-+PACK_A_16A   = 1
-+PACK_A_16B   = 2
-+PACK_A_8888  = 3
-+PACK_A_8A    = 4
-+PACK_A_8B    = 5
-+PACK_A_8C    = 6
-+PACK_A_8D    = 7
-+PACK_A_32S   = 8
-+PACK_A_16AS  = 9
-+PACK_A_16BS  = 10
-+PACK_A_8888S = 11
-+PACK_A_8AS   = 12
-+PACK_A_8BS   = 13
-+PACK_A_8CS   = 14
-+PACK_A_8DS   = 15
-+
-+# mul unit pack modes
-+PACK_MUL_NOP  = 0
-+PACK_MUL_8888 = 3
-+PACK_MUL_8A   = 4
-+PACK_MUL_8B   = 5
-+PACK_MUL_8C   = 6
-+PACK_MUL_8D   = 7
-+
-+# regfile a unpack modes
-+UNPACK_A_NOP = 0
-+UNPACK_A_16A = 1
-+UNPACK_A_16B = 2
-+UNPACK_A_8R  = 3
-+UNPACK_A_8A  = 4
-+UNPACK_A_8B  = 5
-+UNPACK_A_8C  = 6
-+UNPACK_A_8D  = 7
-+
-+# r4 unpack modes
-+UNPACK_R4_NOP = 0
-+UNPACK_R4_16A = 1
-+UNPACK_R4_16B = 2
-+UNPACK_R4_8R  = 3
-+UNPACK_R4_8A  = 4
-+UNPACK_R4_8B  = 5
-+UNPACK_R4_8C  = 6
-+UNPACK_R4_8D  = 7
-+
-+PACK_TYPE_INT    = 0
-+PACK_TYPE_FLOAT  = 1
-+PACK_TYPE_EITHER = -1
-+
-+PACK_MODE_A      = 0 # regfile a
-+PACK_MODE_M      = 1 # mul unit
-+PACK_MODE_EITHER = -1
-+
-+UNPACK_LOC_A     = 0 # regfile a
-+UNPACK_LOC_R4    = 1 # r4
-+UNPACK_LOC_AB    = 2 # either regfile a or regfile b
-+UNPACK_LOC_OTHER = 3 # somewhere else
-+
-+# args
-+#######
-+
-+# loc_t, ie internal
-+MUX_AC  = 0
-+MUX_ANY = 1
-+MUX_A   = 2
-+MUX_B   = 3
-+RW_EITHER = 0
-+RW_READ   = 1
-+RW_WRITE  = 2
-+
-+RADDR_NOP = 39
-+
-+# negatives are for internal use
-+RMUX_SEMA  = -6
-+RMUX_LABEL = -5
-+RMUX_IMMV  = -4
-+RMUX_IMM   = -3
-+RMUX_AC    = -2
-+RMUX_ANY   = -1
-+RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
-+RMUX_A     = 6
-+RMUX_B     = 7
-+
-+WADDR_R0  = 32 # followed by R1, R2, R3
-+WADDR_NOP = 39
-+
-+WMUX_ANY = 0
-+WMUX_A   = 1
-+WMUX_B   = 2
-+
-+# signals
-+##########
-+
-+SIG_BKPT       = 0
-+SIG_NORMAL     = 1
-+SIG_THRSW      = 2
-+SIG_THREND     = 3
-+SIG_SBWAIT     = 4
-+SIG_SBDONE     = 5
-+SIG_INT        = 6 # on a0
-+SIG_LTHRSW     = 6 # on b0
-+SIG_LOADCV     = 7
-+SIG_LOADC      = 8
-+SIG_LDCEND     = 9
-+SIG_LDTMU0     = 10
-+SIG_LDTMU1     = 11
-+SIG_ROTATE     = 12 # on a0
-+SIG_LOADAM     = 12 # on b0
-+SIG_SMALLIMMED = 13
-+SIG_IMMED      = 14
-+SIG_BRANCH     = 15
-+
-+# multi-line assembler constructs
-+##################################
-+
-+CONSTRUCT_MACRO = 0x1
-+CONSTRUCT_IF    = 0x2
-+CONSTRUCT_ELSE  = 0x4
-+CONSTRUCT_REP   = 0x8
-+
-+###############################################################################
-+# helpers
-+###############################################################################
-+
-+def asm_error(message, location = None):
-+   if location is None:
-+      location = current_location
-+   if location == '':
-+      sys.stderr.write('qasm ERROR: %s\n' % message)
-+   else:
-+      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
-+   sys.exit(-1)
-+
-+def asm_warning(message, location = None):
-+   if disable_warnings or (nwarn_level != 0):
-+      return
-+   if location is None:
-+      location = current_location
-+   if location == '':
-+      sys.stderr.write('qasm WARNING: %s\n' % message)
-+   else:
-+      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
-+   if warnings_are_errors:
-+      asm_error('warnings are errors!', location)
-+
-+# smart_split('') = []
-+# smart_split('a') = ['a']
-+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
-+def smart_split(s, delim = ',', count = 0):
-+   if len(s) == 0:
-+      return []
-+   parts = []
-+   depth = 0
-+   i = 0
-+   for j in xrange(len(s)):
-+      if s[j] in '([{':
-+         depth += 1
-+      elif s[j] in ')]}':
-+         depth -= 1
-+      elif (s[j] == delim) and (depth == 0):
-+         parts.append(s[i:j])
-+         i = j + 1
-+         if len(parts) == count:
-+            break
-+   if depth != 0:
-+      asm_error('bracket nesting fail')
-+   parts.append(s[i:])
-+   return parts
-+
-+def is_int(x):
-+   return isinstance(x, int) or isinstance(x, long)
-+
-+###############################################################################
-+# "parsing" stuff
-+###############################################################################
-+
-+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
-+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
-+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
-+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
-+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
-+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
-+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
-+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
-+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
-+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
-+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
-+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
-+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
-+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
-+re_label_ref_left = re.compile('\\b([ar]):')
-+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
-+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
-+
-+# ops
-+######
-+
-+aops = {
-+   'mov': (AOP_MOV, 2),
-+   'bra': (AOP_BRA, 2),
-+   'brr': (AOP_BRR, 2),
-+   'nop': (AOP_NOP, 0),
-+   'fadd': (AOP_FADD, 3),
-+   'fsub': (AOP_FSUB, 3),
-+   'fmin': (AOP_FMIN, 3),
-+   'fmax': (AOP_FMAX, 3),
-+   'fminabs': (AOP_FMINABS, 3),
-+   'fmaxabs': (AOP_FMAXABS, 3),
-+   'ftoi': (AOP_FTOI, 2),
-+   'itof': (AOP_ITOF, 2),
-+   'add': (AOP_ADD, 3),
-+   'sub': (AOP_SUB, 3),
-+   'shr': (AOP_SHR, 3),
-+   'asr': (AOP_ASR, 3),
-+   'ror': (AOP_ROR, 3),
-+   'shl': (AOP_SHL, 3),
-+   'min': (AOP_MIN, 3),
-+   'max': (AOP_MAX, 3),
-+   'and': (AOP_AND, 3),
-+   'or': (AOP_OR, 3),
-+   'xor': (AOP_XOR, 3),
-+   'not': (AOP_NOT, 2),
-+   'clz': (AOP_CLZ, 2),
-+   'v8adds': (AOP_V8ADDS, 3),
-+   'v8subs': (AOP_V8SUBS, 3)}
-+
-+def get_aop(aop):
-+   if aop not in aops:
-+      asm_error('invalid aop')
-+   return aops[aop]
-+
-+mops = {
-+   'mov': (MOP_MOV, 2),
-+   'nop': (MOP_NOP, 0),
-+   'fmul': (MOP_FMUL, 3),
-+   'mul24': (MOP_MUL24, 3),
-+   'v8muld': (MOP_V8MULD, 3),
-+   'v8min': (MOP_V8MIN, 3),
-+   'v8max': (MOP_V8MAX, 3),
-+   'v8adds': (MOP_V8ADDS, 3),
-+   'v8subs': (MOP_V8SUBS, 3)}
-+
-+def get_mop(mop):
-+   if mop not in mops:
-+      asm_error('invalid mop')
-+   return mops[mop]
-+
-+# conds
-+########
-+
-+conds = {
-+   'ifz': COND_IFZ,
-+   'ifnz': COND_IFNZ,
-+   'ifn': COND_IFN,
-+   'ifnn': COND_IFNN,
-+   'ifc': COND_IFC,
-+   'ifnc': COND_IFNC}
-+
-+def get_cond(cond):
-+   if not cond:
-+      return COND_ALWAYS
-+   if cond not in conds:
-+      asm_error('invalid cond')
-+   return conds[cond]
-+
-+bconds = {
-+   'allz': BCOND_ALLZ,
-+   'allnz': BCOND_ALLNZ,
-+   'anyz': BCOND_ANYZ,
-+   'anynz': BCOND_ANYNZ,
-+   'alln': BCOND_ALLN,
-+   'allnn': BCOND_ALLNN,
-+   'anyn': BCOND_ANYN,
-+   'anynn': BCOND_ANYNN,
-+   'allc': BCOND_ALLC,
-+   'allnc': BCOND_ALLNC,
-+   'anyc': BCOND_ANYC,
-+   'anync': BCOND_ANYNC}
-+
-+def get_bcond(bcond):
-+   if not bcond:
-+      return BCOND_ALWAYS
-+   if bcond not in bconds:
-+      asm_error('invalid bcond')
-+   return bconds[bcond]
-+
-+def get_setf(setf):
-+   if not setf:
-+      return False
-+   return True
-+
-+# packing/unpacking
-+####################
-+
-+packs = {
-+   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
-+   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
-+   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
-+   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
-+   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
-+
-+def get_pack(pack):
-+   if not pack:
-+      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
-+   if pack not in packs:
-+      asm_error('invalid pack')
-+   return packs[pack]
-+
-+a_unpacks = {
-+   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
-+   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
-+   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
-+   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
-+   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
-+   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
-+   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
-+   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
-+   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
-+   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
-+   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
-+   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
-+   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
-+
-+def get_a_unpack(unpack):
-+   if not unpack:
-+      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
-+   if unpack not in a_unpacks:
-+      asm_error('invalid ra unpack')
-+   return a_unpacks[unpack] + (UNPACK_LOC_A,)
-+
-+r4_unpacks = {
-+   '16af': UNPACK_R4_16A,
-+   '16bf': UNPACK_R4_16B,
-+   '8dr':  UNPACK_R4_8R,
-+   '8ac':  UNPACK_R4_8A,
-+   '8bc':  UNPACK_R4_8B,
-+   '8cc':  UNPACK_R4_8C,
-+   '8dc':  UNPACK_R4_8D}
-+
-+def get_r4_unpack(unpack):
-+   if not unpack:
-+      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
-+   if unpack not in r4_unpacks:
-+      asm_error('invalid r4 unpack')
-+   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
-+
-+# args
-+#######
-+
-+class loc_t:
-+   def __init__(self, mux, i, rot, r5_rot, pack, rw):
-+      self.mux = mux
-+      self.i = i
-+      self.rot = rot % 16
-+      self.r5_rot = r5_rot % 16
-+      self.pack = pack
-+      self.rw = rw
-+
-+   def copy(self):
-+      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __add__(self, i):
-+      if not is_int(i):
-+         raise Exception('can only add integer to loc')
-+      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __sub__(self, i):
-+      if not is_int(i):
-+         raise Exception('can only subtract integer from loc')
-+      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __cmp__(self, other):
-+      if is_int(other):
-+         return cmp(self.i, other)
-+      if not isinstance(other, loc_t):
-+         raise Exception('can only compare loc to integer or other loc')
-+      if self.mux != other.mux:
-+         return cmp(self.mux, other.mux)
-+      if self.i != other.i:
-+         return cmp(self.i, other.i)
-+      if self.rot != other.rot:
-+         return cmp(self.rot, other.rot)
-+      if self.r5_rot != other.r5_rot:
-+         return cmp(self.r5_rot, other.r5_rot)
-+      return cmp(self.pack, other.pack)
-+
-+   def is_r5(self):
-+      return (self.mux == MUX_AC) and (self.i == 5)
-+
-+   def shift(self, rot, left):
-+      if isinstance(rot, loc_t) and rot.is_r5():
-+         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
-+            raise Exception('can\'t rotate by rotated/unpacked r5')
-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
-+      if not is_int(rot):
-+         raise Exception('can only rotate by integer or r5')
-+      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
-+
-+   def __lshift__(self, rot):
-+      return self.shift(rot, True)
-+
-+   def __rshift__(self, rot):
-+      return self.shift(rot, False)
-+
-+   def __getattr__(self, name):
-+      # discard the first character if it is an underscore. this is a total hack
-+      # to allow packs starting with a digit to work
-+      if name[0] == '_':
-+         name = name[1:]
-+      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
-+         if self.pack:
-+            raise Exception('can\'t specify two packs')
-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
-+      raise AttributeError()
-+
-+   def __str__(self):
-+      if self.mux == MUX_AC:
-+         return 'r%d' % self.i
-+      if self.mux == MUX_ANY:
-+         return 'rany%d' % self.i
-+      if self.mux == MUX_A:
-+         return 'ra%d' % self.i
-+      if self.mux == MUX_B:
-+         return 'rb%d' % self.i
-+      assert 0
-+
-+class sema_t:
-+   def __init__(self, acq, i):
-+      if not is_int(i):
-+         raise Exception('semaphore index must be integer')
-+      self.acq = acq
-+      self.i = i
-+
-+class label_t:
-+   def __init__(self, rel, name, offset):
-+      self.rel = rel
-+      self.name = name
-+      self.offset = offset
-+
-+   def __add__(self, offset):
-+      return label_t(self.rel, self.name, self.offset + offset)
-+
-+   def __sub__(self, offset):
-+      return label_t(self.rel, self.name, self.offset - offset)
-+
-+class label_maker_t:
-+   def __init__(self, rel):
-+      self.rel = rel
-+
-+   def __getattr__(self, name):
-+      # we discard the first character. this is a total hack to allow numeric labels to work
-+      if not re_label_ref_right.match(name[1:]):
-+         raise Exception('invalid label reference')
-+      return label_t(self.rel, name[1:], 0)
-+
-+def bits(x, n):
-+   if (x >> n) != 0:
-+      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
-+   return x
-+
-+def bitsw(x, n):
-+   if x == (1 << n):
-+      x = 0
-+   return bits(x, n)
-+
-+def bitsws(x, n):
-+   if x == (1 << (n - 1)):
-+      x = 0
-+   if -(1 << (n - 1)) <= x < 0:
-+      x += 1 << n
-+   return bits(x, n)
-+
-+def vpm_setup(n, stride, addr, v2 = False):
-+   horiz, laned, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   if horiz:
-+      if x != 0:
-+         raise Exception('horizontal accesses must have x of 0')
-+   else:
-+      if (y & 0xf) != 0:
-+         raise Exception('vertical accesses must be 16 row aligned')
-+   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
-+   if v2:
-+      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
-+         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
-+   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
-+      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
-+
-+def vdw_setup_0(n, m, addr):
-+   horiz, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
-+      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
-+
-+def vdr_setup_0(n, m, addr, vpm_stride, stride):
-+   horiz, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   if (stride < 8) or (stride & (stride - 1)):
-+      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
-+   log2_stride = 3
-+   while (1 << log2_stride) != stride:
-+      log2_stride += 1
-+   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
-+      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
-+      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
-+
-+class allocator_t:
-+   def __init__(self, *available):
-+      self.available = list(available)
-+      self.allocated = {}
-+      self.reserved = []
-+
-+   def copy(self):
-+      a = allocator_t()
-+      a.available = self.available[:]
-+      a.allocated = self.allocated.copy()
-+      a.reserved = self.reserved[:]
-+      return a
-+
-+   def forget(self):
-+      self.__init__(self.available + self.allocated.values() + self.reserved)
-+
-+   def reserve(self, *rs):
-+      for r in rs:
-+         self.available.remove(r)
-+         self.reserved.append(r)
-+
-+   def retire(self, name):
-+      r = self.allocated.pop(name)
-+      del r.__invert__
-+      del r.retire
-+      self.available.append(r)
-+      return r
-+
-+   def __getattr__(self, name):
-+      if name not in self.allocated:
-+         r = self.available.pop()
-+         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
-+         r.__invert__ = r.retire
-+         self.allocated[name] = r
-+      return self.allocated[name]
-+
-+def pragma_allow_xor_0(x):
-+   global allow_xor_0
-+
-+   if not isinstance(x, bool):
-+      raise Exception('allow_xor_0 must be bool')
-+   x, allow_xor_0 = allow_xor_0, x
-+   return x
-+
-+def pragma_dont_warn_when_mul_rot_inp_r5(x):
-+   global dont_warn_when_mul_rot_inp_r5
-+
-+   if not isinstance(x, bool):
-+      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
-+   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
-+   return x
-+
-+arg_defs = {
-+   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
-+   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
-+   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
-+   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
-+   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
-+   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
-+   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
-+   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
-+   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
-+   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
-+   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
-+   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
-+   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
-+   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
-+   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
-+   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
-+   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
-+   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
-+   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
-+   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
-+   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
-+   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
-+   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
-+   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
-+   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
-+   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
-+   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
-+   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
-+   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
-+   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
-+   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
-+   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
-+   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
-+   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
-+   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
-+   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
-+   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
-+   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
-+   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
-+   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
-+   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
-+   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
-+   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
-+
-+   # semaphore acq/rel
-+   'sacq': lambda i: sema_t(True, i),
-+   'srel': lambda i: sema_t(False, i),
-+
-+   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
-+   'r_label_maker': label_maker_t(True),
-+   'a_label_maker': label_maker_t(False),
-+
-+   # handy functions
-+   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
-+   'sqrt':  math.sqrt,
-+   'sin':   math.sin,
-+   'cos':   math.cos,
-+   'atan2': math.atan2,
-+   'pi':    math.pi,
-+   'rseed': random.seed,
-+   'rand':  lambda: int(random.getrandbits(32)),
-+   'bits':  bits,
-+   'bitsw': bitsw,
-+   'bitsws': bitsws,
-+
-+   # handy vpm/vdw/vdr stuff
-+   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
-+   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
-+   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
-+   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
-+   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
-+   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
-+   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
-+   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
-+   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
-+   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
-+   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
-+   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
-+   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
-+   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
-+   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
-+   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
-+   'vpm_setup': vpm_setup,
-+   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
-+   'vdw_setup_0': vdw_setup_0,
-+   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
-+   'vdr_setup_0': vdr_setup_0,
-+   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
-+   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
-+
-+   # annotations
-+   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
-+   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
-+   'preserve_cond': ('preserve_cond', 1),
-+
-+   # somewhat experimental register allocator
-+   'allocator_t': allocator_t,
-+
-+   # pragmas
-+   'pragma_allow_xor_0': pragma_allow_xor_0,
-+   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
-+
-+# accumulators and regs (regular names -- r0, ra0, etc)
-+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
-+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+
-+def arg_eval(arg, sets):
-+   s = (arg.strip().split('.', 1) + [None])[:2]
-+   if s[0] == '-':
-+      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
-+   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
-+   arg = re_pack.sub('._\\1', arg)
-+   try:
-+      # todo: i would like to be able to pass both arg_defs and sets in here
-+      # (with sets hiding arg_defs in the case of conflicts), but the obvious
-+      # dict(arg_defs, **sets) won't permit things such as:
-+      # .set f, lambda x: y
-+      # .set y, 4
-+      # (the y in the lambda will be looked up in the temporary dict we created
-+      # when evaluating the f .set, which doesn't contain y)
-+      #
-+      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
-+      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
-+      # also, before dumping sets at the end, we strip out the arg_defs stuff
-+      # (this isn't entirely correct as we want to dump sets that are hiding
-+      # arg_defs)
-+      return eval(arg, sets)
-+   except Exception, e:
-+      asm_error(e)
-+   except:
-+      asm_error('unknown error while evaluating argument')
-+
-+# doesn't check/fixup pack
-+def check_and_fixup_loc(loc, read):
-+   if (not read) and (loc.rw == RW_READ):
-+      asm_error('writing to read-only hardware register')
-+   if read and (loc.rw == RW_WRITE):
-+      asm_error('reading from write-only hardware register')
-+   if not read:
-+      # conceptually, we are writing to a location rotated right by
-+      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
-+      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
-+      loc.rot = -loc.rot % 16
-+      loc.r5_rot = -loc.r5_rot % 16
-+   if (loc.rot != 0) and (loc.r5_rot != 0):
-+      asm_error('can\'t rotate by both r5 and immediate')
-+   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
-+      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
-+   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
-+      if not read:
-+         asm_error('target doesn\'t support write rotation')
-+      if loc.mux == MUX_ANY:
-+         loc.mux = MUX_A # can't do rotated read from regfile b
-+      if loc.mux != MUX_A:
-+         asm_error('rotation on read only allowed from regfile a')
-+      if loc.i >= 32:
-+         asm_warning('rotation only works from physical regfile')
-+   if loc.mux == MUX_AC:
-+      if (loc.i < 0) or (loc.i >= 6):
-+         asm_error('reg out of range')
-+      if not read:
-+         if loc.i == 4:
-+            asm_error('not allowed to write to r4')
-+         if loc.i == 5:
-+
-+            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
-+   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
-+      if (loc.i < 0) or (loc.i >= 64):
-+         asm_error('reg out of range')
-+   else:
-+      assert 0
-+
-+def get_dst(dst, sets):
-+   if not dst:
-+      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
-+   dst = arg_eval(dst, sets)
-+   if not isinstance(dst, loc_t):
-+      asm_error('invalid dst')
-+   dst = dst.copy()
-+   check_and_fixup_loc(dst, False)
-+   pack = get_pack(dst.pack)
-+   if dst.mux == MUX_AC:
-+      if pack[2] == PACK_MODE_A:
-+         asm_warning('ra packing only works when writing to physical regfile')
-+         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
-+      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
-+   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
-+      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
-+         asm_warning('ra packing only works when writing to physical regfile')
-+      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
-+   if dst.mux == MUX_ANY:
-+      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
-+   if dst.mux == MUX_B:
-+      if pack[2] == PACK_MODE_A:
-+         asm_error('this packing operation can only be used for regfile a')
-+      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
-+   assert 0
-+
-+def get_src(src, sets):
-+   if not src:
-+      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
-+   src = arg_eval(src, sets)
-+   if isinstance(src, sema_t):
-+      if not have_sema:
-+         asm_error('target does not support semaphores')
-+      if (src.i < 0) or (src.i >= 16):
-+         asm_error('semaphore number must be in [0, 16)')
-+      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if isinstance(src, label_t):
-+      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if isinstance(src, list):
-+      if len(src) != 16:
-+         asm_error('vector immediate must have length 16')
-+      src = src[:]
-+      for i in xrange(16):
-+         if not is_int(src[i]):
-+            asm_error('all elements of vector immediate must be integers')
-+         src[i] &= (1 << 32) - 1
-+      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if is_int(src):
-+      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if not isinstance(src, loc_t):
-+      asm_error('invalid src')
-+   src = src.copy()
-+   check_and_fixup_loc(src, True)
-+   if mulw_rotate:
-+      srot, sr5rot = 0, 0
-+      drot, dr5rot = src.rot, src.r5_rot
-+   else:
-+      srot, sr5rot = src.rot, src.r5_rot
-+      drot, dr5rot = 0, 0
-+   if src.mux == MUX_AC:
-+      if src.i == 4:
-+         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
-+      if src.pack:
-+         asm_error('unpack only allowed for regfile a or r4')
-+      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
-+   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
-+      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
-+   if src.mux == MUX_ANY:
-+      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
-+   if src.mux == MUX_B:
-+      if src.pack:
-+         asm_error('unpack only allowed for regfile a or r4')
-+      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
-+   assert 0
-+
-+# signals
-+##########
-+
-+sigs = {
-+   'bkpt': SIG_BKPT,
-+   'thrsw': SIG_THRSW,
-+   'thrend': SIG_THREND,
-+   'sbwait': SIG_SBWAIT,
-+   'sbdone': SIG_SBDONE,
-+   'int': SIG_INT,
-+   'loadcv': SIG_LOADCV,
-+   'loadc': SIG_LOADC,
-+   'ldcend': SIG_LDCEND,
-+   'ldtmu0': SIG_LDTMU0,
-+   'ldtmu1': SIG_LDTMU1}
-+
-+def get_sig(sig):
-+   if sig not in sigs:
-+      return SIG_NORMAL
-+   return sigs[sig]
-+
-+# annotations
-+##############
-+
-+def get_annots(annot, sets):
-+   annots = arg_eval(annot, sets)
-+   if isinstance(annots, list):
-+      annots = annots[:]
-+   else:
-+      annots = [annots]
-+   for i, annot in enumerate(annots):
-+      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
-+         (not is_int(annot[1]))):
-+         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
-+      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
-+   return annots
-+
-+###############################################################################
-+# core
-+###############################################################################
-+
-+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
-+   needfloat = PACK_TYPE_EITHER
-+   havefloata = False
-+   havefloatr4 = False
-+   unpacka = None
-+   unpackr4 = None
-+   forcebs = [False, False, False, False]
-+   forcerafloat = False
-+
-+   pm = PACK_MODE_EITHER
-+   for i in (0, 1, 2, 3):
-+      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
-+         assert rpacks[i][0] == 0
-+      else:
-+         if rpacks[i][2] == UNPACK_LOC_A:
-+            if unpacka is None:
-+               unpacka = rpacks[i][0]
-+            elif unpacka != rpacks[i][0]:
-+               asm_error('conflicting unpack operations on regfile a')
-+            havefloata = havefloata or rfloats[i]
-+         elif rpacks[i][2] == UNPACK_LOC_R4:
-+            if unpackr4 is None:
-+               unpackr4 = rpacks[i][0]
-+            elif unpackr4 != rpacks[i][0]:
-+               asm_error('conflicting unpack operations on r4')
-+            havefloatr4 = havefloatr4 or rfloats[i]
-+         else:
-+            assert 0
-+
-+         if rpacks[i][1] != PACK_TYPE_EITHER:
-+            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
-+               asm_error('conflicting unpack float requirements')
-+            needfloat = rpacks[i][1]
-+   for i in (0, 1, 2, 3):
-+      if rpacks[i][2] == UNPACK_LOC_AB:
-+         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
-+            forcebs[i] = True # non-nop unpack from regfile a. must use b
-+
-+   if unpacka:
-+      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
-+         havefloata = True
-+         forcerafloat = True
-+      havefloat = havefloata
-+   else:
-+      havefloat = havefloatr4
-+
-+   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
-+      asm_error('float unpack operation used in integer alu operations')
-+   if (needfloat == PACK_TYPE_INT) and havefloat:
-+      asm_error('integer unpack operation used in float alu operation')
-+
-+   unpack = 0
-+   if unpacka and unpackr4:
-+      asm_error('cannot specify pack operation for both regfile a and r4')
-+   if unpacka:
-+      pm = PACK_MODE_A
-+      unpack = unpacka
-+   elif unpackr4:
-+      pm = PACK_MODE_M
-+      unpack = unpackr4
-+
-+   pack = 0
-+   if wpacks[0][2] == PACK_MODE_M:
-+      asm_error('mul-unit pack operation used on add result')
-+   for i in (0, 1):
-+      if wpacks[i][2] == PACK_MODE_A:
-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
-+            asm_error('conflicting pack modes')
-+         pm = PACK_MODE_A
-+         pack = wpacks[i][0]
-+      elif wpacks[i][2] == PACK_MODE_M:
-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
-+            asm_error('conflicting pack modes')
-+         pm = PACK_MODE_M
-+         pack = wpacks[i][0]
-+
-+      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
-+         asm_error('float pack operation used with integer alu result')
-+      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
-+         asm_error('integer pack operation used with float alu result')
-+
-+   if pm == PACK_MODE_EITHER:
-+      pm = PACK_MODE_A
-+   return pm, pack, unpack, forcebs, forcerafloat
-+
-+# immediates that can be encoded with SIG_SMALLIMMED
-+bimms = {}
-+bimms.update((i, i) for i in xrange(16))
-+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
-+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
-+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
-+
-+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
-+   if rmux == RMUX_SEMA:
-+      asm_error('semaphore op can only be used with mov')
-+   if rmux == RMUX_LABEL:
-+      asm_error('label not allowed here')
-+   if rmux == RMUX_IMMV:
-+      asm_error('vector immediate can only be used with mov')
-+   if rmux == RMUX_IMM:
-+      if raddr not in bimms:
-+         asm_error('can\'t encode immediate 0x%08x' % raddr)
-+      raddr = bimms[raddr]
-+      if not immb:
-+         if raddr_b is not None:
-+            asm_error('regfile b and immediates don\'t mix')
-+         raddr_b = raddr
-+         immb = True
-+      elif raddr_b != raddr:
-+         asm_error('can only encode one rotation/immediate')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+   if rmux == RMUX_AC:
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
-+   if rmux == RMUX_ANY:
-+      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      if (not immb) and (raddr_b == raddr):
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+      if raddr_a is None:
-+         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
-+         raddr_a = raddr
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      if raddr_b is None:
-+         assert not immb
-+         raddr_b = raddr
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+      asm_error('no free read slots')
-+   if rmux == RMUX_A:
-+      if (not mulw_rotate) and (raddr_a is not None) and (
-+         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
-+         asm_error('conflicting rotations from regfile a')
-+      if raddr_a is None:
-+         raddr_a = raddr[0]
-+      elif raddr_a != raddr[0]:
-+         asm_error('can only read from one location in each regfile')
-+      arot_r5 = raddr[2]
-+      if raddr[1] == 0:
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      raddr = 48 + raddr[1]
-+      if not immb:
-+         if raddr_b is not None:
-+            asm_error('regfile b and rotation don\'t mix')
-+         raddr_b = raddr
-+         immb = True
-+      elif raddr_b != raddr:
-+         asm_error('can only encode one rotation/immediate')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+   if rmux == RMUX_B:
-+      if immb:
-+         asm_error('regfile b and rotation/immediates don\'t mix')
-+      if raddr_b is None:
-+         raddr_b = raddr
-+      elif raddr_b != raddr:
-+         asm_error('can only read from one location in each regfile')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+   assert 0
-+
-+# ok if:
-+# - accumulator (r0-r3)
-+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
-+#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
-+#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
-+#   what about vr_wait/vw_wait/mutex?
-+def read_rot_ok(rmux, raddr_a, raddr_b):
-+   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
-+      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
-+      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
-+
-+def asm_flush_prog_data():
-+   global prog_data
-+
-+   while len(prog_data) & 7:
-+      prog_data.append(0)
-+   for i in xrange(0, len(prog_data), 8):
-+      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
-+         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
-+   prog_data = []
-+
-+def asm_line(sets, location, line):
-+   global current_location, construct, nwarn_level
-+
-+   prev_location = current_location
-+   current_location = location
-+
-+   try:
-+      if construct != None:
-+         if re_macro.match(line):
-+            construct_stack.append(CONSTRUCT_MACRO)
-+         elif re_if.match(line):
-+            construct_stack.append(CONSTRUCT_IF)
-+         elif re_rep.match(line):
-+            construct_stack.append(CONSTRUCT_REP)
-+         else:
-+            else_m = line == '.else'
-+            elif_m = re_elif.match(line)
-+            if elif_m:
-+               end_construct = CONSTRUCT_IF
-+            else:
-+               end_construct = {
-+                  '.endm':  CONSTRUCT_MACRO,
-+                  '.else':  CONSTRUCT_IF,
-+                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
-+                  '.endr':  CONSTRUCT_REP}.get(line)
-+            if end_construct is not None:
-+               end_construct &= construct_stack.pop()
-+               if end_construct == 0:
-+                  if elif_m:
-+                     asm_error('unexpected .elif')
-+                  asm_error('unexpected %s' % line)
-+               if len(construct_stack) == 0:
-+                  lines = construct
-+                  construct = None
-+                  if end_construct == CONSTRUCT_MACRO:
-+                     return
-+                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
-+                     condition_if, condition_else = lines[0]
-+                     lines = lines[1:]
-+                     if condition_if:
-+                        for location, line in lines:
-+                           asm_line(sets, location, line)
-+                     if else_m:
-+                        construct = [(condition_else, False)]
-+                        construct_stack.append(CONSTRUCT_ELSE)
-+                     elif elif_m:
-+                        if elif_m.group('set'):
-+                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
-+                        else:
-+                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
-+                        condition_else = condition_else and (not condition_if)
-+                        construct = [(condition_if, condition_else)]
-+                        construct_stack.append(CONSTRUCT_IF)
-+                     return
-+                  if end_construct == CONSTRUCT_REP:
-+                     name, count = lines[0]
-+                     lines = lines[1:]
-+                     for i in xrange(count):
-+                        sets[name] = i
-+                        for location, line in lines:
-+                           asm_line(sets, location, line)
-+                     return
-+                  assert 0
-+               if else_m:
-+                  construct_stack.append(CONSTRUCT_ELSE)
-+               elif elif_m:
-+                  construct_stack.append(CONSTRUCT_IF)
-+         construct.append((current_location, line))
-+         return
-+
-+      if line in ('.endm', '.else', '.endif', '.endr'):
-+         asm_error('unexpected %s' % line)
-+      if re_elif.match(line):
-+         asm_error('unexpected .elif')
-+
-+      m = re_macro.match(line)
-+      if m:
-+         construct = []
-+         construct_stack.append(CONSTRUCT_MACRO)
-+         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
-+         return
-+
-+      m = re_if.match(line)
-+      if m:
-+         if m.group('set'):
-+            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
-+         else:
-+            # not not forces condition to a bool (this matters if condition is
-+            # something mutable like a list)
-+            condition = not not arg_eval(m.group('condition'), sets)
-+         construct = [(condition, not condition)]
-+         construct_stack.append(CONSTRUCT_IF)
-+         return
-+
-+      m = re_rep.match(line)
-+      if m:
-+         count = arg_eval(m.group('count'), sets)
-+         if not is_int(count):
-+            asm_error('.rep count must be integer')
-+         construct = [(m.group('name'), count)]
-+         construct_stack.append(CONSTRUCT_REP)
-+         return
-+
-+      m = re_include.match(line)
-+      if m:
-+         filename = arg_eval(m.group('filename'), sets)
-+         if not isinstance(filename, str):
-+            asm_error('expected string')
-+         asm_file(sets, '%s: %s' % (current_location, filename), filename)
-+         return
-+
-+      m = re_set.match(line)
-+      if m:
-+         sets[m.group('name')] = arg_eval(m.group('val'), sets)
-+         return
-+
-+      m = re_unset.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name not in sets:
-+            asm_error('%s not set' % name)
-+         if name in arg_defs: # todo: see arg_eval
-+            sets[name] = arg_defs[name]
-+         else:
-+            del sets[name]
-+         return
-+
-+      m = re_eval.match(line)
-+      if m:
-+         arg_eval(m.group('expr'), sets)
-+         return
-+
-+      m = re_print_info_warn_error.match(line)
-+      if m:
-+         def print_fn(message):
-+            print message
-+         def info_fn(message):
-+            sys.stderr.write('%s\n' % message)
-+         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
-+            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
-+         return
-+
-+      m = re_assert.match(line)
-+      if m:
-+         if not arg_eval(m.group('condition'), sets):
-+            asm_error('assertion failure: \'%s\'' % m.group('condition'))
-+         return
-+
-+      m = re_data.match(line)
-+      if m:
-+         size = int(m.group('size'))
-+         for datum in smart_split(m.group('data')):
-+            datum = arg_eval(datum, sets)
-+            if not is_int(datum):
-+               asm_error('datum must be integer')
-+            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
-+         return
-+
-+      m = re_macro_inst.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name in macros:
-+            params, lines = macros[name]
-+            args = smart_split(m.group('args'))
-+            if len(args) > len(params):
-+               asm_error('too many arguments to macro')
-+            sets = sets.copy()
-+            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
-+            for param in params[len(args):]:
-+               if param in sets:
-+                  if param in arg_defs: # todo: see arg_eval
-+                     sets[param] = arg_defs[param]
-+                  else:
-+                     del sets[param]
-+            for location, line in lines:
-+               asm_line(sets, '%s: %s' % (current_location, location), line)
-+            return
-+
-+      if line == '.pushnwarn':
-+         nwarn_level += 1
-+         return
-+      if line == '.popnwarn':
-+         if nwarn_level == 0:
-+            asm_error('.popnwarn without .pushnwarn')
-+         nwarn_level -= 1
-+         return
-+
-+      # everything below assumes prog is up to date
-+      asm_flush_prog_data()
-+
-+      m = re_label.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name[0].isdigit():
-+            labels.setdefault(name, []).append(len(prog))
-+         else:
-+            if name[0] == ':':
-+               undecorated_name = name[1:]
-+            else:
-+               undecorated_name = name
-+            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
-+               asm_error('named label defined twice')
-+            labels[name] = len(prog)
-+         return
-+
-+      annots = line.split('@')
-+      ops = [op.strip() for op in annots[0].split(';')]
-+      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
-+      sig = get_sig(ops[-1])
-+      if sig != SIG_NORMAL:
-+         ops = ops[:-1]
-+      if len(ops) > 2:
-+         asm_error('too many ops')
-+      elif (len(ops) == 1) and (ops[0] == ''):
-+         ops = []
-+      ops = (ops + ['nop', 'nop'])[:2]
-+      m = re_op.match(ops[0])
-+      if not m:
-+         asm_error('invalid syntax')
-+      aop, aargs_n = get_aop(m.group('op'))
-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
-+         acond = get_bcond(m.group('cond'))
-+      else:
-+         acond = get_cond(m.group('cond'))
-+      asf = get_setf(m.group('sf'))
-+      aargs = smart_split(m.group('args'))
-+      if len(aargs) != aargs_n:
-+         asm_error('wrong operand count')
-+      ard, ara, arb = (aargs + [None, None, None])[:3]
-+      m = re_op.match(ops[1])
-+      if not m:
-+         asm_error('invalid syntax')
-+      mop, margs_n = get_mop(m.group('op'))
-+      mcond = get_cond(m.group('cond'))
-+      msf = get_setf(m.group('sf'))
-+      margs = smart_split(m.group('args'))
-+      if len(margs) != margs_n:
-+         asm_error('wrong operand count')
-+      mrd, mra, mrb = (margs + [None, None, None])[:3]
-+      # eval srcs first so allocator can retire and reuse registers for dst
-+      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
-+      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
-+      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
-+      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
-+      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
-+      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
-+      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
-+         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
-+         asm_error('cannot have 2 arguments with different rotations')
-+      if aarmux is not None:
-+         awrot = (awrot + aadrot) % 16
-+         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
-+      if (awrot != 0) or awrot_r5:
-+         asm_error('rotate not allowed on add write')
-+      if marmux is not None:
-+         mwrot = (mwrot + madrot) % 16
-+         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
-+
-+      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
-+      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
-+      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
-+         [aarpack, abrpack, marpack, mbrpack],
-+         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
-+         aop == AOP_FTOI,
-+         [awpack, mwpack],
-+         [afloatw, mop == MOP_FMUL])
-+      if forcebs[0]:
-+         aarmux = RMUX_B
-+      if forcebs[1]:
-+         abrmux = RMUX_B
-+      if forcebs[2]:
-+         marmux = RMUX_B
-+      if forcebs[3]:
-+         mbrmux = RMUX_B
-+
-+      # extend nops to 3 operands
-+      if aop == AOP_NOP:
-+         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
-+      if mop == MOP_NOP:
-+         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
-+
-+      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
-+      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
-+         if forcerafloat:
-+            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
-+            # instead of duplicating the 2nd operand, take the ra operand from
-+            # the mul op thus forcing the ra value to be considered a float for
-+            # the purposes of unpacking
-+            if marmux == RMUX_A:
-+               abraddr, abrmux = maraddr, marmux
-+            else:
-+               assert mbrmux == RMUX_A
-+               abraddr, abrmux = mbraddr, mbrmux
-+         else:
-+            abraddr, abrmux = aaraddr, aarmux
-+      else:
-+         assert not forcerafloat # can only forcerafloat if we have an unused operand
-+
-+      # handle write addrs
-+      if (awmux == mwmux) and (awmux != WMUX_ANY):
-+         asm_error('add/mul ops not allowed to write to same regfile')
-+      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
-+
-+      # handle branch
-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
-+         # check setf
-+         if asf:
-+            asm_error('setf not allowed on bra/brr')
-+
-+         # check pack/unpack
-+         if (pack != 0) or (unpack != 0):
-+            asm_error('pack/unpack not allowed with bra/brr')
-+
-+         # handle read address
-+         if aarmux == RMUX_LABEL:
-+            if (aop == AOP_BRA) and aaraddr[1]:
-+               asm_warning('bra with rel label')
-+            if (aop == AOP_BRR) and (not aaraddr[1]):
-+               asm_warning('brr with abs label')
-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
-+         if aarmux == RMUX_ANY:
-+            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
-+         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
-+            asm_error('branch destination must be either label, immediate, or from regfile a')
-+         if aarmux == RMUX_IMM:
-+            imm = aaraddr
-+            raddr = 0 # can't use RADDR_NOP
-+         elif aarmux == RMUX_A:
-+            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
-+               asm_error('rotation of read from regfile a not allowed with branch')
-+            if aop == AOP_BRR:
-+               asm_warning('brr with ra')
-+            imm = 0
-+            raddr = aaraddr[0]
-+         else:
-+            assert 0
-+
-+         # check mul op is nop
-+         if mop != MOP_NOP:
-+            asm_error('mul op not allowed with branch')
-+
-+         # check sig
-+         if sig != SIG_NORMAL:
-+            asm_error('no signal allowed with branch')
-+
-+         if raddr >= 32:
-+            asm_error('can only branch to register locations in physical regfile')
-+         if raddr & 1:
-+            asm_warning('branch instruction will destroy flags (see hw-2780)')
-+
-+         # construct branch instruction
-+         prog.append((imm,
-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
-+            line, annots))
-+
-+         return
-+
-+      # use COND_NEVER when possible (might save power / allow mul setf)
-+      if not dict(annots).get('preserve_cond', 0):
-+          if (awaddr == WADDR_NOP) and (not asf):
-+             acond = COND_NEVER
-+          if (mwaddr == WADDR_NOP) and (not msf):
-+             mcond = COND_NEVER
-+
-+      # attempt to convert movs to ldi
-+      if (# no mul setf
-+         (not msf) and
-+         # ops must either be nop or mov of sema/label/imm/immv
-+         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
-+         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
-+         # but we don't want 2 nops
-+         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
-+         # if both ops are movs, srcs must be identical
-+         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
-+         # no signal
-+         (sig == SIG_NORMAL)):
-+         # make sure aarmux/aaraddr contains the value
-+         if aop != AOP_MOV:
-+            aarmux = marmux
-+            aaraddr = maraddr
-+
-+         # convert immediate
-+         if aarmux == RMUX_SEMA:
-+            ldi_mode = LDI_SEMA
-+         elif aarmux == RMUX_LABEL:
-+            ldi_mode = LDI_32
-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
-+         elif aarmux == RMUX_IMMV:
-+            signed, unsigned = True, True
-+            imm = 0
-+            for i, elem in enumerate(aaraddr):
-+               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
-+                  signed = False
-+               if elem not in (0, 1, 2, 3):
-+                  unsigned = False
-+               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
-+            if not (signed or unsigned):
-+               asm_error('can\'t encode vector immediate')
-+            if signed:
-+               ldi_mode = LDI_EL_SIGNED
-+            else:
-+               ldi_mode = LDI_EL_UNSIGNED
-+            aaraddr, aarmux = imm, RMUX_IMM
-+         elif aarmux == RMUX_IMM:
-+            ldi_mode = LDI_32
-+         else:
-+            assert 0
-+
-+         # construct ldi instruction
-+         prog.append((aaraddr,
-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
-+            line, annots))
-+
-+         return
-+
-+      # convert movs to alu ops
-+      if aop == AOP_MOV:
-+         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
-+            aop = AOP_XOR
-+            aaraddr, aarmux = 0, RMUX_AC
-+            abraddr, abrmux = 0, RMUX_AC
-+         else:
-+            aop = AOP_OR
-+            abraddr, abrmux = aaraddr, aarmux
-+      if mop == MOP_MOV:
-+         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
-+            mop = MOP_V8SUBS
-+            maraddr, marmux = 0, RMUX_AC
-+            mbraddr, mbrmux = 0, RMUX_AC
-+         else:
-+            mop = MOP_V8MIN
-+            mbraddr, mbrmux = maraddr, marmux
-+
-+      # normal alu instruction...
-+
-+      # handle setf
-+      if asf and (aop == AOP_NOP):
-+         asm_error('nop.setf is not allowed in add pipe')
-+      if msf and (mop == MOP_NOP):
-+         asm_warning('nop.setf, really?')
-+      if (aop == AOP_NOP) or (acond == COND_NEVER):
-+         sf = msf
-+      else:
-+         if msf:
-+            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
-+         sf = asf
-+
-+      # handle read addrs
-+      raddr_a = None
-+      raddr_b = None
-+      immb = False
-+      arot_r5 = False
-+      muxes = [0, 0, 0, 0]
-+      if mwrot != 0:
-+         raddr_b = 48 + mwrot
-+         immb = True
-+      if mwrot_r5 and have_am:
-+         raddr_b = 48
-+         immb = True
-+      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
-+         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
-+            if f(rmux):
-+               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
-+      add_a, add_b, mul_a, mul_b = muxes
-+      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
-+         # some output elements might not be as expected
-+         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
-+            bad_elems = 0xffff
-+         else:
-+            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
-+            if mwrot > 12:
-+               bad_elems ^= 0xffff
-+         bad_elems &= dict(annots).get('mul_used', 0xffff)
-+         if not msf:
-+            if mwaddr == WADDR_NOP:
-+               # not writing anywhere and not setting flags. no elements used
-+               bad_elems = 0
-+            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
-+               ((not ws) and (mwaddr == 37))):
-+               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
-+               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
-+               # only use element 0
-+               bad_elems &= 0x0001
-+            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
-+               ((not ws) and (mwaddr == 42))):
-+               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
-+               # flags. only use elements 0, 4, 8, and 12
-+               bad_elems &= 0x1111
-+         if bad_elems:
-+            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
-+      if raddr_a is None:
-+         raddr_a = RADDR_NOP
-+      if raddr_b is None:
-+         raddr_b = RADDR_NOP
-+      if immb:
-+         if sig != SIG_NORMAL:
-+            asm_error('rotation/immediates and signal don\'t mix')
-+         sig = SIG_SMALLIMMED
-+      if arot_r5 or (mwrot_r5 and (not have_am)):
-+         if sig != SIG_NORMAL:
-+            asm_error('rotation/immediates/signal don\'t mix')
-+         sig = SIG_ROTATE
-+
-+      # construct instruction
-+      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
-+         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
-+         line, annots))
-+   finally:
-+      current_location = prev_location
-+
-+def preprocess_passthrough(file):
-+   line_number = 0
-+   for line in file:
-+      line_number += 1
-+      yield line_number, line
-+
-+def asm_file(sets, location, filename, preprocess = None):
-+   global current_dir, current_location
-+
-+   if filename is None:
-+      location = '<stdin>'
-+      file = sys.stdin
-+
-+      prev_dir = current_dir
-+   else:
-+      filename = os.path.normpath(os.path.join(current_dir, filename))
-+
-+      try:
-+         file = open(filename)
-+      except Exception, e:
-+         asm_error(e)
-+      except:
-+         asm_error('unknown error while opening file %s' % filename)
-+
-+      prev_dir = current_dir
-+      current_dir = os.path.dirname(filename)
-+
-+   prev_location = current_location
-+   current_location = location
-+
-+   if preprocess is None:
-+      preprocess = preprocess_passthrough
-+
-+   try:
-+      for line_number, line in preprocess(file):
-+         # strip off comments and whitespace
-+         line = line.split('#')[0].strip()
-+         if line == '':
-+            continue
-+
-+         asm_line(sets, '%s: %d' % (current_location, line_number), line)
-+   finally:
-+      current_dir = prev_dir
-+      current_location = prev_location
-+
-+def asm_end_prog():
-+   # check we aren't in a multi-line construct (eg .macro or .rep)
-+   if construct != None:
-+      asm_error({
-+         CONSTRUCT_MACRO: '.macro without .endm',
-+         CONSTRUCT_IF:    '.if/.elif without .endif',
-+         CONSTRUCT_ELSE:  '.else without .endif',
-+         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
-+
-+   # check no warnings level back to 0
-+   if nwarn_level != 0:
-+      asm_error('.pushnwarn without .popnwarn')
-+
-+   # flush queued up data
-+   asm_flush_prog_data()
-+
-+   # fixup all the label references we can
-+   for pc in xrange(len(prog)):
-+      if isinstance(prog[pc][0], tuple):
-+         location, label, rel, offset = prog[pc][0]
-+         if label[0].isdigit():
-+            label_pcs = labels.get(label[:-1], [])
-+            if label[-1] == 'b':
-+               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
-+            else:
-+               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
-+            if label_pcs == []:
-+               asm_error('search for label reached begin/end of file', location = location)
-+            imm = label_pcs[0]
-+         elif label in labels:
-+            imm = labels[label]
-+         elif (':' + label) in labels:
-+            imm = labels[':' + label]
-+         elif external_link:
-+            continue # let the external linker deal with it
-+         else:
-+            asm_error('undefined label', location = location)
-+         imm = (imm * 8) + offset
-+         if rel:
-+            imm -= (pc + 4) * 8 # relative to instruction after delay slots
-+            imm &= (1 << 32) - 1
-+         else:
-+            if not external_link:
-+               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
-+            imm = (location, label, rel, offset, imm)
-+         prog[pc] = (imm,) + prog[pc][1:]
-+
-+def asm_init():
-+   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
-+
-+   current_dir = os.getcwd()
-+   current_location = ''
-+   prog = []
-+   prog_data = []
-+   macros = {
-+      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
-+      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
-+   labels = {}
-+   construct = None
-+   construct_stack = []
-+   nwarn_level = 0
-+
-+def asm_reset_prog():
-+   global prog, labels
-+
-+   prog = []
-+   labels = {}
-+
-+###############################################################################
-+# dumping
-+###############################################################################
-+
-+def print_lines(lines):
-+   for line in lines:
-+      print line
-+
-+class dumper_t:
-+   def external_link(self): return False
-+   def begin(self): pass
-+   def label(self, pc, name): pass
-+   def line(self, pc, ls, ms, line, annots, first): pass
-+   def end(self): pass
-+   def sets(self, sets): pass
-+   def direct(self, line): pass
-+
-+class clif_dumper_t(dumper_t):
-+   def __init__(self):
-+      self.annot_mode = 0
-+
-+   def external_link(self):
-+      return True
-+
-+   def parse_annot_mode(self, line):
-+      l = line.split(',')
-+      self.annot_mode = int(l[0])
-+      if self.annot_mode not in (0, 1, 2):
-+         asm_error('bad annot mode')
-+      if self.annot_mode == 2:
-+         if len(l) != 2:
-+            asm_error('expected buffer name')
-+         self.annot_name = l[1].strip()
-+         self.annot_offset = 0
-+      elif len(l) != 1:
-+         asm_error('unexpected comma')
-+
-+   def label(self, pc, name):
-+      if (self.annot_mode != 1) and (name[0] == ':'):
-+         if self.annot_mode == 2:
-+            name = name + '_annotations'
-+         print '@label %s' % name[1:]
-+      else:
-+         print '// :%s' % name
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if self.annot_mode == 0:
-+         if isinstance(ls, tuple):
-+            if len(ls) == 5:
-+               location, label, rel, offset, offset_from_prog = ls
-+               assert not rel
-+               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
-+            else:
-+               location, label, rel, offset = ls
-+               if rel:
-+                  asm_error('relative external label references not allowed in this mode', location = location)
-+               ls = '[%s + %d]' % (label, offset)
-+         else:
-+            ls = '0x%08x' % ls
-+         print '%s 0x%08x // %s' % (ls, ms, line)
-+      elif self.annot_mode == 1:
-+         print '// %s' % line
-+         for annot in annots:
-+            print '0x%08x 0x%08x // %s' % ({
-+               # todo: would rather not have these hard coded
-+               'mul_used':              1,
-+               'preserve_cond':         2,
-+               'geomd_open':            3,
-+               'geomd_i':               4,
-+               'geomd_tris_clear':      5,
-+               'geomd_verts':           6,
-+               'geomd_tris_add':        7,
-+               'geomd_tris_set_center': 8,
-+               'geomd_region_clear':    9,
-+               'geomd_region_set':      10,
-+               'geomd_images_clear':    11,
-+               'geomd_images_l':        12,
-+               'geomd_images_b':        13,
-+               'geomd_images_r':        14,
-+               'geomd_images_t':        15,
-+               'geomd_images_add_vpm':  16,
-+               'trace_4c':              17,
-+               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
-+         if len(annots) != 0:
-+            print '0x00000000 // end'
-+      else:
-+         assert self.annot_mode == 2
-+         if len(annots) == 0:
-+            print '0x00000000 // %s' % line
-+         else:
-+            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
-+            self.annot_offset += (len(annots) * 8) + 4
-+
-+   def direct(self, line):
-+      print line
-+
-+class plain_dumper_t(dumper_t):
-+   def line(self, pc, ls, ms, line, annots, first):
-+      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
-+
-+class c_c_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, array_name):
-+      self.header_name = header_name
-+      self.array_name = array_name
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      self.external_labels = set()
-+      self.lines = []
-+
-+      print '#include "%s.h"' % self.header_name
-+      print ''
-+      print '#ifdef _MSC_VER'
-+      print '   #include <stdint.h>'
-+      print '   /* cast through uintptr_t to avoid warnings */'
-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
-+      print '#else'
-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
-+      print '#endif'
-+      print ''
-+      print '#ifdef __cplusplus'
-+      print 'extern "C" { /* the types are probably wrong... */'
-+      print '#endif'
-+
-+   def label(self, pc, name):
-+      self.lines.append('// :%s' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple):
-+         if len(ls) == 5:
-+            location, label, rel, offset, offset_from_prog = ls
-+            assert not rel
-+            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
-+         else:
-+            location, label, rel, offset = ls
-+            if rel:
-+               asm_error('relative external label references not allowed in this mode', location = location)
-+            if label not in self.external_labels:
-+               self.external_labels.add(label)
-+               print 'extern uint8_t %s[];' % label
-+            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
-+      else:
-+         ls = '0x%08x' % ls
-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
-+
-+   def end(self):
-+      print '#ifdef __cplusplus'
-+      print '}'
-+      print '#endif'
-+      print ''
-+      print '#ifdef _MSC_VER'
-+      print '__declspec(align(8))'
-+      print '#elif defined(__GNUC__)'
-+      print '__attribute__((aligned(8)))'
-+      print '#endif'
-+      print 'unsigned int %s[] = {' % self.array_name
-+      print_lines(self.lines)
-+      print '};'
-+      print '#ifdef __HIGHC__'
-+      print '#pragma Align_to(8, %s)' % self.array_name
-+      print '#endif'
-+
-+class c_h_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, array_name):
-+      self.full_header_name = full_header_name
-+      self.array_name = array_name
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '#ifndef %s_H' % self.full_header_name
-+      print '#define %s_H' % self.full_header_name
-+      print ''
-+      print 'extern unsigned int %s[];' % self.array_name
-+      print ''
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
-+
-+   def end(self):
-+      print ''
-+      print '#endif'
-+
-+class ml_c_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, name, annots):
-+      self.header_name = header_name
-+      self.name = name
-+      self.annots = annots
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      if self.annots:
-+         self.annot_lines = []
-+      self.lines = []
-+      self.external_labels = set()
-+      self.link_lines = []
-+
-+      print '#include "%s.h"' % self.header_name
-+      print '#include <assert.h>'
-+      if self.annots:
-+         print '#ifdef SIMPENROSE'
-+         print '#include <stddef.h>'
-+         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
-+      print ''
-+
-+   def label(self, pc, name):
-+      self.lines.append('// :%s' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if self.annots:
-+         if len(annots) == 0:
-+            self.annot_lines.append('NULL,')
-+         else:
-+            print 'static unsigned int const annotations_%d[] = {' % pc
-+            for annot in annots:
-+               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
-+            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
-+            print ''
-+            self.annot_lines.append('annotations_%d,' % pc)
-+      if isinstance(ls, tuple):
-+         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
-+         if len(ls) == 5:
-+            location, label, rel, offset, offset_from_prog = ls
-+            assert not rel
-+            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
-+         else:
-+            location, label, rel, offset = ls
-+            self.external_labels.add(label)
-+            if rel:
-+               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
-+            else:
-+               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
-+         ls = '0xdeadbeef'
-+      else:
-+         ls = '0x%08x' % ls
-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
-+
-+   def end(self):
-+      if self.annots:
-+         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
-+         print_lines(self.annot_lines)
-+         print '};'
-+         print '#endif'
-+         print ''
-+      print 'static unsigned int const array[] = {'
-+      print_lines(self.lines)
-+      print '};'
-+      print ''
-+      print 'void %s_link(void *p_in, unsigned int base' % self.name
-+      for label in sorted(self.external_labels):
-+         print '   , unsigned int %s' % label
-+      print '   )'
-+      print '{'
-+      print '   unsigned int *p = (unsigned int *)p_in;'
-+      print '   unsigned int i;'
-+      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
-+      print '      p[i] = array[i];'
-+      print '   }'
-+      print_lines(self.link_lines)
-+      print '}'
-+
-+class ml_h_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, name, annots):
-+      self.full_header_name = full_header_name
-+      self.name = name
-+      self.annots = annots
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      self.external_labels = set()
-+      self.lines_n = 0
-+
-+      print '#ifndef %s_H' % self.full_header_name
-+      print '#define %s_H' % self.full_header_name
-+      print ''
-+      if self.annots:
-+         print '#ifdef SIMPENROSE'
-+         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
-+         print '#endif'
-+         print ''
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
-+         if self.annots:
-+            print '#ifdef SIMPENROSE'
-+            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
-+            print '#endif'
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple) and (len(ls) != 5):
-+         self.external_labels.add(ls[1])
-+      self.lines_n += 1
-+
-+   def end(self):
-+      print ''
-+      print 'extern void %s_link(void *p, unsigned int base' % self.name
-+      for label in sorted(self.external_labels):
-+         print '   , unsigned int %s' % label
-+      print '   );'
-+      print ''
-+      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
-+      print ''
-+      print '#endif'
-+
-+def print_lines_lc(lines):
-+   for line in lines:
-+      print '%s \\' % line
-+
-+def print_groups_lc(groups):
-+   first = True
-+   for group in groups:
-+      if first:
-+         print '{ \\'
-+      else:
-+         print ', { \\'
-+      print_lines_lc(group)
-+      print '} \\'
-+      first = False
-+
-+class inline_c_dumper_t(dumper_t):
-+   def __init__(self, annots):
-+      self.annots = annots
-+      self.iteration = False
-+
-+   def begin_iteration(self):
-+      assert not self.iteration
-+      self.iteration = True
-+      self.iteration_lines = []
-+      if self.annots:
-+         self.iteration_annot_lines = []
-+         self.annot_arrs = []
-+
-+   def end_iteration(self):
-+      assert self.iteration
-+      self.iteration = False
-+      print '%d, \\' % self.iteration_n
-+      if self.annots:
-+         print '( \\'
-+      print_groups_lc(self.iteration_lines)
-+      if self.annots:
-+         print '), ( \\'
-+         print_groups_lc(self.iteration_annot_lines)
-+         print '), ( \\'
-+         for annot_arr in self.annot_arrs:
-+            print_lines_lc(annot_arr)
-+         print ') \\'
-+
-+   def begin(self):
-+      self.n = 0
-+      self.lines = []
-+      if self.annots:
-+         self.annot_lines = []
-+         if not self.iteration:
-+            self.annot_arrs = []
-+
-+   def label(self, pc, name):
-+      self.lines.append('/* :%s */' % name)
-+      if self.annots:
-+         self.annot_lines.append('/* :%s */' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      self.n += 1
-+      if first:
-+         prefix = ''
-+      else:
-+         prefix = ', '
-+      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
-+      if self.annots:
-+         if len(annots) == 0:
-+            a = 'NULL'
-+         else:
-+            a = 'annotations_%d' % len(self.annot_arrs)
-+            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
-+            for annot in annots:
-+               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
-+            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
-+            self.annot_arrs.append(annot_arr)
-+         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
-+
-+   def end(self):
-+      if self.iteration:
-+         if len(self.iteration_lines) == 0:
-+            self.iteration_n = self.n
-+         elif self.iteration_n != self.n:
-+            asm_error('number of instructions differs between iterations')
-+         self.iteration_lines.append(self.lines)
-+         if self.annots:
-+            self.iteration_annot_lines.append(self.annot_lines)
-+      else:
-+         if self.annots:
-+            print '( \\'
-+         print_lines_lc(self.lines)
-+         if self.annots:
-+            print '), ( \\'
-+            print_lines_lc(self.annot_lines)
-+            print '), ( \\'
-+            for annot_arr in self.annot_arrs:
-+               print_lines_lc(annot_arr)
-+            print ') \\'
-+
-+   def direct(self, line):
-+      print line
-+
-+class asvc_dumper_t(dumper_t):
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '.align 8'
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '%s::' % name[1:]
-+      else:
-+         print '%s:' % name
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple):
-+         location, label, rel, offset = ls[:4]
-+         if rel:
-+            ls = '%s + %d - (. + 32)' % (label, offset)
-+         else:
-+            ls = '%s + %d' % (label, offset)
-+      else:
-+         ls = '0x%08x' % ls
-+      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
-+
-+def is_ra_or_rb(val):
-+   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
-+
-+class aliases_dumper_t(dumper_t):
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '#ifndef JUST_DQASM_ARGS'
-+
-+   def label(self, pc, name):
-+      if not name[0].isdigit():
-+         if name[0] == ':':
-+            name = name[1:]
-+         print '"bs%s", "bs%x",' % (name, pc * 8)
-+         print '"bu%s", "bu%x",' % (name, pc * 8)
-+
-+   def end(self):
-+      print '#endif'
-+
-+   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
-+   def sets(self, sets):
-+      dqasm_args = []
-+      print '#ifndef JUST_DQASM_ARGS'
-+      for name in sets:
-+         if is_ra_or_rb(sets[name]):
-+            dqasm_args.append('-r%s=%s' % (sets[name], name))
-+            print '"%s", "%s",' % (name, sets[name])
-+         elif isinstance(sets[name], list):
-+            for i, val in enumerate(sets[name]):
-+               if is_ra_or_rb(val):
-+                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
-+                  print '"%s[%d]", "%s",' % (name, i, val)
-+      print '#endif'
-+      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
-+
-+def dump(dumper):
-+   if (len(prog) != 0) or (len(labels) != 0):
-+      dumper.begin()
-+
-+      sorted_labels = []
-+      for name in labels:
-+         if name[0].isdigit():
-+            for pc in labels[name]:
-+               sorted_labels.append((pc, name))
-+         else:
-+            sorted_labels.append((labels[name], name))
-+      sorted_labels.sort(reverse = True)
-+
-+      first = True
-+      for pc in xrange(len(prog)):
-+         ls, ms, line, annots = prog[pc]
-+         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
-+            dumper.label(*sorted_labels.pop())
-+         dumper.line(pc, ls, ms, line, annots, first)
-+         first = False
-+      for sorted_label in sorted_labels:
-+         assert sorted_label[0] == len(prog)
-+         dumper.label(*sorted_label)
-+
-+      dumper.end()
-+
-+###############################################################################
-+# preprocessing
-+###############################################################################
-+
-+def preprocess_inline_c(dumper):
-+   def preprocess(file):
-+      ls = None
-+      line_number = 0
-+      for line in file:
-+         line_number += 1
-+         while True:
-+            if ls is None:
-+               l = line.split('%[', 1)
-+               if len(l) == 1:
-+                  dumper.direct(l[0].rstrip())
-+                  break
-+               dumper.direct('%s \\' % l[0].rstrip())
-+               line = l[1]
-+               ls = []
-+            else:
-+               l = line.split('%]', 1)
-+               ls.append((line_number, l[0]))
-+               if len(l) == 1:
-+                  break
-+               line = l[1]
-+               l = ls[-1][1].split('%|', 1)
-+               if len(l) == 1:
-+                  for l_number, l in ls:
-+                     yield l_number, l
-+                  asm_end_prog()
-+                  dump(dumper)
-+                  asm_reset_prog()
-+               else:
-+                  ls[-1] = (ls[-1][0], l[0])
-+                  if hasattr(dumper, 'begin_iteration'):
-+                     dumper.begin_iteration()
-+                  for repls in l[1].split('%,'):
-+                     repls = [repl.strip() for repl in repls.split('%/')]
-+                     for l_number, l in ls:
-+                        for i, repl in enumerate(repls):
-+                           l = l.replace('%' + str(i), repl)
-+                        yield l_number, l
-+                     asm_end_prog()
-+                     dump(dumper)
-+                     asm_reset_prog()
-+                  if hasattr(dumper, 'end_iteration'):
-+                     dumper.end_iteration()
-+               ls = None
-+   return preprocess
-+
-+def preprocess_clif(dumper):
-+   def preprocess(file):
-+      in_asm = False
-+      line_number = 0
-+      for line in file:
-+         line_number += 1
-+         if in_asm:
-+            if line.strip() == '%]':
-+               asm_end_prog()
-+               dump(dumper)
-+               asm_reset_prog()
-+               in_asm = False
-+            else:
-+               yield line_number, line
-+         else:
-+            if line.strip() == '%[':
-+               in_asm = True
-+            elif (line[:1] == '%') and (line[:2] != '%@'):
-+               yield line_number, line[1:]
-+            else:
-+               asm_end_prog()
-+               dump(dumper)
-+               asm_reset_prog()
-+               if line[:2] == '%@':
-+                  if hasattr(dumper, 'parse_annot_mode'):
-+                     dumper.parse_annot_mode(line[2:])
-+               else:
-+                  dumper.direct(line.rstrip())
-+   return preprocess
-+
-+###############################################################################
-+# main
-+###############################################################################
-+
-+def main():
-+   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
-+   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
-+
-+   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
-+
-+   # parse command line
-+   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
-+   parser.add_option('-m', '--mode', dest = 'mode',
-+      help = '<mode> should be clif, plain, ' +
-+      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
-+      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
-+      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
-+      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
-+      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
-+   parser.add_option('-t', '--target', dest = 'target',
-+      help = '<target> should be a0, b0, or hera', metavar = '<target>')
-+   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
-+   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
-+   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
-+   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
-+   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
-+   options, args = parser.parse_args()
-+   if len(args) == 0:
-+      filename = None
-+   elif len(args) == 1:
-+      filename = args[0]
-+   else:
-+      parser.print_help()
-+      sys.exit(-1)
-+
-+   # handle mode
-+   mode = options.mode or 'clif' # assume clif if no mode specified
-+   if mode == 'clif':
-+      dumper = clif_dumper_t()
-+      preprocess = preprocess_clif(dumper)
-+   elif mode == 'plain':
-+      dumper = plain_dumper_t()
-+      preprocess = None
-+   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
-+      mode_options = mode[4:].split(',')
-+      if len(mode_options) != 3:
-+         asm_error('badly formatted mode on command line')
-+      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
-+      preprocess = None
-+   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
-+      mode_options = mode[5:].split(',')
-+      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
-+         asm_error('badly formatted mode on command line')
-+      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
-+         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
-+      preprocess = None
-+   elif mode == 'inline_c':
-+      dumper = inline_c_dumper_t(False)
-+      preprocess = preprocess_inline_c(dumper)
-+   elif mode == 'inline_c:annots':
-+      dumper = inline_c_dumper_t(True)
-+      preprocess = preprocess_inline_c(dumper)
-+   elif mode == 'asvc':
-+      dumper = asvc_dumper_t()
-+      preprocess = None
-+   elif mode == 'aliases':
-+      dumper = aliases_dumper_t()
-+      preprocess = None
-+   elif mode == 'aliases:inline_c':
-+      dumper = aliases_dumper_t()
-+      preprocess = preprocess_inline_c(dumper)
-+   else:
-+      asm_error('invalid mode')
-+   external_link = dumper.external_link()
-+
-+   # handle target
-+   target = options.target or 'b0' # assume b0 if no target specified
-+   if target == 'a0':
-+      have_sema = False
-+      have_am = False
-+      mulw_rotate = False
-+      have_lthrsw = False
-+   elif target == 'b0':
-+      have_sema = True
-+      have_am = True
-+      mulw_rotate = True
-+      have_lthrsw = True
-+   elif target == 'hera':
-+      have_sema = True
-+      have_am = False
-+      mulw_rotate = True
-+      have_lthrsw = True
-+   else:
-+      asm_error('invalid target')
-+   if have_am:
-+      sigs['loadam'] = SIG_LOADAM
-+      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
-+   if have_lthrsw:
-+      sigs['lthrsw'] = SIG_LTHRSW
-+      del sigs['int']
-+      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
-+
-+   # handle misc options
-+   allow_xor_0 = options.allow_xor_0
-+   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
-+   warnings_are_errors = options.warnings_are_errors
-+   disable_warnings = options.disable_warnings
-+
-+   # make options visible to asm
-+   arg_defs['mode'] = mode
-+   arg_defs['target'] = target
-+
-+   # arg_defs all setup at this point
-+   sets = arg_defs.copy() # todo: see arg_eval
-+
-+   # handle command line sets
-+   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
-+   for options_set in options.sets:
-+      m = re_options_set.match(options_set)
-+      if not m:
-+         asm_error('badly formatted set on command line')
-+      sets[m.group('name')] = arg_eval(m.group('val'), sets)
-+
-+   # assemble input file and dump
-+   asm_file(sets, filename, filename, preprocess)
-+   asm_end_prog()
-+   dump(dumper)
-+   for name in arg_defs: # todo: see arg_eval
-+      del sets[name]
-+   dumper.sets(sets)
-+
-+if __name__ == '__main__':
-+   main()
-diff --git b/pi-util/qem.sh a/pi-util/qem.sh
-new file mode 100644
-index 0000000..20ce7ee
---- /dev/null
-+++ a/pi-util/qem.sh
-@@ -0,0 +1,8 @@
++++ b/pi-util/qem.sh
+@@ -0,0 +1,9 @@
 +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
 +QASM=python\ pi-util/qasm.py
 +SRC_FILE=libavcodec/rpi_shader.qasm
 +DST_BASE=shader
 +
++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
 +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
 +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
 +
-diff --git b/pi-util/rebase_liblinks.py a/pi-util/rebase_liblinks.py
+diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
 new file mode 100755
-index 0000000..6a9a33f
+index 0000000..5935a11
 --- /dev/null
-+++ a/pi-util/rebase_liblinks.py
-@@ -0,0 +1,37 @@
-+#!/usr/bin/env python
-+
-+import os, sys
-+from stat import *
-+
-+def walktree(top, callback, n, prefix):
-+    '''recursively descend the directory tree rooted at top,
-+       calling the callback function for each regular file'''
-+
-+    for f in os.listdir(top):
-+        pathname = os.path.join(top, f)
-+        mode = os.lstat(pathname).st_mode
-+        if S_ISDIR(mode):
-+            # It's a directory, recurse into it
-+            walktree(pathname, callback, n+1, prefix)
-+        elif S_ISLNK(mode):
-+            # It's a file, call the callback function
-+            callback(pathname, os.readlink(pathname), n, prefix)
-+
-+def visitfile(file, linkname, n, prefix):
-+    if (linkname.startswith(prefix + 'lib/')):
-+        newlink = "../" * n + linkname[len(prefix):]
-+        print 'relinking', file, "->", newlink
-+        os.remove(file)
-+        os.symlink(newlink, file)
-+
-+if __name__ == '__main__':
-+    argc = len(sys.argv)
-+    if argc == 2:
-+        walktree(sys.argv[1], visitfile, 0, "/")
-+    elif argc == 3:
-+        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
-+    else:
-+        print "rebase_liblinks.py <local root> [<old sysroot>]"
-+
-+
-+
-diff --git b/pi-util/syncroot.sh a/pi-util/syncroot.sh
-new file mode 100755
-index 0000000..d8bdd91
---- /dev/null
-+++ a/pi-util/syncroot.sh
-@@ -0,0 +1,43 @@
-+set -e
-+
-+if [ "$1" == "" ]; then
-+  echo Usage: $0 \<src_dir\> [\<rootname\>]
-+  echo src_dir is a source for rsync so may contain m/c name.
-+  echo rootname will be set to \"raspian_jessie_pi1\" if missing
-+  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
-+  exit 1
-+fi
-+
-+SYSROOT_NAME=$2
-+if [ "$SYSROOT_NAME" == "" ]; then
-+  SYSROOT_NAME=raspian_jessie_pi1
-+fi
-+
-+DST_ROOT=`pwd`
-+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
-+SRC=$1
-+
-+echo Sync src:  $SRC
-+echo Sync dest: $DST
-+
-+mkdir -p $DST/lib
-+mkdir -p $DST/opt/vc/include
-+mkdir -p $DST/usr/lib/pkgconfig
-+mkdir -p $DST/usr/bin
-+mkdir -p $DST/usr/share
-+
-+#### MUST NOT include /opt/vc/include/*GL*
-+# Creates conflicts with GL includes inside Chrome
-+
-+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
-+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
-+rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
-+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
-+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
-+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
-+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
-+rsync -rl $SRC/usr/include $DST/usr
-+
-+pi-util/rebase_liblinks.py $DST
-+
-+
-diff --git b/pi-util/v3dusage.py a/pi-util/v3dusage.py
-new file mode 100644
-index 0000000..7e336a9
---- /dev/null
-+++ a/pi-util/v3dusage.py
-@@ -0,0 +1,75 @@
++++ b/pi-util/v3dusage.py
+@@ -0,0 +1,128 @@
 +#!/usr/bin/env python
 +
 +import sys
 +import argparse
 +import re
 +
-+def main():
-+    argp = argparse.ArgumentParser(description="QPU/VPU perf summary")
-+    argp.add_argument("logfile")
-+    args = argp.parse_args()
-+
++def do_logparse(logname):
 +
 +    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
 +
 +    ttotal = {'idle':0.0}
 +    tstart = {}
++    qctotal = {}
++    qtstotal = {}
++    l2hits = {}
++    l2total = {}
 +    time0 = None
 +    idle_start = None
 +    qpu_op_no = 0
 +    op_count = 0
 +
-+    with open(args.logfile, "rt") as infile:
++    with open(logname, "rt") as infile:
 +        for line in infile:
 +            match = rmatch.match(line)
 +            if match:
@@ -18387,6 +20055,31 @@ index 0000000..7e336a9
 +                    ttotal['idle'] += time - idle_start
 +                    idle_start = None
 +
++            match = rqcycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qctotal:
++                    qctotal[unit] = 0
++                qctotal[unit] += int(match.group(2))
++
++            match = rqtscycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qtstotal:
++                    qtstotal[unit] = 0
++                qtstotal[unit] += int(match.group(2))
++
++            match = rl2hits.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in l2total:
++                    l2total[unit] = 0
++                    l2hits[unit] = 0
++                l2total[unit] += int(match.group(3))
++                if match.group(2) == "hits":
++                    l2hits[unit] += int(match.group(3))
++
++
 +    if not time0:
 +        print "No v3d profile records found"
 +    else:
@@ -18395,8 +20088,34 @@ index 0000000..7e336a9
 +        print "Logged time:", tlogged, "  Op count:", op_count
 +        for unit in sorted(ttotal):
 +            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++        print
++        for unit in sorted(qctotal):
++            if not unit in qtstotal:
++                qtstotal[unit] = 0;
++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++            if unit in l2total:
++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
 +
 +
 +if __name__ == '__main__':
-+   main()
++    argp = argparse.ArgumentParser(
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        description="QPU/VPU perf summary from VC logging",
++        epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
 +
++Example use:
++  vcgencmd set_logging level=0xc0
++  <command to profile>
++  sudo vcdbg log msg >& t.log
++  v3dusage.py t.log
++""")
++
++    argp.add_argument("logfile")
++    args = argp.parse_args()
++
++    do_logparse(args.logfile)
++
+