From b27b2a95cc368141e9604cf0cbdff755a49cdd5b Mon Sep 17 00:00:00 2001
From: MilhouseVH <milhouseVH.github@nmacleod.com>
Date: Mon, 23 Oct 2017 10:07:42 +0100
Subject: [PATCH] ffmpeg: update to ffmpeg-9702d0d

---
 packages/multimedia/ffmpeg/package.mk         |     2 +-
 ...mpeg-99.1003-pfcd_hevc_optimisations.patch | 25774 ++++++++++------
 2 files changed, 17108 insertions(+), 8668 deletions(-)

diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 044bf59c51..43d136a1b0 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -18,7 +18,7 @@
 
 PKG_NAME="ffmpeg"
 # Current branch is: release/3.1-xbmc
-PKG_VERSION="f58e5b9"
+PKG_VERSION="9702d0d"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index 96cfa9ae30..5b3fc489a5 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -1,8 +1,16 @@
 diff --git a/.gitignore b/.gitignore
-index 524fb73..305632b 100644
+index 524fb73c16..bcc983739f 100644
 --- a/.gitignore
 +++ b/.gitignore
-@@ -23,6 +23,7 @@
+@@ -1,6 +1,7 @@
+ *.a
+ *.o
+ *.o.*
++*.bin
+ *.d
+ *.def
+ *.dll
+@@ -23,6 +24,7 @@
  .\#*
  /.config
  /.version
@@ -11,7 +19,7 @@ index 524fb73..305632b 100644
  /ffplay
  /ffprobe
 diff --git a/ffmpeg.c b/ffmpeg.c
-index 9ffd833..e2474e5 100644
+index cdded8673f..5eee7dfd40 100644
 --- a/ffmpeg.c
 +++ b/ffmpeg.c
 @@ -23,6 +23,11 @@
@@ -20,13 +28,21 @@ index 9ffd833..e2474e5 100644
  
 +#ifdef RPI
 +#define RPI_DISPLAY
-+#define RPI_ZERO_COPY
++#define RPI_DISPLAY_ALL 0
 +#endif
 +
  #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -66,6 +71,25 @@
+@@ -42,6 +47,7 @@
+ #include "libavformat/avformat.h"
+ #include "libavdevice/avdevice.h"
+ #include "libswresample/swresample.h"
++#include "libavutil/atomic.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/channel_layout.h"
+ #include "libavutil/parseutils.h"
+@@ -66,6 +72,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
@@ -38,21 +54,21 @@ index 9ffd833..e2474e5 100644
 +#include <interface/mmal/mmal.h>
 +#include <interface/mmal/mmal_parameters_camera.h>
 +#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
 +#include <interface/mmal/util/mmal_util.h>
 +#include <interface/mmal/util/mmal_default_components.h>
 +#include <interface/mmal/util/mmal_connection.h>
 +#include <interface/mmal/util/mmal_util_params.h>
 +#pragma GCC diagnostic pop
-+#ifdef RPI_ZERO_COPY
 +#include "libavcodec/rpi_qpu.h"
-+#endif
++#include "libavutil/rpi_sand_fns.h"
 +#include "libavcodec/rpi_zc.h"
 +#endif
 +
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -158,6 +182,182 @@ static int restore_tty;
+@@ -158,6 +183,241 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -60,39 +76,36 @@ index 9ffd833..e2474e5 100644
 +
 +#define NUM_BUFFERS 4
 +
-+static MMAL_COMPONENT_T* rpi_display = NULL;
-+static MMAL_POOL_T *rpi_pool = NULL;
-+static volatile int rpi_display_count = 0;
 +
-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
++typedef struct rpi_display_env_s
++{
++    MMAL_COMPONENT_T* display;
++    MMAL_COMPONENT_T* isp;
++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
++    MMAL_CONNECTION_T * conn;
++
++    MMAL_POOL_T *rpi_pool;
++    volatile int rpi_display_count;
++    enum AVPixelFormat avfmt;
++} rpi_display_env_t;
++
++static rpi_display_env_t * rpi_display_env = NULL;
++
++
++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
 +{
 +    MMAL_POOL_T* pool;
-+    size_t i;
-+    size_t size = (w*h*3)/2;
-+#ifdef RPI_ZERO_COPY
 +    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
 +    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
 +    assert(pool);
-+#else
-+    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
-+
-+    for (i = 0; i < NUM_BUFFERS; ++i)
-+    {
-+       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
-+       char * bufPtr = buffer->data;
-+       memset(bufPtr, i*30, w*h);
-+       memset(bufPtr+w*h, 128, (w*h)/2);
-+    }
-+#endif
 +
 +    return pool;
 +}
 +
 +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+#ifdef RPI_ZERO_COPY
++    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
 +    av_rpi_zc_unref(buffer->user_data);
-+    --rpi_display_count;
-+#endif
++    avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
 +    mmal_buffer_header_release(buffer);
 +}
 +
@@ -100,9 +113,12 @@ index 9ffd833..e2474e5 100644
 +  mmal_buffer_header_release(buffer);
 +}
 +
-+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
++#define DISPLAY_PORT_DEPTH 4
++
++static rpi_display_env_t *
++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
 +{
-+    MMAL_COMPONENT_T* display;
++    MMAL_STATUS_T err;
 +    MMAL_DISPLAYREGION_T region =
 +    {
 +        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
@@ -111,51 +127,113 @@ index 9ffd833..e2474e5 100644
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
++#if RPI_ZC_SAND_8_IN_10_BUF
++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
++#else
++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
++#endif
 +    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
++    rpi_display_env_t * de;
++    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
 +
-+    bcm_host_init();  // TODO is this needed?
-+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
-+    assert(display);
++    bcm_host_init();  // Needs to be done by someone...
 +
-+    mmal_port_parameter_set(display->input[0], &region.hdr);
++    if ((de = av_mallocz(sizeof(*de))) == NULL) {
++        return NULL;
++    }
++
++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
++    av_assert0(de->display);
++    de->port_in = de->display->input[0];
++
++    if (isp_req)
++    {
++        mmal_component_create("vc.ril.isp", &de->isp);
++        de->port_in = de->isp->input[0];
++    }
++
++    mmal_port_parameter_set(de->display->input[0], &region.hdr);
 +
 +    {
-+        MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
++        MMAL_PORT_T * const port = de->port_in;
++        MMAL_ES_FORMAT_T* const format = port->format;
++        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
++        port->buffer_num = DISPLAY_PORT_DEPTH;
++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
++            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
++                MMAL_ENCODING_I420;
 +        format->es->video.width = geo.stride_y;
-+        format->es->video.height = geo.height_y;
++        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
++                                      (h + 15) & ~15 : geo.height_y;  // Magic
 +        format->es->video.crop.x = 0;
 +        format->es->video.crop.y = 0;
 +        format->es->video.crop.width = w;
 +        format->es->video.crop.height = h;
-+        mmal_port_format_commit(display->input[0]);
++        mmal_port_format_commit(port);
 +    }
 +
-+    mmal_component_enable(display);
++    de->rpi_pool = display_alloc_pool(de->port_in);
++    mmal_port_enable(de->port_in,display_cb_input);
 +
-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
++    if (isp_req) {
++        MMAL_PORT_T * const port_out = de->isp->output[0];
++        mmal_log_dump_port(de->port_in);
++        mmal_format_copy(port_out->format, de->port_in->format);
++        if (fmt == AV_PIX_FMT_SAND64_10) {
++            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
++                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
++            {
++                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
++            }
++            else
++                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
 +
-+    mmal_port_enable(display->input[0],display_cb_input);
-+    mmal_port_enable(display->control,display_cb_control);
++        }
++        port_out->format->encoding = MMAL_ENCODING_I420;
++        mmal_log_dump_port(port_out);
++        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
++        {
++            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
++            goto fail;
++        }
++        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
++            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
++            goto fail;
++        }
++        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
++            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
++            goto fail;
++        }
++        mmal_port_enable(de->isp->control,display_cb_control);
++        mmal_component_enable(de->isp);
++    }
++
++    mmal_component_enable(de->display);
++    mmal_port_enable(de->display->control,display_cb_control);
++    de->avfmt = fmt;
 +
 +    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
 +
-+    return display;
++    return de;
++
++fail:
++    // **** Free stuff
++    return NULL;
 +}
 +
-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
 +{
 +    MMAL_BUFFER_HEADER_T* buf;
 +
-+    if (!display || !rpi_pool)
++    if (de == NULL)
 +        return;
 +
-+    if (rpi_display_count >= 3) {
++    if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
 +        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
 +        return;
 +    }
 +
-+    buf = mmal_queue_get(rpi_pool->queue);
++    buf = mmal_queue_get(de->rpi_pool->queue);
 +    if (!buf) {
 +        // Running too fast so drop the frame
 +        printf("Q alloc failure\n");
@@ -165,67 +243,64 @@ index 9ffd833..e2474e5 100644
 +    buf->cmd = 0;
 +    buf->offset = 0; // Offset to valid data
 +    buf->flags = 0;
-+#ifdef RPI_ZERO_COPY
-+{
-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
-+    if (fr_buf == NULL) {
-+        mmal_buffer_header_release(buf);
-+        return;
-+    }
-+
-+    buf->user_data = fr_buf;
-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->offset = av_rpi_zc_offset(fr_buf);
-+    buf->length = av_rpi_zc_length(fr_buf);
-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+#if 0
 +    {
-+        unsigned int n;
-+        for (n = 0; n < fr->width; n += 128) {
-+            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
++        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
++        if (fr_buf == NULL) {
++            mmal_buffer_header_release(buf);
++            return;
 +        }
++
++        buf->user_data = fr_buf;
++        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
++        buf->offset = av_rpi_zc_offset(fr_buf);
++        buf->length = av_rpi_zc_length(fr_buf);
++        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++        avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
 +    }
-+#endif
-+    ++rpi_display_count;
-+}
-+#else
-+{
-+#error YYY
-+    int w = fr->width;
-+    int h = fr->height;
-+    int w2 = (w+31)&~31;
-+    int h2 = (h+15)&~15;
-+
-+    buf->length = (w2 * h2 * 3)/2;
-+    buf->user_data = NULL;
-+
-+    //mmal_buffer_header_mem_lock(buf);
-+    memcpy(buf->data, fr->data[0], w2 * h);
-+    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
-+    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
-+    //mmal_buffer_header_mem_unlock(buf);
-+}
-+#endif
-+
-+    while (rpi_display_count >= 3) {
++#if RPI_DISPLAY_ALL
++    while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
 +        usleep(5000);
 +    }
++#endif
 +
-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
 +    {
-+        printf("** send failed: depth=%d\n", rpi_display_count);
-+        display_cb_input(NULL, buf);
++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++        display_cb_input(de->port_in, buf);
 +    }
 +}
 +
-+static void display_exit(MMAL_COMPONENT_T* display)
++static void display_exit(rpi_display_env_t ** const pde)
 +{
++    rpi_display_env_t * const de = *pde;
++    *pde = NULL;
++
++    if (de != NULL) {
 +//    sleep(120);
-+    if (display) {
-+        mmal_component_destroy(display);
-+    }
-+    if (rpi_pool) {
-+        mmal_port_pool_destroy(display->input[0], rpi_pool);
++
++        if (de->port_in != NULL) {
++            mmal_port_disable(de->port_in);
++        }
++
++        // The above disable should kick out all buffers - check that
++        if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
++            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
++        }
++
++        if (de->conn != NULL) {
++            mmal_connection_destroy(de->conn);
++        }
++        if (de->isp != NULL) {
++            mmal_component_destroy(de->isp);
++        }
++        if (de->display != NULL) {
++            mmal_component_destroy(de->display);
++        }
++        if (de->rpi_pool != NULL) {
++            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
++        }
++
++        av_free(de);
 +    }
 +}
 +
@@ -235,29 +310,29 @@ index 9ffd833..e2474e5 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
 +
 +#ifdef RPI_DISPLAY
-+    display_exit(rpi_display);
++    display_exit(&rpi_display_env);
 +#endif
 +
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
+@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret)
          av_freep(&ist->filters);
          av_freep(&ist->hwaccel_device);
  
-+#ifdef RPI_ZERO_COPY
++#ifdef RPI_DISPLAY
 +        av_rpi_zc_uninit(ist->dec_ctx);
 +#endif
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -265,28 +340,28 @@ index 9ffd833..e2474e5 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
+@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
 +#ifdef RPI_DISPLAY
 +    if (next_picture && ist != NULL)
 +    {
-+        if (!rpi_display)
-+            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
++        if (rpi_display_env == NULL)
++            rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
++        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
 +    }
 +#endif
 +
      if (filter->inputs[0]->frame_rate.num > 0 &&
          filter->inputs[0]->frame_rate.den > 0)
          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
-@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
 +
-+#ifdef RPI_ZERO_COPY
++#ifdef RPI_DISPLAY
 +        // Overrides the above get_buffer2
 +        av_rpi_zc_init(ist->dec_ctx);
 +#endif
@@ -295,66 +370,74 @@ index 9ffd833..e2474e5 100644
  
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index fd0d1f0..1740768 100644
+index bb28aea1e2..741aa0bdc4 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
-@@ -5,6 +5,12 @@ NAME = avcodec
+@@ -5,6 +5,16 @@ NAME = avcodec
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
++          rpi_opts.h                                                    \
 +          rpi_qpu.h                                                     \
 +          rpi_shader.h                                                  \
-+	  rpi_shader_cmd.h                                              \
++          rpi_shader_cmd.h                                              \
++          rpi_shader_template.h                                         \
++          rpi_shader_template_fn.h                                      \
 +          rpi_mailbox.h                                                 \
-+          rpi_hevc_transform.h                                          \
++          rpi_hevc_transform8.h                                         \
++          rpi_hevc_transform10.h                                        \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
+@@ -43,6 +53,11 @@ OBJS = allcodecs.o                                                      \
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
 +       rpi_qpu.o                                                        \
 +       rpi_shader.o                                                     \
++       rpi_shader_template.o                                            \
 +       rpi_mailbox.o                                                    \
 +       rpi_zc.o                                                         \
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
-+QASM := $(SUBDIR)../pi-util/qasm.py
++QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
 +
-+ifneq ("$(wildcard $(QASM))","")
++ifneq ("$(wildcard $(QASM_PY))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
 +endif
 +
-+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 54efaad..02a89c3 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -667,6 +667,7 @@ void avcodec_register_all(void)
-     REGISTER_PARSER(H261,               h261);
-     REGISTER_PARSER(H263,               h263);
-     REGISTER_PARSER(H264,               h264);
-+    REGISTER_PARSER(H264_MVC,           h264_mvc);
-     REGISTER_PARSER(HEVC,               hevc);
-     REGISTER_PARSER(MJPEG,              mjpeg);
-     REGISTER_PARSER(MLP,                mlp);
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++	python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++	python pi-util/make_array.py $<
++
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index a4ceca7..cafd25d 100644
+index a4ceca7f46..f8229a80e2 100644
 --- a/libavcodec/arm/Makefile
 +++ b/libavcodec/arm/Makefile
-@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
@@ -363,13 +446,15 @@ index a4ceca7..cafd25d 100644
 +                                          arm/hevcdsp_epel_neon.o       \
                                            arm/hevcdsp_idct_neon.o       \
 -                                          arm/hevcdsp_qpel_neon.o
++                                          arm/hevcdsp_cres_neon.o       \
++                                          arm/hevcdsp_res16_neon.o      \
 +                                          arm/hevcdsp_qpel_neon.o       \
 +                                          arm/hevcdsp_sao_neon.o
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
 diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b..0a3980a 100644
+index fdbf86b45e..0a3980a1ef 100644
 --- a/libavcodec/arm/cabac.h
 +++ b/libavcodec/arm/cabac.h
 @@ -26,13 +26,34 @@
@@ -552,7 +637,7 @@ index fdbf86b..0a3980a 100644
  #endif /* AVCODEC_ARM_CABAC_H */
 diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
 new file mode 100644
-index 0000000..31d3c59
+index 0000000000..31d3c59205
 --- /dev/null
 +++ b/libavcodec/arm/hevc_cabac.h
 @@ -0,0 +1,491 @@
@@ -1047,9 +1132,239 @@ index 0000000..31d3c59
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
+new file mode 100644
+index 0000000000..380d3c8d3b
+--- /dev/null
++++ b/libavcodec/arm/hevc_idct_fn_neon.S
+@@ -0,0 +1,224 @@
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT  (15 - BIT_DEPTH)
++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q0, r1
++        vdup.16     q1, r1
++        vst1.16     {q0, q1}, [r0]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++        vstm        r0, {q8-q15}
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++        vstm        r0!, {q8-q15}
++        vstm        r0!, {q8-q15}
++        vstm        r0!, {q8-q15}
++        vstm        r0, {q8-q15}
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        mov         r3, #16
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++1:      subs        r3, #1
++        vstm        r0!, {q8-q15}
++        bne         1b
++        bx lr
++endfunc
++
++
++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
++        vpush       {d8-d15}
++        vld1.16     {q14, q15}, [r0]  // coeffs
++        ldr         r3, =0x00240053 // 36 and 83
++        vmov.32     d0[0], r3
++
++        tr4_shift d28, d29, d30, d31, #7
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        vst1.16     {q14, q15}, [r0]
++        vpop        {d8-d15}
++        bx lr
++endfunc
++
++
++
++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++        vpush       {d8-d15}
++        vld1.16     {q14, q15}, [r0]  // coeffs
++        ldr         r3, =0x4a  // 74
++        vmov.32     d0[0], r3
++        ldr         r3, =0x1d  // 29
++        vmov.32     d0[1], r3
++        ldr         r3, =0x37  // 55
++        vmov.32     d1[0], r3
++
++        tr4_luma_shift d28, d29, d30, d31, #7
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++        vst1.16     {q14, q15}, [r0]
++        vpop        {d8-d15}
++        bx lr
++endfunc
++
++
++
++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
++        push   {r4-r8}
++        vpush {d8-d15}
++        mov    r5, #16
++
++        adrl      r3, tr4f
++        vld1.16   {d0, d1}, [r3]
++
++        // left half
++        vld1.16 {d24}, [r0], r5
++        vld1.16 {d25}, [r0], r5
++        vld1.16 {d26}, [r0], r5
++        vld1.16 {d27}, [r0], r5
++        vld1.16 {d28}, [r0], r5
++        vld1.16 {d29}, [r0], r5
++        vld1.16 {d30}, [r0], r5
++        vld1.16 {d31}, [r0], r5
++        sub      r0, #128
++        tr8_begin d25, d27, d29, d31
++        tr4       d24, d26, d28, d30
++        tr8_end   #7
++        vst1.16 {d2}, [r0], r5
++        vst1.16 {d3}, [r0], r5
++        vst1.16 {d4}, [r0], r5
++        vst1.16 {d5}, [r0], r5
++        vst1.16 {d6}, [r0], r5
++        vst1.16 {d7}, [r0], r5
++        vst1.16 {d8}, [r0], r5
++        vst1.16 {d9}, [r0], r5
++        sub      r0, #128
++        //skip right half if col_limit in r1 is less than 4
++        cmp      r1, #4
++        blt      1f
++        //right half
++        add      r0, #8
++        vld1.16 {d24}, [r0], r5
++        vld1.16 {d25}, [r0], r5
++        vld1.16 {d26}, [r0], r5
++        vld1.16 {d27}, [r0], r5
++        vld1.16 {d28}, [r0], r5
++        vld1.16 {d29}, [r0], r5
++        vld1.16 {d30}, [r0], r5
++        vld1.16 {d31}, [r0], r5
++        sub      r0, #128
++        tr8_begin d25, d27, d29, d31
++        tr4       d24, d26, d28, d30
++        tr8_end   #7
++        vst1.16 {d2}, [r0], r5
++        vst1.16 {d3}, [r0], r5
++        vst1.16 {d4}, [r0], r5
++        vst1.16 {d5}, [r0], r5
++        vst1.16 {d6}, [r0], r5
++        vst1.16 {d7}, [r0], r5
++        vst1.16 {d8}, [r0], r5
++        vst1.16 {d9}, [r0], r5
++        sub      r0, #136
++1:
++        // top half
++        vldm r0, {q12-q15} // coeffs
++        transpose_16b_4x4 d24, d26, d28, d30
++        transpose_16b_4x4 d25, d27, d29, d31
++        tr8_begin d26, d30, d27, d31
++        tr4 d24, d28, d25, d29
++        tr8_end #(TRN_SHIFT)
++        transpose_16b_4x4 d2, d3, d4, d5
++        transpose_16b_4x4 d6, d7, d8, d9
++        vswp     d7, d5
++        vswp     d7, d8
++        vswp     d3, d6
++        vswp     d6, d4
++        vstm r0!, {q1-q4}
++
++        // bottom half
++        vldm r0, {q12-q15} // coeffs
++        transpose_16b_4x4 d24, d26, d28, d30
++        transpose_16b_4x4 d25, d27, d29, d31
++        tr8_begin d26, d30, d27, d31
++        tr4 d24, d28, d25, d29
++        tr8_end #(TRN_SHIFT)
++        transpose_16b_4x4 d2, d3, d4, d5
++        transpose_16b_4x4 d6, d7, d8, d9
++        vswp     d7, d5
++        vswp     d7, d8
++        vswp     d3, d6
++        vswp     d6, d4
++        //vstm     r0, {q1-q4}
++        vst1.16 {q1-q2}, [r0]
++        add     r0, #32
++        vst1.16 {q3-q4}, [r0]
++        sub     r0, #32
++        vpop {d8-d15}
++        pop {r4-r8}
++        bx lr
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
 diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
 new file mode 100644
-index 0000000..373576b
+index 0000000000..373576b4cb
 --- /dev/null
 +++ b/libavcodec/arm/hevc_misc_neon.S
 @@ -0,0 +1,62 @@
@@ -1115,8 +1430,310 @@ index 0000000..373576b
 +
 +endfunc
 +
+diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
+new file mode 100644
+index 0000000000..bafefd4318
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_cres_neon.S
+@@ -0,0 +1,296 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ General notes:
++@
++@ Residual is only guaranteed to be cliped to 16 bits
++@ This means that we do need to do movul, qadd, qmovun
++@ rather than addw, qmovun (if we were clipped to 15 then we could get away
++@ with this)
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_add_residual_4x4_u_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vld1.16     {q0, q1}, [r1]
++        vdup.16     q2, r3
++        vdup.16     q3, r3
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_add_residual_8x8_u_neon_8, export=1
++        mov         r12,    #4
++        vdup.16     q15, r3
++1:
++        vld2.8      {d16, d17}, [r0, :128], r2
++        vld2.8      {d18, d19}, [r0, :128]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12, #1
++        vmovl.u8    q10, d16
++        sub         r0, r2
++        vmovl.u8    q11, d18
++        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d17
++        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d16,  q0
++        vqmovun.s16 d17,  q2
++        vqmovun.s16 d18,  q1
++        vqmovun.s16 d19,  q3
++        vst2.8      {d16, d17}, [r0, :128], r2
++        vst2.8      {d18, d19}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_add_residual_16x16_u_neon_8, export=1
++        mov         r12,    #16
++        vdup.16     q15, r3
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vaddw.u8    q2,  q15, d18
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d16, q0
++        vqmovun.s16 d17, q1
++        vqmovun.s16 d18, q2
++        vqmovun.s16 d19, q3
++        vst2.8      {q8, q9}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_v_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vld1.16     {q2, q3}, [r1]
++        vdup.16     q0, r3
++        vdup.16     q1, r3
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_v_neon_8, export=1
++        mov         r12,    #4
++        vdup.16     q15, r3
++1:
++        vld2.8      {d16, d17}, [r0, :128], r2
++        vld2.8      {d18, d19}, [r0, :128]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12, #1
++        vmovl.u8    q10, d17
++        sub         r0, r2
++        vmovl.u8    q11, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vaddw.u8    q2,  q15, d16
++        vaddw.u8    q3,  q15, d18
++        vqmovun.s16 d17,  q0
++        vqmovun.s16 d16,  q2
++        vqmovun.s16 d19,  q1
++        vqmovun.s16 d18,  q3
++        vst2.8      {d16, d17}, [r0, :128], r2
++        vst2.8      {d18, d19}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_v_neon_8, export=1
++        mov         r12,    #16
++        vdup.16     q15, r3
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d18
++        vmovl.u8    q11, d19
++        vaddw.u8    q2,  q15, d16
++        vaddw.u8    q3,  q15, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d16, q2
++        vqmovun.s16 d17, q3
++        vqmovun.s16 d18, q0
++        vqmovun.s16 d19, q1
++        vst2.8      {q8, q9}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_c_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vldm        r1, {q0-q3}           @ Q0/1 gets all of U, Q2/3 gets all of V
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_c_neon_8, export=1
++        mov         r12,    #8
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.8      {d16, d17}, [r0, :128]
++        vld1.16     {q0}, [r1, :128]!
++        vld1.16     {q1}, [r3, :128]!
++        subs        r12, #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst2.8      {d0, d1}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_c_neon_8, export=1
++        mov         r12,    #16
++        add         r3, r1, #(16*16*2)  @ Offset to V
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        vld1.16     {q2, q3}, [r3, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst2.8      {q0, q1}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
 diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb..9bd0a42 100644
+index 166bddb104..15c4329cdb 100644
 --- a/libavcodec/arm/hevcdsp_deblock_neon.S
 +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
 @@ -15,7 +15,7 @@
@@ -1128,66 +1745,204 @@ index 166bddb..9bd0a42 100644
   */
  
  
-@@ -31,6 +31,9 @@
+@@ -24,70 +24,238 @@
+ 
+ .macro hevc_loop_filter_chroma_start
+         ldr      r12, [r2]
+-        ldr      r3, [r2, #4]
+-        add      r2, r3, r12
+-        cmp      r2, #0
++        ldr      r2, [r2, #4]
++        orrs     r2, r12, r2, lsl #16
+         it       eq
          bxeq     lr
  .endm
  
+-.macro hevc_loop_filter_chroma_body
+-        vsubl.u8  q3, d4, d2
+-        vsubl.u8  q11, d18, d19
+-        vshl.i16  q3, #2
+-        vadd.i16  q11, q3
+-        vdup.16   d0, r12
+-        vdup.16   d1, r3
+-        vrshr.s16 q11, q11, #3
+-        vneg.s16  q12, q0
 +@ Uses: d2, d4, d18, d19
 +@ Returns: d2, d4
-+@ Modifies: d0-d7, d22-d25
- .macro hevc_loop_filter_chroma_body
-         vsubl.u8  q3, d4, d2
-         vsubl.u8  q11, d18, d19
-@@ -49,6 +52,33 @@
-         vqmovun.s16 d4, q2
- .endm
- 
++@ Modifies: d0-d7, d22-d25, r12
 +
-+@ Uses r2[0:7], r2[8:15]
-+@ Modifies: d0-d7, d22-d25
-+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
-+        vsubl.u8  q3, \Q0, \P0
-+        vsubl.u8  q11, \P1, \Q1
-+        vshl.i16  q3, #2
-+        vadd.i16  q11, q3
++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
++        vsubl.u8  q0, \Q0, \P0
++        vsubl.u8  q1, \P1, \Q1
++        vdup.16   d4, r2
++        lsr       r2, r2, #16
++        vshl.i16  q0, #2
++        ldr       r12, [sp, #0] @ r12 = &no_q
++        vadd.i16  q0, q1
++        ldrh      r3, [r3]      @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
++        vdup.16   d5, r2
 +
-+        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
-+        vdup.16   d0, r2
-+        vmovl.u8  q0, d0
-+        vuzp.16   d0, d1
-+
-+        vrshr.s16 q11, q11, #3
-+        vneg.s16  q12, q0
++        vrshr.s16 q0, q0, #3
++        ldrh      r12, [r12]
++        vneg.s16  q3, q2
++        vmin.s16  q0, q0, q2
 +        vmovl.u8  q2, \Q0
-+        vmin.s16  q11, q11, q0
-+        vmax.s16  q11, q11, q12
-+        vaddw.u8  q1, q11, \P0
-+        vsub.i16  q2, q11
++        vmax.s16  q0, q0, q3
++        vaddw.u8  q1, q0, \P0
++        vsub.i16  q2, q0
++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
 +        vqmovun.s16 \P0, q1
 +        vqmovun.s16 \Q0, q2
 +.endm
 +
++@ Uses r2 (tc a;b)
++@ Modifies: q0-q3
++@ On exit
++@   r12 (and flags) contain no_p;no_q
++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
++        vsub.i16  q0, \Q0, \P0
++        lsl       r12, r2, #(\bit_depth - 8)
++        vsub.i16  q1, \P1, \Q1
++        vshl.i16  q0, #2
++        vdup.16   d4, r12
++        lsr       r12, r12, #16
++        vadd.i16  q0, q1
++        ldrh      r3, [r3]
++        vdup.16   d5, r12
++
++        vrshr.s16 q0, q0, #3
++        vneg.s16  q3, q2
++        movw      r12, #(1 << \bit_depth) - 1
++        vmin.s16  q0, q0, q2
++        vmax.s16  q0, q0, q3
++        vdup.i16  q3, r12
++        ldr       r12, [sp, #0]
++
++        vadd.i16  \P0, q0, \P0
++        vsub.i16  \Q0, q0
++
++        vmov.i64  q2, #0
++        ldrh      r12, [r12]
++        vmin.s16  \P0, q3
++        vmin.s16  \Q0, q3
++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
++        vmax.s16  \P0, q2
++        vmax.s16  \Q0, q2
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
++        vsubl.u8  q0, \Q0u, \P0u
++        vsubl.u8  q1, \Q0v, \P0v
++        vsubl.u8  q2, \P1u, \Q1u
++        vsubl.u8  q3, \P1v, \Q1v
++        vshl.i16  q0, #2
++        vshl.i16  q1, #2
++        vadd.i16  q0, q2
++        vdup.16   d4, r2
++        lsr       r2, #16
++        vadd.i16  q1, q3
++
++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
++        vrshr.s16 q0, #3
++        vdup.16   d6, r2
+         vmovl.u8  q2, d4
+-        vmin.s16  q11, q11, q0
+-        vmax.s16  q11, q11, q12
+-        vaddw.u8  q1, q11, d2
+-        vsub.i16  q2, q11
+-        vqmovun.s16 d2, q1
+-        vqmovun.s16 d4, q2
++        vmovl.u8  q3, d6
++        vuzp.16   d4, d5
++        vrshr.s16 q1, #3
++        vuzp.16   d6, d7
++
++        vmin.s16  q0, q2
++        vneg.s16  q2, q2
++        vmin.s16  q1, q3
++        vneg.s16  q3, q3
++        vmax.s16  q0, q2
++        vaddw.u8  q2, q0, \P0u
++        vmax.s16  q1, q3
++        vaddw.u8  q3, q1, \P0v
++
++        vqmovun.s16 \P0u, q2
++        vmovl.u8  q2, \Q0u
++        vqmovun.s16 \P0v, q3
++        vmovl.u8  q3, \Q0v
++        vsub.i16  q2, q0
++        vsub.i16  q3, q1
++
++        vqmovun.s16 \Q0u, q2
++        vqmovun.s16 \Q0v, q3
+ .endm
+ 
++@ Preserves r12
++@ Clobbers r2
++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
++        vsub.i16  q0, \Q0u, \P0u
++        vsub.i16  q1, \Q0v, \P0v
++        vsub.i16  q2, \P1u, \Q1u
++        vsub.i16  q3, \P1v, \Q1v
++        vshl.i16  q0, #2
++        vshl.i16  q1, #2
++        vadd.i16  q0, q2
++        vdup.16   d4, r2
++        lsr       r2, #16
++        vadd.i16  q1, q3
++
++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
++        vrshr.s16 q0, #3
++        vdup.16   d6, r2
++        vshll.u8  q2, d4, #\bit_depth - 8
++        vshll.u8  q3, d6, #\bit_depth - 8
++        vuzp.16   d4, d5
++        vrshr.s16 q1, #3
++        vuzp.16   d6, d7
++
++        movw      r2, #(1 << \bit_depth) - 1
++        vmin.s16  q0, q2
++        vneg.s16  q2, q2
++        vmin.s16  q1, q3
++        vneg.s16  q3, q3
++        vmax.s16  q0, q2
++        vmov.i64  q2, #0
++        vmax.s16  q1, q3
++        vdup.i16  q3, r2
++        vadd.i16  \P0u, q0
++        vsub.i16  \Q0u, q0
++        vadd.i16  \P0v, q1
++        vsub.i16  \Q0v, q1
++
++        vmax.s16  \P0u, q2
++        vmax.s16  \Q0u, q2
++        vmax.s16  \P0v, q2
++        vmax.s16  \Q0v, q2
++        vmin.s16  \P0u, q3
++        vmin.s16  \Q0u, q3
++        vmin.s16  \P0v, q3
++        vmin.s16  \Q0v, q3
++.endm
++
 +
 +
  .macro hevc_loop_filter_luma_start
          ldr     r12, [r3]
          ldr      r3, [r3, #4]
-@@ -60,15 +90,17 @@
-         lsr      r3, #16
+-        lsl      r3, #16
+-        orr      r3, r12
+-        cmp      r3, #0
++        orrs     r3, r12, r3, lsl #16
+         it       eq
+         bxeq     lr
+-        lsr      r3, #16
  .endm
  
 -.macro hevc_loop_filter_luma_body
-+@ Uses: r2, r3, r12
-+@ Modifies: r5, r6, r7, r8, r9
-+function hevc_loop_filter_luma_body
-+        vmovl.u8  q15, d23
-+        vmovl.u8  q14, d22
-+        vmovl.u8  q13, d21
-+        vmovl.u8  q12, d20
-+        vmovl.u8  q11, d19
-+        vmovl.u8  q10, d18
-+        vmovl.u8  q9, d17
-         vmovl.u8  q8, d16
+-        vmovl.u8  q8, d16
 -        vmovl.u8  q9, d18
 -        vmovl.u8  q10, d20
 -        vmovl.u8  q11, d22
@@ -1195,46 +1950,103 @@ index 166bddb..9bd0a42 100644
 -        vmovl.u8  q13, d26
 -        vmovl.u8  q14, d28
 -        vmovl.u8  q15, d30
++@ Uses: r2, r3, r12
++@ Modifies: r5, r6, r7, r8, r9
++
++@ Input:
++@  r2          beta    (raw: needs shift for bitdepth > 8)
++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
++@  [sp,#96]    &no_p[0]
++@  [sp,#100]   &no_q[0]
++@
++@ Input & output
++@  8-bit: d16-d23
++@ 16-bit:  q8-q15
++@
++@ Output
++@  Z           r10==0
++@  r10[ 0:7 ]  no_p[0]
++@  r10[ 8:15]  no_p[1]
++@  r10[16:23]  no_q[0]
++@  r10[24:31]  no_q[1]
++
  
++.macro m_filter_luma bit_depth
++.if \bit_depth == 8
++        vmovl.u8  q15, d23
++        vmovl.u8  q14, d22
++        vmovl.u8  q13, d21
++        vmovl.u8  q12, d20
++        vmovl.u8  q11, d19
++        vmovl.u8  q10, d18
++        vmovl.u8  q9, d17
++        vmovl.u8  q8, d16
++.endif
          vadd.i16   q7, q9, q11
++.if \bit_depth > 8
++        lsl        r2, r2, #(\bit_depth - 8)
++.endif
          vadd.i16   q6, q14, q12
-@@ -77,7 +109,6 @@
++.if \bit_depth > 8
++        lsl        r3, r3, #(\bit_depth - 8)
++.endif
+         vsub.i16   q7, q10
++        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
+         vsub.i16   q6, q13
          vabd.s16   q7, q7, q10
          vabd.s16   q6, q6, q13
- 
 -
++        ldrh       r10, [r5]
+ 
          vdup.16    q0, r2
          vmov       q4, q7
          vmov       q5, q6
-@@ -152,7 +183,7 @@
+-        vdup.16    d4, r12
++        ldr        r5, [sp, #100]
++        vdup.16    d4, r3
++        lsr        r3, r3, #16
+         vtrn.16    q7, q4
++        ldrh       r5, [r5]
+         vtrn.16    q6, q5
+ 
+         vshl.u64   q7, #32
+         vshr.u64   q4, #32
+         vshl.u64   q6, #32
++        orr        r10, r10, r5, lsl #16
+         vshr.u64   q5, #32
+         vshr.u64   q7, #32
+         vshr.u64   q6, #32
+@@ -152,7 +320,7 @@
  
          and        r9, r8, r7
          cmp        r9, #0
 -        beq        weakfilter_\@
-+        beq        weakfilter_
++        beq        1f
  
          vadd.i16  q2, q11, q12
          vadd.i16  q4, q9, q8
-@@ -210,11 +241,11 @@
+@@ -210,11 +378,11 @@
          vbit      q13, q3, q5
          vbit      q14, q2, q5
  
 -weakfilter_\@:
-+weakfilter_:
++1:
          mvn       r8, r8
          and       r9, r8, r7
          cmp       r9, #0
 -        beq       ready_\@
-+        beq       ready_
++        beq       2f
  
          vdup.16    q4, r2
  
-@@ -275,75 +306,345 @@ weakfilter_\@:
+@@ -275,111 +443,1041 @@ weakfilter_\@:
          vbit      q11, q0, q5
          vbit      q12, q4, q5
  
 -ready_\@:
-+ready_:
++2:
++.if \bit_depth == 8
          vqmovun.s16 d16, q8
 -        vqmovun.s16 d18, q9
 -        vqmovun.s16 d20, q10
@@ -1243,7 +2055,7 @@ index 166bddb..9bd0a42 100644
 -        vqmovun.s16 d26, q13
 -        vqmovun.s16 d28, q14
 -        vqmovun.s16 d30, q15
--.endm
++        cmp       r10, #0
 +        vqmovun.s16 d17, q9
 +        vqmovun.s16 d18, q10
 +        vqmovun.s16 d19, q11
@@ -1251,7 +2063,30 @@ index 166bddb..9bd0a42 100644
 +        vqmovun.s16 d21, q13
 +        vqmovun.s16 d22, q14
 +        vqmovun.s16 d23, q15
++.else
++        movw      r12, #(1 << \bit_depth - 1)
++        vmov.i64  q0, #0
++        vdup.i16  q1, r12
++        @ q8 & q15 should be unaltered and so don't require clipping
++        vmax.s16  q9,  q0
++        cmp       r10, #0
++        vmax.s16  q10, q0
++        vmax.s16  q11, q0
++        vmax.s16  q12, q0
++        vmax.s16  q13, q0
++        vmax.s16  q14, q0
++        vmin.s16  q9,  q1
++        vmin.s16  q10, q1
++        vmin.s16  q11, q1
++        vmin.s16  q12, q1
++        vmin.s16  q13, q1
++        vmin.s16  q14, q1
++.endif
 +        mov       pc, lr
+ .endm
+ 
++function hevc_loop_filter_luma_body
++        m_filter_luma 8
 +endfunc
 +
 +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
@@ -1263,7 +2098,16 @@ index 166bddb..9bd0a42 100644
 +        b        v_loop_luma_common
 +endfunc
 +
- 
++
++@ void ff_hevc_v_loop_filter_luma_neon(
++@   uint8_t *_pix,      [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int *_tc,           [r3]
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++
  function ff_hevc_v_loop_filter_luma_neon, export=1
          hevc_loop_filter_luma_start
 -        push     {r5-r11}
@@ -1271,14 +2115,6 @@ index 166bddb..9bd0a42 100644
 +
 +        sub      r4, r0, #4
 +v_loop_luma_common:
-+        @ Why this isn't a bitmask to start with I have no idea...
-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
-+        ldr      r5, [sp, #32]
-+        ldrh     r10, [r5]
-+        ldr      r5, [sp, #36]
-+        ldrh     r5, [r5]
-+        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
-+
          vpush    {d8-d15}
 -        sub      r0, #4
 -        vld1.8   {d16}, [r0], r1
@@ -1335,44 +2171,38 @@ index 166bddb..9bd0a42 100644
 +
 +        @ no_p[1]
 +        tst     r10, #0xff00
-+        itt ne
-+        addne    r4, r4, r1, lsl #2
++        add     r2, r4, r1, lsl #2
 +        bne     1f
 +        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
 +        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
 +        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
-+
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
++1:
++        @ no_p[0]
++        tst     r10, #0xff
++        bne     1f
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
 +1:
 +        @ no_q[1]
 +        tst     r10, #0xff000000
-+        itt ne
-+        addne    r0, r0, r1, lsl #2
-+        bne     2f
++        add     r2, r0, r1, lsl #2
++        bne     1f
 +        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
 +        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
 +        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
-+
-+2:
-+        @ no_p[0]
-+        tst     r10, #0xff
-+        bne     3f
-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
-+
-+3:
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
++1:
 +        @ no_q[0]
 +        tst     r10, #0xff0000
-+        bne     4f
-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
-+
-+4:
++        bne     1f
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
 +bypasswrite:
          vpop     {d8-d15}
 -        pop      {r5-r11}
@@ -1380,6 +2210,81 @@ index 166bddb..9bd0a42 100644
 +        pop      {r4-r10,pc}
  endfunc
  
++.macro m_filter_v_luma_common_16 bit_depth
++        vpush    {d8-d15}
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        neg     r1, r1
++
++        @ p[1]
++        tst      r10, #0xff00
++        add      r2, r4, r1, lsl #2
++        bne      1f
++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4]
++1:
++        @ p[0]
++        tst      r10, #0xff
++        bne      1f
++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r2]
++1:
++        @ q[1]
++        tst      r10, #0xff000000
++        add      r2, r0, r1, lsl #2
++        bne      1f
++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0]
++1:
++        @ q[0]
++        tst      r10, #0xff0000
++        bne      1f
++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++        vpop     {d8-d15}
++        pop      {r4-r10,pc}
++.endm
++
++
++
++
 +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
 +@                                 ptrdiff_t stride, [r1]
 +@                                 int beta,         [r2]
@@ -1429,13 +2334,6 @@ index 166bddb..9bd0a42 100644
 +        neg     r1, r1
 +        add     r0, r0, r1
 +
-+        @ Why this isn't a bitmask to start with I have no idea...
-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
-+        ldr      r5, [sp, #32]
-+        ldrh     r10, [r5]
-+        ldr      r5, [sp, #36]
-+        ldrh     r5, [r5]
-+        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
 +        bne      1f
 +
 +        vst1.8  {d22}, [r0], r1
@@ -1486,8 +2384,81 @@ index 166bddb..9bd0a42 100644
 +
 +        pop      {r4-r10,pc}
 +
- endfunc
- 
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}
++
++        vpush    {d8-d15}
++        sub      r0, r0, r1, lsl #2
++
++        vld1.16 { q8}, [r0], r1
++        vld1.16 { q9}, [r0], r1
++        vld1.16 {q10}, [r0], r1
++        vld1.16 {q11}, [r0], r1
++        vld1.16 {q12}, [r0], r1
++        vld1.16 {q13}, [r0], r1
++        vld1.16 {q14}, [r0], r1
++        vld1.16 {q15}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        vpop     {d8-d15}
++
++        sub      r0, r1
++        neg      r1, r1
++        bne      1f
++
++        vst1.16  {q14}, [r0], r1
++        vst1.16  {q13}, [r0], r1
++        vst1.16  {q12}, [r0], r1
++        vst1.16  {q11}, [r0], r1
++        vst1.16  {q10}, [r0], r1
++        vst1.16  { q9}, [r0]
++        pop      {r4-r10,pc}
++
++@ Partial write
++1:
++        tst      r10, #0xff0000
++        mov      r2, r0
++        bne      1f
++        vst1.16  {d28}, [r2], r1
++        vst1.16  {d26}, [r2], r1
++        vst1.16  {d24}, [r2]
++
++1:
++        tst      r10, #0xff000000
++        add      r2, r0, #8
++        bne      1f
++        vst1.16  {d29}, [r2], r1
++        vst1.16  {d27}, [r2], r1
++        vst1.16  {d25}, [r2]
++
++1:
++        tst      r10, #0xff
++        @ r0 = r0 + r1 * 3
++        add      r0, r0, r1
++        add      r0, r0, r1, lsl # 1
++        add      r2, r0, #8
++        bne      1f
++        vst1.16  {d22}, [r0], r1
++        vst1.16  {d20}, [r0], r1
++        vst1.16  {d18}, [r0]
++
++1:
++        tst      r10, #0xff00
++        bne      1f
++        vst1.16  {d23}, [r2], r1
++        vst1.16  {d21}, [r2], r1
++        vst1.16  {d19}, [r2]
++
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
 +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
@@ -1501,9 +2472,7 @@ index 166bddb..9bd0a42 100644
 +        vld2.8   {d26,d27}, [r0], r1
 +        vld2.8   {d28,d29}, [r0]
 +        sub      r0, r0, r1, lsl #1
-+        hevc_loop_filter_uv_body d16, d18, d26, d28
-+        lsr      r2, r2, #16
-+        hevc_loop_filter_uv_body d17, d19, d27, d29
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
 +        cmp      r3, #0
 +        bne      1f
 +        vst2.8   {d18,d19}, [r0], r1
@@ -1513,122 +2482,509 @@ index 166bddb..9bd0a42 100644
 +        @ At least one no_f bit is set
 +        @ Which means we need to break this apart in an ugly fashion
 +1:      vzip.8   d18, d19
++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
 +        vzip.8   d26, d27
 +        sub      r1, r1, #8
 +
-+        tst      r3, #1
-+        bne      1f
++        bmi      1f
 +        vst1.8   {d18}, [r0]
 +1:      add      r0, r0, #8
-+        tst      r3, #2
-+        bne      2f
++        bcs      2f
 +        vst1.8   {d19}, [r0]
-+2:      add      r0, r0, r1
++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
++        add      r0, r0, r1
 +
-+        tst      r3, #4
-+        bne      1f
++        bmi      1f
 +        vst1.8   {d26}, [r0]
-+1:      add      r0, r0, #8
-+        tst      r3, #8
-+        it ne
-+        bxne     lr
++1:      it cs
++        bxcs     lr
++        add      r0, r0, #8
 +        vst1.8   {d27}, [r0]
 +        bx       lr
 +
 +endfunc
 +
 +
++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++        sub      r0, r0, r1, lsl #1
++        vld2.16  {q8,  q9 }, [r0], r1
++        vld2.16  {q10, q11}, [r0], r1
++        vld2.16  {q12, q13}, [r0], r1
++        vld2.16  {q14, q15}, [r0]
++        sub      r0, r0, r1, lsl #1
++
++        hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++
++        cmp      r3, #0
++        bne      1f
++        vst2.16  {q10, q11}, [r0], r1
++        vst2.16  {q12, q13}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:      vzip.16  q10, q11
++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
++        vzip.16  q12, q13
++        sub      r1, r1, #16
++
++        bmi      1f
++        vst1.16  {q10}, [r0]
++1:      add      r0, r0, #16
++        bcs      2f
++        vst1.16  {q11}, [r0]
++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
++        add      r0, r0, r1
++
++        bmi      1f
++        vst1.16  {q12}, [r0]
++1:      it cs
++        bxcs     lr
++        add      r0, r0, #16
++        vst1.16  {q13}, [r0]
++        bx       lr
++.endm
++
++
 +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     uint8_t * src_l,       // r3
 +@                                     unsigned int no_f);   // sp[0]
 +@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++
 +function ff_hevc_v_loop_filter_uv2_neon_8, export=1
 +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
-+        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
++        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
++        sub      r12, r0, r3
 +
 +        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
-+        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++        vld4.8   {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
++        cmp      r12, #4
 +
 +        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
-+        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++        vld4.8   {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
 +
 +        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
-+        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++        vld4.8   {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
 +
 +        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
-+        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++        vld4.8   {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
 +
 +        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
-+        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++        vld4.8   {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
 +
 +        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
-+        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++        vld4.8   {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
 +
 +        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
-+        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
-+
-+        hevc_loop_filter_uv_body d16, d18, d26, d28
-+        lsr      r2, r2, #16
-+        hevc_loop_filter_uv_body d17, d19, d27, d29
++        vld4.8   {d20[7], d21[7], d22[7], d23[7]}, [r0]
++        it eq
++        ldreq    r12, [sp, #0]
 +
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
++        cmp      r12, #0
++        add      r3, #2
 +        neg      r1, r1
-+
-+        ldr      r2, [sp, #0]
-+
-+        @ p[1]
-+        tst      r2, #2
-+        itt ne
-+        addne    r3, r3, r1, lsl #2
 +        bne      1f
-+        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
-+        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
-+        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
-+        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
 +
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst4.8   {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
++        vst4.8   {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
++        vst4.8   {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
++        vst4.8   {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
++        vst4.8   {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
++        vst4.8   {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
++        vst4.8   {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
++        vst4.8   {d18[0], d19[0], d20[0], d21[0]}, [r3]
++        bx       lr
++
++@ Either split or partial
 +1:
-+        @ q[1]
-+        tst      r2, #8
-+        itt ne
-+        addne    r0, r0, r1, lsl #2
-+        bne 2f
-+        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
-+        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
-+        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
-+        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++        ldr      r12, [sp, #0]
++        lsls     r12, #29               @ b2 -> N, b3 -> C
++        add      r2, r0, r1, lsl #2
++        bcs      1f
++        vst2.8   {d20[7], d21[7]}, [r0], r1
++        vst2.8   {d20[6], d21[6]}, [r0], r1
++        vst2.8   {d20[5], d21[5]}, [r0], r1
++        vst2.8   {d20[4], d21[4]}, [r0]
++1:
++        bmi      2f
++        vst2.8   {d20[3], d21[3]}, [r2], r1
++        vst2.8   {d20[2], d21[2]}, [r2], r1
++        vst2.8   {d20[1], d21[1]}, [r2], r1
++        vst2.8   {d20[0], d21[0]}, [r2]
 +
 +2:
-+        @ p[0]
-+        tst      r2, #1
-+        bne      3f
-+        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
-+        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
-+        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
-+        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
-+
++        lsls     r12, #2
++        add      r2, r3, r1, lsl #2
++        bcs      3f
++        vst2.8   {d18[7], d19[7]}, [r3], r1
++        vst2.8   {d18[6], d19[6]}, [r3], r1
++        vst2.8   {d18[5], d19[5]}, [r3], r1
++        vst2.8   {d18[4], d19[4]}, [r3]
 +3:
-+        @ q[0]
-+        tst      r2, #4
-+        it ne
-+        bxne     lr
-+        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
-+        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
-+        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
-+        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
-+
++        it mi
++        bxmi     lr
++        vst2.8   {d18[3], d19[3]}, [r2], r1
++        vst2.8   {d18[2], d19[2]}, [r2], r1
++        vst2.8   {d18[1], d19[1]}, [r2], r1
++        vst2.8   {d18[0], d19[0]}, [r2]
 +        bx       lr
-+endfunc
+ endfunc
+ 
++
++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++.macro m_filter_v_uv2_16 bit_depth
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++        sub      r12, r0, r3
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++        cmp      r12, #8
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r3]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++        it eq
++        ldreq    r12, [sp, #0]
++
++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++        cmp      r12, #0
++        add      r3, #4
++        neg      r1, r1
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst4.16  {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
++        vst4.16  {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
++        vst4.16  {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
++        vst4.16  {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
++        vst4.16  {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
++        vst4.16  {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
++        vst4.16  {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
++        vst4.16  {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
++        bx       lr
++
++@ Either split or partial
++1:
++        ldr      r12, [sp, #0]
++        lsls     r12, #29               @ b2 -> N, b3 -> C
++        add      r2, r0, r1, lsl #2
++        bcs      1f
++        vst2.16  {d25[3], d27[3]}, [r0], r1
++        vst2.16  {d25[2], d27[2]}, [r0], r1
++        vst2.16  {d25[1], d27[1]}, [r0], r1
++        vst2.16  {d25[0], d27[0]}, [r0]
++1:
++        bmi      2f
++        vst2.16  {d24[3], d26[3]}, [r2], r1
++        vst2.16  {d24[2], d26[2]}, [r2], r1
++        vst2.16  {d24[1], d26[1]}, [r2], r1
++        vst2.16  {d24[0], d26[0]}, [r2]
++
++2:
++        lsls     r12, #2
++        add      r2, r3, r1, lsl #2
++        bcs      3f
++        vst2.16  {d21[3], d23[3]}, [r3], r1
++        vst2.16  {d21[2], d23[2]}, [r3], r1
++        vst2.16  {d21[1], d23[1]}, [r3], r1
++        vst2.16  {d21[0], d23[0]}, [r3]
++3:
++        it mi
++        bxmi     lr
++        vst2.16  {d20[3], d22[3]}, [r2], r1
++        vst2.16  {d20[2], d22[2]}, [r2], r1
++        vst2.16  {d20[1], d22[1]}, [r2], r1
++        vst2.16  {d20[0], d22[0]}, [r2]
++        bx       lr
++.endm
++
 +
 +
  function ff_hevc_v_loop_filter_chroma_neon, export=1
          hevc_loop_filter_chroma_start
++
++        sub      r0, #2
++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
++
++        sub      r0, r0, r1, lsl #3
++        add      r0, r0, #1
++        hevc_loop_filter_chroma_body d16, d17, d18, d19
++        bne      1f
++
++        vst2.8   {d17[0], d18[0]}, [r0], r1
++        vst2.8   {d17[1], d18[1]}, [r0], r1
++        vst2.8   {d17[2], d18[2]}, [r0], r1
++        vst2.8   {d17[3], d18[3]}, [r0], r1
++        vst2.8   {d17[4], d18[4]}, [r0], r1
++        vst2.8   {d17[5], d18[5]}, [r0], r1
++        vst2.8   {d17[6], d18[6]}, [r0], r1
++        vst2.8   {d17[7], d18[7]}, [r0], r1
++        bx       lr
++
++1:
++        tst      r12, #0xff             @ P0a
++        bne      2f
++
++        vst1.8   {d17[0]}, [r0], r1
++        vst1.8   {d17[1]}, [r0], r1
++        vst1.8   {d17[2]}, [r0], r1
++        vst1.8   {d17[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++2:
++        tst      r12, #0xff0000         @ Q0a
++        add      r0, #1
++        bne      3f
++        vst1.8   {d18[0]}, [r0], r1
++        vst1.8   {d18[1]}, [r0], r1
++        vst1.8   {d18[2]}, [r0], r1
++        vst1.8   {d18[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++3:
++        tst      r12, #0xff000000       @ Q0b
++        add      r0, r0, r1, lsl #2
++        bne      4f
++        vst1.8   {d18[4]}, [r0], r1
++        vst1.8   {d18[5]}, [r0], r1
++        vst1.8   {d18[6]}, [r0], r1
++        vst1.8   {d18[7]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++4:
++        tst      r12, #0xff00           @ P0b
++        it ne
++        bxne     lr
++
++        sub      r0, #1
++        vst1.8   {d17[4]}, [r0], r1
++        vst1.8   {d17[5]}, [r0], r1
++        vst1.8   {d17[6]}, [r0], r1
++        vst1.8   {d17[7]}, [r0], r1
++        bx       lr
++
++endfunc
++
++
++.macro m_filter_v_chroma_16 bit_depth
++        hevc_loop_filter_chroma_start
++
          sub      r0, #4
-@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
-         vst1.8   {d4}, [r0]
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
++
++        sub      r0, r0, r1, lsl #3
++        add      r0, r0, #2
++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
++        bne      1f
++
++        vst2.16  {d18[0], d20[0]}, [r0], r1
++        vst2.16  {d18[1], d20[1]}, [r0], r1
++        vst2.16  {d18[2], d20[2]}, [r0], r1
++        vst2.16  {d18[3], d20[3]}, [r0], r1
++        vst2.16  {d19[0], d21[0]}, [r0], r1
++        vst2.16  {d19[1], d21[1]}, [r0], r1
++        vst2.16  {d19[2], d21[2]}, [r0], r1
++        vst2.16  {d19[3], d21[3]}, [r0], r1
++        bx       lr
++
++1:
++        tst      r12, #0xff             @ P0a
++        bne      2f
++
++        vst1.16  {d18[0]}, [r0], r1
++        vst1.16  {d18[1]}, [r0], r1
++        vst1.16  {d18[2]}, [r0], r1
++        vst1.16  {d18[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++2:
++        tst      r12, #0xff0000         @ Q0a
++        add      r0, #1
++        bne      3f
++        vst1.16  {d20[0]}, [r0], r1
++        vst1.16  {d20[1]}, [r0], r1
++        vst1.16  {d20[2]}, [r0], r1
++        vst1.16  {d20[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++3:
++        tst      r12, #0xff000000       @ Q0b
++        add      r0, r0, r1, lsl #2
++        bne      4f
++        vst1.16  {d21[0]}, [r0], r1
++        vst1.16  {d21[1]}, [r0], r1
++        vst1.16  {d21[2]}, [r0], r1
++        vst1.16  {d21[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++4:
++        tst      r12, #0xff00           @ P0b
++        it ne
++        bxne     lr
++
++        sub      r0, #1
++        vst1.16  {d19[0]}, [r0], r1
++        vst1.16  {d19[1]}, [r0], r1
++        vst1.16  {d19[2]}, [r0], r1
++        vst1.16  {d19[3]}, [r0], r1
++        bx       lr
++.endm
++
++
++@ void ff_hevc_h_loop_filter_chroma_neon(
++@   uint8_t *_pix,     [r0]
++@   ptrdiff_t _stride, [r1]
++@   int *_tc,          [r2]
++@   uint8_t *_no_p,    [r3]
++@   uint8_t *_no_q);   [sp+0]
++
++function ff_hevc_h_loop_filter_chroma_neon, export=1
++        hevc_loop_filter_chroma_start
++        sub      r0, r0, r1, lsl #1
+         vld1.8   {d16}, [r0], r1
+         vld1.8   {d17}, [r0], r1
+         vld1.8   {d18}, [r0], r1
+-        vld1.8   {d2},  [r0], r1
+-        vld1.8   {d4},  [r0], r1
+-        vld1.8   {d19}, [r0], r1
+-        vld1.8   {d20}, [r0], r1
+-        vld1.8   {d21}, [r0], r1
+-        sub      r0, r0, r1, lsl #3
+-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+-        hevc_loop_filter_chroma_body
+-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+-        vst1.8   {d16}, [r0], r1
++        vld1.8   {d19}, [r0]
++        sub      r0, r0, r1, lsl #1
++        hevc_loop_filter_chroma_body d16, d17, d18, d19
++        bne      1f     @ Partial write
+         vst1.8   {d17}, [r0], r1
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d2},  [r0], r1
+-        vst1.8   {d4},  [r0], r1
+-        vst1.8   {d19}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d21}, [r0]
++        vst1.8   {d18}, [r0]
++        bx       lr
++1:
++        tst      r12, #0xff
++        vmov     r2, r3, d17
++        it eq
++        streq    r2, [r0]
++        tst      r12, #0xff00
++        it eq
++        streq    r3, [r0, #4]
++
++        add      r0, r1
++        tst      r12, #0xff0000
++        vmov     r2, r3, d18
++        it eq
++        streq    r2, [r0]
++        tst      r12, #0xff000000
++        it eq
++        streq    r3, [r0, #4]
++
          bx       lr
  endfunc
+ 
+-function ff_hevc_h_loop_filter_chroma_neon, export=1
++.macro m_filter_h_chroma_16 bit_depth
+         hevc_loop_filter_chroma_start
+         sub      r0, r0, r1, lsl #1
+-        vld1.8   {d18}, [r0], r1
+-        vld1.8   {d2}, [r0], r1
+-        vld1.8   {d4}, [r0], r1
+-        vld1.8   {d19}, [r0]
++        vld1.16  {q8}, [r0], r1
++        vld1.16  {q9}, [r0], r1
++        vld1.16  {q10}, [r0], r1
++        vld1.16  {q11}, [r0]
+         sub      r0, r0, r1, lsl #1
+-        hevc_loop_filter_chroma_body
+-        vst1.8   {d2}, [r0], r1
+-        vst1.8   {d4}, [r0]
++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
++        bne      1f     @ Partial write
++        vst1.16  {q9}, [r0], r1
++        vst1.16  {q10}, [r0]
++        bx       lr
++1:
++        tst      r12, #0xff
++        bne      2f
++        vst1.16  {d18}, [r0]
++2:
++        tst      r12, #0xff00
++        bne      3f
++        add      r0, #8
++        vst1.16  {d19}, [r0]
++        sub      r0, #8
++3:
++        tst      r12, #0xff0000
++        add      r0, r1
++        bne      4f
++        vst1.16  {d20}, [r0]
++4:
++        tst      r12, #0xff000000
++        it ne
++        bxne     lr
++        add      r0, #8
++        vst1.16  {d21}, [r0]
++
+         bx       lr
++.endm
++
 +
 +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
 + *                                            int *curr_rpl0, int *curr_
@@ -1754,9 +3110,54 @@ index 166bddb..9bd0a42 100644
 +        b           11b
 +endfunc
 +
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++        m_filter_luma 10
++endfunc
++
++function ff_hevc_h_loop_filter_luma_neon_10, export=1
++        m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_luma2_neon_10, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
++
++        ldr      r4, [sp, #40]
++        b        v_loop_luma_common_10
++endfunc
++
++function ff_hevc_v_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}
++
++        sub      r4, r0, #8
++v_loop_luma_common_10:
++        m_filter_v_luma_common_16 10
++endfunc
++
++function ff_hevc_h_loop_filter_uv_neon_10, export=1
++        m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_uv2_neon_10, export=1
++        m_filter_v_uv2_16 10
++endfunc
++
++function ff_hevc_h_loop_filter_chroma_neon_10, export=1
++        m_filter_h_chroma_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_chroma_neon_10, export=1
++        m_filter_v_chroma_16 10
+ endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
 new file mode 100644
-index 0000000..00eab9e
+index 0000000000..00eab9eeee
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_epel_neon.S
 @@ -0,0 +1,337 @@
@@ -2097,11 +3498,399 @@ index 0000000..00eab9e
 +       .byte 4, 28, 46, 6
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
+diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
+index 13d540e5ff..9b6d745556 100644
+--- a/libavcodec/arm/hevcdsp_idct_neon.S
++++ b/libavcodec/arm/hevcdsp_idct_neon.S
+@@ -21,82 +21,6 @@
+ #include "libavutil/arm/asm.S"
+ #include "neon.S"
+ 
+-function ff_hevc_idct_4x4_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q0, r1
+-        vdup.16     q1, r1
+-        vst1.16     {q0, q1}, [r0]
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_8x8_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-        vstm        r0, {q8-q15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_16x16_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-        vstm        r0!, {q8-q15}
+-        vstm        r0!, {q8-q15}
+-        vstm        r0!, {q8-q15}
+-        vstm        r0, {q8-q15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_32x32_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        mov         r3, #16
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-1:      subs        r3, #1
+-        vstm        r0!, {q8-q15}
+-        bne         1b
+-        bx lr
+-endfunc
+-
+ function ff_hevc_transform_add_4x4_neon_8, export=1
+         vldm        r1, {q0-q1}
+         vld1.32     d4[0], [r0], r2
+@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1
+         bx          lr
+ endfunc
+ 
++
++@ ff_hevc_add_residual_4x4_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_4x4_dc_neon_8, export=1
++        vdup.16     q15, r2
++
++        vld1.32     d4[0], [r0], r1
++        vld1.32     d4[1], [r0], r1
++        vld1.32     d5[0], [r0], r1
++        vld1.32     d5[1], [r0], r1
++        sub         r0, r0, r1, lsl #2
++        vaddw.u8    q0, q15, d4
++        vaddw.u8    q1, q15, d5
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r1
++        vst1.32     d0[1], [r0], r1
++        vst1.32     d1[0], [r0], r1
++        vst1.32     d1[1], [r0], r1
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_4x4_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #4
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_8x8_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_8x8_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #8
++
++1:      subs        r3,   #1
++        vld1.8      d16,  [r0]
++        vaddw.u8    q0,   q15, d16
++        vqmovun.s16 d0,   q0
++        vst1.32     d0,   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_8x8_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #8
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_16x16_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_16x16_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #16
++
++1:      subs        r3,   #1
++        vld1.8      {q8},  [r0]
++        vaddw.u8    q0,  q15, d16
++        vaddw.u8    q1,  q15, d17
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst1.8      {q0},   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_16x16_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #16
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_32x32_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_32x32_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #32
++
++1:      subs        r3,   #1
++        vld1.8      {q8, q9},  [r0]
++        vaddw.u8    q0,  q15, d16
++        vaddw.u8    q1,  q15, d17
++        vaddw.u8    q2,  q15, d18
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst1.8     {q0, q1},   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++
+ .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+         vtrn.64         \r0, \r4
+         vtrn.64         \r1, \r5
+@@ -263,55 +312,6 @@ endfunc
+         vqrshrn.s32   \r3, q3, \shift
+ .endm
+ 
+-function ff_hevc_transform_4x4_neon_8, export=1
+-        vpush       {d8-d15}
+-        vld1.16     {q14, q15}, [r0]  // coeffs
+-        ldr         r3, =0x00240053 // 36 and 83
+-        vmov.32     d0[0], r3
+-
+-        tr4_shift d28, d29, d30, d31, #7
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        tr4_shift d28, d29, d30, d31, #12
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        vst1.16     {q14, q15}, [r0]
+-        vpop        {d8-d15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_transform_luma_4x4_neon_8, export=1
+-        vpush       {d8-d15}
+-        vld1.16     {q14, q15}, [r0]  // coeffs
+-        ldr         r3, =0x4a  // 74
+-        vmov.32     d0[0], r3
+-        ldr         r3, =0x1d  // 29
+-        vmov.32     d0[1], r3
+-        ldr         r3, =0x37  // 55
+-        vmov.32     d1[0], r3
+-
+-        tr4_luma_shift d28, d29, d30, d31, #7
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        tr4_luma_shift d28, d29, d30, d31, #12
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-        vst1.16     {q14, q15}, [r0]
+-        vpop        {d8-d15}
+-        bx lr
+-endfunc
+-
+ .macro tr8_begin in0, in1, in2, in3
+         vmull.s16  q7, \in0, d1[1]   // 89 * src1
+         vmull.s16  q8, \in0, d1[0]   // 75 * src1
+@@ -356,100 +356,6 @@ endfunc
+         vqrshrn.s32   d8, q5, \shift
+ .endm
+ 
+-function ff_hevc_transform_8x8_neon_8, export=1
+-        push   {r4-r8}
+-        vpush {d8-d15}
+-        mov    r5, #16
+-
+-        adr       r3, tr4f
+-        vld1.16   {d0, d1}, [r3]
+-
+-        // left half
+-        vld1.16 {d24}, [r0], r5
+-        vld1.16 {d25}, [r0], r5
+-        vld1.16 {d26}, [r0], r5
+-        vld1.16 {d27}, [r0], r5
+-        vld1.16 {d28}, [r0], r5
+-        vld1.16 {d29}, [r0], r5
+-        vld1.16 {d30}, [r0], r5
+-        vld1.16 {d31}, [r0], r5
+-        sub      r0, #128
+-        tr8_begin d25, d27, d29, d31
+-        tr4       d24, d26, d28, d30
+-        tr8_end   #7
+-        vst1.16 {d2}, [r0], r5
+-        vst1.16 {d3}, [r0], r5
+-        vst1.16 {d4}, [r0], r5
+-        vst1.16 {d5}, [r0], r5
+-        vst1.16 {d6}, [r0], r5
+-        vst1.16 {d7}, [r0], r5
+-        vst1.16 {d8}, [r0], r5
+-        vst1.16 {d9}, [r0], r5
+-        sub      r0, #128
+-        //skip right half if col_limit in r1 is less than 4
+-        cmp      r1, #4
+-        blt      1f
+-        //right half
+-        add      r0, #8
+-        vld1.16 {d24}, [r0], r5
+-        vld1.16 {d25}, [r0], r5
+-        vld1.16 {d26}, [r0], r5
+-        vld1.16 {d27}, [r0], r5
+-        vld1.16 {d28}, [r0], r5
+-        vld1.16 {d29}, [r0], r5
+-        vld1.16 {d30}, [r0], r5
+-        vld1.16 {d31}, [r0], r5
+-        sub      r0, #128
+-        tr8_begin d25, d27, d29, d31
+-        tr4       d24, d26, d28, d30
+-        tr8_end   #7
+-        vst1.16 {d2}, [r0], r5
+-        vst1.16 {d3}, [r0], r5
+-        vst1.16 {d4}, [r0], r5
+-        vst1.16 {d5}, [r0], r5
+-        vst1.16 {d6}, [r0], r5
+-        vst1.16 {d7}, [r0], r5
+-        vst1.16 {d8}, [r0], r5
+-        vst1.16 {d9}, [r0], r5
+-        sub      r0, #136
+-1:
+-        // top half
+-        vldm r0, {q12-q15} // coeffs
+-        transpose_16b_4x4 d24, d26, d28, d30
+-        transpose_16b_4x4 d25, d27, d29, d31
+-        tr8_begin d26, d30, d27, d31
+-        tr4 d24, d28, d25, d29
+-        tr8_end #12
+-        transpose_16b_4x4 d2, d3, d4, d5
+-        transpose_16b_4x4 d6, d7, d8, d9
+-        vswp     d7, d5
+-        vswp     d7, d8
+-        vswp     d3, d6
+-        vswp     d6, d4
+-        vstm r0!, {q1-q4}
+-
+-        // bottom half
+-        vldm r0, {q12-q15} // coeffs
+-        transpose_16b_4x4 d24, d26, d28, d30
+-        transpose_16b_4x4 d25, d27, d29, d31
+-        tr8_begin d26, d30, d27, d31
+-        tr4 d24, d28, d25, d29
+-        tr8_end #12
+-        transpose_16b_4x4 d2, d3, d4, d5
+-        transpose_16b_4x4 d6, d7, d8, d9
+-        vswp     d7, d5
+-        vswp     d7, d8
+-        vswp     d3, d6
+-        vswp     d6, d4
+-        //vstm     r0, {q1-q4}
+-        vst1.16 {q1-q2}, [r0]
+-        add     r0, #32
+-        vst1.16 {q3-q4}, [r0]
+-        sub     r0, #32
+-        vpop {d8-d15}
+-        pop {r4-r8}
+-        bx lr
+-endfunc
+ 
+ .align 4
+ tr4f:
+@@ -463,3 +369,11 @@ tr16:
+ .word 0x00500046  // 80, d2[2] = 70
+ .word 0x0039002b  // 57, d2[0] = 43
+ .word 0x00190009  // 25, d2[2] = 9
++
++#define BIT_DEPTH 8
++#include "hevc_idct_fn_neon.S"
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "hevc_idct_fn_neon.S"
++
 diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 5591807..b6c48ee 100644
+index 55918077e2..e708b7c074 100644
 --- a/libavcodec/arm/hevcdsp_init_neon.c
 +++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,11 +22,26 @@
+@@ -22,11 +22,41 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
  #include "hevcdsp_arm.h"
@@ -2113,6 +3902,11 @@ index 5591807..b6c48ee 100644
  void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +
++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
 +#ifdef RPI
 +void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
 +                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
@@ -2123,44 +3917,201 @@ index 5591807..b6c48ee 100644
 +void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
 +                             uint8_t * src_l,
 +                             unsigned int no_f);
++
++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
 +#endif
 +
  void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
  void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
  void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
-@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
++
+ void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                      ptrdiff_t stride);
++                                     ptrdiff_t stride);
+ void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                      ptrdiff_t stride);
++                                     ptrdiff_t stride);
+ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                      ptrdiff_t stride);
++                                       ptrdiff_t stride);
  void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- 
-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-                                      ptrdiff_t stride);
++                                       ptrdiff_t stride);
 +
-+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
 +
-+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +
-+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
-+                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
 +
-+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++#if RPI_HEVC_SAND
++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++#endif
++
++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++#if RPI_HEVC_SAND
++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
 +
++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++#endif
 +
++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
+ 
  #define PUT_PIXELS(name) \
      void name(int16_t *dst, uint8_t *src, \
-                                 ptrdiff_t srcstride, int height, \
-@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
  #undef PUT_PIXELS
@@ -2176,227 +4127,110 @@ index 5591807..b6c48ee 100644
  
  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                     int height, int width);
-@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
  }
  
-+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++                                                MvField *curr, MvField *neigh, uint8_t *bs);
++
++
++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int8_t offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = 3; // BIT_DEPTH - 5
-+    int cwidth = 0;
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    switch(cwidth){
-+    case 8:
-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 16:
-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 32:
-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 64:
-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    default:
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++)
-+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
++    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
 +}
 +
-+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++#if RPI_HEVC_SAND
++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height)
 +{
-+    // Width 32 already dealt with
-+    // width 16 code works in double lines
-+    if (width == 16 && (height & 1) == 0) {
-+        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
-+                                          sao_offset_val_u, sao_left_class_u,
-+                                          sao_offset_val_v, sao_left_class_v,
-+                                          width, height);
-+    }
-+    else
-+    {
-+        const int shift  = 3; // BIT_DEPTH - 5
-+        int k, y, x;
-+        pixel *dst = (pixel *)_dst;
-+        pixel *src = (pixel *)_src;
-+        int8_t offset_table_u[32] = { 0 };
-+        int8_t offset_table_v[32] = { 0 };
-+
-+        stride_src /= sizeof(pixel);
-+        stride_dst /= sizeof(pixel);
-+
-+        for (k = 0; k < 4; k++)
-+            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        for (k = 0; k < 4; k++)
-+            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width * 2; x += 2)
-+            {
-+                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
-+                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
-+            }
-+            dst += stride_dst;
-+            src += stride_src;
-+
-+        }
-+    }
++    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
-+
-+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                                          int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
 +{
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    int8_t sao_offset_val[8];  // padding of 3 for vld
-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    int cwidth = 0;
-+
-+    for (x = 0; x < 5; x++) {
-+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+    }
-+
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    switch (cwidth) {
-+    case 32:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    case 64:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    default:
-+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++) {
-+                int diff0         = CMP(src[x], src[x + a_stride]);
-+                int diff1         = CMP(src[x], src[x + b_stride]);
-+                int idx           = diff0 + diff1;
-+                if (idx)
-+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
++    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
++#endif
++#endif
 +
 +
-+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
 +
-+    if (width == 32 && (height & 7) == 0) {
-+        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
-+    }
-+    else
-+    {
-+        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+        static const int8_t pos[4][2][2] = {
-+            { { -1,  0 }, {  1, 0 } }, // horizontal
-+            { {  0, -1 }, {  0, 1 } }, // vertical
-+            { { -1, -1 }, {  1, 1 } }, // 45 degree
-+            { {  1, -1 }, { -1, 1 } }, // 135 degree
-+        };
-+        int8_t sao_offset_val_u[8];  // padding of 3 for vld
-+        int8_t sao_offset_val_v[8];  // padding of 3 for vld
-+        pixel *dst = (pixel *)_dst;
-+        pixel *src = (pixel *)_src;
-+        int a_stride, b_stride;
-+        int x, y;
-+
-+        for (x = 0; x < 5; x++) {
-+            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
-+            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
-+        }
-+
-+        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width * 2; x += 2) {
-+                int diff0u = CMP(src[x], src[x + a_stride]);
-+                int diff1u = CMP(src[x], src[x + b_stride]);
-+                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
-+                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
-+}
-+#undef CMP
-+
-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
 +
  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
  {
@@ -2407,7 +4241,9 @@ index 5591807..b6c48ee 100644
          c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
 +        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
          c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
          c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
 +#ifdef RPI
 +        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
 +        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
@@ -2416,21 +4252,68 @@ index 5591807..b6c48ee 100644
          c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
          c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
          c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
-@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
+         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
+         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
+-        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
+-        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
+-        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+-        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
++        c->transform_add[0]             = ff_hevc_transform_add_4x4_neon_8;
++        c->transform_add[1]             = ff_hevc_transform_add_8x8_neon_8;
++        c->transform_add[2]             = ff_hevc_transform_add_16x16_neon_8;
++        c->transform_add[3]             = ff_hevc_transform_add_32x32_neon_8;
++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
++#if RPI_HEVC_SAND
++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
++#endif
          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-+          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
-+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
-+          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
-+        }
-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
++#endif
++#if RPI_HEVC_SAND
++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
++#endif
++#endif
          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
@@ -2452,22 +4335,711 @@ index 5591807..b6c48ee 100644
          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
      }
++    else if (bit_depth == 10) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
++#ifdef RPI
++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
++#endif
++        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
++        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
++        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
++        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
++        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
++        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
++        c->transform_add[0]             = ff_hevc_add_residual_4x4_neon_10;
++        c->transform_add[1]             = ff_hevc_add_residual_8x8_neon_10;
++        c->transform_add[2]             = ff_hevc_add_residual_16x16_neon_10;
++        c->transform_add[3]             = ff_hevc_add_residual_32x32_neon_10;
++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
++#if RPI_HEVC_SAND
++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
++#endif
++        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_10;
++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
++
++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
++#endif
++#if RPI_HEVC_SAND
++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
++#endif
++#endif
++    }
 +
 +    assert(offsetof(MvField, mv) == 0);
 +    assert(offsetof(MvField, ref_idx) == 8);
 +    assert(offsetof(MvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
  }
+diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
+new file mode 100644
+index 0000000000..7cc5cd5e5c
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_res16_neon.S
+@@ -0,0 +1,610 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1]
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vld1.16     {d0}, [r0, :64], r2
++        vld1.16     {d1}, [r0, :64], r2
++        vld1.16     {d2}, [r0, :64], r2
++        vld1.16     {d3}, [r0, :64], r2
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r2
++        vst1.16     {d1}, [r0, :64], r2
++        vst1.16     {d2}, [r0, :64], r2
++        vst1.16     {d3}, [r0, :64], r2
++        bx          lr
++
++endfunc
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vdup.i16    q9,  r3
++        vld1.16     {d0}, [r0, :64], r1
++        vld1.16     {d1}, [r0, :64], r1
++        vdup.16     q15, r2
++        vld1.16     {d2}, [r0, :64], r1
++        vld1.16     {d3}, [r0, :64], r1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r1
++        vst1.16     {d1}, [r0, :64], r1
++        vst1.16     {d2}, [r0, :64], r1
++        vst1.16     {d3}, [r0, :64], r1
++        bx          lr
++
++endfunc
++
++
++@ add_residual8x8(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #2
++1:
++        vldm        r1!, {q10-q13}
++        vld1.16     {q0}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r2
++        vqadd.s16   q0,  q10
++        vld1.16     {q2}, [r0, :128], r2
++        vqadd.s16   q1,  q11
++        vld1.16     {q3}, [r0, :128], r2
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {q0}, [r0, :128], r2
++        vmin.s16    q2,  q2,  q9
++        vst1.16     {q1}, [r0, :128], r2
++        vmin.s16    q3,  q3,  q9
++        vst1.16     {q2}, [r0, :128], r2
++        vst1.16     {q3}, [r0, :128], r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #1
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual8x8_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++        mov         r12, #2
++        vdup.16     q15, r2
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vld1.16     {q0}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r1
++        vqadd.s16   q0,  q15
++        vld1.16     {q2}, [r0, :128], r1
++        vqadd.s16   q1,  q15
++        vld1.16     {q3}, [r0, :128], r1
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q8
++        vmax.s16    q1,  q8
++        vmax.s16    q2,  q8
++        vmax.s16    q3,  q8
++        vmin.s16    q0,  q9
++        vmin.s16    q1,  q9
++        vst1.16     {q0}, [r0, :128], r1
++        vmin.s16    q2,  q9
++        vst1.16     {q1}, [r0, :128], r1
++        vmin.s16    q3,  q9
++        vst1.16     {q2}, [r0, :128], r1
++        vst1.16     {q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual16x16(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #8
++1:
++        vldm        r1!, {q10-q13}
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst1.16     {q0, q1}, [r0, :128], r2
++        vst1.16     {q2, q3}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #4
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual16x16_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #8
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r1
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0, :128], r1
++        vst1.16     {q2, q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++
++@ add_residual32x32(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #32
++1:
++        vldm        r1!, {q10-q13}
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #16
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual32x32_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #32
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++        vldm        r1, {q10-q13}
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        add         r3,  r1, #(16*16*2)  @ Offset to V
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  #32
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
 new file mode 100644
-index 0000000..08a021d
+index 0000000000..30113d9c93
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,862 @@
+@@ -0,0 +1,1882 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -2491,124 +5063,211 @@ index 0000000..08a021d
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
-+.macro init_sao_band
-+        pld      [r1]
-+        vld1.8   {q0, q1}, [r2]  // offset table
-+        ldr       r2, [sp, #0]   // stride_dst
-+        ldr      r12, [sp, #4]   // height
-+        vmov.u8  q3, #128
-+.endm
++.set EDGE_SRC_STRIDE, 160
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
++        vshr.u8 q13, q9, #3
++        vadd.s8  q9, \Q_K128
++
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
 +
-+// 128 in q3
-+// input q8 - q11
-+.macro sao_band_64
-+        vtbl.8   d24, {d0, d1, d2, d3}, d24
-+        vadd.s8  q8, q3
-+        vtbl.8   d25, {d0, d1, d2, d3}, d25
-+        vadd.s8  q9, q3
-+        vtbl.8   d26, {d0, d1, d2, d3}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d0, d1, d2, d3}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0, d1, d2, d3}, d28
 +        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0, d1, d2, d3}, d29
++        vshr.u8 q12, q10, #3
++        vadd.s8  q10, \Q_K128
 +        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d0, d1, d2, d3}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d0, d1, d2, d3}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
++        vshr.u8 q13, q11, #3
++        vadd.s8  q11, \Q_K128
++
++        vsub.s8  q8, \Q_K128
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vsub.s8  q9, \Q_K128
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
++        vqadd.s8 q10, q12
++        vqadd.s8 q11, q13
++        vsub.s8  q10, \Q_K128
++        vsub.s8  q11, \Q_K128
 +.endm
 +
-+function ff_hevc_sao_band_w8_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #8
-+        vld1.8   {d16}, [r1, :64], r3
-+        vld1.8   {d17}, [r1, :64], r3
-+        vshr.u8  q12, q8, #3
-+        vld1.8   {d18}, [r1, :64], r3
-+        vld1.8   {d19}, [r1, :64], r3
-+        vshr.u8  q13, q9, #3
-+        vld1.8   {d20}, [r1, :64], r3
-+        vld1.8   {d21}, [r1, :64], r3
-+        vshr.u8  q14, q10, #3
-+        vld1.8   {d22}, [r1, :64], r3
-+        vld1.8   {d23}, [r1, :64], r3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8  {d16}, [r0, :64], r2
-+        vst1.8  {d17}, [r0, :64], r2
-+        vst1.8  {d18}, [r0, :64], r2
-+        vst1.8  {d19}, [r0, :64], r2
-+        vst1.8  {d20}, [r0, :64], r2
-+        vst1.8  {d21}, [r0, :64], r2
-+        vst1.8  {d22}, [r0, :64], r2
-+        vst1.8  {d23}, [r0, :64], r2
-+        bne    1b
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
 +
-+        bx lr
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT1, d25
++
++        vqadd.s8 q8, q12
++        vsub.s8  q8, \Q_K128
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vtbl.8    d26, \XLAT0, d26
++        vtbl.8    d27, \XLAT1, d27
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vaddw.s8  \Q2, d26
++        vaddw.s8  \Q3, d27
++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q12
++.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++function band_load_y
++        vmov.i64  q0, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q1, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
++
++        mov       r4, sp
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
++        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
++        mov       sp, r4
++
++        ldr       r12, [sp, #20]        @ height
++        pld       [r1]
++
++        sub       r12, #1
++        add       r4, r1, r3
++        bx        lr
 +endfunc
 +
-+function ff_hevc_sao_band_w16_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #4
-+        vld1.8  {q8}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vld1.8  {q9}, [r1, :128], r3
-+        vshr.u8  q13, q9, #3
-+        vld1.8  {q10}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vld1.8  {q11}, [r1, :128], r3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8   {q8}, [r0, :128], r2
-+        vst1.8   {q9}, [r0, :128], r2
-+        vst1.8   {q10}, [r0, :128], r2
-+        vst1.8   {q11}, [r0, :128], r2
-+        bne    1b
 +
-+        bx lr
-+endfunc
++function band_load_c
++        vmov.i64  q2, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q3, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
 +
-+function ff_hevc_sao_band_w32_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #2
-+        vld1.8   {q8-q9}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vld1.8   {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8   {q8-q9}, [r0, :128], r2
-+        vst1.8   {q10-q11}, [r0, :128], r2
-+        bne      1b
++        mov       r4, sp                @ Remember SP
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
 +
-+        bx       lr
-+endfunc
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
 +
-+function ff_hevc_sao_band_w64_neon_8, export=1
-+        init_sao_band
++        @ And again for the 2nd set
++        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        ldr       r12, [r4, #20]        @ sao_left_class2
++
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
++
++        mov       sp, r4
++
++        ldr       r12, [sp, #28]        @ height
++        pld       [r1]
 +
-+        push      {r4, lr}
 +        subs      r12, #1
-+        mov       r4, r1
-+        it ne
-+        addne     r4, r3
++        add       r4, r1, r3
++        bx        lr
++endfunc
++
++
++@ ff_hevc_sao_band_64_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_64_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
 +
 +1:      subs      r12, #1
 +        vldm      r1, {q8-q11}
 +        pld       [r4]
-+        vshr.u8   q12, q8, #3
-+        vshr.u8   q13, q9, #3
 +        add       r1, r3
-+        vshr.u8   q14, q10, #3
-+        vshr.u8   q15, q11, #3
-+        sao_band_64
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
 +        it ne
 +        addne     r4, r3
 +        vstm      r0, {q8-q11}
@@ -2618,8 +5277,113 @@ index 0000000..08a021d
 +        pop       {r4, pc}
 +endfunc
 +
++@ ff_hevc_sao_band_32_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
 +
-+@ ff_hevc_sao_band_c_w64_neon_8(
++function ff_hevc_sao_band_32_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #2
++        vld1.8    { q8, q9 }, [r1, :128], r3
++        vld1.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8, q9 }, [r0, :128], r2
++        vst1.8    {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_16_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_16_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #4
++        vld1.8    { q8}, [r1, :128], r3
++        vld1.8    { q9}, [r1, :128], r3
++        vld1.8    {q10}, [r1, :128], r3
++        vld1.8    {q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8}, [r0, :128], r2
++        vst1.8    { q9}, [r0, :128], r2
++        vst1.8    {q10}, [r0, :128], r2
++        vst1.8    {q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_8_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_8_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.32   {d16[0]}, [r1, :32], r3
++        vld1.32   {d16[1]}, [r1, :32], r3
++        vld1.32   {d17[0]}, [r1, :32], r3
++        vld1.32   {d17[1]}, [r1, :32], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.32   {d16[0]}, [r0, :32], r2
++        vst1.32   {d16[1]}, [r0, :32], r2
++        vst1.32   {d17[0]}, [r0, :32], r2
++        vst1.32   {d17[1]}, [r0, :32], r2
++        bpl       1b
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_32_neon_8(
 +@   uint8_t * dst          [r0]
 +@   uint8_t * src          [r1]
 +@   uint32_t dst_stride    [r2]
@@ -2631,707 +5395,1535 @@ index 0000000..08a021d
 +@   int width              sp[16]
 +@   int height             sp[20]
 +
-+@ As this is often done in-place on the frame buffer it is worth preloading
-+@ the pixel values but we want to beware of loading ouside our buffer to avoid
-+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
++function ff_hevc_sao_band_c_32_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
 +
-+function ff_hevc_sao_band_c_neon_8, export=1
-+        mov     r12, sp
-+        push   {r4-r8, lr}  // 24 bytes
++        vmov.i8   q15, #128
++        sub       r3, #32
++        sub       r2, #32
 +
-+        ldm     r12, {r4-r7}
++1:      subs      r12, #1
++        vld2.8    { q8, q9 }, [r1, :128]!
++        vld2.8    {q10, q11}, [r1, :128], r3
 +
-+        add     r4, #2
-+        add     r6, #2
-+        vld1.16 {d16}, [r4]    @ Unaligned
-+        lsl     r5, r5, #3
-+        vld1.16 {d18}, [r6]
-+        pld     [r1]
-+        vmov.i8  d17, #0
-+        mov     r4, r1
-+        vmov.i8  d19, #0
-+        lsl     r7, r7, #3
-+        vdup.8  q1, r5
-+        ldr     r5, [r12, #16]  @ width
-+        vdup.8  q2, r7
-+        ldr     r12, [r12, #20]
-+        vqmovn.s16 d0, q8
-+        cmp     r5, #16         @ At some point we may want a table lookup
-+        vqmovn.s16 d1, q9
-+        vmov.i8 q3, #128
-+        beq     16f
++        pld       [r4]
 +
-+        @ d0 U lookup
-+        @ d1 V lookup
-+        @ q1 U raw offset
-+        @ q2 V raw offset
-+        @ q3 #128
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
 +
-+        @ r4 = r1 = src - Inteded for preload pointer
-+        @ r12 = height
++        vst2.8    { q8, q9 }, [r0, :128]!
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        itt ne
++        addne     r4, r3
++        addne     r4, #32
++
++        bpl       1b
++
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_16_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_sao_band_c_16_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        vmov.i8   q15, #128
++
++1:      subs      r12, #2
++        vld2.8    { q8, q9 }, [r1, :128], r3
++        vld2.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_8_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_sao_band_c_8_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.8    {d16, d17}, [r1, :128], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    {d16, d17}, [r0, :128], r2
++        bpl       1b
++        pop     {r4, pc}
++
++4:
++1:      subs      r12, #1
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++        vuzp.8    d16, d17
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vzip.8    d16, d17
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++
++@ ff_hevc_sao_band_64_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_64_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++        vpush     {q4-q7}
++
++1:      subs      r12, #1
++        vldm      r1, {q4-q11}
++        add       r1, r3
++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        vstm      r0, {q4-q11}
++        add       r0, r2
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_64_neon_10, export=1
++        band_64_16 10
++endfunc
++
++@ ff_hevc_sao_band_32_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_32_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++
++1:      subs      r12, #1
++        vldm      r1, {q8-q11}
++        add       r1, r3
++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_32_neon_10, export=1
++        band_32_16 10
++endfunc
++
++@ ff_hevc_sao_band_16_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_16_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        bl        band_load_y
++
++1:      subs      r12, #2
++        vld1.16   { q8, q9 }, [r1, :128], r3
++        vld1.16   {q10, q11}, [r1, :128], r3
++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   { q8, q9 }, [r0, :128], r2
++        vst1.16   {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_16_neon_10, export=1
++        band_16_16 10
++endfunc
++
++@ ff_hevc_sao_band_8_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_8_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        bl        band_load_y
++        ldr       lr, [sp, #16]
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.16   { q8}, [r1, :128], r3
++        vld1.16   { q9}, [r1, :128], r3
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   { q8}, [r0, :128], r2
++        vst1.16   { q9}, [r0, :128], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.16   {d16}, [r1, :64], r3
++        vld1.16   {d17}, [r1, :64], r3
++        vld1.16   {d18}, [r1, :64], r3
++        vld1.16   {d19}, [r1, :64], r3
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   {d16}, [r0, :64], r2
++        vst1.16   {d17}, [r0, :64], r2
++        vst1.16   {d18}, [r0, :64], r2
++        vst1.16   {d19}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_8_neon_10, export=1
++        band_8_16 10
++endfunc
++
++
++@ ff_hevc_sao_band_c_32_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_32_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        vpush     {q4-q7}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        sub       r2, #96
++
++1:      subs      r12, #1
++
++        vld2.16   { q4, q5 }, [r1, :128]!
++        vld2.16   { q6, q7 }, [r1, :128]!
++        vld2.16   { q8, q9 }, [r1, :128]!
++        vld2.16   {q10, q11}, [r1, :128], r3
++
++        pld       [r4]
++        sub       r1, #96
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
 +
-+        @ Might (unlikely) be called with height == 1
-+        subs      r12, #1
 +        it ne
 +        addne     r4, r3
 +
-+1:
-+        subs      r12, #1
-+        vld2.8    {q8-q9}, [r1, :128]!
-+        vsub.u8   q12, q8, q1
-+        vld2.8    {q10-q11}, [r1, :128], r3
-+        vsub.u8   q14, q10, q1
-+        vsub.u8   q13, q9, q2
-+        sub       r1, #32
-+        vsub.u8   q15, q11, q2
-+        pld       [r4]
-+        vshr.u8   q12, #3
-+        vadd.s8   q8, q3
-+        vshr.u8   q13, #3
-+        vadd.s8   q9, q3
++        vst2.16   { q4, q5 }, [r0, :128]!
++        vst2.16   { q6, q7 }, [r0, :128]!
++        vst2.16   { q8, q9 }, [r0, :128]!
++        vst2.16   {q10, q11}, [r0, :128], r2
 +
-+        vtbl.8   d24, {d0}, d24
-+        vshr.u8  q14, #3
-+        vtbl.8   d25, {d0}, d25
-+        vshr.u8  q15, #3
-+        vtbl.8   d26, {d1}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d1}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d1}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d1}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
-+
-+        it ne
-+        addne     r4, r3        @ Do not inc on final pass
-+        vst2.8    {q8-q9}, [r0, :128]!
-+        vst2.8    {q10-q11}, [r0, :128], r2
-+        sub       r0, #32
 +        bpl       1b
 +
-+        pop    {r4-r8, pc}
-+
-+@ -- width 16 (UV pairs) --
-+16:
-+        subs    r12, #2
-+        it ne
-+        addne   r4, r4, r3, lsl #1
-+
-+1:
-+        subs      r12, #2
-+        vld2.8    {q8-q9}, [r1, :128], r3
-+        vsub.u8   q12, q8, q1
-+        vld2.8    {q10-q11}, [r1, :128], r3
-+        vsub.u8   q14, q10, q1
-+        vsub.u8   q13, q9, q2
-+        pld       [r4]
-+        vsub.u8   q15, q11, q2
-+        pld       [r4, r3]
-+        vshr.u8  q12, #3
-+        vadd.s8  q8, q3
-+        vshr.u8  q13, #3
-+        vadd.s8  q9, q3
-+
-+        vtbl.8   d24, {d0}, d24
-+        vshr.u8  q14, #3
-+        vtbl.8   d25, {d0}, d25
-+        vshr.u8  q15, #3
-+        vtbl.8   d26, {d1}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d1}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d1}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d1}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
-+
-+        it ne
-+        addne   r4, r4, r3, lsl #1
-+        vst2.8    {q8-q9}, [r0, :128], r2
-+        vst2.8    {q10-q11}, [r0, :128], r2
-+        bpl       1b
-+
-+        pop    {r4-r8, pc}
++        vpop      {q4-q7}
++        pop       {r4, pc}
++.endm
 +
++function ff_hevc_sao_band_c_32_neon_10, export=1
++        band_c_32_16 10
 +endfunc
 +
 +
-+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
-+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
-+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
-+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
-+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
++@ ff_hevc_sao_band_c_16_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_16_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        sub       r2, #32
++        sub       r3, #32
++
++1:      subs      r12, #1
++
++        vld2.16   { q8, q9 }, [r1, :128]!
++        vld2.16   {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128]!
++        vst2.16   {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
 +.endm
 +
++function ff_hevc_sao_band_c_16_neon_10, export=1
++        band_c_16_16 10
++endfunc
 +
-+// input
-+// a in q0 - q3
-+// c in q4 - q7
-+// b in q8 - q11
-+// offset table r4,r5 and r6,r7
-+//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
-+// output in q0 - q3
-+// clobbers q12 - q15
 +
-+@ a <- c <- b
++@ ff_hevc_sao_band_c_8_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_8_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        ldr       lr, [sp, #24]         @ width
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.16   { q8, q9 }, [r1, :128], r3
++
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #2
++        vld2.16   {d16, d17}, [r1, :128], r3
++        vld2.16   {d18, d19}, [r1, :128], r3
++
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   {d16, d17}, [r0, :128], r2
++        vst2.16   {d18, d19}, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_c_8_neon_10, export=1
++        band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ [r5]  translate values
 +@
-+@ It appears that Neon can stall if you try and use results too soon so we try to
-+@ spread our instruction out
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
 +
-+.macro edgeidx64
++function edge_64b_body_8
 +
-+        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q1
-+        vcgt.u8 q14, q6, q2
-+        vcgt.u8 q15, q7, q3
++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8 q13,  q5,  q1
++        vcgt.u8 q14,  q6,  q2
++        vcgt.u8 q15,  q7,  q3
 +
-+        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
-+        vcgt.u8 q1, q1, q5
-+        vcgt.u8 q2, q2, q6
-+        vcgt.u8 q3, q3, q7
++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q1,  q5
++        vcgt.u8  q2,  q6
++        vcgt.u8  q3,  q7
 +
-+        vsub.s8 q0, q0, q12 // a = sign(c-a)
-+        vsub.s8 q1, q1, q13
-+        vsub.s8 q2, q2, q14
-+        vsub.s8 q3, q3, q15
++        vsub.s8  q0,  q12       @ a = sign(c-a)
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
 +
-+        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q9
-+        vcgt.u8 q14, q6, q10
-+        vcgt.u8 q15, q7, q11
++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q13, q5,  q9
++        vcgt.u8  q14, q6,  q10
++        vcgt.u8  q15, q7,  q11
 +
-+        vsub.s8 q0, q0, q12
-+        vsub.s8 q1, q1, q13
-+        vsub.s8 q2, q2, q14
-+        vsub.s8 q3, q3, q15
++        vsub.s8  q0,  q12
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
 +
-+        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
-+        vcgt.u8 q13, q9, q5
-+        vcgt.u8 q14, q10, q6
-+        vcgt.u8 q15, q11, q7
++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
++        vcgt.u8  q13, q9,  q5
++        vcgt.u8  q14, q10, q6
++        vcgt.u8  q15, q11, q7
 +
-+        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
-+        vadd.s8 q1, q1, q13
-+        vmov.u8 q12, #2
-+        vadd.s8 q2, q2, q14
-+        vadd.s8 q3, q3, q15
++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
++        vadd.s8  q1,  q13
++        vmov.u8  q12, #2
++        vadd.s8  q2,  q14
++        vadd.s8  q3,  q15
 +
-+        vadd.s8 q0, q0, q12
-+        vadd.s8 q1, q1, q12
-+        @ whilst vmov dn, rm, rn exists it is a vfp instruction
-+        @ and causes a stall till neon pipe empty - so don't do that!
-+        vmov    d26[0], r4
-+        vmov    d26[1], r5
-+        vmov    d27[0], r6
-+        vmov    d27[1], r7
-+        vadd.s8 q2, q2, q12
-+        vuzp.8    q0, q1
-+        vmov.u8 q15, #128
-+        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
++        vadd.s8  q0,  q12
++        vadd.s8  q1,  q12
 +
-+        vtbl.8  d0, {d26}, d0
-+        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
++        vld1.8   {d26, d27}, [r5]
 +
-+        vtbl.8  d1, {d26}, d1
-+        vadd.s8 q14, q5, q15
++        vadd.s8  q2,  q12
++        vuzp.8   q0,  q1
++        vmov.u8  q15, #128
++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
 +
-+        vtbl.8  d2, {d27}, d2
-+        vuzp.8    q2, q3
++        vtbl.8   d0,  {d26}, d0
++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
 +
-+        vtbl.8  d3, {d27}, d3
++        vtbl.8   d1,  {d26}, d1
++        vadd.s8  q14, q5, q15
 +
-+        vtbl.8  d4, {d26}, d4
-+        vzip.8    q0, q1
++        vtbl.8   d2,  {d27}, d2
++        vuzp.8   q2,  q3
 +
-+        vtbl.8  d5, {d26}, d5
-+        vqadd.s8 q0, q0, q12
-+        vqadd.s8 q1, q1, q14
-+        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
++        vtbl.8   d3,  {d27}, d3
 +
-+        vtbl.8  d6, {d27}, d6
-+        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
++        vtbl.8   d4,  {d26}, d4
++        vzip.8   q0,  q1
 +
-+        vtbl.8  d7, {d27}, d7
-+        vzip.8   q2, q3
++        vtbl.8   d5,  {d26}, d5
++        vqadd.s8 q0,  q12
++        vqadd.s8 q1,  q14
++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
 +
-+        vsub.s8 q0, q0, q15
-+        vqadd.s8 q2, q2, q12
-+        vqadd.s8 q3, q3, q14
-+        vsub.s8 q1, q1, q15
-+        vsub.s8 q2, q2, q15
-+        vsub.s8 q3, q3, q15
++        vtbl.8   d6,  {d27}, d6
++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
 +
-+.endm
++        vtbl.8   d7,  {d27}, d7
++        vzip.8   q2,  q3
++
++        vsub.s8  q0,  q15
++        vqadd.s8 q2,  q12
++        vqadd.s8 q3,  q14
++        vsub.s8  q1,  q15
++        vsub.s8  q2,  q15
++        vsub.s8  q3,  q15
++
++        bx      lr
++endfunc
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ r4    upper clip value
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_16
++
++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q1
++        vcgt.u16 q14, q6, q2
++        vcgt.u16 q15, q7, q3
++
++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u16 q1, q1, q5
++        vcgt.u16 q2, q2, q6
++        vcgt.u16 q3, q3, q7
++
++        vsub.s16 q0, q0, q12 // a = sign(c-a)
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q9
++        vcgt.u16 q14, q6, q10
++        vcgt.u16 q15, q7, q11
++
++        vsub.s16 q0, q0, q12
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u16 q13, q9, q5
++        vcgt.u16 q14, q10, q6
++        vcgt.u16 q15, q11, q7
++
++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s16 q1, q1, q13
++        vmov.u8  q12, #2
++        vadd.s16 q2, q2, q14
++        vadd.s16 q3, q3, q15
++
++        vmovn.s16 d0, q0
++        vmovn.s16 d1, q1
++        vmovn.s16 d2, q2
++        vmovn.s16 d3, q3
++
++        vuzp.8   q0, q1
++
++        vld1.8   {d26, d27}, [r5]
++
++        vadd.s8  q0, q0, q12
++        vadd.s8  q1, q1, q12
++
++        vtbl.8   d0, {d26}, d0
++        vtbl.8   d1, {d26}, d1
++        vtbl.8   d2, {d27}, d2
++        vtbl.8   d3, {d27}, d3
++
++        vmov.i64 q12, #0
++
++        vzip.8   q0, q1
++
++        vdup.i16 q13, r4
++
++        @ Avoid overwrite whilst widening
++        vaddw.s8 q2, q6, d2
++        vaddw.s8 q3, q7, d3
++        vaddw.s8 q1, q5, d1
++        vaddw.s8 q0, q4, d0
++
++        @ now clip
++        clip16_4 q2, q3, q1, q0, q12, q13
 +
-+function edge_w64_body
-+        edgeidx64
-+        vstm    r0, {q0-q3}
-+        add     r0, r0, r2
 +        bx       lr
 +endfunc
 +
-+.macro init_edge_64
-+        push   {r4-r8,lr}
-+        ldr    r12, [sp, #24] // height
-+        ldr    r5,  [sp, #28] // sao_offset_val_table
-+        ldrd   r4, r5, [r5]
-+        mov    r6, r4
-+        mov    r7, r5
-+.endm
 +
-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+        sub    r1, #8
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-+        // load a
-+        vext.8 q0, q3, q4, #15
-+        vext.8 q1, q4, q5, #15
-+        vext.8 q2, q5, q6, #15
-+        vext.8 q3, q6, q7, #15
-+        // load b
-+        vext.8 q8, q4, q5, #1
-+        vext.8 q9, q5, q6, #1
-+        vext.8 q10, q6, q7, #1
-+        vext.8 q11, q7, q12, #1
-+        bl    edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
++
++        vsub.s8  q0,  q3
++        vsub.s8  q10, q9
++        vadd.s8  q0,  q10       @ a = sign(c-a)
++
++        vadd.s8  q0,  q14
++        vuzp.8   d0,  d1
++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d0,  {d16}, d0
++        vtbl.8   d1,  {d17}, d1
++
++        vzip.8   d0,  d1
++        vqadd.s8 q0,  q3
++        vsub.s8  q0,  q15
++
++        bx      lr
 +endfunc
 +
-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+        sub     r1, r3
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
++        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
++        vsub.s16 q0, q3         @ a = sign(c-a)
++        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
++        vsub.s16 q0, q3
++        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
++        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
++
++        vmovn.s16 d0, q0
++        @ d1 will have random contents that we transform but
++        @ that doesn't matter as we then discard them
++        vuzp.8   d0, d1
++
++        vadd.s8  q0, q0, q14
++
++        vtbl.8   d0, {d16}, d0
++        vtbl.8   d1, {d17}, d1
++
++        vzip.8   d0, d1
++
++        vaddw.s8 q0, q1, d0
++
++        @ now clip
++        vmax.s16 q0, q12
++        vmin.s16 q0, q15
++        bx       lr
++endfunc
++
++
++@ ff_hevc_sao_edge_[c_]xx_neon(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
++@   int eo,                           [sp, #sp_base + 0]
++@   int width,                        [sp, #sp_base + 4]
++@   int height)                       [sp, #sp_base + 8]
++
++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
++        push     {r4-r6, lr}    @ 16 bytes
++.set sp_base, 16
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++        mov      r12, #2
++.if \is_chroma
++        ldr      r4, [sp, #16]
++.set sp_base, sp_base + 4
++.endif
++        vld1.8   {d16[2]}, [r3], r12
++        vld1.8   {d16[0]}, [r3], r12
++        vld1.8   {d16[1]}, [r3], r12
++        vld1.8   {d16[3]}, [r3], r12
++        vld1.8   {d16[4]}, [r3]
++.if \is_chroma
++        vld1.8   {d17[2]}, [r4], r12
++        vld1.8   {d17[0]}, [r4], r12
++        vld1.8   {d17[1]}, [r4], r12
++        vld1.8   {d17[3]}, [r4], r12
++        vld1.8   {d17[4]}, [r4]
++.else
++        vmov     d17, d16
++.endif
++
++@ Setup constant registers
++.if \bit_depth > 8
++        movw     r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++        vmov.i64 q12, #0
++        vdup.16  q15, r4
++.else
++        vmov.u8  q15, #128
++.endif
++        vmov.u8  q14, #2
++.endif
++        movw     r3, EDGE_SRC_STRIDE
++
++@ If setup_64b we need the xlat table on the stack and q4-q7 saved
++.if \setup_64b
++        sub      r5, sp, #16
++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++        ldr      r12, [sp, #sp_base + 4]        @ width
++        cmp      r12, #8
++.endif
++        ldr      r12, [sp, #sp_base + 0]        @ e0
++        adr      r6, \jump_tab
++.if \check_w4
++        it lt
++        addlt    r6, #16
++.endif
++        ldr      r6, [r6, r12, lsl #2]
++
++        ldr      r12, [sp, #sp_base + 8]        @ height
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++        push     {r0, r1, r6, r12}
++        blx      r6
++        pop      {r0, r1, r6, r12}
++
++        add      r0, #64
++        add      r1, #64
++.endif
++
++        blx      r6
++
++@ Tidy up & return
++.if \setup_64b
++        vpop     {q4-q8}        @ spurious but harmless load of q8
++.endif
++        pop      {r4-r6, pc}
++.endm
++
++
++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
++.endm
++
++
++.macro  edge_64b_e0, body_fn, pb
++        mov      r6, lr
++        sub      r1, #8
++1:      vldm     r1, {d7-d16}
++        subs     r12, #1
++        add      r1, r3
 +        // load a
-+        vld1.8  {q0-q1}, [r1, :128]!
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+        sub     r1, #32
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+1:      subs    r12, #1
++        vext.8   q0,  q3,  q4, #(16 - \pb)
++        vext.8   q1,  q4,  q5, #(16 - \pb)
++        vext.8   q2,  q5,  q6, #(16 - \pb)
++        vext.8   q3,  q6,  q7, #(16 - \pb)
 +        // load b
-+        vld1.8  {q8-q9}, [r1, :128]!
-+        vld1.8  {q10-q11}, [r1, :128], r3
-+        sub     r1, #32
-+        bl      edge_w64_body
++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
++        vext.8   q8,  q4,  q5, #\pb
++        vext.8   q9,  q5,  q6, #\pb
++        vext.8   q10, q6,  q7, #\pb
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_32bx2_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #2
++
++        vld1.8   {q4-q5}, [r1]
++        sub      r1, #\pb
++        vld1.8   {q0-q1}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {q8-q9}, [r1], r3
++        sub      r1, #\pb
++        vld1.8   {q6-q7}, [r1]
++        sub      r1, #\pb
++        vld1.8   {q2-q3}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {q10-q11}, [r1], r3
++        sub      r1, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {q0,q1}, [r0], r2
++        vst1.8   {q2,q3}, [r0], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_16b_e0, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++        sub      r3, #\pb * 2
++
++1:      subs     r12, #1
++
++        vld1.64  {q0}, [r1]             @ load a
++        add      r1, #\pb
++        vld1.64  {q1}, [r1, :128]       @ load c
++        add      r1, #\pb
++        vld1.64  {q2}, [r1], r3         @ load b
++
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #2
++
++        vld1.8   {d2}, [r1, :64]
++        sub      r1, #\pb
++        vld1.8   {d0}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {d4}, [r1], r3
++        sub      r1, #\pb
++        vld1.8   {d3}, [r1, :64]
++        sub      r1, #\pb
++        vld1.8   {d1}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {d5}, [r1], r3
++        sub      r1, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r0, :64], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #4
++
++        vld1.32  {d2[0]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d0[0]}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.32  {d4[0]}, [r1], r3      @ R
++        vld1.32  {d4[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d2[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d0[1]}, [r1], r3      @ L
++        vld1.32  {d1[0]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[0]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d5[0]}, [r1], r3      @ R
++        vld1.32  {d5[1]}, [r1]
++        sub      r1, #(\pb * 2)
++        vld1.32  {d1[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[1]}, [r1], r3      @ M
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++
++.macro  edge_64b_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {q0-q1}, [r1, :128]!
++        vld1.8   {q2-q3}, [r1, :128], r3
++        sub      r1, #32
++        // load c
++        vld1.8   {q4-q5}, [r1, :128]!
++        vld1.8   {q6-q7}, [r1, :128], r3
++        sub      r1, #32
++1:      subs     r12, #1
++        // load b
++        vld1.8   {q8-q9}, [r1, :128]!
++        vld1.8   {q10-q11}, [r1, :128], r3
++        sub      r1, #32
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
 +        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
++        vmov.64  q0, q4
++        vmov.64  q1, q5
++        vmov.64  q2, q6
++        vmov.64  q3, q7
 +        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++        vmov.64  q4, q8
++        vmov.64  q5, q9
++        vmov.64  q6, q10
++        vmov.64  q7, q11
++        bgt      1b
++        bx       r6
++.endm
 +
-+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+1:      sub     r1, r3
++.macro  edge_32bx2_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
 +        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        sub     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #31
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #33
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++        vld1.8   {q0-q1}, [r1, :128], r3
++        vld1.8   {q4-q5}, [r1, :128], r3
 +
-+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #33
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #31
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++1:      subs     r12, #2
++        @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vmov     q2, q4
++        vmov     q3, q5
++        vld1.8   {q8-q9}, [r1, :128], r3
++        vld1.8   {q10-q11}, [r1, :128], r3
++        vmov     q6, q8
++        vmov     q7, q9
 +
++        bl       \body_fn
 +
-+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
-+@   uint8_t *_dst,               r0
-+@   uint8_t *_src,               r1
-+@   ptrdiff_t stride_dst,        r2
-+@   ptrdiff_t stride_src,        r3
-+@   int height,                  sp[0]
-+@   int16_t *sao_offset_table_u,  sp[4]
-+@   int16_t *sao_offset_table_v); sp[8]
-+@   int eo                        sp[12]
++        vst1.8   {q0,q1}, [r0], r2
++        vst1.8   {q2,q3}, [r0], r2
 +
-+function ff_hevc_sao_edge_c_w64_neon_8, export=1
-+        push   {r4-r8,lr}     // 6 reg = 24
-+        ldr    r5,  [sp, #28] // sao_offset_val_table_u
-+        ldr    r7,  [sp, #32] // sao_offset_val_table_v
-+
-+        @ Load and rearrange offsets
-+        @ Also "convert" from 16bit to 8bit
-+        ldrb    r4, [r5, #2]
-+        ldrb    r8, [r5, #4]
-+        ldrb    r6, [r7, #2]
-+        ldrb    r12, [r7, #4]
-+        orr     r4, r4, r8, lsl #8
-+        orr     r6, r6, r12, lsl #8
-+        ldrb    r8, [r5, #6]
-+        ldrb    r12, [r7, #6]
-+        orr     r4, r4, r8, lsl #24
-+        orr     r6, r6, r12, lsl #24
-+        ldrb    r5, [r5, #8]
-+        ldrb    r7, [r7, #8]
-+
-+        ldr     r12, [sp, #36] // e0
-+        adr     r8, edge_c_tbl_w64
-+        ldr     r8, [r8, r12, lsl #2]
-+
-+        ldr     r12, [sp, #24] // height
-+        vpush   {d8-d15}
-+        mov     pc, r8
-+
-+edge_c_tbl_w64:
-+        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
-+
-+ff_hevc_sao_edge_c_eo0_w64_neon_8:
-+        sub    r1, #8
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-+        // load a
-+        vext.8 q0, q3, q4, #14
-+        vext.8 q1, q4, q5, #14
-+        vext.8 q2, q5, q6, #14
-+        vext.8 q3, q6, q7, #14
-+        // load b
-+        vext.8 q8, q4, q5, #2
-+        vext.8 q9, q5, q6, #2
-+        vext.8 q10, q6, q7, #2
-+        vext.8 q11, q7, q12, #2
-+        bl    edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+
-+ff_hevc_sao_edge_c_eo1_w64_neon_8:
-+        sub     r1, r3
-+        // load a
-+        vldm    r1, {q0-q3}
-+        add     r1, r3
-+        // load c
-+        vldm    r1, {q4-q7}
-+        add     r1, r3
-+1:      subs    r12, #1
-+        // load b
-+        vldm    r1, {q8-q11}
-+        add     r1, r3
-+        bl      edge_w64_body
 +        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
++        vmov.64  q0, q8
++        vmov.64  q1, q9
++
 +        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++        vmov.64  q4, q10
++        vmov.64  q5, q11
++        bgt      1b
++        bx       r6
++.endm
 +
-+ff_hevc_sao_edge_c_eo2_w64_neon_8:
-+1:      sub     r1, r3
++.macro  edge_16b_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {q0}, [r1, :128], r3
++        // load c
++        vld1.8   {q1}, [r1, :128], r3
++1:      subs     r12, #1
++        // load b
++        vld1.8   {q2}, [r1, :128], r3
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        // copy c to a
++        vmov.64  q0, q1
++        // copy b to c
++        vmov.64  q1, q2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {d0}, [r1, :64], r3
++        vld1.8   {d2}, [r1, :64], r3
++
++1:      subs     r12, #2
++        @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vmov.64  d1, d2
++        vld1.8   {d4}, [r1, :64], r3
++        vld1.8   {d5}, [r1, :64], r3
++        vmov.64  d3, d4
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0], r2
++        vst1.8   {d1}, [r0], r2
++
++        // copy c to a
++        vmov.64  d0, d4
++        // copy b to c
++        vmov.64  d2, d5
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e1, body_fn
++        mov      r6, lr
++debug_me:
++        sub      r1, r3
++        // load a
++        vld1.32  {d0[0]}, [r1], r3
++        vld1.32  {d0[1]}, [r1], r3
++
++1:      subs     r12, #4
++        @ Given the data duplication here we could probably do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.32  {d4[0]}, [r1], r3
++        vld1.32  {d4[1]}, [r1], r3
++        vld1.32  {d5[0]}, [r1], r3
++        vld1.32  {d5[1]}, [r1], r3
++
++        vmov.32  d1, d4
++        vext.32  d2, d0, d4, #1
++        vext.32  d3, d4, d5, #1
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++
++        vmov.32  d0, d5
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_64b_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #32
++        sub      r3, #(32 - \pb)
++
++1:      sub      r1, r3
 +        // load a
 +        // TODO: fix unaligned load
 +        //       don't reload a like in eo1
-+        sub     r1, #2
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #30
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #2
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #34
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++        vld1.8   {q0-q1}, [r1]!
++        vld1.8   {q2-q3}, [r1], r3
++        subs     r12, #1
++        // load  c
++        vld1.8   {q4-q5}, [r1, :128]!
++        vld1.8   {q6-q7}, [r1, :128], r3
++        // load  b
++        vld1.8   {q8-q9}, [r1]!
++        vld1.8   {q10-q11}, [r1]
++        sub      r1, #(64 + \pb)
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
 +
-+ff_hevc_sao_edge_c_eo3_w64_neon_8:
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #2
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #34
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #2
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #30
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
-+
-+
-+.macro init_edge_32
-+        ldr     r12, [sp, #4] // sao_offset_val_table
-+        vld1.32 {d31}, [r12]
-+        ldr     r12, [sp] // height
++        add      r3, #(32 - \pb)
++        bx       r6
 +.endm
 +
-+.macro diff out0, tmp0, in0, in1
-+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
++.macro  edge_32bx2_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++
++1:      sub      r1, r3
++        vld1.8   {q0-q1}, [r1], r3
++        vld1.8   {q2-q3}, [r1]
++        subs     r12, #2
++        // load  c
++        add      r1, #\pb
++        vld1.8   {q4-q5}, [r1, :128], r3
++        vld1.8   {q6-q7}, [r1, :128]
++        // load  b
++        add      r1, #\pb
++        vld1.8   {q8-q9}, [r1], r3
++        vld1.8   {q10-q11}, [r1]
++        sub      r1, #(\pb * 2)
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0], r2
++        vst1.8   {q2-q3}, [r0], r2
++        bgt      1b
++
++        bx       r6
 +.endm
 +
-+.macro table32
-+        vmov.s8  q10, #2
-+        vadd.s8  q0, q10
-+        vadd.s8  q1, q10
-+        vmov.s8  q10, #128
-+        vtbl.8   d0, {d31}, d0
-+        vadd.s8  q11, q2, q10
-+        vtbl.8   d1, {d31}, d1
-+        vadd.s8  q12, q3, q10
-+        vtbl.8   d2, {d31}, d2
-+        vqadd.s8 q11, q0
-+        vtbl.8   d3, {d31}, d3
-+        vqadd.s8 q12, q1
-+        vsub.s8  q0, q11, q10
-+        vsub.s8  q1, q12, q10
-+        vst1.8   {q0-q1}, [r0, :128], r2
++.macro  edge_16b_e2, body_fn, pb
++        mov      r6, lr
++        add     r3, #\pb
++
++1:      sub      r1, r3
++        // load a
++        vld1.8   {q0}, [r1], r3
++        subs     r12, #1
++        // load  c
++        vld1.8   {q1}, [r1, :128], r3
++        // load  b
++        vld1.8   {q2}, [r1]
++        sub      r1, #\pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        bgt      1b
++        bx       r6
 +.endm
 +
-+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+        init_edge_32
-+        vpush {q4-q7}
-+        sub     r1, #4
-+1:      subs    r12, #1
-+        vld1.8  {q13-q14}, [r1]!
-+        vld1.32 d30, [r1], r3
-+        sub     r1, #32
-+        // a
-+        vext.8   q0, q13, q14, #3
-+        vext.8   q1, q14, q15, #3
-+        vshr.u64 d24, d30, #24
-+        // c
-+        vext.8   q2, q13, q14, #4
-+        vext.8   q3, q14, q15, #4
-+        vshr.u64 d16, d30, #32
-+        // diff0
-+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
-+        diff   d18, d25, d24, d16
-+        // -diff1
-+        vext.s8 q0, q13, q14, #1
-+        vext.s8 q1, q14, q9, #1
++.macro  edge_8bx2_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
 +
-+        vsub.s8 q0, q13, q0 //diff0 + diff1
-+        vsub.s8 q1, q14, q1
-+        table32
-+        bne     1b
-+        vpop {q4-q7}
++1:      sub      r1, r3
++        vld1.8   {d0}, [r1], r3
++        vld1.8   {d1}, [r1]
++        subs     r12, #2
++        // load  c
++        add      r1, #\pb
++        vld1.8   {d2}, [r1, :64], r3
++        vld1.8   {d3}, [r1, :64]
++        // load  b
++        add      r1, #\pb
++        vld1.8   {d4}, [r1], r3
++        vld1.8   {d5}, [r1]
++        sub      r1, #(\pb * 2)
 +
-+        bx      lr
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0], r2
++        vst1.8   {d1}, [r0], r2
++        bgt      1b
++
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++
++1:      sub      r1, r3
++        @ line 0 {d0[0], -,     -    }  r1 lo
++        vld1.32  {d0[0]}, [r1], r3
++        subs     r12, #4
++        @ Line 1 {d0[1], d2[0], -    }  r1 lo
++        vld1.32  {d0[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d2[0]}, [r1], r3
++        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
++        vld1.32  {d2[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d1[0]}, [r1]
++        add      r1, #\pb * 2
++        vld1.32  {d4[0]}, [r1], r3
++        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
++        vld1.32  {d4[1]}, [r1]
++        sub      r1, #\pb * 2
++        vld1.32  {d1[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[0]}, [r1], r3
++        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
++        vld1.32  {d3[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d5[0]}, [r1], r3
++        @ Line 4 {-,      -,    d5[1]}  r1 hi
++        vld1.32  {d5[1]}, [r1]
++        sub      r1, #(\pb * 2)
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++        bgt      1b
++
++        bx       r6
++.endm
++
++.macro  edge_64b_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_64b_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_32bx2_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_32bx2_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_16b_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_16b_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_8bx2_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_8bx2_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_4bx4_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_64b_e0     \body_fn, \pb
++10:     edge_64b_e1     \body_fn
++20:     edge_64b_e2     \body_fn, \pb
++30:     edge_64b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_32bx2_e0   \body_fn, \pb
++10:     edge_32bx2_e1   \body_fn
++20:     edge_32bx2_e2   \body_fn, \pb
++30:     edge_32bx2_e3   \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_32bx2_e0   \body_fn_64b, \pb
++10:     edge_32bx2_e1   \body_fn_64b
++20:     edge_32bx2_e2   \body_fn_64b, \pb
++30:     edge_32bx2_e3   \body_fn_64b, \pb
++5:      edge_16b_e0     \body_fn_16b, \pb
++15:     edge_16b_e1     \body_fn_16b
++25:     edge_16b_e2     \body_fn_16b, \pb
++35:     edge_16b_e3     \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++5:      edge_8bx2_e0    \body_fn, \pb
++15:     edge_8bx2_e1    \body_fn
++25:     edge_8bx2_e2    \body_fn, \pb
++35:     edge_8bx2_e3    \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_8bx2_e0    \body_fn, \pb
++10:     edge_8bx2_e1    \body_fn
++20:     edge_8bx2_e2    \body_fn, \pb
++30:     edge_8bx2_e3    \body_fn, \pb
++5:      edge_4bx4_e0    \body_fn, \pb
++15:     edge_4bx4_e1    \body_fn
++25:     edge_4bx4_e2    \body_fn, \pb
++35:     edge_4bx4_e3    \body_fn, \pb
++.endm
++
++@ void ff_hevc_sao_edge_8_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_8_neon_8, export=1
++        edge_16b_init   8, 0, 1, 99f
++99:
++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+        init_edge_32
-+        vpush {q4-q7}
-+        // load a
-+        sub     r1, r3
-+        vld1.8  {q0-q1}, [r1, :128], r3
-+        // load c
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q8-q9}, [r1, :128], r3
-+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
-+        vadd.s8 q0, q4, q12 //diff0 + diff1
-+        vadd.s8 q1, q5, q13
-+        table32
-+        // CMP ( c, a )
-+        vneg.s8 q12, q4
-+        vneg.s8 q13, q5
-+        // c
-+        vmov.64 q2, q8
-+        vmov.64 q3, q9
-+        bne     1b
-+        vpop {q4-q7}
-+        bx      lr
++@ void ff_hevc_sao_edge_16_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_16_neon_8, export=1
++        edge_16b_init   8, 0, 0, 99f
++99:
++        edge_16b_bodies edge_16b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+        init_edge_32
-+        vpush   {d8-d15}
-+        // load a
-+        sub     r1, r3
-+        sub     r1, #8
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #7
-+        vext.8  q1, q11, q12, #7
-+        // load c
-+        vld1.8  {d9}, [r1, :64]!
-+        vld1.8  {q2-q3}, [r1, :64], r3
-+        sub     r1, #8
-+        vext.8  q4, q4, q2, #15
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q8, q10, q11, #9
-+        vext.8  q9, q11, q12, #9
-+        vext.8  q6, q10, q11, #8
-+        vext.8  q7, q11, q12, #8
-+        vext.8  q5, q10, q11, #7
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
-+        // inputs for next loop iteration
-+        // a
-+        vmov.8  q0, q4
-+        vext.8  q1, q2, q3, #15
-+        // c
-+        vmov.8  q2, q6
-+        vmov.8  q3, q7
-+        vmov.8  q4, q5
-+        bne     1b
-+        vpop    {d8-d15}
-+        bx      lr
++@ void ff_hevc_sao_edge_32_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_32_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+        init_edge_32
-+        sub     r1, r3
-+        // load a
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #1
-+        vext.8  q1, q11, q12, #1
-+        // load c
-+        vld1.8  {q2-q3}, [r1, :64]!
-+        vld1.8  {d30}, [r1, :64], r3
-+        sub     r1, #40
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q8, q10, q11, #7
-+        vext.8  q9, q11, q12, #7
-+        vext.8  q14, q12, q10, #7
++@ void ff_hevc_sao_edge_64_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
 +
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
++function ff_hevc_sao_edge_64_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 1
++endfunc
 +
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
++@ ff_hevc_sao_edge_c_8_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
 +
-+        // inputs for next loop iteration
-+        // a
-+        vext.8  q0, q2, q3, #1
-+        vext.8  q1, q3, q15, #1
-+        // c
-+        vext.8  q2, q8, q9, #1
-+        vext.8  q3, q9, q14, #1
-+        vext.8  d30, d28, d2, #1
-+        bne     1b
-+        bx      lr
++function ff_hevc_sao_edge_c_8_neon_8, export=1
++        edge_16b_init   8, 1, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_16_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_16_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_32_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_32_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_sao_edge_8_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_8_neon_10, export=1
++        edge_16b_init   10, 0, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_sao_edge_16_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_16_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_sao_edge_64_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_sao_edge_64_neon_10, export=1
++        edge_64b_init   10, 0, 1, 99f
++endfunc
++
++@ void ff_hevc_sao_edge_32_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_32_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_8_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_8_neon_10, export=1
++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_sao_edge_c_32_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_32_neon_10, export=1
++        edge_64b_init   10, 1, 1, 99f
++endfunc
++
++
++@ ff_hevc_sao_edge_c_16_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_16_neon_10, export=1
++        edge_64b_init   10, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 4
 +endfunc
 +
 diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 39713ed..25eb52b 100644
+index 1be52e7a12..bae5df4bc6 100644
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
 @@ -410,6 +410,8 @@ enum AVCodecID {
@@ -3343,15 +6935,7 @@ index 39713ed..25eb52b 100644
      /* various PCM "codecs" */
      AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
      AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_DC_CLIP          4096
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
-+#define FF_BUG_GMC_UNSUPPORTED 32768
- 
-     /**
-      * strictly follow the standard (MPEG-4, ...).
-@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
+@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext {
  #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
  #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
  #define FF_PROFILE_H264_CAVLC_444            44
@@ -3361,7 +6945,7 @@ index 39713ed..25eb52b 100644
  
  #define FF_PROFILE_VC1_SIMPLE   0
  #define FF_PROFILE_VC1_MAIN     1
-@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
+@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext {
  #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
  #endif
  
@@ -3371,11 +6955,12 @@ index 39713ed..25eb52b 100644
 +     * @author jc (08/02/2016)
 +     */
 +    void * get_buffer_context;
++
  } AVCodecContext;
  
  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
 diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c62..ccfa991 100644
+index 1bf1c620d6..ccfa991f60 100644
 --- a/libavcodec/cabac.h
 +++ b/libavcodec/cabac.h
 @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
@@ -3394,140 +6979,11 @@ index 1bf1c62..ccfa991 100644
      const uint8_t *bytestream_start;
      const uint8_t *bytestream;
      const uint8_t *bytestream_end;
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 9d94b72..535ebf0 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-     },
-+    {
-+        .id        = AV_CODEC_ID_H264_MVC,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "h264_mvc",
-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
-+        .props     = AV_CODEC_PROP_LOSSY,
-+    },
- 
-     /* various PCM "codecs" */
-     {
-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index efe3555..16358aa 100644
---- a/libavcodec/h264.h
-+++ b/libavcodec/h264.h
-@@ -126,7 +126,9 @@ enum {
-     NAL_END_STREAM      = 11,
-     NAL_FILLER_DATA     = 12,
-     NAL_SPS_EXT         = 13,
-+    NAL_SPS_SUBSET      = 15,
-     NAL_AUXILIARY_SLICE = 19,
-+    NAL_SLICE_EXT       = 20,
-     NAL_FF_IGNORE       = 0xff0f001,
- };
- 
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index ce4bab2..b9b0c78 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
-     uint8_t parse_history[6];
-     int parse_history_count;
-     int parse_last_mb;
-+    int is_mvc;
-+    int slice_ext;
- } H264ParseContext;
- 
- 
-@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-         } else if (state <= 5) {
-             int nalu_type = buf[i] & 0x1F;
-             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
--                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
-+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
-+                nalu_type == NAL_SPS_SUBSET) {
-                 if (pc->frame_start_found) {
-                     i++;
-                     goto found;
-                 }
-             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
--                       nalu_type == NAL_IDR_SLICE) {
-+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
-                 state += 8;
-+
-+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
-                 continue;
-             }
-             state = 7;
-         } else {
-             p->parse_history[p->parse_history_count++] = buf[i];
--            if (p->parse_history_count > 5) {
-+            if (p->parse_history_count > 8) {
-                 unsigned int mb, last_mb = p->parse_last_mb;
-                 GetBitContext gb;
- 
--                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
-+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
-                 p->parse_history_count = 0;
-                 mb= get_ue_golomb_long(&gb);
-                 p->parse_last_mb = mb;
-@@ -145,7 +150,7 @@ found:
-     pc->frame_start_found = 0;
-     if (p->is_avc)
-         return next_avc;
--    return i - (state & 5) - 5 * (state > 7);
-+    return i - (state & 5) - 8 * (state > 7);
- }
- 
- static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
-@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
-         }
-     }
- 
--    parse_nal_units(s, avctx, buf, buf_size);
-+    if (!p->is_mvc)
-+        parse_nal_units(s, avctx, buf, buf_size);
- 
-     if (avctx->framerate.num)
-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
-         if ((state & 0xFFFFFF00) != 0x100)
-             break;
-         nalu_type = state & 0x1F;
--        if (nalu_type == NAL_SPS) {
-+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
-             has_sps = 1;
-         } else if (nalu_type == NAL_PPS)
-             has_pps = 1;
-@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
-     .parser_close   = h264_close,
-     .split          = h264_split,
- };
-+
-+static av_cold int init_mvc(AVCodecParserContext *s)
-+{
-+    H264ParseContext *p = s->priv_data;
-+    int ret = init(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    p->is_mvc = 1;
-+    return 0;
-+}
-+
-+AVCodecParser ff_h264_mvc_parser = {
-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
-+    .priv_data_size = sizeof(H264ParseContext),
-+    .parser_init    = init_mvc,
-+    .parser_parse   = h264_parse,
-+    .parser_close   = h264_close,
-+    .split          = h264_split,
-+};
 diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index b478065..955e426 100644
+index c1fa67f67b..6f99021339 100644
 --- a/libavcodec/hevc.c
 +++ b/libavcodec/hevc.c
-@@ -41,8 +41,196 @@
+@@ -41,8 +41,346 @@
  #include "hevc.h"
  #include "profiles.h"
  
@@ -3535,33 +6991,19 @@ index b478065..955e426 100644
 +  #include "rpi_qpu.h"
 +  #include "rpi_shader.h"
 +  #include "rpi_shader_cmd.h"
++  #include "rpi_shader_template.h"
 +  #include "rpi_zc.h"
++  #include "libavutil/rpi_sand_fns.h"
 +
 +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
 +  #define RPI_CACHE_UNIF_MVS  1
 +
-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
-+  //#define RPI_SIMULATE_QPUS
-+  #ifdef RPI_WORKER
-+    #include "pthread.h"
-+  #endif
++  #include "pthread.h"
++  #include "libavutil/atomic.h"
 +
 +  static void worker_core(HEVCContext * const s);
-+
-+  // We can pred any block height but annoyingly if we we do then the TMU cache
-+  // explodes and it goes even slower :-(
-+  #if 0
-+  #define Y_P_MAX_H     16
-+  #define Y_B_MAX_H     16
-+  #else
-+  #define Y_P_MAX_H     64
-+  #define Y_B_MAX_H     64
-+  #endif
 +#endif
 +
-+// #define DISABLE_MC
-+
-+#define DISABLE_CHROMA 0
 +#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
 +
 +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
@@ -3573,8 +7015,6 @@ index b478065..955e426 100644
 +}
 +#   define av_mod_uintp2   av_mod_uintp2_c
 +#endif
-+
-+#define Y_B_ONLY 0
 +
  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
  
@@ -3584,18 +7024,23 @@ index b478065..955e426 100644
 +#define MC_DUMMY_X (-32)
 +#define MC_DUMMY_Y (-32)
 +
-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-+// For each block of 64*64 the smallest block size is 8x4
-+// We also need an extra command for the setup information
++// UV still has min 4x4 pred
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
++#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
++
++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
 +
-+#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
 +
 +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
 +
-+// TODO Chroma only needs 4 taps
 +
 +// Actual filter goes -ve, +ve, +ve, -ve using these values
 +static const uint32_t rpi_filter_coefs[8] = {
@@ -3609,29 +7054,135 @@ index b478065..955e426 100644
 +        ENCODE_COEFFS(  2,  10,  58,  2)
 +};
 +
-+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++    const unsigned int n;
++    const int * const * setup_fns;
++    const int * const * sync_fns;
++    const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++    ipe_chan_info_t luma;
++    ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
++   {  // 8
++      .luma =   {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++      .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++   },
++   {  // 9
++      .luma =   {0},
++      .chroma = {0}
++   },
++   {  // 10
++      .luma =   {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++      .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++   }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++    const unsigned int n = ici->n;
++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++
++    ipe->n = n;
++    ipe->max_fill = q1_size - ipe->min_gap;
++    for(unsigned int i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(ici->setup_fns[i]);
++        q->code_sync = qpu_fn(ici->sync_fns[i]);
++        q->code_exit = qpu_fn(ici->exit_fns[i]);
++    }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
++{
++    const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
++
++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++
++    for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
++        HEVCRpiJob *const jb = s->jobs + i;
++        set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
++        set_ipe_from_ici(&jb->luma_ip,   &iii->luma);
++    }
++}
++
 +
 +#endif
 +
 +
-+#ifdef RPI_WORKER
-+
-+typedef struct worker_global_env_s
-+{
-+    volatile int arm_load;
-+    pthread_mutex_t lock;
-+
-+    unsigned int arm_y;
-+    unsigned int arm_c;
-+    unsigned int gpu_y;
-+    unsigned int gpu_c;
-+} worker_global_env_t;
-+
-+static worker_global_env_t worker_global_env =
-+{
-+    .lock = PTHREAD_MUTEX_INITIALIZER
-+};
-+
++#ifdef RPI
 +
 +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
@@ -3639,108 +7190,154 @@ index b478065..955e426 100644
 +#define LOG_ENTER
 +#define LOG_EXIT
 +
++#define USE_SEM 1
++
 +// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-+static void worker_submit_job(HEVCContext *s)
++static void worker_submit_job(HEVCContext * const s)
 +{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_tail++;
-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_ENTER
++    sem_post(&s->jb0->sem_in);
++    s->jb0->pending = 1;
++    s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++    s->jb0 = s->jobs + s->pass0_job;
++    LOG_EXIT
 +}
 +
 +// Call this to say we have completed pass1
-+static void worker_complete_job(HEVCContext *s)
++static void worker_complete_job(HEVCContext * const s)
 +{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_head++;
-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_ENTER
++    sem_t * const sem = &s->jb1->sem_out;
++    // Must set job no before signalling as otherwise rpi_do_all_passes
++    // may call worker_core from the main thread with a bad job number
++    s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++    s->jb1 = s->jobs + s->pass1_job;
++    sem_post(sem);
++    LOG_EXIT
 +}
 +
-+// Call this to wait for all jobs to have completed at the end of a frame
-+static void worker_wait(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  while( s->worker_head !=s->worker_tail)
-+  {
-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-+  }
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
 +
 +// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
 +// available to receive the next job.
 +static void worker_pass0_ready(HEVCContext *s)
 +{
-+  LOG_ENTER
-+    pthread_mutex_lock(&s->worker_mutex);
-+    // tail is number of submitted jobs
-+    // head is number of completed jobs
-+    // tail-head is number of outstanding jobs in the queue
-+    // we need to ensure there is at least 1 space left for us to use
-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-+    {
-+      // Wait until another job is completed
-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++    LOG_ENTER
++    HEVCRpiJob * const jb = s->jb0;
++    if (jb->pending) {
++        while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
++            /* Loop */;
++        jb->pending = 0;
 +    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_EXIT
++}
++
++// Call this to wait for all jobs to have completed at the end of a frame
++static void worker_wait(HEVCContext * const s)
++{
++    LOG_ENTER
++    unsigned int i;
++    for (i = 0; i != RPI_MAX_JOBS; ++i) {
++        HEVCRpiJob * const jb = s->jobs + i;
++        if (jb->pending) {
++            while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
++                /* Loop */;
++            jb->pending = 0;
++        }
++    }
++    LOG_EXIT
 +}
 +
 +static void *worker_start(void *arg)
 +{
-+  HEVCContext *s = (HEVCContext *)arg;
-+  while(1) {
-+    pthread_mutex_lock(&s->worker_mutex);
++    HEVCContext * const s = (HEVCContext *)arg;
 +
-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
++    for (;;)
 +    {
-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
++        HEVCRpiJob * const jb = s->jb1;
++        while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
++            /* Loop */;
++        if (jb->terminate)
++            break;
 +
-+    if (s->kill_worker) {
-+      break;
++        LOG_ENTER
++        worker_core(s);
++        worker_complete_job(s);
++        LOG_EXIT
 +    }
-+    LOG_ENTER
-+    worker_core(s);
-+
-+    worker_complete_job(s);
-+    LOG_EXIT
-+  }
-+  return NULL;
++    return NULL;
 +}
 +
++static void worker_pic_free_all(HEVCContext * const s)
++{
++    unsigned int i;
++
++    // Free coeff stuff - allocation not the same for all buffers
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
++
++        if (cf->s[0].buf != NULL)
++            av_freep(&cf->mptr);
++        if (cf->s[2].buf != NULL)
++            gpu_free(&cf->gptr);
++        memset(cf, 0, sizeof(*cf));
++    }
++}
++
++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
++{
++    unsigned int i;
++
++    // Free coeff stuff - allocation not the same for all buffers
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
++
++//        av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
++//        av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
++//        av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
++//        av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
++
++        if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++            goto fail;
++        cf->s[2].buf = (int16_t *)cf->gptr.arm;
++        cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++        // Must be 64 byte aligned for our zero apping code so over-allocate &
++        // round
++        if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++            goto fail;
++        cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++    }
++    return 0;
++
++fail:
++    printf("%s: **** Failed\n", __func__);
++    worker_pic_free_all(s);
++    return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++    unsigned int i;
++    for (i = 0; i != 4; ++i) {
++        cf->s[i].n = 0;
++    }
++}
 +#endif
++
 +
  /**
   * NOTE: Each function hls_foo correspond to the function foo in the
   * specification (HLS stands for High Level Syntax).
-@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
  /* free everything allocated  by pic_arrays_init() */
  static void pic_arrays_free(HEVCContext *s)
  {
 +#ifdef RPI
-+    int job;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      if (s->coeffs_buf_arm[job][0]) {
-+        gpu_free(&s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = 0;
-+      }
-+      if (s->coeffs_buf_arm[job][2]) {
-+        gpu_free(&s->coeffs_buf_accelerated[job]);
-+        s->coeffs_buf_arm[job][2] = 0;
-+      }
-+    }
++    worker_pic_free_all(s);
 +#endif
++
 +#ifdef RPI_DEBLOCK_VPU
 +    {
 +        int i;
@@ -3757,7 +7354,7 @@ index b478065..955e426 100644
      av_freep(&s->sao);
      av_freep(&s->deblock);
  
-@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
      int ctb_count        = sps->ctb_width * sps->ctb_height;
      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
  
@@ -3766,32 +7363,17 @@ index b478065..955e426 100644
 +    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
 +    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
 +    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+    int job;
 +
 +    av_assert0(sps);
-+//    s->max_ctu_count = sps->ctb_width;
-+//    printf("CTB with=%d\n", sps->ctb_width);
-+//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
-+    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
-+    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++#if RPI_ROUND_TO_LINES
++    // Round down to an integral quantity of lines
++    if (s->max_ctu_count > sps->ctb_width)
++        s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
++#endif
 +
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        for(job=0;job<RPI_MAX_JOBS;job++) {
-+            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+            if (!s->coeffs_buf_arm[job][0])
-+                goto fail;
-+
-+            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+            if (!s->coeffs_buf_arm[job][2])
-+                goto fail;
-+            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+        }
-+    }
++    if (worker_pic_alloc_all(s, coefs_per_row) != 0)
++        goto fail;
 +#endif
 +#ifdef RPI_DEBLOCK_VPU
 +    {
@@ -3847,7 +7429,7 @@ index b478065..955e426 100644
      s->bs_width  = (width  >> 2) + 1;
      s->bs_height = (height >> 2) + 1;
  
-@@ -137,6 +434,29 @@ fail:
+@@ -137,6 +560,29 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -3877,16 +7459,18 @@ index b478065..955e426 100644
  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
  {
      int i = 0;
-@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
  static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
  {
      #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
 -    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+-    int ret, i;
 +    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
-     int ret, i;
++    int ret;
  
      pic_arrays_free(s);
-@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     s->ps.sps = NULL;
+@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
      switch (sps->pix_fmt) {
      case AV_PIX_FMT_YUV420P:
      case AV_PIX_FMT_YUVJ420P:
@@ -3899,7 +7483,20 @@ index b478065..955e426 100644
  #if CONFIG_HEVC_DXVA2_HWACCEL
          *fmt++ = AV_PIX_FMT_DXVA2_VLD;
  #endif
-@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
++#if RPI_HEVC_SAND
++        // Currently geometry calc is stuffed for big sizes
++        if (sps->width < 2048 && sps->height <= 1088) {
++            *fmt++ = AV_PIX_FMT_SAND64_10;
++        }
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
          ret = ff_thread_get_format(s->avctx, pix_fmts);
          if (ret < 0)
              goto fail;
@@ -3907,22 +7504,56 @@ index b478065..955e426 100644
          s->avctx->pix_fmt = ret;
      }
      else {
-@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
+     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
+     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
++#ifdef RPI
++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++#endif
+ 
+-    for (i = 0; i < 3; i++) {
+-        av_freep(&s->sao_pixel_buffer_h[i]);
+-        av_freep(&s->sao_pixel_buffer_v[i]);
+-    }
++    av_freep(&s->sao_pixel_buffer_h[0]);
++    av_freep(&s->sao_pixel_buffer_v[0]);
+ 
+     if (sps->sao_enabled && !s->avctx->hwaccel) {
+-        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+-        int c_idx;
++        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
++        unsigned int c_idx;
++        size_t vsize[3] = {0};
++        size_t hsize[3] = {0};
+ 
          for(c_idx = 0; c_idx < c_count; c_idx++) {
              int w = sps->width >> sps->hshift[c_idx];
              int h = sps->height >> sps->vshift[c_idx];
-+            // ******** Very very nasty allocation kludge for plaited Chroma
-             s->sao_pixel_buffer_h[c_idx] =
+-            s->sao_pixel_buffer_h[c_idx] =
 -                av_malloc((w * 2 * sps->ctb_height) <<
-+                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
-                           sps->pixel_shift);
-             s->sao_pixel_buffer_v[c_idx] =
+-                          sps->pixel_shift);
+-            s->sao_pixel_buffer_v[c_idx] =
 -                av_malloc((h * 2 * sps->ctb_width) <<
-+                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
-                           sps->pixel_shift);
+-                          sps->pixel_shift);
++            // ctb height & width are a min of 8 so this must a multiple of 16
++            // so no point rounding up!
++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
          }
++
++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++        // when we have plaited chroma
++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
      }
-@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
+ 
+     s->ps.sps = sps;
+@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s)
                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
                  pred_weight_table(s, gb);
              }
@@ -3934,20 +7565,25 @@ index b478065..955e426 100644
  
              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
      return 0;
  }
  
 +#ifdef RPI
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
++{
++    return s->jb0->intra.cmds + s->jb0->intra.n++;
++}
++
 +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
 +{
 +    // U & V done on U call in the case of sliced frames
-+    if (rpi_sliced_frame(s->frame) && c_idx > 1)
++    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
 +        return;
 +
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
 +        cmd->type = RPI_PRED_INTRA;
 +        cmd->size = log2_trafo_size;
 +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
@@ -3956,7 +7592,7 @@ index b478065..955e426 100644
 +        cmd->i_pred.y = y0;
 +        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
 +    }
-+    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
++    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
 +        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
 +    }
 +    else {
@@ -3969,7 +7605,7 @@ index b478065..955e426 100644
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -3982,7 +7618,7 @@ index b478065..955e426 100644
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -3994,7 +7630,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -4006,7 +7642,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -4018,7 +7654,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -4030,7 +7666,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -4077,7 +7713,7 @@ index b478065..955e426 100644
              }
          }
      }
-@@ -1275,47 +1675,120 @@ do {
+@@ -1281,47 +1827,119 @@ do {
      return 0;
  }
  
@@ -4112,13 +7748,13 @@ index b478065..955e426 100644
 -    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
 -    if (s->ps.sps->chroma_format_idc) {
 -        s->hevcdsp.put_pcm(dst1, stride1,
-+#ifdef RPI
-+    if (rpi_sliced_frame(s->frame)) {
-+        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
++#if RPI_HEVC_SAND
++    if (av_rpi_is_sand_frame(s->frame)) {
++        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
 +                           s->frame->linesize[0],
 +                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
 +
-+        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
++        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
 +                           s->frame->linesize[1],
                             cb_size >> s->ps.sps->hshift[1],
                             cb_size >> s->ps.sps->vshift[1],
@@ -4157,10 +7793,9 @@ index b478065..955e426 100644
 +#ifdef RPI
 +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
 +{
-+    int16_t * const coeffs = (buf_no != 3) ?
-+        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
-+        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
-+    s->num_coeffs[s->pass0_job][buf_no] += n;
++    HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++    cfe->n += n;
 +    return coeffs;
 +}
 +#endif
@@ -4205,7 +7840,7 @@ index b478065..955e426 100644
 +
 +        // Add command
 +        {
-+            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
 +            cmd->type = RPI_PRED_I_PCM;
 +            cmd->size = log2_cb_size;
 +            cmd->i_pcm.src = coeffs;
@@ -4223,99 +7858,7 @@ index b478065..955e426 100644
  /**
   * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
   *
-@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-  * @param luma_offset additive offset applied to the luma prediction value
-  */
- 
-+#if RPI_INTER
-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                        int block_w, int block_h, int luma_weight, int luma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_UNI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref->data[0];
-+    cmd->srcstride = ref->linesize[0];
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = luma_weight;
-+    cmd->offset = luma_offset;
-+}
-+
-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
-+                       const struct MvField * const current_mv)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_BI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[0];
-+    cmd->srcstride = ref0->linesize[0];
-+    cmd->mv = *mv0;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[0];
-+    cmd->srcstride1 = ref1->linesize[0];
-+    cmd->mv1 = *mv1;
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
-+                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = src0;
-+    cmd->srcstride = srcstride;
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = chroma_weight;
-+    cmd->offset = chroma_offset;
-+}
-+
-+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[cidx+1];
-+    cmd->srcstride = ref0->linesize[cidx+1];
-+    cmd->mv = current_mv->mv[0];
-+    cmd->mv1 = current_mv->mv[1];
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[cidx+1];
-+    cmd->srcstride1 = ref1->linesize[cidx+1];
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+#endif
-+
- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                             (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
      int idx              = ff_hevc_pel_weight[block_w];
  
@@ -4326,7 +7869,7 @@ index b478065..955e426 100644
      x_off += mv->x >> 2;
      y_off += mv->y >> 2;
      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -4335,7 +7878,7 @@ index b478065..955e426 100644
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
  
@@ -4346,7 +7889,7 @@ index b478065..955e426 100644
      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
      intptr_t _mx         = mx << (1 - hshift);
      intptr_t _my         = my << (1 - vshift);
  
@@ -4357,7 +7900,7 @@ index b478065..955e426 100644
      x_off += mv->x >> (2 + hshift);
      y_off += mv->y >> (2 + vshift);
      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
      int hshift = s->ps.sps->hshift[1];
      int vshift = s->ps.sps->vshift[1];
  
@@ -4368,7 +7911,125 @@ index b478065..955e426 100644
      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+                                                          _mx1, _my1, block_w);
+ }
+ 
+-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
+-                                const Mv *mv, int y0, int height)
++#ifdef RPI
++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field)
+ {
+-    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++        HEVCContext *const fs = ref->tf.owner->priv_data;
++        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
++        sem_t * sem = NULL;
++
++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
++            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
++
++            av_assert0(pwait->req == -1 && pwait->next == NULL);
+ 
+-    if (s->threads_type == FF_THREAD_FRAME )
+-        ff_thread_await_progress(&ref->tf, y, 0);
++            pwait->req = val;
++            pwait->next = NULL;
++            if (pstate->first == NULL)
++                pstate->first = pwait;
++            else
++                pstate->last->next = pwait;
++            pstate->last = pwait;
++            sem = &pwait->sem;
++        }
++        pthread_mutex_unlock(&pstate->lock);
++
++        if (sem != NULL) {
++            while (sem_wait(sem) != 0)
++                av_assert0(errno == EINTR);
++        }
++    }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
++{
++    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
++
++    ((int *)s->ref->tf.progress->data)[field] = val;
++
++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++    {
++        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
++        HEVCRPiFrameProgressWait * pwait;
++
++        while ((pwait = *ppwait) != NULL) {
++            if (pwait->req > val)
++            {
++                ppwait = &pwait->next;
++                pstate->last = pwait;
++            }
++            else
++            {
++                *ppwait = pwait->next;
++                pwait->req = -1;
++                pwait->next = NULL;
++                sem_post(&pwait->sem);
++            }
++        }
++    }
++    pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
++{
++    pstate->first = NULL;
++    pstate->last = NULL;
++    pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
++{
++    pwait->req = -1;
++    pwait->next = NULL;
++    sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
++{
++    av_assert0(pstate->first == NULL);
++    pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
++{
++    sem_destroy(&pwait->sem);
++}
++#endif
++
++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
++                                const Mv * const mv, const int y0, const int height)
++{
++    if (s->threads_type == FF_THREAD_FRAME) {
++        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++
++#ifdef RPI
++        if (s->enable_rpi) {
++            int16_t *const pr = s->jb0->progress + ref->dpb_no;
++            if (*pr < y) {
++                *pr = y;
++            }
++        }
++        else
++#endif
++        // It is a const ThreadFrame but the prototype isn't
++        ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++    }
+ }
+ 
+ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
      }
  }
  
@@ -4378,21 +8039,106 @@ index b478065..955e426 100644
 +
 +#if RPI_INTER
 +
-+static HEVCRpiLumaPred *
-+rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
 +{
-+    HEVCRpiLumaPred * yp = s->curr_pred_y;
-+    HEVCRpiLumaPred * ypt = yp + 1;
-+    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
++    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
++    HEVCRpiInterPredQ * ypt = yp + 1;
++    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
 +        if (ypt->load < yp->load)
 +            yp = ypt;
 +    }
 +
-+//        yp->load += load_val;
-+    ++yp->load;
++    yp->load += load_val;
++    ipe->used_grp = 1;
++    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
++
 +    return yp;
 +}
 +
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++    for (unsigned int i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr->data[-1] = q->code_sync;
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
++        q->load = 0;
++    }
++}
++
++// Returns 0 on success, -1 if Q is dangerously full
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++    if (!ipe->used_grp)
++        return 0;
++
++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
++    {
++        ipe->curr = 0;
++        rpi_inter_pred_sync(ipe);
++    }
++    ipe->used = 1;
++    ipe->used_grp = 0;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
++        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
++            return -1;
++        }
++    }
++    return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    ipe->curr = 0;
++    ipe->used = 0;
++    ipe->used_grp = 0;
++    for (i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base;
++        q->load = 0;
++        q->last_l0 = NULL;
++        q->last_l1 = NULL;
++    }
++}
++
++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n_max, const unsigned int n_grp,
++                                 const unsigned int total_size, const unsigned int min_gap)
++{
++    memset(ipe, 0, sizeof(*ipe));
++    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
++    ipe->n_grp = n_grp;
++    ipe->min_gap = min_gap;
++
++#if RPI_CACHE_UNIF_MVS
++    gpu_malloc_cached(total_size, &ipe->gptr);
++#else
++    gpu_malloc_uncached(total_size, &ipe->gptr);
++#endif
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
++{
++    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
++           wt << (s->ps.sps->bit_depth - 8);
++}
++
 +static void
 +rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
 +           const int nPbW, const int nPbH,
@@ -4401,116 +8147,155 @@ index b478065..955e426 100644
 +           const int weight_offset,
 +           AVFrame *const src_frame)
 +{
-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
-+
-+//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
-+//                    mv, x0, y0, nPbW, nPbH,
-+//                    weight_mul, weight_offset);
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx       = (my << 8) | mx;
++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
 +
++    if (my_mx == 0)
 +    {
-+        const unsigned int mx          = mv->x & 3;
-+        const unsigned int my          = mv->y & 3;
-+        const unsigned int my_mx       = (my << 8) | mx;
-+        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
-+        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
-+        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int bh = nPbH;
 +
-+        // Potentially we could change the assembly code to support taller sizes in one go
-+        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
++        for (int start_x = 0; start_x < nPbW; start_x += 16)
 +        {
-+            const uint32_t src_yx_y = y1_m3 + start_y;
-+            int start_x = 0;
-+            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
 +
-+#if 1
-+            // As Y-pred operates on two independant 8-wide src blocks we can merge
-+            // this pred with the previous one if it the previous one is 8 pel wide,
-+            // the same height as the current block, immediately to the left of our
-+            // current dest block and mono-pred.
-+
-+            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
-+            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
-+            {
-+                const int bw = FFMIN(nPbW, 8);
-+                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
-+
-+                last_y8_lx->next_src2_x = x1_m3;
-+                last_y8_lx->next_src2_y = src_yx_y;
-+                last_y8_lx->next_src2_base = src_vc_address_y;
-+                last_y8_p->p.w += bw;
-+                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
-+                last_y8_p->p.wo2 = wo;
-+
-+                s->last_y8_p = NULL;
-+                s->last_y8_lx = NULL;
-+                start_x = bw;
 +#if RPI_TSTATS
-+                ++s->tstats.y_pred1_y8_merge;
-+#endif
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred1_x0y0;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
 +            }
 +#endif
 +
-+            for (; start_x < nPbW; start_x += 16)
-+            {
-+                const int bw = FFMIN(nPbW - start_x, 16);
-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src_vc_address_y;
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->wo1 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const unsigned int bh = nPbH;
++        int start_x = 0;
++
++#if 1
++        // As Y-pred operates on two independant 8-wide src blocks we can merge
++        // this pred with the previous one if it the previous one is 8 pel wide,
++        // the same height as the current block, immediately to the left of our
++        // current dest block and mono-pred.
++
++        qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++        {
++            const int bw = FFMIN(nPbW, 8);
++            qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
++
++            last_y8_src2->x = x1_m3;
++            last_y8_src2->y = y1_m3;
++            last_y8_src2->base = src_vc_address_y;
++            last_y8_p->w += bw;
++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++            last_y8_p->wo2 = wo;
++
++            s->last_y8_p = NULL;
++            s->last_y8_l1 = NULL;
++            start_x = bw;
 +#if RPI_TSTATS
-+                {
-+                    HEVCRpiStats *const ts = &s->tstats;
-+                    if (mx == 0 && my == 0)
-+                        ++ts->y_pred1_x0y0;
-+                    else if (mx == 0)
-+                        ++ts->y_pred1_x0;
-+                    else if (my == 0)
-+                        ++ts->y_pred1_y0;
-+                    else
-+                        ++ts->y_pred1_xy;
-+
-+                    if (nPbW > 8)
-+                        ++ts->y_pred1_wgt8;
-+                    else
-+                        ++ts->y_pred1_wle8;
-+
-+                    if (nPbH > 16)
-+                        ++ts->y_pred1_hgt16;
-+                    else
-+                        ++ts->y_pred1_hle16;
-+                }
++            ++s->tstats.y_pred1_y8_merge;
++#endif
++        }
 +#endif
-+                cmd_y[-1].next_fn = s->qpu_filter;
-+                cmd_lx->next_src1_x = x1_m3 + start_x;
-+                cmd_lx->next_src1_y = src_yx_y;
-+                cmd_lx->next_src1_base = src_vc_address_y;
-+                if (bw <= 8)
-+                {
-+                    cmd_lx->next_src2_x = MC_DUMMY_X;
-+                    cmd_lx->next_src2_y = MC_DUMMY_Y;
-+                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
-+                }
-+                else
-+                {
-+                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
-+                    cmd_lx->next_src2_y = src_yx_y;
-+                    cmd_lx->next_src2_base = src_vc_address_y;
-+                }
-+                cmd_y->p.w = bw;
-+                cmd_y->p.h = bh;
-+                cmd_y->p.mymx21 = my2_mx2_my_mx;
-+                cmd_y->p.wo1 = wo;
-+                cmd_y->p.wo2 = wo;
-+                cmd_y->p.dst_addr =  dst_addr + start_x;
-+                yp->last_lx = cmd_y;
-+                yp->qpu_mc_curr = cmd_y + 1;
 +
-+                if (bw == 8) {
-+                    s->last_y8_lx = cmd_lx;
-+                    s->last_y8_p = cmd_y;
-+                }
++        for (; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                if (mx == 0 && my == 0)
++                    ++ts->y_pred1_x0y0;
++                else if (mx == 0)
++                    ++ts->y_pred1_x0;
++                else if (my == 0)
++                    ++ts->y_pred1_y0;
++                else
++                    ++ts->y_pred1_xy;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++            src1->x = x1_m3 + start_x;
++            src1->y = y1_m3;
++            src1->base = src_vc_address_y;
++            if (bw <= 8)
++            {
++                src2->x = MC_DUMMY_X;
++                src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++                src2->base = s->qpu_dummy_frame_emu;
++#else
++                src2->base = s->qpu_dummy_frame_qpu;
++#endif
++            }
++            else
++            {
++                src2->x = x1_m3 + start_x + 8;
++                src2->y = y1_m3;
++                src2->base = src_vc_address_y;
++            }
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo;
++            cmd_y->wo2 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++            if (bw == 8) {
++                s->last_y8_l1 = src2;
++                s->last_y8_p = cmd_y;
 +            }
 +        }
 +    }
@@ -4524,168 +8309,180 @@ index b478065..955e426 100644
 +           AVFrame *const src_frame,
 +           AVFrame *const src_frame2)
 +{
-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
 +    const Mv * const mv  = mv_field->mv + 0;
 +    const Mv * const mv2 = mv_field->mv + 1;
 +
-+//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
-+//           mv, x0, y0, nPbW, nPbH,
-+//           src_frame2, mv2, mv_field);
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx = (my<<8) | mx;
++    const unsigned int mx2          = mv2->x & 3;
++    const unsigned int my2          = mv2->y & 3;
++    const unsigned int my2_mx2 = (my2<<8) | mx2;
++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
++    const uint32_t wt_offset =
++        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
++    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
++
++    if (my2_mx2_my_mx == 0)
 +    {
-+        const unsigned int mx          = mv->x & 3;
-+        const unsigned int my          = mv->y & 3;
-+        const unsigned int my_mx = (my<<8) | mx;
-+        const unsigned int mx2          = mv2->x & 3;
-+        const unsigned int my2          = mv2->y & 3;
-+        const unsigned int my2_mx2 = (my2<<8) | mx2;
-+        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int x2 = x0 + (mv2->x >> 2);
++        const int y2 = y0 + (mv2->y >> 2);
++        const int bh = nPbH;
++
++        // Can do chunks a full 16 wide if we don't want the H filter
++        for (int start_x=0; start_x < nPbW; start_x += 16)
++        {
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred2_x0y0;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 16);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = 0;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        // Filter requires a run-up of 3
 +        const int x1 = x0 + (mv->x >> 2) - 3;
 +        const int y1 = y0 + (mv->y >> 2) - 3;
 +        const int x2 = x0 + (mv2->x >> 2) - 3;
 +        const int y2 = y0 + (mv2->y >> 2) - 3;
-+        const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+        const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
-+                     s->sh.luma_offset_l1[ref_idx1] + 1;
-+        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
-+        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++        const int bh = nPbH;
 +
-+        uint32_t dst = get_vc_address_y(s->frame) + y_off;
-+        const uint32_t src1_base = get_vc_address_y(src_frame);
-+        const uint32_t src2_base = get_vc_address_y(src_frame2);
-+
-+        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
-+        {
-+            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
-+
-+            for (int start_x=0; start_x < nPbW; start_x += 8)
-+            { // B blocks work 8 at a time
-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++        for (int start_x=0; start_x < nPbW; start_x += 8)
++        { // B blocks work 8 at a time
++            // B weights aren't doubled as the QPU code does the same
++            // amount of work as it does for P
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
 +#if RPI_TSTATS
-+              {
-+                  HEVCRpiStats *const ts = &s->tstats;
-+                  const unsigned int mmx = mx | mx2;
-+                  const unsigned int mmy = my | my2;
-+                  if (mmx == 0 && mmy == 0)
-+                      ++ts->y_pred2_x0y0;
-+                  else if (mmx == 0)
-+                      ++ts->y_pred2_x0;
-+                  else if (mmy == 0)
-+                      ++ts->y_pred2_y0;
-+                  else
-+                      ++ts->y_pred2_xy;
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                const unsigned int mmx = mx | mx2;
++                const unsigned int mmy = my | my2;
++                if (mmx == 0 && mmy == 0)
++                    ++ts->y_pred2_x0y0;
++                else if (mmx == 0)
++                    ++ts->y_pred2_x0;
++                else if (mmy == 0)
++                    ++ts->y_pred2_y0;
++                else
++                    ++ts->y_pred2_xy;
 +
-+                  if (nPbH > 16)
-+                      ++ts->y_pred2_hgt16;
-+                  else
-+                      ++ts->y_pred2_hle16;
-+              }
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
 +#endif
-+              cmd_y[-1].next_fn = s->qpu_filter_b;
-+              cmd_lx->next_src1_x = x1 + start_x;
-+              cmd_lx->next_src1_y = y1 + start_y;
-+              cmd_lx->next_src1_base = src1_base;
-+              cmd_lx->next_src2_x = x2 + start_x;
-+              cmd_lx->next_src2_y = y2 + start_y;
-+              cmd_lx->next_src2_base = src2_base;
-+              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
-+              cmd_y->p.h = bh;
-+              cmd_y->p.mymx21 = my2_mx2_my_mx;
-+              cmd_y->p.wo1 = wo1;
-+              cmd_y->p.wo2 = wo2;
-+              cmd_y->p.dst_addr =  dst + start_x;
-+              yp->last_lx = cmd_y;
-+              yp->qpu_mc_curr = cmd_y + 1;
-+          }
-+          dst += s->frame->linesize[0] * 16;
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 8);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
 +        }
 +    }
 +}
 +
-+
-+static HEVCRpiChromaPred *
-+rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
-+{
-+    HEVCRpiChromaPred * cp = s->curr_pred_c;
-+    HEVCRpiChromaPred * cpt = cp + 1;
-+    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
-+        if (cpt->load < cp->load)
-+            cp = cpt;
-+    }
-+    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
-+    ++cp->load;
-+//    cp->load += load_val;
-+    return cp;
-+}
-+
++// h/v shifts fixed at one as that is all the qasm copes with
 +static void
-+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
 +  const Mv * const mv,
 +  const int16_t * const c_weights,
 +  const int16_t * const c_offsets,
 +  AVFrame * const src_frame)
 +{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // = s->ps.sps->hshift[1];
++    const int vshift = 1; // = s->ps.sps->vshift[1];
 +
-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
-+#if 0
-+    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
++    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
 +
-+    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
-+                c_weights[0], c_offsets[0]);
-+
-+    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
-+                c_weights[1], c_offsets[1]);
-+#endif
++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
 +    {
-+        const int hshift           = s->ps.sps->hshift[1];
-+        const int vshift           = s->ps.sps->vshift[1];
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++        qpu_mc_src_t * const last_lx = *plast_lx;
++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
-+        const uint32_t src_base_u = get_vc_address_u(src_frame);
-+        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
-+        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
-+        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
-+        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+
-+        for(int start_y=0;start_y < nPbH_c;start_y+=16)
-+        {
-+            const int bh = FFMIN(nPbH_c-start_y, 16);
-+
-+            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+            {
-+                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
-+                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
-+                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
-+                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+                u[-1].next_fn  = s->qpu_filter_uv;
-+                last_l0->next_src_x = x1_c + start_x;
-+                last_l0->next_src_y = y1_c + start_y;
-+                last_l0->next_src_base_c = src_base_u;
-+                u[0].p.h = bh;
-+                u[0].p.w = bw;
-+                u[0].p.coeffs_x = x_coeffs;
-+                u[0].p.coeffs_y = y_coeffs;
-+                u[0].p.wo_u = wo_u;
-+                u[0].p.wo_v = wo_v;
-+                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
-+                cp->last_l0 = u;
-+                cp->qpu_mc_curr = u + 1;
-+            }
-+
-+            dst_base_u += s->frame->linesize[1] * 16;
-+        }
++        last_lx->x = x1_c + start_x;
++        last_lx->y = y1_c;
++        last_lx->base = src_base_u;
++        cmd_c->h = bh;
++        cmd_c->w = bw;
++        cmd_c->coeffs_x = x_coeffs;
++        cmd_c->coeffs_y = y_coeffs;
++        cmd_c->wo_u = wo_u;
++        cmd_c->wo_v = wo_v;
++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++        *plast_lx = &cmd_c->next_src;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
 +    }
-+  return;
++    return;
 +}
 +
++// h/v shifts fixed at one as that is all the qasm copes with
 +static void
 +rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
@@ -4697,89 +8494,72 @@ index b478065..955e426 100644
 +  AVFrame * const src_frame,
 +  AVFrame * const src_frame2)
 +{
-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
-+#if 0
-+    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // s->ps.sps->hshift[1];
++    const int vshift = 1; // s->ps.sps->vshift[1];
++    const Mv * const mv = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
 +
-+    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
-+#endif
++    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
++    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
++
++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++
++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
 +    {
-+        const int hshift = s->ps.sps->hshift[1];
-+        const int vshift = s->ps.sps->vshift[1];
-+        const Mv * const mv = mv_field->mv + 0;
-+        const Mv * const mv2 = mv_field->mv + 1;
++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
-+        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
-+        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++        qpu_mc_src_t * const src_l0 = cp->last_l0;
++        qpu_mc_src_t * const src_l1 = cp->last_l1;
 +
-+        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
-+        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
-+        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++        src_l0->x = x1_c + start_x;
++        src_l0->y = y1_c;
++        src_l0->base = src1_base;
++        src_l1->x = x2_c + start_x;
++        src_l1->y = y2_c;
++        src_l1->base = src2_base;
 +
-+        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
-+        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++        u[0].h = bh;
++        u[0].w = bw;
++        u[0].coeffs_x1 = coefs0_x;
++        u[0].coeffs_y1 = coefs0_y;
++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
++        u[0].coeffs_x2 = coefs1_x;
++        u[0].coeffs_y2 = coefs1_y;
++        u[0].wo_u2 = wo_u2;
++        u[0].wo_v2 = wo_v2;
++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
 +
-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+
-+        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
-+          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
-+
-+          // We are allowed 3/4 powers of two as well as powers of 2
-+          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
-+
-+          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
-+              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
-+              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
-+              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
-+              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
-+
-+              u[-1].next_fn = s->qpu_filter_uv_b0;
-+              last_l0->next_src_x = x1_c + start_x;
-+              last_l0->next_src_y = y1_c + start_y;
-+              last_l0->next_src_base_c = get_vc_address_u(src_frame);
-+
-+              u[0].next_fn = 0;  // Ignored - 2 block cmd
-+              u[0].next_src_x = x2_c + start_x;
-+              u[0].next_src_y = y2_c + start_y;
-+              u[0].next_src_base_c = get_vc_address_u(src_frame2);
-+
-+              u[0].b0.h = (bh<16 ? bh : 16);
-+              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
-+              u[0].b0.coeffs_x = coefs0_x;
-+              u[0].b0.coeffs_y = coefs0_y;
-+              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
-+              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
-+              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
-+
-+              last_l1->next_src_x = x2_c + start_x;
-+              last_l1->next_src_y = y2_c + start_y;
-+              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
-+
-+              u[1].b1.dummy0 = 0;  // w,h inherited from b0
-+              u[1].b1.coeffs_x = coefs1_x;
-+              u[1].b1.coeffs_y = coefs1_y;
-+              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
-+              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
-+              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
-+
-+              cp->last_l0 = u;
-+              cp->last_l1 = u + 1;
-+              cp->qpu_mc_curr = u + 2;
-+          }
-+
-+          dst_base_u += s->frame->linesize[1] * 16;
-+        }
++        cp->last_l0 = &u[0].next_src1;
++        cp->last_l1 = &u[0].next_src2;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
 +    }
 +}
++
++
 +#endif
 +
 +
@@ -4796,7 +8576,7 @@ index b478065..955e426 100644
      int merge_idx = 0;
      struct MvField current_mv = {{{ 0 }}};
  
-@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
      int y_cb             = y0 >> log2_min_cb_size;
      int x_pu, y_pu;
      int i, j;
@@ -4806,7 +8586,7 @@ index b478065..955e426 100644
  
      if (!skip_flag)
          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -4828,7 +8608,7 @@ index b478065..955e426 100644
          if (s->ps.sps->chroma_format_idc) {
 +#if RPI_INTER
 +            if (s->enable_rpi) {
-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++                rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
 +                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
 +                  ref0->frame);
 +                return;
@@ -4837,7 +8617,7 @@ index b478065..955e426 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -4859,7 +8639,7 @@ index b478065..955e426 100644
          if (s->ps.sps->chroma_format_idc) {
 +#if RPI_INTER
 +            if (s->enable_rpi) {
-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++                rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
 +                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
 +                  ref1->frame);
 +                return;
@@ -4868,7 +8648,7 @@ index b478065..955e426 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -4901,7 +8681,7 @@ index b478065..955e426 100644
              chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
-@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                  intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
                  ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
                  if (s->ps.sps->pcm.loop_filter_disable_flag)
@@ -4911,21 +8691,22 @@ index b478065..955e426 100644
  
                  if (ret < 0)
                      return ret;
-@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
  }
  
 +#ifdef RPI
 +static void rpi_execute_dblk_cmds(HEVCContext *s)
 +{
-+    int n;
-+    int job = s->pass1_job;
-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+    int (*p)[2] = s->dblk_cmds[job];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
++    const unsigned int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
++    HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
++    unsigned int i;
++
++    for (i = 0; i != de->n; ++i)
++    {
++        ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
 +    }
-+    s->num_dblk_cmds[job] = 0;
++    de->n = 0;
 +}
 +
 +#if 0
@@ -4958,21 +8739,33 @@ index b478065..955e426 100644
 +#endif
 +
 +
++#define RPI_OPT_SEP_PRED 0
++
++
 +// I-pred, transform_and_add for all blocks types done here
 +// All ARM
++#if RPI_OPT_SEP_PRED
++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
++#else
 +static void rpi_execute_pred_cmds(HEVCContext * const s)
++#endif
 +{
 +  int i;
-+  int job = s->pass1_job;
-+  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+#ifdef RPI_WORKER
++  HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
++  const HEVCPredCmd *cmd = iap->cmds;
++#ifdef RPI
 +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
 +#else
 +  HEVCLocalContext *lc = s->HEVClc;
 +#endif
 +
-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
++  for(i = iap->n; i > 0; i--, cmd++) {
 +//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++#if RPI_OPT_SEP_PRED
++      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
++          continue;
++      }
++#endif
 +
 +      switch (cmd->type)
 +      {
@@ -4983,7 +8776,7 @@ index b478065..955e426 100644
 +              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
 +              lc->na.cand_up           = (cmd->na >> 1) & 1;
 +              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
++              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
 +                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
 +              else
 +                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
@@ -4991,16 +8784,25 @@ index b478065..955e426 100644
 +
 +          case RPI_PRED_ADD_RESIDUAL:
 +              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+#ifdef RPI_PRECLEAR
-+              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
-+#endif
 +              break;
++          case RPI_PRED_ADD_DC:
++              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++              break;
++#if RPI_HEVC_SAND
 +          case RPI_PRED_ADD_RESIDUAL_U:
-+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
 +              break;
 +          case RPI_PRED_ADD_RESIDUAL_V:
-+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
 +              break;
++          case RPI_PRED_ADD_RESIDUAL_C:
++              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++          case RPI_PRED_ADD_DC_U:
++          case RPI_PRED_ADD_DC_V:
++              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++              break;
++#endif
 +
 +          case RPI_PRED_I_PCM:
 +              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
@@ -5011,88 +8813,27 @@ index b478065..955e426 100644
 +              abort();
 +      }
 +  }
-+  s->num_pred_cmds[job] = 0;
++#if RPI_OPT_SEP_PRED
++  if (do_luma)
++#endif
++  {
++      iap->n = 0;
++  }
 +}
 +
-+// Do any inter-pred that we want to do in software
-+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
-+// All ARM
-+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
-+{
-+    unsigned int cidx;
-+    AVFrame myref;
-+    AVFrame myref1;
-+    struct MvField mymv;
-+
-+    for(; n>0 ; n--, cmd++) {
-+        av_assert0(0);
-+
-+        switch(cmd->cmd) {
-+        case RPI_CMD_LUMA_UNI:
-+            if (b_only)
-+                break;
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_LUMA_BI:
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            myref1.data[0] = cmd->src1;
-+            myref1.linesize[0] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
-+                       &myref1, &cmd->mv1, &mymv);
-+            break;
-+        case RPI_CMD_CHROMA_UNI:
-+            if (b_only)
-+                break;
-+            mymv.mv[0] = cmd->mv;
-+            chroma_mc_uni(s, cmd->dst,
-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_CHROMA_BI:
-+        case RPI_CMD_CHROMA_BI+1:
-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
-+            myref.data[cidx+1] = cmd->src;
-+            myref.linesize[cidx+1] = cmd->srcstride;
-+            myref1.data[cidx+1] = cmd->src1;
-+            myref1.linesize[cidx+1] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            mymv.mv[0] = cmd->mv;
-+            mymv.mv[1] = cmd->mv1;
-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
-+            break;
-+        }
-+    }
-+}
-+
-+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
-+{
-+    const int job = s->pass1_job;
-+
-+    if (!qpu_luma || luma_b_only)
-+        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
-+    s->num_mv_cmds_y[job] = 0;
-+    if (!qpu_chroma || chroma_b_only)
-+        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
-+    s->num_mv_cmds_c[job] = 0;
-+}
 +
 +#endif
 +
 +#ifdef RPI
++
 +// Set initial uniform job values & zero ctu_count
 +static void rpi_begin(HEVCContext *s)
 +{
 +#if RPI_INTER
-+    int job = s->pass0_job;
-+    int i;
++    unsigned int i;
++    HEVCRpiJob * const jb = s->jb0;
++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
 +
 +    const uint16_t pic_width_y        = s->ps.sps->width;
 +    const uint16_t pic_height_y       = s->ps.sps->height;
@@ -5100,73 +8841,60 @@ index b478065..955e426 100644
 +    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
 +    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
 +
-+    for(i=0; i < QPU_N_UV;i++) {
-+        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
-+        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
++    rpi_inter_pred_reset(cipe);
++    for (i = 0; i < cipe->n; i++) {
++        HEVCRpiInterPredQ * const cp = cipe->q + i;
++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
 +
-+        // Chroma setup is a double block with L0 fetch
-+        // and other stuff in the 1st block and L1 fetch
-+        // in the 2nd along with a lot of dummy vars
-+        // This could be packed a lot tighter but it would make
-+        // L0, L1 management a lot harder
++        u->next_src1.x = 0;
++        u->next_src1.y = 0;
++        u->next_src1.base = 0;
++        u->pic_cw = pic_width_c;
++        u->pic_ch = pic_height_c;
++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        u->wdenom = s->sh.chroma_log2_weight_denom;
++        cp->last_l0 = &u->next_src1;
 +
 +        u->next_fn = 0;
-+        u->next_src_x = 0;
-+        u->next_src_y = 0;
-+        u->next_src_base_c = 0;
-+        u->s0.pic_cw = pic_width_c;
-+        u->s0.pic_ch = pic_height_c;
-+        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
-+        u->s0.stride1 = s->frame->linesize[1];
-+        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
-+        u->s0.dummy0 = 0;
-+        cp->last_l0 = u;
-+        ++u;
++        u->next_src2.x = 0;
++        u->next_src2.y = 0;
++        u->next_src2.base = 0;
++        cp->last_l1 = &u->next_src2;
 +
-+        u->next_fn = 0;
-+        u->next_src_x = 0;
-+        u->next_src_y = 0;
-+        u->next_src_base_c = 0;
-+        u->s1.dummy0 = 0;
-+        u->s1.dummy1 = 0;
-+        u->s1.dummy2 = 0;
-+        u->s1.dummy3 = 0;
-+        u->s1.dummy4 = 0;
-+        u->s1.dummy5 = 0;
-+        cp->last_l1 = u;
-+        ++u;
-+
-+        cp->load = 0;
-+        cp->qpu_mc_curr = u;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
 +    }
-+    s->curr_pred_c = NULL;
 +
-+    for(i=0;i < QPU_N_Y;i++) {
-+        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
-+        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
++    rpi_inter_pred_reset(yipe);
++    for (i = 0; i < yipe->n; i++) {
++        HEVCRpiInterPredQ * const yp = yipe->q + i;
++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
 +
-+        y->next_src1_x = 0;
-+        y->next_src1_y = 0;
-+        y->next_src1_base = 0;
-+        y->next_src2_x = 0;
-+        y->next_src2_y = 0;
-+        y->next_src2_base = 0;
-+        y->s.pic_h = pic_height_y;
-+        y->s.pic_w = pic_width_y;
-+        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
-+        y->s.stride1 = s->frame->linesize[0];
-+        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
-+        y->s.dummy0 = 0;
++        y->next_src1.x = 0;
++        y->next_src1.y = 0;
++        y->next_src1.base = 0;
++        y->next_src2.x = 0;
++        y->next_src2.y = 0;
++        y->next_src2.base = 0;
++        y->pic_h = pic_height_y;
++        y->pic_w = pic_width_y;
++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        y->wdenom = s->sh.luma_log2_weight_denom;
 +        y->next_fn = 0;
-+        yp->last_lx = y;
-+        ++y;
++        yp->last_l0 = &y->next_src1;
++        yp->last_l1 = &y->next_src2;
 +
-+        yp->load = 0;
-+        yp->qpu_mc_curr = y;
++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
 +    }
-+    s->curr_pred_y = NULL;
++
 +    s->last_y8_p = NULL;
-+    s->last_y8_lx = NULL;
++    s->last_y8_l1 = NULL;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
++        jb->progress[i] = -1;
++    }
++
 +#endif
 +    s->ctu_count = 0;
 +}
@@ -5174,78 +8902,122 @@ index b478065..955e426 100644
 +
 +
 +#if RPI_INTER
-+static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
 +{
 +    unsigned int i;
-+    const uint32_t exit_fn = qpu_fn(mc_exit);
-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
-+    unsigned int tc = 0;
-+    HEVCRpiJob * const jb = s->jobs + job;
++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++    unsigned int max_block = 0;
 +
-+    // Add final commands to Q
-+    for(i = 0; i != QPU_N_Y; ++i) {
-+        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
-+        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
-+
-+        // We will always have had L0 if we have L1 so only test L0
-+        if (px != yp->qpu_mc_base)
-+            tc = 1;
-+
-+        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        px->next_src1_x = MC_DUMMY_X;
-+        px->next_src1_y = MC_DUMMY_Y;
-+        px->next_src1_base = s->qpu_dummy_frame;
-+        px->next_src2_x = MC_DUMMY_X;
-+        px->next_src2_y = MC_DUMMY_Y;
-+        px->next_src2_base = s->qpu_dummy_frame;
-+
-+        yp->last_lx = NULL;
++    if (!ipe->used) {
++        return 0;
 +    }
 +
-+    return tc;
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++        if (block_size > max_block)
++            max_block = block_size;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_qpu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_qpu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++
++        // Add to mailbox list
++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++        mail[i][1] = yp->code_setup;
++    }
++
++#if RPI_CACHE_UNIF_MVS
++    // We don't need invalidate here as the uniforms aren't changed by the QPU
++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++    // new values which seems to give us a small performance advantage
++    //
++    // In most cases we will not have a completely packed set of uniforms and as
++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++    // fullest
++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++                                  ipe->n, ipe->max_fill + ipe->min_gap);
++#endif
++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++    return 1;
 +}
++#endif
 +
-+#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
-+#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
-+
-+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(HEVCContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
 +{
 +    unsigned int i;
-+    const uint32_t exit_fn = qpu_fn(mc_exit_c);
-+    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
-+    unsigned int tc = 0;
-+    HEVCRpiJob * const jb = s->jobs + job;
-+
-+    // Add final commands to Q
-+    for(i = 0; i != QPU_N_UV; ++i) {
-+        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
-+        qpu_mc_pred_c_t *const p0 = cp->last_l0;
-+        qpu_mc_pred_c_t *const p1 = cp->last_l1;
-+
-+        // We will always have had L0 if we have L1 so only test L0
-+        if (p0 != cp->qpu_mc_base)
-+            tc = 1;
-+
-+        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->next_src_x = MC_DUMMY_X;
-+        p0->next_src_y = MC_DUMMY_Y;
-+        p0->next_src_base_c = s->qpu_dummy_frame;
-+        p1->next_src_x = MC_DUMMY_X;
-+        p1->next_src_y = MC_DUMMY_Y;
-+        p1->next_src_base_c = s->qpu_dummy_frame;;
-+
-+        cp->last_l0 = NULL;
-+        cp->last_l1 = NULL;
++    if (!ipe->used) {
++        return 0;
 +    }
 +
-+    return tc;
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_emu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_emu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++    }
++
++    return 1;
 +}
 +#endif
 +
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++#endif
++
 +#ifdef RPI
 +
 +
@@ -5260,174 +9032,178 @@ index b478065..955e426 100644
 +// Core execution tasks
 +static void worker_core(HEVCContext * const s)
 +{
-+    worker_global_env_t * const wg = &worker_global_env;
-+    int arm_cost = 0;
-+//    vpu_qpu_wait_h sync_c;
++#if RPI_OPT_SEP_PRED
++    vpu_qpu_wait_h sync_c;
++#endif
 +    vpu_qpu_wait_h sync_y;
-+    int qpu_luma = 0;
-+    int qpu_chroma = 0;
-+    int gpu_load;
-+    int arm_load;
-+    static const int arm_const_cost = 2;
 +
-+//    static int z = 0;
-+
-+    const int job = s->pass1_job;
-+    unsigned int flush_start = 0;
-+    unsigned int flush_count = 0;
++    HEVCRpiJob * const jb = s->jb1;
++    int pred_y, pred_c;
 +
 +    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
 +
-+    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
-+        vpu_qpu_job_add_vpu(vqj,
-+            vpu_get_fn(),
-+            vpu_get_constants(),
-+            s->coeffs_buf_vc[job][2],
-+            s->num_coeffs[job][2] >> 8,
-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+            s->num_coeffs[job][3] >> 10,
-+            0);
-+
-+        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    }
-+
-+
-+#if RPI_INTER
-+    pthread_mutex_lock(&wg->lock);
-+
-+//    ++z;
-+    gpu_load = vpu_qpu_current_load();
-+    arm_load = avpriv_atomic_int_get(&wg->arm_load);
-+#if 0 // Y_B_ONLY
-+    qpu_luma =  gpu_load + 2 < arm_load;
-+    qpu_chroma = gpu_load < arm_load + 8;
-+#elif 0
-+    qpu_luma =  gpu_load < arm_load + 2;
-+    qpu_chroma = gpu_load < arm_load + 8;
-+#else
-+    qpu_chroma = 1;
-+    qpu_luma = 1;
-+#endif
-+
-+    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
-+
-+    wg->gpu_c += qpu_chroma;
-+    wg->gpu_y += qpu_luma;
-+    wg->arm_c += !qpu_chroma;
-+    wg->arm_y += !qpu_luma;
-+
-+
-+//    if ((z & 511) == 0) {
-+//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
-+//    }
-+
-+
 +    {
-+        int (*d)[2] = s->dblk_cmds[job];
-+        unsigned int high=(*d)[1];
-+        int n;
++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++        if (cf->s[3].n + cf->s[2].n != 0)
++        {
++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++            vpu_qpu_job_add_vpu(vqj,
++                vpu_get_fn(s->ps.sps->bit_depth),
++                vpu_get_constants(),
++                cf->gptr.vc,
++                cf->s[2].n >> 8,
++                cf->gptr.vc + offset32,
++                cf->s[3].n >> 10,
++                0);
 +
-+        flush_start = high;
-+        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+            unsigned int y = (*d)[1];
-+            flush_start = FFMIN(flush_start, y);
-+            high=FFMAX(high,y);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
 +        }
-+        // Avoid flushing past end of frame
-+        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
 +    }
 +
-+#if !DISABLE_CHROMA
-+    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
-+    {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+        const uint32_t code = qpu_fn(mc_setup_c);
-+        uint32_t * p;
-+        unsigned int i;
-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
-+            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
-+            *p++ = code;
-+        }
-+
-+        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
-+
-+#if RPI_CACHE_UNIF_MVS
-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+#endif
-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
-+    }
-+#endif
++    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
 +
 +// We can take a sync here and try to locally overlap QPU processing with ARM
 +// but testing showed a slightly negative benefit with noticable extra complexity
-+//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
-+
-+    if (qpu_luma && mc_terminate_y(s, job) != 0)
-+    {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+        const uint32_t code = qpu_fn(mc_setup);
-+        uint32_t * p;
-+        unsigned int i;
-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
-+            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
-+            *p++ = code;
-+        }
-+
-+        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
-+
-+#if RPI_CACHE_UNIF_MVS
-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#if RPI_OPT_SEP_PRED
++    vpu_qpu_job_add_sync_this(vqj, &sync_c);
 +#endif
-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
-+    }
 +
-+    pthread_mutex_unlock(&wg->lock);
-+
-+#endif
++    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
 +
 +    vpu_qpu_job_add_sync_this(vqj, &sync_y);
 +
++
++    // We are expecting a contiguous Z-shaped set of blocks
++    // So generate up to 3 blocks:
++    //   1st line
++    //   body
++    //   last line
++    // This will work even if we don't have the expected geometry
++    if (pred_y || pred_c)
++    {
++        const HEVCRpiDeblkEnv *const de = &jb->deblk;
++        const HEVCRpiDeblkBlk * db = de->blks + 0;
++        const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++        unsigned int x0 = db->x_ctb;
++        unsigned int xx = x0 + ctb_size;
++        unsigned int y0 = db->y_ctb;
++
++        unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
++        unsigned int b = 0;
++        unsigned int i;
++
++        for (i = 1, ++db; i < de->n; ++i, ++db)
++        {
++            if (db->x_ctb == xx && db->y_ctb == y0) {
++                xx += ctb_size;
++            }
++            else
++            {
++                unsigned int * const tlbr = blks_tlbr[b];
++                if (tlbr[0] > y0)
++                    tlbr[0] = y0;
++                if (tlbr[1] > x0)
++                    tlbr[1] = x0;
++                if (tlbr[2] < y0 + ctb_size)
++                    tlbr[2] = y0 + ctb_size;
++                if (tlbr[3] < xx)
++                    tlbr[3] = xx;
++                x0 = db->x_ctb;
++                xx = x0 + ctb_size;
++                y0 = db->y_ctb;
++                b = 1;
++            }
++        }
++
++        if (blks_tlbr[b][0] != ~0U)
++            ++b;
++
++        {
++            unsigned int * const tlbr = blks_tlbr[b];
++            tlbr[0] = y0;
++            tlbr[1] = x0;
++            tlbr[2] = y0 + ctb_size;
++            tlbr[3] = xx;
++        }
++
++        // ??? Coalesce blocks ???
++        for (i = 0; i <= b; ++i) {
++            const unsigned int * const tlbr = blks_tlbr[i];
++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++              tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
++        }
++    }
++
++
 +    // Having accumulated some commands - do them
 +    rpi_cache_flush_finish(rfe);
++
++    // Await progress as required
++    {
++        unsigned int i;
++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
++            if (jb->progress[i] >= 0) {
++                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
++            }
++        }
++    }
++
 +    vpu_qpu_job_finish(vqj);
 +
-+    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
++    worker_pic_reset(&jb->coeffs);
 +
-+#if Y_B_ONLY
-+    if (qpu_luma)
-+        vpu_qpu_wait(&sync_y);
++    // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    if (av_rpi_is_sand8_frame(s->frame))
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++        rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++    else
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++        rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
 +#endif
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
 +
++#if RPI_OPT_SEP_PRED
 +    // Wait for transform completion
++    vpu_qpu_wait(&sync_c);
 +
 +    // Perform intra prediction and residual reconstruction
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
-+#if Y_B_ONLY
-+    if (!qpu_luma)
-+        vpu_qpu_wait(&sync_y);
-+#else
++    rpi_execute_pred_cmds(s, 0, 1);
++
++    // Wait for transform completion
 +    vpu_qpu_wait(&sync_y);
-+#endif
++
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s, 1, 0);
++#else
++    // Wait for transform completion
++    vpu_qpu_wait(&sync_y);
++
++    // Perform intra prediction and residual reconstruction
 +    rpi_execute_pred_cmds(s);
++#endif
 +
 +    // Perform deblocking for CTBs in this row
 +    rpi_execute_dblk_cmds(s);
-+
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
 +}
 +
 +static void rpi_do_all_passes(HEVCContext *s)
 +{
++    // Called from main thread - must be no pending background jobs
++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
++
 +    // Do the various passes - common with the worker code
 +    worker_core(s);
 +    // Prepare next batch
@@ -5435,99 +9211,90 @@ index b478065..955e426 100644
 +}
 +
 +
-+
 +#endif
 +
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  {
      HEVCContext *s  = avctxt->priv_data;
-@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
      int y_ctb       = 0;
      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
 +#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
-+        s->frame->format == AV_PIX_FMT_SAND128 &&
-+        !s->ps.pps->cross_component_prediction_enabled_flag;
-+
-+    if (!s->enable_rpi) {
-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
-+        printf("Cross component\n");
-+    }
++    // * We don't support cross_component_prediction_enabled_flag but as that
++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
++    //   only deal with sand which is never 4:4:4
++    //   [support wouldn't be hard]
++    s->enable_rpi =
++        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
++         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
 +#endif
 +    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
 +
      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
          return AVERROR_INVALIDDATA;
-@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          }
      }
  
-+#ifdef RPI_WORKER
-+    s->pass0_job = 0;
-+    s->pass1_job = 0;
-+#endif
 +#ifdef RPI
++    // Worker must be idle at start
++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
 +    rpi_begin(s);
 +#endif
 +
      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+-        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
  
-@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
          y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
- 
-+
-         ff_hevc_cabac_init(s, ctb_addr_ts);
- 
-         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
-@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
-+#if RPI_INTER
-+        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
-+        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
-+#endif
-+
          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
 +#ifdef RPI
++        // Report progress so we can use our MVs in other frames
++        // If we are tiled then this isn't really optimal but given that tiling
++        // can change on a per pic basis (described in PPS) other schemes are
++        // quite a lot harder
++        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
++            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
++        }
++
 +        if (s->enable_rpi) {
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-+          //av_assert0(s->pass0_job>=0);
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+          s->ctu_count++;
++            int q_full = (++s->ctu_count >= s->max_ctu_count);
 +
-+          if ( s->ctu_count >= s->max_ctu_count ) {
-+#ifdef RPI_WORKER
-+            if (s->used_for_ref)
-+            {
-+//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
++            if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
++                q_full = 1;
++            if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
++                q_full = 1;
 +
-+//                worker_wait(s);
-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+              // Pass on this job to worker thread
-+              worker_submit_job(s);
++            s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
++            s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
 +
-+              // Make sure we have space to prepare the next job
-+              worker_pass0_ready(s);
++            if (q_full) {
++                if (s->used_for_ref)
++                {
++//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
 +
-+              // Prepare the next batch of commands
-+              rpi_begin(s);
-+            } else {
-+              // Non-ref frame so do it all on this thread
-+              rpi_do_all_passes(s);
++//                  worker_wait(s);
++                    // Split work load onto separate threads so we make as rapid progress as possible with this frame
++                    // Pass on this job to worker thread
++                    worker_submit_job(s);
++
++                    // Make sure we have space to prepare the next job
++                    worker_pass0_ready(s);
++
++                    // Prepare the next batch of commands
++                    rpi_begin(s);
++                } else {
++                    // Non-ref frame so do it all on this thread
++                    rpi_do_all_passes(s);
++                }
 +            }
-+#else
-+            rpi_do_all_passes(s);
-+#endif
-+          }
 +
 +        }
 +#endif
@@ -5536,7 +9303,7 @@ index b478065..955e426 100644
          if (more_data < 0) {
              s->tab_slice_address[ctb_addr_rs] = -1;
              return more_data;
-@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  
          ctb_addr_ts++;
          ff_hevc_save_states(s, ctb_addr_ts);
@@ -5549,12 +9316,10 @@ index b478065..955e426 100644
  
 +#ifdef RPI
 +
-+#ifdef RPI_WORKER
 +    // Wait for the worker to finish all its jobs
 +    if (s->enable_rpi) {
 +        worker_wait(s);
 +    }
-+#endif
 +
 +    // Finish off any half-completed rows
 +    if (s->enable_rpi && s->ctu_count) {
@@ -5579,7 +9344,7 @@ index b478065..955e426 100644
      if (x_ctb + ctb_size >= s->ps.sps->width &&
          y_ctb + ctb_size >= s->ps.sps->height)
          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
      s = s1->sList[self_id];
      lc = s->HEVClc;
  
@@ -5591,10 +9356,13 @@ index b478065..955e426 100644
      if(ctb_row) {
          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
          if (ret < 0)
              return ret;
  
+-        if (s->max_ra == INT_MAX) {
+-            if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+-                s->max_ra = s->poc;
 +        // The definition of _N unit types is "non-reference for other frames
 +        // with the same temporal_id" so they may/will be ref frames for pics
 +        // with a higher temporal_id.
@@ -5621,47 +9389,95 @@ index b478065..955e426 100644
 +            s->is_decoded = 0;
 +            break;
 +        }
-         if (s->max_ra == INT_MAX) {
-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-                 s->max_ra = s->poc;
-@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++
++        if (s->sh.first_slice_in_pic_flag) {
++            if (s->max_ra == INT_MAX) {
++                if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
++                    s->max_ra = s->poc;
++                } else {
++                    if (IS_IDR(s))
++                        s->max_ra = INT_MIN;
++                }
++            }
++
++            if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
++                s->poc <= s->max_ra) {
++                s->is_decoded = 0;
++                break;
+             } else {
+                 if (IS_IDR(s))
+                     s->max_ra = INT_MIN;
+@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
          }
      }
  
 -fail:
 -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-+fail:  // Also success path
-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#if RPI_INTER
-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
-+#endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 -
-+    }
-+#if RPI_INTER
-+    else if (s->ref && s->enable_rpi) {
-+      // When running single threaded we need to flush the whole frame
-+      flush_frame(s,s->frame);
-+    }
++fail:  // Also success path
++    if (s->ref != NULL) {
++        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
++#ifdef RPI
++            rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
 +#endif
++            ff_hevc_progress_signal_all_done(s);
++        }
++#ifdef RPI
++        // * Flush frame will become confused if we pass it something
++        //   that doesn't have an expected number of planes (e.g. 400)
++        //   So only flush if we are sure we can.
++        else if (s->enable_rpi) {
++            // Flush frame to real memory as we expect to be able to pass
++            // it straight on to mmal
++            flush_frame(s, s->frame);
++        }
++#endif
++    }
      return ret;
  }
  
-@@ -3064,6 +4764,41 @@ fail:
+@@ -3070,6 +5056,83 @@ fail:
      return AVERROR(ENOMEM);
  }
  
-+#ifdef RPI_WORKER
-+static av_cold void hevc_init_worker(HEVCContext *s)
++#ifdef RPI
++static av_cold void hevc_init_worker(HEVCContext * const s)
 +{
 +    int err;
-+    pthread_cond_init(&s->worker_cond_head, NULL);
-+    pthread_cond_init(&s->worker_cond_tail, NULL);
-+    pthread_mutex_init(&s->worker_mutex, NULL);
 +
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
++    memset(s->jobs, 0, sizeof(s->jobs));
++
++    for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
++        HEVCRpiJob * const jb = s->jobs + job;
++
++        sem_init(&jb->sem_in, 0, 0);
++        sem_init(&jb->sem_out, 0, 0);
++        ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++        jb->intra.n = 0;
++        jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
++
++        // ** Sizeof the union structure might be overkill but at the moment it
++        //    is correct (it certainly isn't going to be too small)
++
++        rpi_inter_pred_alloc(&jb->chroma_ip,
++                             QPU_N_MAX, QPU_N_GRP,
++                             QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
++                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
++        rpi_inter_pred_alloc(&jb->luma_ip,
++                             QPU_N_MAX,  QPU_N_GRP,
++                             QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
++                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
++
++        jb->deblk.n = 0;
++        jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
++    }
++    s->pass0_job = 0;
++    s->pass1_job = 0;
++    s->jb0 = s->jobs + 0;
++    s->jb1 = s->jobs + 0;
++
 +    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
 +    if (err) {
 +        printf("Failed to create worker thread\n");
@@ -5669,83 +9485,74 @@ index b478065..955e426 100644
 +    }
 +}
 +
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++    av_freep(&ipe->q);
++    gpu_free(&ipe->gptr);
++}
++
 +static av_cold void hevc_exit_worker(HEVCContext *s)
 +{
 +    void *res;
-+    s->kill_worker=1;
-+    pthread_cond_broadcast(&s->worker_cond_tail);
++    unsigned int i;
++
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++        s->jobs[i].terminate = 1;
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++        sem_post(&s->jobs[i].sem_in);
 +    pthread_join(s->worker_thread, &res);
 +
-+    pthread_cond_destroy(&s->worker_cond_head);
-+    pthread_cond_destroy(&s->worker_cond_tail);
-+    pthread_mutex_destroy(&s->worker_mutex);
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiJob * const jb = s->jobs + i;
 +
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
++        sem_destroy(&jb->sem_in);
++        sem_destroy(&jb->sem_out);
++        ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++        av_freep(&jb->intra.cmds);
++        av_freep(&jb->deblk.blks);
++        rpi_free_inter_pred(&jb->chroma_ip);
++        rpi_free_inter_pred(&jb->luma_ip);
++    }
 +}
++
 +#endif
 +
  static av_cold int hevc_decode_free(AVCodecContext *avctx)
  {
      HEVCContext       *s = avctx->priv_data;
-@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
  
      av_freep(&s->cabac_state);
  
+-    for (i = 0; i < 3; i++) {
+-        av_freep(&s->sao_pixel_buffer_h[i]);
+-        av_freep(&s->sao_pixel_buffer_v[i]);
 +#ifdef RPI
 +
-+#ifdef RPI_WORKER
 +    hevc_exit_worker(s);
-+#endif
-+
-+    for(i=0;i<RPI_MAX_JOBS;i++) {
-+
-+        av_freep(&s->unif_mv_cmds_y[i]);
-+        av_freep(&s->unif_mv_cmds_c[i]);
-+        av_freep(&s->univ_pred_cmds[i]);
-+
-+#if RPI_INTER
-+        gpu_free(&s->jobs[i].chroma_mvs_gptr);
-+        gpu_free(&s->jobs[i].luma_mvs_gptr);
-+#endif
-+    }
-+
 +    vpu_qpu_term();
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
+     }
 +
 +    av_rpi_zc_uninit(avctx);
 +#endif
 +
-     for (i = 0; i < 3; i++) {
-         av_freep(&s->sao_pixel_buffer_h[i]);
-         av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
++    av_freep(&s->sao_pixel_buffer_v[0]);
+     av_frame_free(&s->output_frame);
+ 
+     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
      return 0;
  }
  
-+#ifdef RPI
-+#ifdef RPI_PRECLEAR
-+static av_cold void memclear16(int16_t *p, int n)
-+{
-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+  //int i;
-+  //for(i=0;i<n;i++)
-+  //  p[i] = 0;
-+}
-+#endif
-+#endif
 +
  static av_cold int hevc_init_context(AVCodecContext *avctx)
  {
      HEVCContext *s = avctx->priv_data;
-     int i;
-+#ifdef RPI
-+    unsigned int job;
-+#endif
- 
-     s->avctx = avctx;
- 
-@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
      s->HEVClcList[0] = s->HEVClc;
      s->sList[0] = s;
  
@@ -5759,71 +9566,39 @@ index b478065..955e426 100644
 +    if (vpu_qpu_init() != 0)
 +        goto fail;
 +
-+    for(job = 0; job < RPI_MAX_JOBS; job++) {
-+        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
-+        if (!s->unif_mv_cmds_y[job])
-+            goto fail;
-+        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
-+        if (!s->unif_mv_cmds_c[job])
-+            goto fail;
-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+        if (!s->univ_pred_cmds[job])
-+            goto fail;
-+    }
-+
 +#if RPI_INTER
-+    // We divide the image into blocks 256 wide and 64 high
-+    // We support up to 2048 widths
-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-+    // Also add space for the startup command for each stream.
-+
-+    for (job = 0; job < RPI_MAX_JOBS; job++) {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+#if RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
-+        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
-+#else
-+        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
-+        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
-+#endif
-+
-+        {
-+            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
-+            for(i = 0; i < QPU_N_UV; i++) {
-+                jb->chroma_mvs[i].qpu_mc_base = p;
-+                jb->chroma_mvs[i].qpu_mc_curr = p;
-+                p += UV_COMMANDS_PER_QPU;
-+            }
-+        }
-+        {
-+            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
-+            for(i = 0; i < QPU_N_Y; i++) {
-+                jb->luma_mvs[i].qpu_mc_base = p;
-+                jb->luma_mvs[i].qpu_mc_curr = p;
-+                p += Y_COMMANDS_PER_QPU;
-+            }
-+        }
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    {
++        static const uint32_t dframe[1] = {0x80808080};
++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
 +    }
-+    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
-+    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
-+    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
-+    s->qpu_filter = qpu_fn(mc_filter);
-+    s->qpu_filter_b = qpu_fn(mc_filter_b);
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
++#endif
 +#endif
 +    //gpu_malloc_uncached(2048*64,&s->dummy);
 +
 +    s->enable_rpi = 0;
 +
-+#ifdef RPI_WORKER
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
++    }
 +    hevc_init_worker(s);
 +#endif
-+
-+#endif
 +
      s->cabac_state = av_malloc(HEVC_CONTEXTS);
      if (!s->cabac_state)
          goto fail;
-@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
+@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+         if (!s->DPB[i].frame)
+             goto fail;
+         s->DPB[i].tf.f = s->DPB[i].frame;
++        s->DPB[i].dpb_no = i;
+     }
+ 
+     s->max_ra = INT_MAX;
+@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
      }
  
      if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
@@ -5836,7 +9611,7 @@ index b478065..955e426 100644
  
      return 0;
  }
-@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
+@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = {
      .update_thread_context = hevc_update_thread_context,
      .init_thread_copy      = hevc_init_thread_copy,
      .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
@@ -5846,88 +9621,63 @@ index b478065..955e426 100644
      .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
  };
 diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index be91010..dd7d152 100644
+index 162ca0e582..d647232638 100644
 --- a/libavcodec/hevc.h
 +++ b/libavcodec/hevc.h
-@@ -23,6 +23,9 @@
+@@ -23,6 +23,7 @@
  #ifndef AVCODEC_HEVC_H
  #define AVCODEC_HEVC_H
  
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#include "config.h"
-+
++#include "rpi_opts.h"
  #include "libavutil/buffer.h"
  #include "libavutil/md5.h"
  
-@@ -37,6 +40,45 @@
+@@ -37,6 +38,10 @@
  #include "thread.h"
  #include "videodsp.h"
  
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifndef RPI
-+
-+  #define RPI_INTER          0
-+  #define RPI_TSTATS         0
-+  #define RPI_HEVC_SAND      0
-+
-+#else
-+
-+  #include "rpi_qpu.h"
-+  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
-+
-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+  #define RPI_WORKER
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  // This has no effect unless RPI_WORKER is defined
-+  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
-+  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
-+  // free for the foreground to fill in.
-+  #define RPI_MAX_JOBS 2
-+
-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
-+  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
-+  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
-+//  #define RPI_DEBLOCK_VPU
-+
-+  #define RPI_VPU_DEBLOCK_CACHED 1
-+
-+  #if HAVE_NEON
-+  #define RPI_HEVC_SAND      1
-+  #else
-+  // Sand bust on Pi1 currently - reasons unknown
-+  #define RPI_HEVC_SAND      0
-+  #endif
-+
-+  #define RPI_TSTATS 0
++#ifdef RPI
++#include "rpi_qpu.h"
 +#endif
 +
  #define MAX_DPB_SIZE 16 // A.4.1
  #define MAX_REFS 16
  
-@@ -660,17 +702,6 @@ typedef struct CodingUnit {
+@@ -463,6 +468,7 @@ typedef struct HEVCSPS {
+     int implicit_rdpcm_enabled_flag;
+     int explicit_rdpcm_enabled_flag;
+     int intra_smoothing_disabled_flag;
++    int high_precision_offsets_enabled_flag;
+     int persistent_rice_adaptation_enabled_flag;
+ 
+     ///< coded frame dimension in various units
+@@ -660,6 +666,7 @@ typedef struct CodingUnit {
      uint8_t cu_transquant_bypass_flag;
  } CodingUnit;
  
--typedef struct Mv {
--    int16_t x;  ///< horizontal component of motion vector
--    int16_t y;  ///< vertical component of motion vector
--} Mv;
--
--typedef struct MvField {
--    DECLARE_ALIGNED(4, Mv, mv)[2];
--    int8_t ref_idx[2];
--    int8_t pred_flag;
--} MvField;
--
++#if 0
+ typedef struct Mv {
+     int16_t x;  ///< horizontal component of motion vector
+     int16_t y;  ///< vertical component of motion vector
+@@ -670,6 +677,7 @@ typedef struct MvField {
+     int8_t ref_idx[2];
+     int8_t pred_flag;
+ } MvField;
++#endif
+ 
  typedef struct NeighbourAvailable {
      int cand_bottom_left;
-     int cand_left;
-@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
+@@ -745,9 +753,23 @@ typedef struct HEVCFrame {
+      * A combination of HEVC_FRAME_FLAG_*
+      */
      uint8_t flags;
++
++    // Entry no in DPB - can be used as a small unique
++    // frame identifier (within the current thread)
++    uint8_t dpb_no;
  } HEVCFrame;
  
-+#ifdef RPI_WORKER
++#ifdef RPI
 +typedef struct HEVCLocalContextIntra {
 +    TransformUnit tu;
 +    NeighbourAvailable na;
@@ -5935,21 +9685,22 @@ index be91010..dd7d152 100644
 +#endif
 +
  typedef struct HEVCLocalContext {
-+    TransformUnit tu;
-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
++    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
++    NeighbourAvailable na;
 +
      uint8_t cabac_state[HEVC_CONTEXTS];
  
      uint8_t stat_coeff[4];
-@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
+@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext {
  
      int qPy_pred;
  
 -    TransformUnit tu;
- 
+-
      uint8_t ctb_left_flag;
      uint8_t ctb_up_flag;
-@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
+     uint8_t ctb_up_right_flag;
+@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext {
      int ct_depth;
      CodingUnit cu;
      PredictionUnit pu;
@@ -5957,20 +9708,18 @@ index be91010..dd7d152 100644
  
  #define BOUNDARY_LEFT_SLICE     (1 << 0)
  #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
+@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext {
      int boundary_flags;
  } HEVCLocalContext;
  
-+
 +#ifdef RPI
 +
 +// The processing is done in chunks
-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-+// This is a distance of 1536 pixels across the screen
 +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
 +// but allocate more memory and increase the latency before data in the next frame can be processed
 +#define RPI_NUM_CHUNKS 4
 +#define RPI_CHUNK_SIZE 12
++#define RPI_ROUND_TO_LINES 0
 +
 +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
 +#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
@@ -5989,9 +9738,6 @@ index be91010..dd7d152 100644
 +#define RPI_CMD_CHROMA_BI 3
 +#define RPI_CMD_V_BI 4
 +
-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+// #define RPI_PRECLEAR
-+
 +// Command for inter prediction
 +typedef struct HEVCMvCmd {
 +    uint8_t cmd;
@@ -6019,6 +9765,10 @@ index be91010..dd7d152 100644
 +    RPI_PRED_ADD_RESIDUAL,
 +    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
 +    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_ADD_DC,
++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
++    RPI_PRED_ADD_DC_V,
 +    RPI_PRED_INTRA,
 +    RPI_PRED_I_PCM,
 +    RPI_PRED_CMD_MAX
@@ -6033,8 +9783,14 @@ index be91010..dd7d152 100644
 +        struct {  // TRANSFORM_ADD
 +            uint8_t * dst;
 +            const int16_t * buf;
-+            uint32_t stride;
++            uint16_t stride;  // Should be good enough for all pic fmts we use
++            int16_t dc;
 +        } ta;
++        struct {
++            uint8_t * dst;
++            uint32_t stride;
++            int dc;
++        } dc;
 +        struct {  // INTRA
 +            uint16_t x;
 +            uint16_t y;
@@ -6052,32 +9808,87 @@ index be91010..dd7d152 100644
 +#endif
 +
 +#ifdef RPI
++#include <semaphore.h>
 +
-+struct qpu_mc_pred_c_s;
-+struct qpu_mc_pred_y_s;
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
 +
-+typedef struct HEVCRpiLumaPred
++typedef struct HEVCRpiInterPredQ
 +{
-+    struct qpu_mc_pred_y_s *qpu_mc_base;
-+    struct qpu_mc_pred_y_s *qpu_mc_curr;
-+    struct qpu_mc_pred_y_s *last_lx;
++    union qpu_mc_pred_cmd_u *qpu_mc_base;
++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    struct qpu_mc_src_s *last_l0;
++    struct qpu_mc_src_s *last_l1;
 +    unsigned int load;
-+} HEVCRpiLumaPred;
++    uint32_t code_setup;
++    uint32_t code_sync;
++    uint32_t code_exit;
++} HEVCRpiInterPredQ;
 +
-+typedef struct HEVCRpiChromaPred
++typedef struct HEVCRpiInterPredEnv
 +{
-+    struct qpu_mc_pred_c_s *qpu_mc_base;
-+    struct qpu_mc_pred_c_s *qpu_mc_curr;
-+    struct qpu_mc_pred_c_s *last_l0;
-+    struct qpu_mc_pred_c_s *last_l1;
-+    unsigned int load;
-+} HEVCRpiChromaPred;
++    HEVCRpiInterPredQ * q;
++    unsigned int n;        // Number of Qs
++    unsigned int n_grp;    // Number of Q in a group
++    unsigned int curr;     // Current Q number (0..n-1)
++    int used;              // 0 if nothing in any Q, 1 otherwise
++    int used_grp;          // 0 if nothing in any Q in the current group
++    unsigned int max_fill;
++    unsigned int min_gap;
++    GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++    unsigned int n;        // Number of commands
++    HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCeoffEnv {
++    unsigned int n;
++    uint16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCeoffsEnv {
++    HEVCRpiCoeffEnv s[4];
++    GPU_MEM_PTR_T gptr;
++    void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiDeblkBlk {
++    uint16_t x_ctb;
++    uint16_t y_ctb;
++} HEVCRpiDeblkBlk;
++
++typedef struct HEVCRpiDeblkEnv {
++    unsigned int n;
++    HEVCRpiDeblkBlk * blks;
++} HEVCRpiDeblkEnv;
++
++typedef struct HEVCRPiFrameProgressWait {
++    int req;
++    struct HEVCRPiFrameProgressWait * next;
++    sem_t sem;
++} HEVCRPiFrameProgressWait;
++
++typedef struct HEVCRPiFrameProgressState {
++    struct HEVCRPiFrameProgressWait * first;
++    struct HEVCRPiFrameProgressWait * last;
++    pthread_mutex_t lock;
++} HEVCRPiFrameProgressState;
 +
 +typedef struct HEVCRpiJob {
-+    GPU_MEM_PTR_T chroma_mvs_gptr;
-+    GPU_MEM_PTR_T luma_mvs_gptr;
-+    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
-+    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
++    volatile int terminate;
++    int pending;
++    sem_t sem_in;       // set by main
++    sem_t sem_out;      // set by worker
++    HEVCRpiInterPredEnv chroma_ip;
++    HEVCRpiInterPredEnv luma_ip;
++    int16_t progress[32];  // index by dpb_no
++    HEVCRpiIntraPredEnv intra;
++    HEVCRpiCoeffsEnv coeffs;
++    HEVCRpiDeblkEnv deblk;
++    HEVCRPiFrameProgressWait progress_wait;
 +} HEVCRpiJob;
 +
 +#if RPI_TSTATS
@@ -6105,78 +9916,42 @@ index be91010..dd7d152 100644
  typedef struct HEVCContext {
      const AVClass *c;  // needed by private avoptions
      AVCodecContext *avctx;
-@@ -798,13 +978,103 @@ typedef struct HEVCContext {
- 
-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-     HEVCLocalContext    *HEVClc;
--
-+#ifdef RPI_WORKER
-+    HEVCLocalContextIntra HEVClcIntra;
-+#endif
-     uint8_t             threads_type;
-     uint8_t             threads_number;
- 
+@@ -805,6 +1025,69 @@ typedef struct HEVCContext {
      int                 width;
      int                 height;
  
-+    int used_for_ref;
-+
++    int used_for_ref;  // rpi
 +#ifdef RPI
 +    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
-+    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-+    int buf_width;
-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-+    int num_coeffs[RPI_MAX_JOBS][4];
-+    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds_y[RPI_MAX_JOBS];
-+    int num_mv_cmds_c[RPI_MAX_JOBS];
-+    int num_pred_cmds[RPI_MAX_JOBS];
-+    int num_dblk_cmds[RPI_MAX_JOBS];
-+    int vpu_id;
-+    int pass0_job; // Pass0 does coefficient decode
-+    int pass1_job; // Pass1 does pixel processing
++    unsigned int pass0_job; // Pass0 does coefficient decode
++    unsigned int pass1_job; // Pass1 does pixel processing
 +    int ctu_count; // Number of CTUs done in pass0 so far
 +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
 +
++    HEVCRpiJob * jb0;
++    HEVCRpiJob * jb1;
 +    HEVCRpiJob jobs[RPI_MAX_JOBS];
 +#if RPI_TSTATS
 +    HEVCRpiStats tstats;
 +#endif
 +#if RPI_INTER
-+    HEVCRpiChromaPred * curr_pred_c;
-+    HEVCRpiLumaPred * curr_pred_y;
-+    struct qpu_mc_pred_y_s * last_y8_p;
-+    struct qpu_mc_pred_y_s * last_y8_lx;
++    struct qpu_mc_pred_y_p_s * last_y8_p;
++    struct qpu_mc_src_s * last_y8_l1;
 +
 +    // Function pointers
-+    uint32_t qpu_filter_uv;
-+    uint32_t qpu_filter_uv_b0;
-+    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
-+    uint32_t qpu_filter;
-+    uint32_t qpu_filter_b;
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
++#endif
++    HEVCRpiQpu qpu;
 +#endif
 +
-+#ifdef RPI_WORKER
 +    pthread_t worker_thread;
-+    pthread_cond_t worker_cond_head;
-+    pthread_cond_t worker_cond_tail;
-+    pthread_mutex_t worker_mutex;
-+
-+    int worker_tail; // Contains the number of posted jobs
-+    int worker_head; // Contains the number of completed jobs
-+    int kill_worker; // set to 1 to terminate the worker
-+#endif
-+
-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
 +
 +#ifdef RPI_DEBLOCK_VPU
++#define RPI_DEBLOCK_VPU_Q_COUNT 2
 +    int enable_rpi_deblock;
 +
 +    int uv_setup_width;
@@ -6204,34 +9979,25 @@ index be91010..dd7d152 100644
 +    unsigned int dvq_n;
 +
 +#endif
-+
++    HEVCLocalContextIntra HEVClcIntra;
++    HEVCRPiFrameProgressState progress_states[2];
 +#endif
 +
      uint8_t *cabac_state;
  
      /** 1 if the independent slice segment header was successfully parsed */
-@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
-     uint32_t max_mastering_luminance;
-     uint32_t min_mastering_luminance;
- 
-+#ifdef RPI
-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+#endif
- } HEVCContext;
- 
- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                  int log2_trafo_size, enum ScanType scan_idx,
-                                  int c_idx);
+@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
+ int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
+                            uint8_t *buf, int buf_size);
 +#if RPI_INTER
 +extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
 +#endif
 +
- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
- 
-@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
+ /**
+  * Reset SEI values that are stored on the Context.
+@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
  extern const uint8_t ff_hevc_diag_scan8x8_x[64];
  extern const uint8_t ff_hevc_diag_scan8x8_y[64];
  
@@ -6244,11 +10010,85 @@ index be91010..dd7d152 100644
 +extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
 +#endif
 +
++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    if (s->enable_rpi)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++    else
++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
++}
++
++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
++{
++    if (s->enable_rpi && s->used_for_ref)
++        ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    if (s->enable_rpi)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++    else
++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
++}
++
++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
++{
++    if (s->used_for_ref)
++    {
++        if (s->enable_rpi)
++            ff_hevc_rpi_progress_signal_field(s, y, 0);
++        else
++            ff_thread_report_progress(&s->ref->tf, y, 0);
++    }
++}
++
++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
++{
++    if (s->enable_rpi)
++    {
++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++    }
++    else
++        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
++}
++
++#else
++
++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
++#define ff_hevc_progress_signal_mv(s, y)
++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
++
 +#endif
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
++{
++    if (ref->tf.progress != NULL)
++    {
++        int * const p = (int *)&ref->tf.progress->data;
++        p[0] = INT_MAX;
++        p[1] = INT_MAX;
++    }
++}
 +
  #endif /* AVCODEC_HEVC_H */
 diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 05b2821..733efde 100644
+index 05b2821840..c84886817d 100644
 --- a/libavcodec/hevc_cabac.c
 +++ b/libavcodec/hevc_cabac.c
 @@ -21,14 +21,76 @@
@@ -6260,12 +10100,11 @@ index 05b2821..733efde 100644
  #include "libavutil/attributes.h"
  #include "libavutil/common.h"
  
--#include "cabac_functions.h"
+ #include "cabac_functions.h"
  #include "hevc.h"
-+#include "cabac_functions.h"
-+
+ 
 +#ifdef RPI
-+#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
 +#endif
 +
 +// BY22 is probably faster than simple bypass if the processor has
@@ -6287,7 +10126,7 @@ index 05b2821..733efde 100644
 +#if ARCH_ARM
 +#include "arm/hevc_cabac.h"
 +#endif
- 
++
  #define CABAC_MAX_BIN 31
  
 +
@@ -6610,7 +10449,7 @@ index 05b2821..733efde 100644
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -6623,7 +10462,7 @@ index 05b2821..733efde 100644
 +
 +#ifndef coeff_abs_level_remaining_decode_bypass
 +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
-+{
+ {
 +    CABACContext * const c = &s->HEVClc->cc;
 +    uint32_t y;
 +    unsigned int prefix;
@@ -6664,7 +10503,7 @@ index 05b2821..733efde 100644
 +#endif
 +
 +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
- {
++{
 +    CABACContext * const c = &s->HEVClc->cc;
      int prefix = 0;
      int suffix = 0;
@@ -6740,7 +10579,7 @@ index 05b2821..733efde 100644
 +        rv = (rv << 1) | b;
 +    }
 +    return rv;
-+}
+ }
 +#endif
 +
 +
@@ -6881,22 +10720,21 @@ index 05b2821..733efde 100644
 +    int * const pPrev_sig)
 +{
 +    while (--i >= 0) {
-+        unsigned int x_cg = scan_x_cg[i];
-+        unsigned int y_cg = scan_y_cg[i];
++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++        const unsigned int x_cg = scan_x_cg[i];
 +
 +        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right + Down * 2 when calculating
-+        // significant coeff flags so we obtain it here
-+        //.
++        // we use the full Right * 2 + Down when calculating
++        // significant coeff flags so we obtain it here.
++        //
 +        // The group flag array is one longer than it needs to
 +        // be so we don't need to check for y_cg limits
-+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
-+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
 +
 +        if (i == 0 ||
 +            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
 +        {
-+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
++            gf_y[0] |= (1 << x_cg);
 +            *pPrev_sig = prev_sig;
 +            break;
 +        }
@@ -6914,35 +10752,128 @@ index 05b2821..733efde 100644
 +    unsigned int stride = frame->linesize[c_idx];
 +    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
 +    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
-+    const int is_sliced = rpi_sliced_frame(frame);
++    const int is_sliced = av_rpi_is_sand_frame(frame);
 +    uint8_t * dst = !is_sliced ?
 +            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
 +        c_idx == 0 ?
-+            rpi_sliced_frame_pos_y(frame, x, y) :
-+            rpi_sliced_frame_pos_c(frame, x, y);
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
 +
-+//    if (c_idx != 0) {
-+//        return;
-+//    }
 +    if (s->enable_rpi) {
-+        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+        cmd->size = log2_trafo_size;
-+        cmd->c_idx = c_idx;
-+        cmd->ta.buf = coeffs;
-+        cmd->ta.dst = dst;
-+        cmd->ta.stride = stride;
++        const unsigned int i = s->jb0->intra.n;
++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
++
++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++            pc->ta.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->ta.stride == stride);
++
++            pc->type = RPI_PRED_ADD_RESIDUAL_C;
++        }
++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++            pc->dc.dst == dst)
++        {
++            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->dc.stride == stride);
++
++            // Rewrite as add residual - must rewrite all fields as different union member
++            pc->type = RPI_PRED_ADD_RESIDUAL_V;
++            pc->c_idx = c_idx;
++            pc->ta.buf = coeffs;
++            pc->ta.dst = dst;
++            pc->ta.stride = stride;
++            pc->ta.dc = dc;
++        }
++        else
++        {
++            HEVCPredCmd * const cmd = pc + 1;
++            s->jb0->intra.n = i + 1;
++
++            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++            cmd->size = log2_trafo_size;
++            cmd->c_idx = c_idx;
++            cmd->ta.buf = coeffs;
++            cmd->ta.dst = dst;
++            cmd->ta.stride = stride;
++            cmd->ta.dc = 0;
++        }
 +    }
 +    else if (!is_sliced || c_idx == 0) {
 +        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
 +    }
++#if RPI_HEVC_SAND
++    // * These should probably never happen
 +    else if (c_idx == 1) {
-+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
 +    }
 +    else {
-+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
 +    }
- }
++#endif
++}
++
++
++static void rpi_add_dc(HEVCContext * const s,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame->linesize[c_idx];
++    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
++    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
++    const int is_sliced = av_rpi_is_sand_frame(frame);
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++    if (s->enable_rpi) {
++        const unsigned int i = s->jb0->intra.n;
++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
++
++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++            pc->ta.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->ta.stride == stride);
++
++            pc->ta.dc = (int16_t)coeff;
++        }
++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++            pc->dc.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->dc.stride == stride &&
++                       (pc->dc.dc & ~0xffff) == 0);
++
++            pc->dc.dc |= (coeff << 16);
++        }
++        else
++        {
++            HEVCPredCmd * const cmd = pc + 1;
++            s->jb0->intra.n = i + 1;
++
++            cmd->type = RPI_PRED_ADD_DC + c_idx;
++            cmd->size = log2_trafo_size;
++            cmd->c_idx = c_idx;
++            cmd->dc.dst = dst;
++            cmd->dc.stride = stride;
++            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++        }
++    }
++}
++
++
 +#endif
  
  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
@@ -6985,6 +10916,7 @@ index 05b2821..733efde 100644
 +#endif
 +#ifdef RPI
 +    int use_vpu;
++    int use_dc = 0;
 +#endif
 +    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
@@ -7006,7 +10938,6 @@ index 05b2821..733efde 100644
 +    const int c_idx_nz = (c_idx != 0);
 +
 +    int may_hide_sign;
-+
  
      // Derive QP for dequant
      if (!lc->cu.cu_transquant_bypass_flag) {
@@ -7015,7 +10946,7 @@ index 05b2821..733efde 100644
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          };
          int qp_y = lc->qp_y;
  
@@ -7036,7 +10967,7 @@ index 05b2821..733efde 100644
          }
  
          if (c_idx == 0) {
-@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -7127,7 +11058,7 @@ index 05b2821..733efde 100644
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -7184,53 +11115,35 @@ index 05b2821..733efde 100644
 -    for (i = num_last_subset; i >= 0; i--) {
 -        int n, m;
 -        int x_cg, y_cg, x_c, y_c, pos;
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+    {
-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+#ifdef RPI
-+        use_vpu = 0;
-+        if (s->enable_rpi) {
-+            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
-+            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if HAVE_NEON
-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+        }
-+        else
-+#endif
-+        {
-+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+        }
-+    }
-+
-+    i = num_last_subset;
-+    do {
-         int implicit_non_zero_coeff = 0;
+-        int implicit_non_zero_coeff = 0;
 -        int64_t trans_coeff_level;
 -        int prev_sig = 0;
 -        int offset = i << 4;
 -        int rice_init = 0;
-+        int n_end;
- 
-         uint8_t significant_coeff_flag_idx[16];
--        uint8_t nb_significant_coeff_flag = 0;
 -
+-        uint8_t significant_coeff_flag_idx[16];
+-        uint8_t nb_significant_coeff_flag = 0;
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+ 
 -        x_cg = scan_x_cg[i];
 -        y_cg = scan_y_cg[i];
--
++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+ 
 -        if ((i < num_last_subset) && (i > 0)) {
 -            int ctx_cg = 0;
 -            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
 -            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
--
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++#ifdef RPI
++        use_vpu = 0;
++        if (s->enable_rpi) {
++            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
++            use_dc = (num_coeff == 1) && !special &&
++                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
+ 
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
 -            implicit_non_zero_coeff = 1;
@@ -7238,9 +11151,37 @@ index 05b2821..733efde 100644
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 -             (x_cg == 0 && y_cg == 0));
--        }
--
++            if (use_dc) {
++                // Just need a little empty space
++                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++                // No need to clear
++            }
++            else
++            {
++                use_vpu = !special && log2_trafo_size >= 4;
++                coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if HAVE_NEON
++                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++                memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++            }
+         }
++        else
++#endif
++        {
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++        }
++    }
+ 
 -        last_scan_pos = num_coeff - offset - 1;
++    i = num_last_subset;
++    do {
++        int implicit_non_zero_coeff = 0;
++        int n_end;
++
++        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
  
          if (i == num_last_subset) {
@@ -7272,23 +11213,24 @@ index 05b2821..733efde 100644
 +                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
 +                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
 +            };
++            // N.B. prev_sig = Right * 2 + Down
 +            static const uint8_t ctx_idx_maps[3][4][16] = {
 +                {
 +                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                },
 +                {
 +                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                },
 +                {
 +                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                }
              };
@@ -7326,7 +11268,7 @@ index 05b2821..733efde 100644
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      }
                  }
              }
@@ -7375,11 +11317,12 @@ index 05b2821..733efde 100644
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          }
  
 -        n_end = nb_significant_coeff_flag;
+-
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
 +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
@@ -7427,9 +11370,6 @@ index 05b2821..733efde 100644
 +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
 +                }
  
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
- 
 -        if (n_end) {
 -            int first_nz_pos_in_cg;
 -            int last_nz_pos_in_cg;
@@ -7440,6 +11380,9 @@ index 05b2821..733efde 100644
 -            int sum_abs = 0;
 -            int sign_hidden;
 -            int sb_type;
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ 
 +                if (coded_val)
 +                {
 +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
@@ -7450,18 +11393,13 @@ index 05b2821..733efde 100644
 +                        const unsigned int c_rice_param = *stat_coeff >> 2;
 +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
  
+-            // initialize first elem of coeff_bas_level_greater1_flag
+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
 +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
 +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                    }
 +                }
  
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
 -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
 -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
@@ -7469,7 +11407,11 @@ index 05b2821..733efde 100644
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
 -                c_rice_param = lc->stat_coeff[sb_type] / 4;
 -            }
--
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+ 
 -            if (!(i == num_last_subset) && greater1_ctx == 0)
 -                ctx_set++;
 -            greater1_ctx = 1;
@@ -7551,9 +11493,6 @@ index 05b2821..733efde 100644
 +                        {
 +                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
 +                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
  
 -            for (m = 0; m < n_end; m++) {
 -                n = significant_coeff_flag_idx[m];
@@ -7574,6 +11513,9 @@ index 05b2821..733efde 100644
 -                                if (lc->stat_coeff[sb_type] > 0)
 -                                    lc->stat_coeff[sb_type]--;
 -                            rice_init = 1;
++                            sum_abs += last_coeff_abs_level_remaining + 1;
++                            *level = trans_coeff_level;
++
 +                            if (stat_coeff != NULL)
 +                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                            stat_coeff = NULL;
@@ -7678,7 +11620,7 @@ index 05b2821..733efde 100644
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -7687,56 +11629,41 @@ index 05b2821..733efde 100644
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 for (i = 0; i < 8; i++)
-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-             }
--
-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
- 
-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
--            s->hevcdsp.idct_4x4_luma(coeffs);
-+           s->hevcdsp.idct_4x4_luma(coeffs);
-         } else {
+             s->hevcdsp.idct_4x4_luma(coeffs);
+-        } else {
++        }
 +#ifdef RPI
-+            if (!use_vpu) {
-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+              if (max_xy == 0) {
-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+              } else {
-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                  if (max_xy < 4)
-+                      col_limit = FFMIN(4, col_limit);
-+                  else if (max_xy < 8)
-+                      col_limit = FFMIN(8, col_limit);
-+                  else if (max_xy < 12)
-+                      col_limit = FFMIN(24, col_limit);
-+
-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-+              }
-+            }
++        else if (!use_vpu)
 +#else
++        else
++#endif
++        {
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                     col_limit = FFMIN(24, col_limit);
-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-             }
+-                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
++            {
++#ifdef RPI
++                if (use_dc)
++                    rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++                else
 +#endif
-         }
-     }
-     if (lc->tu.cross_pf) {
-@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            }
+             else {
+                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                 if (max_xy < 4)
+@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
 +#ifdef RPI
-+    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++    if (!use_dc)
++    {
++        rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++    }
 +#else
      s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
 +#endif
@@ -7744,7 +11671,7 @@ index 05b2821..733efde 100644
  
  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
 diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 1f33b0c..3143b4f 100644
+index 9fbcd1d8b8..df129e2e46 100644
 --- a/libavcodec/hevc_filter.c
 +++ b/libavcodec/hevc_filter.c
 @@ -22,6 +22,12 @@
@@ -7760,26 +11687,31 @@ index 1f33b0c..3143b4f 100644
  #include "libavutil/common.h"
  #include "libavutil/internal.h"
  
-@@ -31,6 +37,11 @@
+@@ -31,6 +37,16 @@
  
  #include "bit_depth_template.c"
  
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#endif
++#if RPI_HEVC_SAND
 +#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++#else
++#define RPI_ZC_SAND_8_IN_10_BUF 0
 +#endif
 +
  #define LUMA 0
  #define CB 1
  #define CR 2
-@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
      return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
  }
  
 +static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
 +{
-+#ifdef RPI
-+    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
++#if RPI_HEVC_SAND
++    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
 +#else
 +    return s->ps.sps->pixel_shift;
 +#endif
@@ -7788,7 +11720,75 @@ index 1f33b0c..3143b4f 100644
  static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
                       intptr_t stride_dst, intptr_t stride_src)
  {
-@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+@@ -161,12 +186,21 @@ int i, j;
+     }
+ }
+ 
++// "DSP" these?
+ static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+ {
+-    if (pixel_shift)
+-        *(uint16_t *)dst = *(uint16_t *)src;
+-    else
+-        *dst = *src;
++    switch (pixel_shift)
++    {
++        case 2:
++            *(uint32_t *)dst = *(uint32_t *)src;
++            break;
++        case 1:
++            *(uint16_t *)dst = *(uint16_t *)src;
++            break;
++        default:
++            *dst = *src;
++            break;
++    }
+ }
+ 
+ static void copy_vert(uint8_t *dst, const uint8_t *src,
+@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
+                       int stride_dst, int stride_src)
+ {
+     int i;
+-    if (pixel_shift == 0) {
+-        for (i = 0; i < height; i++) {
+-            *dst = *src;
+-            dst += stride_dst;
+-            src += stride_src;
+-        }
+-    } else {
+-        for (i = 0; i < height; i++) {
+-            *(uint16_t *)dst = *(uint16_t *)src;
+-            dst += stride_dst;
+-            src += stride_src;
+-        }
++    switch (pixel_shift)
++    {
++        case 2:
++            for (i = 0; i < height; i++) {
++                *(uint32_t *)dst = *(uint32_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        case 1:
++            for (i = 0; i < height; i++) {
++                *(uint16_t *)dst = *(uint16_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        default:
++            for (i = 0; i < height; i++) {
++                *dst = *src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
+     }
+ }
+ 
+@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
                             int stride_src, int x, int y, int width, int height,
                             int c_idx, int x_ctb, int y_ctb)
  {
@@ -7797,7 +11797,7 @@ index 1f33b0c..3143b4f 100644
      int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
      int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
  
-@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s,
          int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
          int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
          int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
@@ -7815,21 +11815,27 @@ index 1f33b0c..3143b4f 100644
                      for (n = 0; n < (min_pu_size >> vshift); n++) {
                          memcpy(src, dst, len);
                          src += stride_src;
-@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s,
  
  static void sao_filter_CTB(HEVCContext *s, int x, int y)
  {
 -    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
++#if SAO_FILTER_N == 5
 +    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
      HEVCLocalContext *lc = s->HEVClc;
      int c_idx;
      int edges[4];  // 0 left 1 top 2 right 3 bottom
-@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
      uint8_t right_tile_edge  = 0;
      uint8_t up_tile_edge     = 0;
      uint8_t bottom_tile_edge = 0;
-+#ifdef RPI
-+    const int sliced = rpi_sliced_frame(s->frame);
++#if RPI_HEVC_SAND
++    const int sliced = av_rpi_is_sand_frame(s->frame);
 +    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
 +#else
 +    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
@@ -7847,7 +11853,7 @@ index 1f33b0c..3143b4f 100644
      if (restore) {
          if (!edges[0]) {
              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          }
      }
  
@@ -7856,7 +11862,7 @@ index 1f33b0c..3143b4f 100644
          int x0       = x >> s->ps.sps->hshift[c_idx];
          int y0       = y >> s->ps.sps->vshift[c_idx];
          int stride_src = s->frame->linesize[c_idx];
-@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
          int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
          int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
@@ -7865,24 +11871,24 @@ index 1f33b0c..3143b4f 100644
 +        ptrdiff_t stride_dst;
          uint8_t *dst;
  
-+#ifdef RPI
-+        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
++#if RPI_HEVC_SAND
++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
 +        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
 +        uint8_t * const src = !sliced ?
-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
 +        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
 +            !sliced ? src - (1 << sh) :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
 +        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
 +            !sliced ? src + (width << sh) :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
 +
 +
 +        if (sliced && c_idx > 1) {
@@ -7913,7 +11919,7 @@ index 1f33b0c..3143b4f 100644
 +                dst = lc->edge_emu_buffer;
 +                stride_dst = 2*MAX_PB_SIZE;
 +                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
 +                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
@@ -7934,9 +11940,11 @@ index 1f33b0c..3143b4f 100644
 -            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
 -                                            sao->offset_val[c_idx], sao->band_position[c_idx],
 -                                            width, height);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
++//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
++
 +                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
 +                                                    sao->offset_val[1], sao->band_position[1],
 +                                                    sao->offset_val[2], sao->band_position[2],
@@ -7952,7 +11960,7 @@ index 1f33b0c..3143b4f 100644
              }
              sao->type_idx[c_idx] = SAO_APPLIED;
              break;
-@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          {
              int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
              int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
@@ -8091,7 +12099,7 @@ index 1f33b0c..3143b4f 100644
 -                                                vert_edge,
 -                                                horiz_edge,
 -                                                diag_edge);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +            if (sliced && c_idx != 0)
 +            {
 +                // Class always the same for both U & V (which is just as well :-))
@@ -8121,18 +12129,42 @@ index 1f33b0c..3143b4f 100644
 +                                                    horiz_edge,
 +                                                    diag_edge);
 +            }
++            // ??? Does this actually work for chroma ???
              restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
                                 x, y, width, height, c_idx);
              sao->type_idx[c_idx] = SAO_APPLIED;
-@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         }
+         }
      }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++    {
++        const unsigned int stride1 = s->frame->linesize[0];
++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++    }
++#endif
  }
  
 +// Returns 2 or 0.
  static int get_pcm(HEVCContext *s, int x, int y)
  {
      int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      uint8_t *src;
      int x, y;
      int chroma, beta;
@@ -8141,7 +12173,7 @@ index 1f33b0c..3143b4f 100644
      uint8_t no_p[2] = { 0 };
      uint8_t no_q[2] = { 0 };
  
-@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -8157,7 +12189,7 @@ index 1f33b0c..3143b4f 100644
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                  tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                  tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
@@ -8175,14 +12207,14 @@ index 1f33b0c..3143b4f 100644
 -                                                       s->frame->linesize[LUMA],
 -                                                       beta, tc, no_p, no_q);
 +                }
-+#ifdef RPI
-+                if (rpi_sliced_frame(s->frame)) {
++#if RPI_HEVC_SAND
++                if (av_rpi_is_sand_frame(s->frame)) {
 +
 +                    // This copes properly with no_p/no_q
-+                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
++                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
 +                                                     s->frame->linesize[LUMA],
 +                                                     beta, tc, no_p, no_q,
-+                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
++                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
 +                }
 +                else
 +#endif
@@ -8217,21 +12249,21 @@ index 1f33b0c..3143b4f 100644
              }
          }
  
-@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
                  tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                  tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
 -                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
 +                src =
-+#ifdef RPI
-+                    rpi_sliced_frame(s->frame) ?
-+                        rpi_sliced_frame_pos_y(s->frame, x, y) :
++#if RPI_HEVC_SAND
++                    av_rpi_is_sand_frame(s->frame) ?
++                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
 +#endif
 +                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
                  if (pcmf) {
                      no_p[0] = get_pcm(s, x, y - 1);
                      no_p[1] = get_pcm(s, x + 4, y - 1);
-@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -8251,17 +12283,19 @@ index 1f33b0c..3143b4f 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  
      if (s->ps.sps->chroma_format_idc) {
-+#ifdef RPI
-+        if (rpi_sliced_frame(s->frame)) {
++#if RPI_HEVC_SAND
++        if (av_rpi_is_sand_frame(s->frame)) {
 +            const int v = 2;
 +            const int h = 2;
 +
 +            // vertical filtering chroma
 +            for (y = y0; y < y_end; y += 8 * v) {
++//                const int demi_y = y + 4 * v >= s->ps.sps->height;
++                const int demi_y = 0;
 +                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
 +                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
 +                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
@@ -8269,7 +12303,7 @@ index 1f33b0c..3143b4f 100644
 +                    if ((bs0 == 2) || (bs1 == 2)) {
 +                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
 +                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
-+                        unsigned int no_f = 0;
++                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
 +
 +                        // tc_offset here should be set to cur_tc_offset I think
 +                        const uint32_t tc4 =
@@ -8289,10 +12323,10 @@ index 1f33b0c..3143b4f 100644
 +                                continue;
 +                        }
 +
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
 +                                                       s->frame->linesize[1],
 +                                                       tc4,
-+                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
 +                                                       no_f);
 +                    }
 +                }
@@ -8307,6 +12341,9 @@ index 1f33b0c..3143b4f 100644
 +                    x_end2 = x_end - 8 * h;
 +
 +                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
++                    const int demi_x = 0;
++
 +                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
 +                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
 +                    if ((bs0 == 2) || (bs1 == 2)) {
@@ -8315,7 +12352,7 @@ index 1f33b0c..3143b4f 100644
 +                        const uint32_t tc4 =
 +                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
 +                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
-+                        unsigned int no_f = 0;
++                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
 +
 +                        if (tc4 == 0)
 +                            continue;
@@ -8331,7 +12368,7 @@ index 1f33b0c..3143b4f 100644
 +                                continue;
 +                        }
 +
-+                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
 +                                                             s->frame->linesize[1],
 +                                                             tc4, no_f);
 +                    }
@@ -8343,21 +12380,21 @@ index 1f33b0c..3143b4f 100644
          for (chroma = 1; chroma <= 2; chroma++) {
              int h = 1 << s->ps.sps->hshift[chroma];
              int v = 1 << s->ps.sps->vshift[chroma];
-@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                          c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
                          c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
 -                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
 +                        src =
-+#ifdef RPI
-+                            rpi_sliced_frame(s->frame) ?
-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#if RPI_HEVC_SAND
++                            av_rpi_is_sand_frame(s->frame) ?
++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
 +#endif
 +                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
                          if (pcmf) {
                              no_p[0] = get_pcm(s, x - 1, y);
                              no_p[1] = get_pcm(s, x - 1, y + (4 * v));
-@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -8381,21 +12418,21 @@ index 1f33b0c..3143b4f 100644
                      }
                  }
  
-@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                          c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
                          c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
 -                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
 +                        src =
-+#ifdef RPI
-+                            rpi_sliced_frame(s->frame) ?
-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#if RPI_HEVC_SAND
++                            av_rpi_is_sand_frame(s->frame) ?
++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
 +#endif
 +                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
                          if (pcmf) {
                              no_p[0] = get_pcm(s, x,           y - 1);
                              no_p[1] = get_pcm(s, x + (4 * h), y - 1);
-@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -8415,7 +12452,7 @@ index 1f33b0c..3143b4f 100644
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  }
  
@@ -8485,7 +12522,7 @@ index 1f33b0c..3143b4f 100644
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -8511,7 +12548,7 @@ index 1f33b0c..3143b4f 100644
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -8588,7 +12625,7 @@ index 1f33b0c..3143b4f 100644
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -8691,7 +12728,7 @@ index 1f33b0c..3143b4f 100644
          }
      }
  }
-@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
  #undef CB
  #undef CR
  
@@ -8701,8 +12738,8 @@ index 1f33b0c..3143b4f 100644
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
 +    rpi_cache_flush_finish(rfe);
 +}
 +#endif
@@ -8716,10 +12753,11 @@ index 1f33b0c..3143b4f 100644
 +        const int d0 = ((int *)f->progress->data)[0];
 +        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
 +
-+        if (curr_y < (unsigned int)f->f->height) {
++        if (curr_y < (unsigned int)s->ps.sps->height) {
 +            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++              0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
++              s->ps.sps->vshift[1], 1, 1);
 +            rpi_cache_flush_finish(rfe);
 +        }
 +    }
@@ -8759,7 +12797,7 @@ index 1f33b0c..3143b4f 100644
 +  // Call VPU
 +  {
 +      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
-+      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
 +      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
 +      vpu_qpu_job_finish(vqj);
 +  }
@@ -8796,61 +12834,167 @@ index 1f33b0c..3143b4f 100644
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
 -            if (s->threads_type & FF_THREAD_FRAME )
+-                ff_thread_report_progress(&s->ref->tf, y, 0);
 +            if (s->threads_type == FF_THREAD_FRAME ) {
 +#if RPI_INTER
 +                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
 +#endif
-                 ff_thread_report_progress(&s->ref->tf, y, 0);
++                ff_hevc_progress_signal_recon(s, y);
 +            }
          }
          if (x_end && y_end) {
              sao_filter_CTB(s, x , y);
 -            if (s->threads_type & FF_THREAD_FRAME )
+-                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            if (s->threads_type == FF_THREAD_FRAME ) {
 +#if RPI_INTER
 +                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
 +#endif
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
++                ff_hevc_progress_signal_recon(s, y + ctb_size);
 +            }
-+        }
+         }
+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+-        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
 +        //if (((y + ctb_size)&63)==0)
 +#ifdef RPI_DEBLOCK_VPU
 +        if (s->enable_rpi_deblock) {
-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+          if (done_deblock) {
-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+          }
++            // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
++            if (done_deblock) {
++                ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
++            }
 +        } else {
 +#if RPI_INTER
-+          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
++            rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-         }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++            ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
++        }
 +#else
 +#if RPI_INTER
 +        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
 +#endif
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++        ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
 +#endif
 +    }
  }
  
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
+index 4a6dde0f67..8ee37ebfbc 100644
+--- a/libavcodec/hevc_mvs.c
++++ b/libavcodec/hevc_mvs.c
+@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
+     return 0;
+ }
+ 
+-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
+ {
+     int tx, scale_factor;
+ 
+@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+                            (scale_factor * src->y < 0)) >> 8);
+ }
+ 
+-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+-                       int colPic, int poc,
+-                       RefPicList *refPicList, int X, int refIdxLx,
+-                       RefPicList *refPicList_col, int listCol, int refidxCol)
++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
++                       const int colPic, const int poc,
++                       const RefPicList * const refPicList, const int X, const int refIdxLx,
++                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
+ {
+     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
+     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
+@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+                 refPicList_col, L ## l, temp_col.ref_idx[l])
+ 
+ // derive the motion vectors section 8.5.3.1.8
+-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+-                                         int refIdxLx, Mv *mvLXCol, int X,
+-                                         int colPic, RefPicList *refPicList_col)
++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
++                                         const int refIdxLx, Mv * const mvLXCol, const int X,
++                                         const int colPic, const RefPicList * const refPicList_col)
+ {
+-    RefPicList *refPicList = s->ref->refPicList;
++    const RefPicList * const refPicList = s->ref->refPicList;
+ 
+     if (temp_col.pred_flag == PF_INTRA)
+         return 0;
+@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+ /*
+  * 8.5.3.1.7  temporal luma motion vector prediction
+  */
+-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+-                                       int nPbW, int nPbH, int refIdxLx,
+-                                       Mv *mvLXCol, int X)
++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
++                                       const int nPbW, const int nPbH, const int refIdxLx,
++                                       Mv * const mvLXCol, const int X)
+ {
+     MvField *tab_mvf;
+     MvField temp_col;
+     int x, y, x_pu, y_pu;
+-    int min_pu_width = s->ps.sps->min_pu_width;
++    const int min_pu_width = s->ps.sps->min_pu_width;
+     int availableFlagLXCol = 0;
+     int colPic;
+ 
+-    HEVCFrame *ref = s->ref->collocated_ref;
++    HEVCFrame * const ref = s->ref->collocated_ref;
+ 
+-    if (!ref) {
++    if (ref == NULL || ref->tab_mvf == NULL) {
+         memset(mvLXCol, 0, sizeof(*mvLXCol));
+         return 0;
+     }
+@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+     x = x0 + nPbW;
+     y = y0 + nPbH;
+ 
+-    if (tab_mvf &&
+-        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+         y < s->ps.sps->height &&
+         x < s->ps.sps->width) {
+         x                 &= ~15;
+         y                 &= ~15;
+         if (s->threads_type == FF_THREAD_FRAME)
+-            ff_thread_await_progress(&ref->tf, y, 0);
++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+         x_pu               = x >> s->ps.sps->log2_min_pu_size;
+         y_pu               = y >> s->ps.sps->log2_min_pu_size;
+         temp_col           = TAB_MVF(x_pu, y_pu);
+@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+     }
+ 
+     // derive center collocated motion vector
+-    if (tab_mvf && !availableFlagLXCol) {
++    if (!availableFlagLXCol) {
+         x                  = x0 + (nPbW >> 1);
+         y                  = y0 + (nPbH >> 1);
+         x                 &= ~15;
+         y                 &= ~15;
+         if (s->threads_type == FF_THREAD_FRAME)
+-            ff_thread_await_progress(&ref->tf, y, 0);
++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+         x_pu               = x >> s->ps.sps->log2_min_pu_size;
+         y_pu               = y >> s->ps.sps->log2_min_pu_size;
+         temp_col           = TAB_MVF(x_pu, y_pu);
 diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
-index 83f2ec2..bcf53dc 100644
+index c1b69a0199..455cdaea1c 100644
 --- a/libavcodec/hevc_ps.c
 +++ b/libavcodec/hevc_ps.c
-@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
      switch (sps->bit_depth) {
      case 8:
          if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
@@ -8863,17 +13007,112 @@ index 83f2ec2..bcf53dc 100644
          if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
          if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
         break;
-@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
-     sps->amp_enabled_flag = get_bits1(gb);
-     sps->sao_enabled      = get_bits1(gb);
+@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+         break;
+     case 10:
+         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
++#if RPI_HEVC_SAND
++        // *** Horrid kludge s.t. we start out with sand format
++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
++#else
+         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
++#endif
+         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
+         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
+         break;
+@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+         if (sps_extension_flag[0]) {
+             int extended_precision_processing_flag;
+-            int high_precision_offsets_enabled_flag;
+             int cabac_bypass_alignment_enabled_flag;
  
-+    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
-+
-     sps->pcm_enabled_flag = get_bits1(gb);
-     if (sps->pcm_enabled_flag) {
-         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+                    "extended_precision_processing_flag not yet implemented\n");
+ 
+             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
+-            high_precision_offsets_enabled_flag  = get_bits1(gb);
+-            if (high_precision_offsets_enabled_flag)
++            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
++            if (sps->high_precision_offsets_enabled_flag)
+                 av_log(avctx, AV_LOG_WARNING,
+-                   "high_precision_offsets_enabled_flag not yet implemented\n");
++                   "high_precision_offsets_enabled_flag not fully implemented\n");
+ 
+             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+ 
+diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
+index df52e401ad..8869a4a602 100644
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -23,7 +23,7 @@
+ 
+ #include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
+-
++#include "libavutil/rpi_sand_fns.h"
+ #include "internal.h"
+ #include "thread.h"
+ #include "hevc.h"
+@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+             HEVCFrame *frame = &s->DPB[min_idx];
+             AVFrame *dst = out;
+             AVFrame *src = frame->frame;
+-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
++            const int fmt = src->format;
++            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+             int pixel_shift = !!(desc->comp[0].depth > 8);
+ 
+             ret = av_frame_ref(out, src);
+@@ -216,12 +217,29 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+             if (ret < 0)
+                 return ret;
+ 
+-            for (i = 0; i < 3; i++) {
+-                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+-                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+-                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+-                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
+-                dst->data[i] += off;
++            if (av_rpi_is_sand_format(fmt))
++            {
++                // Sand cannot be windowed by offset so add side data if we have an offset
++                const HEVCWindow * const window = &frame->window;
++                if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
++                {
++                    AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
++                    AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++                    si->left_offset = window->left_offset;
++                    si->top_offset = window->top_offset;
++                    si->pic_width = s->ps.sps->width;
++                    si->pic_height = s->ps.sps->height;
++                }
++            }
++            else
++            {
++                for (i = 0; i < 3; i++) {
++                    int hshift = (i > 0) ? desc->log2_chroma_w : 0;
++                    int vshift = (i > 0) ? desc->log2_chroma_h : 0;
++                    int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
++                              (frame->window.top_offset   >> vshift) * dst->linesize[i];
++                    dst->data[i] += off;
++                }
+             }
+             av_log(s->avctx, AV_LOG_DEBUG,
+                    "Output frame with POC %d.\n", frame->poc);
+@@ -426,8 +444,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
+     frame->sequence = s->seq_decode;
+     frame->flags    = 0;
+ 
+-    if (s->threads_type == FF_THREAD_FRAME)
+-        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
++    ff_hevc_progress_set_all_done(frame);
+ 
+     return frame;
+ }
 diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 9d773d9..c4d7250 100644
+index 9d773d960e..c9661c3ab1 100644
 --- a/libavcodec/hevcdsp.c
 +++ b/libavcodec/hevcdsp.c
 @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
@@ -8997,28 +13236,16 @@ index 9d773d9..c4d7250 100644
  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
  {
  #undef FUNC
-@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
      PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
      PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
  
-+#ifndef RPI
++#if !RPI_HEVC_SAND
 +#define SLICED_LOOP_FILTERS(depth)
++#define SLICED_ADD_RESIDUAL(depth)
++#define SLICED_SAO(depth)
 +#else
-+#define SLICED_LOOP_FILTERS(depth)\
-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#endif
-+
-+
- #define HEVC_DSP(depth)                                                     \
-     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
-@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
-     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
-     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
++#define SLICED_ADD_RESIDUAL(depth)\
 +    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
 +    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
 +    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
@@ -9027,30 +13254,77 @@ index 9d773d9..c4d7250 100644
 +    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
 +    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
 +    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
++    }                                                                         \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#endif
++
+ #define HEVC_DSP(depth)                                                     \
+     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+-    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+-    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
+-    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+-    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
+-    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
++    hevcdsp->transform_add[0]       = FUNC(add_residual4x4, depth);         \
++    hevcdsp->transform_add[1]       = FUNC(add_residual8x8, depth);         \
++    hevcdsp->transform_add[2]       = FUNC(add_residual16x16, depth);       \
++    hevcdsp->transform_add[3]       = FUNC(add_residual32x32, depth);       \
++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);      \
++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);      \
++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);    \
++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);    \
++    SLICED_ADD_RESIDUAL(depth);                                             \
      hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+-    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
++    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
++    hevcdsp->idct_4x4_luma          = FUNC(idct_4x4_luma, depth);           \
      hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
-@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
+     hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
+@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
+                                                                             \
+-    hevcdsp->sao_band_filter[0] =                                              \
+-    hevcdsp->sao_band_filter[1] =                                              \
+-    hevcdsp->sao_band_filter[2] =                                              \
+-    hevcdsp->sao_band_filter[3] =                                              \
+-    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
+-    hevcdsp->sao_edge_filter[0] =                                              \
+-    hevcdsp->sao_edge_filter[1] =                                              \
+-    hevcdsp->sao_edge_filter[2] =                                              \
+-    hevcdsp->sao_edge_filter[3] =                                              \
+-    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
++    }                                                                       \
      hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
      hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++    SLICED_SAO(depth);                                                         \
                                                                                 \
-+    hevcdsp->sao_band_filter_c[0] =                                            \
-+    hevcdsp->sao_band_filter_c[1] =                                            \
-+    hevcdsp->sao_band_filter_c[2] =                                            \
-+    hevcdsp->sao_band_filter_c[3] =                                            \
-+    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
-+    hevcdsp->sao_edge_filter_c[0] =                                            \
-+    hevcdsp->sao_edge_filter_c[1] =                                            \
-+    hevcdsp->sao_edge_filter_c[2] =                                            \
-+    hevcdsp->sao_edge_filter_c[3] =                                            \
-+    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
-+                                                                               \
      QPEL_FUNCS(depth);                                                         \
      QPEL_UNI_FUNCS(depth);                                                     \
-     QPEL_BI_FUNCS(depth);                                                      \
-@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
      EPEL_UNI_FUNCS(depth);                                                     \
      EPEL_BI_FUNCS(depth);                                                      \
                                                                                 \
@@ -9058,7 +13332,7 @@ index 9d773d9..c4d7250 100644
      hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
      hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
      hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-@@ -257,6 +404,8 @@ int i = 0;
+@@ -257,6 +409,8 @@ int i = 0;
          break;
      }
  
@@ -9068,10 +13342,18 @@ index 9d773d9..c4d7250 100644
          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
      if (ARCH_ARM)
 diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index 9f1f6dd..639ecf1 100644
+index 9f1f6dd59f..c4a1b0f09d 100644
 --- a/libavcodec/hevcdsp.h
 +++ b/libavcodec/hevcdsp.h
-@@ -42,11 +42,26 @@ typedef struct SAOParams {
+@@ -25,6 +25,7 @@
+ #ifndef AVCODEC_HEVCDSP_H
+ #define AVCODEC_HEVCDSP_H
+ 
++#include "rpi_opts.h"
+ #include "get_bits.h"
+ 
+ #define MAX_PB_SIZE 64
+@@ -42,11 +43,40 @@ typedef struct SAOParams {
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
  
@@ -9085,45 +13367,69 @@ index 9f1f6dd..639ecf1 100644
 +    int8_t ref_idx[2];
 +    int8_t pred_flag;
 +} MvField;
++
++#ifdef RPI
++#define SAO_FILTER_N 6
++#else
++#define SAO_FILTER_N 5
++#endif
++
 +
  typedef struct HEVCDSPContext {
      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                      struct GetBitContext *gb, int pcm_bit_depth);
-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
  
 -    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
++    // add_residual was transform_add - import 3.3 names
 +    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++#if RPI_HEVC_SAND
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++#endif
  
      void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
  
-@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
+@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext {
  
-     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+     void (*idct_dc[4])(int16_t *coeffs);
+ 
+-    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                 int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++#if RPI_HEVC_SAND
++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                               const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                               const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                               int width, int height);
++#endif
  
      /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
                                 int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++#if RPI_HEVC_SAND
++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++#endif
  
      void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                  struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
                                  uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++#if RPI_HEVC_SAND
 +    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
 +                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++#endif
  
      void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
                                      int height, intptr_t mx, intptr_t my, int width);
-@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
+@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext {
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                          int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
@@ -9147,23 +13453,24 @@ index 9f1f6dd..639ecf1 100644
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
-index b840d17..32b9e47 100644
+index 5bca02342d..122fbe8154 100644
 --- a/libavcodec/hevcdsp_template.c
 +++ b/libavcodec/hevcdsp_template.c
-@@ -26,6 +26,9 @@
+@@ -26,6 +26,7 @@
  #include "bit_depth_template.c"
  #include "hevcdsp.h"
  
-+#ifdef RPI
-+#include "rpi_zc.h"
-+#endif
++#include "rpi_shader_template.h"
  
  static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                            GetBitContext *gb, int pcm_bit_depth)
-@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
      }
  }
  
+-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+-                                                     ptrdiff_t stride, int size)
++#if RPI_HEVC_SAND
 +static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
 +                          GetBitContext *gb, int pcm_bit_depth)
 +{
@@ -9185,17 +13492,20 @@ index b840d17..32b9e47 100644
 +        dst += stride;
 +    }
 +}
++#endif
 +
-+
- static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
-                                                      ptrdiff_t stride, int size)
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs,
++                                                ptrdiff_t stride, int size)
  {
-@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
      }
  }
  
-+static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
-+                                                ptrdiff_t stride, int size)
+-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
+-                                       ptrdiff_t stride)
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
 +{
 +    int x, y;
 +    pixel *dst = (pixel *)_dst;
@@ -9203,77 +13513,300 @@ index b840d17..32b9e47 100644
 +    stride /= sizeof(pixel);
 +
 +    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + dc);
++        }
++        dst += stride;
++    }
++}
++
++
++#if RPI_HEVC_SAND
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_v, int size)
+ {
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
 +        for (x = 0; x < size * 2; x += 2) {
 +            dst[x] = av_clip_pixel(dst[x] + *res);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++            res++;
++        }
++        dst += stride;
++    }
+ }
+ 
+-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_u, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
 +            res++;
 +        }
 +        dst += stride;
 +    }
 +}
 +
- static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, unsigned int size)
++{
++    unsigned int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int16_t * ru = res;
++    const int16_t * rv = res + size * size;
++
++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++        }
++        dst += stride;
++    }
++
++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int dc_v = dc >> 16;
++    const int dc_u = (dc << 16) >> 16;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++        }
++        dst += stride;
++    }
++}
++
++
++#endif
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, coeffs, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs,
                                         ptrdiff_t stride)
  {
-@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
-     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
++    FUNC(add_residual)(_dst, coeffs, stride, 8);
  }
  
+-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs,
+                                          ptrdiff_t stride)
+ {
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
++    FUNC(add_residual)(_dst, coeffs, stride, 16);
+ }
+ 
+-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs,
+                                          ptrdiff_t stride)
+ {
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
++    FUNC(add_residual)(_dst, coeffs, stride, 32);
+ }
+ 
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++#if RPI_HEVC_SAND
 +// -- U -- (plaited)
 +
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 4);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
 +}
 +
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 8);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
 +}
 +
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 16);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
 +}
 +
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 32);
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
 +}
 +
 +// -- V -- (plaited)
 +
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
 +                                  ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
++    FUNC(add_residual_c)(_dst, res, stride, 4);
 +}
 +
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
 +                                  ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
++    FUNC(add_residual_c)(_dst, res, stride, 8);
 +}
 +
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
 +                                    ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
++    FUNC(add_residual_c)(_dst, res, stride, 16);
 +}
 +
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
 +                                    ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
 +}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++#endif
 +
  
  static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  {
-@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
+         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
+     } while (0)
+ 
+-static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++static void FUNC(idct_4x4_luma)(int16_t *coeffs)
+ {
+     int i;
+     int shift    = 7;
+@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
+     }
+ }
+ 
++
++#if BIT_DEPTH == 10
++#if RPI_HEVC_SAND
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++#endif
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
+ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+                                     int *borders, int _width, int _height,
+@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
      int x, y;
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
@@ -9281,7 +13814,7 @@ index b840d17..32b9e47 100644
      int sao_eo_class    = sao->eo_class[c_idx];
      int init_x = 0, width = _width, height = _height;
  
-@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
  
      if (sao_eo_class != SAO_EO_VERT) {
          if (borders[0]) {
@@ -9321,7 +13854,7 @@ index b840d17..32b9e47 100644
              height--;
          }
      }
-@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
      int x, y;
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
@@ -9329,7 +13862,7 @@ index b840d17..32b9e47 100644
      int sao_eo_class    = sao->eo_class[c_idx];
      int init_x = 0, init_y = 0, width = _width, height = _height;
  
-@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
  
      if (sao_eo_class != SAO_EO_VERT) {
          if (borders[0]) {
@@ -9370,24 +13903,22 @@ index b840d17..32b9e47 100644
              height--;
          }
      }
-@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+ 
      }
  }
- 
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
 +
 +// --- Plaited chroma versions
 +
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
++#if RPI_HEVC_SAND
++
 +static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
@@ -9413,23 +13944,17 @@ index b840d17..32b9e47 100644
 +    for (y = 0; y < height; y++) {
 +        for (x = 0; x < width; x += 2)
 +        {
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++            // *** & 31 shouldn't be wanted but just now we generate broken input that
++            // crashes us in 10-bit world
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
 +        }
 +        dst += stride_dst;
 +        src += stride_src;
 +    }
 +}
-+#endif
-+
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
 +
 +static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
@@ -9447,9 +13972,12 @@ index b840d17..32b9e47 100644
 +    int a_stride, b_stride;
 +    int x, y;
 +    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++
 +    stride_dst /= sizeof(pixel);
 +    width *= 2;
 +
++    av_assert0(width <= 64);
++
 +    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
 +    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
 +    for (y = 0; y < height; y++) {
@@ -9467,43 +13995,42 @@ index b840d17..32b9e47 100644
 +        dst += stride_dst;
 +    }
 +}
-+#endif
 +
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
++// Do once
++#if BIT_DEPTH == 8
 +// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
-+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
 +#endif
 +
++#endif  // RPI_HEVC_SAND
 +
+ 
  #undef CMP
  
- ////////////////////////////////////////////////////////////////////////////////
-@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
  #undef TQ1
  #undef TQ2
  #undef TQ3
 +
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +
 +// line zero
 +#define P3 pix_l[0 * xstride]
@@ -9717,7 +14244,7 @@ index b840d17..32b9e47 100644
 +#endif
 +
 diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
-index 02c1766..cea16ea 100644
+index 02c1766059..cea16eade4 100644
 --- a/libavcodec/hevcpred.c
 +++ b/libavcodec/hevcpred.c
 @@ -24,6 +24,7 @@
@@ -9799,7 +14326,7 @@ index 02c1766..cea16ea 100644
      case 9:
          HEVC_PRED(9);
 diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
-index eb17663..00ba3f9 100644
+index eb17663683..00ba3f94c0 100644
 --- a/libavcodec/hevcpred.h
 +++ b/libavcodec/hevcpred.h
 @@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
@@ -9821,10 +14348,10 @@ index eb17663..00ba3f9 100644
  
  void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 6ae87cc..c14dddd 100644
+index 6fe33546b1..2f9f5f2798 100644
 --- a/libavcodec/hevcpred_template.c
 +++ b/libavcodec/hevcpred_template.c
-@@ -20,13 +20,55 @@
+@@ -20,13 +20,110 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -9836,34 +14363,90 @@ index 6ae87cc..c14dddd 100644
  #include "hevcpred.h"
  
 +#ifdef RPI
-+#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
 +#endif
 +
 +#define DUMP_PRED 0
 +
  #define POS(x, y) src[(x) + stride * (y)]
  
-+#if PRED_C
-+
++// REPEAT_INCLUDE defined at EOF
++#if defined(RPI) && !defined(INCLUDED_ONCE)
 +typedef uint8_t (* c8_dst_ptr_t)[2];
 +typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++    uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++    uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++    pixel4_16 t = {{x, x, x, x}};
++    return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++    pixel4_32 t = {{x, x, x, x}};
++    return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
 +
 +#if BIT_DEPTH == 8
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 16
-+#include "bit_depth_template.c"
-+#undef FUNC
-+#define FUNC(a) FUNC3(a, 8, _c)
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t  c8_src_ptr_t
++#define c_dst_ptr_t  c8_dst_ptr_t
 +#else
-+#undef FUNC
-+#define FUNC FUNCC
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
 +#endif
 +
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
 +#endif
 +
-+#if DUMP_PRED
-+#ifndef DEBUG_ONCE
-+#define DEBUG_ONCE
++
++#if DUMP_PRED && !defined(INCLUDE_ONCE)
 +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
 +{
 +    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
@@ -9875,17 +14458,16 @@ index 6ae87cc..c14dddd 100644
 +    printf("\n");
 +}
 +#endif
-+#endif
 +
  static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
                                                int log2_size, int c_idx)
  {
-@@ -69,8 +111,11 @@ do {                                  \
+@@ -69,8 +166,11 @@ do {                                  \
                  AV_WN4P(&ptr[i], a);                                           \
              else                                                               \
                  a = PIXEL_SPLAT_X4(ptr[i + 3])
 -
-+#ifdef RPI_WORKER
++#ifdef RPI
 +    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
 +#else
      HEVCLocalContext *lc = s->HEVClc;
@@ -9893,7 +14475,7 @@ index 6ae87cc..c14dddd 100644
      int i;
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-@@ -79,15 +124,23 @@ do {                                  \
+@@ -79,15 +179,23 @@ do {                                  \
      int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
      int size_in_luma_v = size << vshift;
      int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
@@ -9909,18 +14491,18 @@ index 6ae87cc..c14dddd 100644
 -    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
 +    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
 +#if defined(RPI)
-+    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
++    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
 +            (pixel*)s->frame->data[c_idx] + x + y * stride :
 +        c_idx == 0 ?
-+            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
-+            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
++            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
 +#else
      pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
 +#endif
  
      int min_pu_width = s->ps.sps->min_pu_width;
  
-@@ -95,14 +148,20 @@ do {                                  \
+@@ -95,14 +203,20 @@ do {                                  \
                                lc->tu.intra_pred_mode;
      pixel4 a;
      pixel  left_array[2 * MAX_TB_SIZE + 1];
@@ -9941,7 +14523,7 @@ index 6ae87cc..c14dddd 100644
      int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
      int cand_left        = lc->na.cand_left;
      int cand_up_left     = lc->na.cand_up_left;
-@@ -114,6 +173,26 @@ do {                                  \
+@@ -114,6 +228,27 @@ do {                                  \
      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                             (x0 + size_in_luma_h)) >> hshift;
  
@@ -9954,10 +14536,11 @@ index 6ae87cc..c14dddd 100644
 +#endif
 +
 +#if defined(RPI)
-+    if (s->frame->format == AV_PIX_FMT_SAND128) {
++    if (av_rpi_is_sand_frame(s->frame)) {
++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
 +        const AVFrame * const frame = s->frame;
 +        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
 +        if ((x & mask) == 0)
 +            src_l -= stripe_adj;
 +        if (((x + size) & mask) == 0)
@@ -9968,7 +14551,7 @@ index 6ae87cc..c14dddd 100644
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
-@@ -163,23 +242,24 @@ do {                                  \
+@@ -163,23 +298,24 @@ do {                                  \
          top[-1] = 128;
      }
      if (cand_up_left) {
@@ -10000,29 +14583,29 @@ index 6ae87cc..c14dddd 100644
                 size - bottom_left_size);
      }
  
-@@ -268,7 +348,11 @@ do {                                  \
+@@ -268,7 +404,11 @@ do {                                  \
              cand_up_left = 1;
              cand_left    = 1;
          } else { // No samples available
-+#if PRED_C && BIT_DEPTH == 16
-+            left[-1] = 0x8080;
++#if PRED_C
++            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
 +#else
              left[-1] = (1 << (BIT_DEPTH - 1));
 +#endif
              EXTEND(top,  left[-1], 2 * size);
              EXTEND(left, left[-1], 2 * size);
          }
-@@ -287,6 +371,9 @@ do {                                  \
+@@ -287,6 +427,9 @@ do {                                  \
      top[-1] = left[-1];
  
      // Filtering process
-+    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
++    // Sand can only apply to chroma_format_idc == 1 so we don't need to
 +    // worry about chroma smoothing for that case
 +#if !PRED_C
      if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
          if (mode != INTRA_DC && size != 4){
              int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-@@ -342,13 +429,46 @@ do {                                  \
+@@ -342,6 +485,30 @@ do {                                  \
                                             mode);
          break;
      }
@@ -10052,24 +14635,8 @@ index 6ae87cc..c14dddd 100644
 +#endif
  }
  
-+#if !PRED_C || BIT_DEPTH == 16
  #define INTRA_PRED(size)                                                            \
- static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
- {                                                                                   \
-     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
- }
-+#else
-+#define INTRA_PRED(size)                                                            \
-+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
-+{                                                                                   \
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#endif
- 
- INTRA_PRED(2)
- INTRA_PRED(3)
-@@ -357,6 +477,7 @@ INTRA_PRED(5)
+@@ -357,6 +524,7 @@ INTRA_PRED(5)
  
  #undef INTRA_PRED
  
@@ -10077,7 +14644,7 @@ index 6ae87cc..c14dddd 100644
  static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
                                    const uint8_t *_left, ptrdiff_t stride,
                                    int trafo_size)
-@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
              POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
                           (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
  }
@@ -10088,9 +14655,9 @@ index 6ae87cc..c14dddd 100644
 +{
 +    int x, y;
 +    int size = 1 << trafo_size;
-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +
 +    for (y = 0; y < size; y++, src += stride)
 +    {
@@ -10105,26 +14672,9 @@ index 6ae87cc..c14dddd 100644
 +}
 +#endif
  
-+#if !PRED_C || BIT_DEPTH == 16
  #define PRED_PLANAR(size)\
  static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-                                        const uint8_t *left, ptrdiff_t stride)   \
- {                                                                               \
-     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
- }
-+#else
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
-+    abort();                                                                    \
-+}
-+#endif
- 
- PRED_PLANAR(0)
- PRED_PLANAR(1)
-@@ -386,6 +540,7 @@ PRED_PLANAR(3)
+@@ -386,6 +577,7 @@ PRED_PLANAR(3)
  
  #undef PRED_PLANAR
  
@@ -10132,7 +14682,7 @@ index 6ae87cc..c14dddd 100644
  static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
                            const uint8_t *_left,
                            ptrdiff_t stride, int log2_size, int c_idx)
-@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
              POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
      }
  }
@@ -10143,9 +14693,9 @@ index 6ae87cc..c14dddd 100644
 +{
 +    unsigned int i, j;
 +    const unsigned int size = (1 << log2_size);
-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +    unsigned int dc0 = size;
 +    unsigned int dc1 = size;
 +
@@ -10186,7 +14736,7 @@ index 6ae87cc..c14dddd 100644
  static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
                                                  const uint8_t *_top,
                                                  const uint8_t *_left,
-@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
      const pixel *top  = (const pixel *)_top;
      const pixel *left = (const pixel *)_left;
  
@@ -10202,7 +14752,7 @@ index 6ae87cc..c14dddd 100644
      int angle = intra_pred_angle[mode - 2];
      pixel ref_array[3 * MAX_TB_SIZE + 4];
      pixel *ref_tmp = ref_array + size;
-@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
          }
      }
  }
@@ -10214,26 +14764,26 @@ index 6ae87cc..c14dddd 100644
 +                                                int mode, int size)
 +{
 +    int x, y;
-+    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
-+    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
-+    c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
++    c_src_ptr_t top  = (c_src_ptr_t)_top;
++    c_src_ptr_t left = (c_src_ptr_t)_left;
 +
 +    const int angle = intra_pred_angle[mode - 2];
-+    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c8_dst_ptr_t ref_tmp = ref_array + size;
-+    c8_src_ptr_t ref;
++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++    c_dst_ptr_t ref_tmp = ref_array + size;
++    c_src_ptr_t ref;
 +    const int last = (size * angle) >> 5;
 +
 +    if (mode >= 18) {
 +        ref = top - 1;
 +        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, top - 1, (size + 1) * 2);
++            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
 +            for (x = last; x <= -1; x++)
 +            {
 +                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
 +                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
 +            }
-+            ref = (c8_src_ptr_t)ref_tmp;
++            ref = (c_src_ptr_t)ref_tmp;
 +        }
 +
 +        for (y = 0; y < size; y++, src += stride) {
@@ -10247,19 +14797,19 @@ index 6ae87cc..c14dddd 100644
 +                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
 +                }
 +            } else {
-+                memcpy(src, ref + idx + 1, size * 2);
++                memcpy(src, ref + idx + 1, size * 2 * PW);
 +            }
 +        }
 +    } else {
 +        ref = left - 1;
 +        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2);
++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
 +            for (x = last; x <= -1; x++)
 +            {
 +                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
 +                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
 +            }
-+            ref = (c8_src_ptr_t)ref_tmp;
++            ref = (c_src_ptr_t)ref_tmp;
 +        }
 +
 +        for (x = 0; x < size; x++, src++) {
@@ -10286,124 +14836,135 @@ index 6ae87cc..c14dddd 100644
  
  static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
                                   const uint8_t *left,
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 099a8c5..bdff2d2 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
+@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
+ }
  
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- 
- #include "avcodec.h"
- #include "internal.h"
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 3adf28d..2f9195f 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- 
-         if (ctx->divx_version >= 0)
-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
+ #undef EXTEND_LEFT_CIP
+ #undef EXTEND_RIGHT_CIP
+ #undef EXTEND_UP_CIP
+@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+ #undef EXTEND
+ #undef MIN_TB_ADDR_ZS
+ #undef POS
++#undef PW
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
 +
-+        if (ctx->num_sprite_warping_points > 1)
-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-     }
- 
-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
- 
-+    avctx->workaround_bugs = s->workaround_bugs;
-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-         s->codec_id == AV_CODEC_ID_MPEG4 &&
-         avctx->idct_algo == FF_IDCT_AUTO) {
 diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index bfa2537..1bca89e 100644
+index d36b68bfae..b526dc393d 100644
 --- a/libavcodec/raw.c
 +++ b/libavcodec/raw.c
-@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
      { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
      { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
  
 +    /* RPI */
 +#ifdef RPI
 +    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
 +#endif
 +
      /* special */
      { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
      { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
 diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d837056..81256b5 100644
+index d83705645c..8dcdf66158 100644
 --- a/libavcodec/rawenc.c
 +++ b/libavcodec/rawenc.c
-@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
+@@ -31,6 +31,8 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -47,6 +49,71 @@ FF_ENABLE_DEPRECATION_WARNINGS
      return 0;
  }
  
-+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
-+{
-+    for (int y = 0; y != frame->height / 2; ++y) {
-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
-+            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
-+            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
-+            for (int i = 0; i < w; ++i)
-+                *dst++ = p[i * 2];
-+        }
-+    }
-+    return dst;
-+}
-+
-+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
 +                      const AVFrame *frame)
 +{
-+    int size = frame->width * frame->height * 3 / 2;
++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
++    int size;
++    int width = frame->width;
++    int height = frame->height;
++    int x0 = 0;
++    int y0 = 0;
 +    uint8_t * dst;
 +    int ret;
 +
++    if (sd != NULL) {
++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++
++        x0 = si->left_offset;
++        y0 = si->top_offset;
++    }
++
++    size = width * height * 3 / 2;
 +    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
 +        return ret;
 +
 +    dst = pkt->data;
 +
-+    // Luma is "easy"
-+    for (int y = 0; y != frame->height; ++y) {
-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
-+            const int w = FFMIN(frame->linesize[0], frame->width - x);
-+            memcpy(dst,
-+                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
-+            dst += w;
-+        }
-+    }
-+    // Chroma is dull
-+    dst = cpy_sand_c(dst, frame, 0);
-+    dst = cpy_sand_c(dst, frame, 1);
-+
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
 +    return 0;
 +}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
++    int size;
++    int width = frame->width;
++    int height = frame->height;
++    int x0 = 0;
++    int y0 = 0;
++    uint8_t * dst;
++    int ret;
++
++    if (sd != NULL) {
++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++
++        x0 = si->left_offset;
++        y0 = si->top_offset;
++    }
++
++    size = width * height * 3;
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++
 +
  static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                        const AVFrame *frame, int *got_packet)
  {
-@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+@@ -56,6 +123,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
      if (ret < 0)
          return ret;
  
-+    if (frame->format == AV_PIX_FMT_SAND128) {
-+        ret = raw_sand_as_yuv420(avctx, pkt, frame);
++    if (av_rpi_is_sand_frame(frame)) {
++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
 +        *got_packet = (ret == 0);
 +        return ret;
 +    }
@@ -10411,13 +14972,4018 @@ index d837056..81256b5 100644
      if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
          return ret;
      if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
 new file mode 100644
-index 0000000..4309f1c
+index 0000000000..391f761df9
 --- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.h
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,923 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# HEVC VPU Transform
++#             fe
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++  cmp r5,1
++  beq memclear16
++  cmp r5,2
++  beq hevc_deblock_16x16
++  cmp r5,3
++  beq hevc_uv_deblock_16x16
++  cmp r5,4
++  beq hevc_uv_deblock_16x16_with_clear
++  cmp r5,5
++  beq hevc_run_command_list
++
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++block_loop:
++  eor r0,r8
++  add r1,r7
++  # Prefetch the next block
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  eor r0,r8
++  sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++#
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
++  # set r8 to 32byte aligned stack pointer
++  add r8,sp,31
++  lsr r8,5
++  lsl r8,5
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++block_loop32:
++
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, TRANS_RND2 # Constant used for rounding second pass
++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++
++  add sp,sp,32*32*2+32 # Restore stack
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++memclear16:
++  # r0 is address
++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
++  vmov HX(0++,0),0 REP 16
++  mov r2,32
++loop:
++  vsth HX(0++,0),(r0+=r2) REP 16
++  add r0,16*16*2
++  sub r1,16*16
++  cmp r1,0
++  bgt loop
++  b lr
++
++
++################################################################################
++# HEVC VPU Deblock
++#
++# Vertical edges before horizontal
++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
++#
++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
++# The VPU code works in units of 16x16 blocks.
++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
++# One final horizontal filter is required at the end.
++# PCM is not allowed in this code.
++#
++#
++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
++
++.set P0,63
++.set P1,62
++.set P2,61
++.set P3,60
++.set Q0,59
++.set Q1,58
++.set Q2,57
++.set Q3,56
++
++.set dp,32
++.set dq,33
++.set d,34
++.set decision,35
++.set beta,36
++.set beta2,37
++.set beta3,38
++.set ptest,39
++.set qtest,40
++.set pqtest,41
++.set thresh,42
++.set deltatest, 44
++.set deltap1, 45
++.set tc25, 46
++.set setup,47
++.set tc,48
++.set tc25,49
++.set tc2, 50
++.set do_filter, 51
++.set delta, 52
++.set tc10, 53
++.set delta0, 54
++.set delta1, 55
++.set zeros, 0
++.set setup_input, 1
++.set deltaq1, 2
++
++
++
++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
++# Row has num16 16x16 blocks across
++# Beta goes from 0 to 64
++# tc goes from 0 to 24
++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
++#   has 8 bytes per edge
++#   has 16 bytes per direction
++#   has 32 bytes per 16x16 block
++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
++hevc_deblock_16x16:
++  push r6-r15, lr
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++
++process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl vert_filter
++  sub r3,8
++  b start_deblock_loop
++deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt start_again
++  pop r6-r15, pc
++start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++vert_filter:
++  push lr
++
++  vmov HX(P3,0), V(16,12)+r3
++  vmov HX(P2,0), V(16,13)+r3
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++  vmov HX(Q2,0), V(16,18)+r3
++  vmov HX(Q3,0), V(16,19)+r3
++
++  bl do_luma_filter
++
++  vadds V(16,13)+r3, HX(P2,0), 0
++  vadds V(16,14)+r3, HX(P1,0), 0
++  vadds V(16,15)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds V(16,16)+r3, HX(Q0,0), 0
++  vadds V(16,17)+r3, HX(Q1,0), 0
++  vadds V(16,18)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++horz_filter:
++  push lr
++
++  vmov HX(P3,0), H(12,0)+r3
++  vmov HX(P2,0), H(13,0)+r3
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++  vmov HX(Q2,0), H(18,0)+r3
++  vmov HX(Q3,0), H(19,0)+r3
++
++  bl do_luma_filter
++
++  vadds H(13,0)+r3, HX(P2,0), 0
++  vadds H(14,0)+r3, HX(P1,0), 0
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++  vadds H(17,0)+r3, HX(Q1,0), 0
++  vadds H(18,0)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_luma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
++  valtl HX(beta,0),H(setup,0),H(setup,0)
++  valtu HX(tc,0),H(setup,0),H(setup,0)
++  vmul HX(tc25,0), HX(tc,0), 5
++  vadd HX(tc25,0),HX(tc25,0), 1
++  vasr HX(tc25,0), HX(tc25,0), 1
++
++  # Compute decision
++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
++
++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
++
++  vadd HX(d,0), HX(dp,0), HX(dq,0)
++  vasr HX(beta2,0),HX(beta,0),2
++  vasr HX(beta3,0),HX(beta,0),3
++
++  # Compute flags that are negative if all conditions pass
++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
++
++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
++  vmov HX(decision,0), 1 IFNN
++  vadd H(decision,0),H(decision,3),0 IFN
++  vadd H(decision,16),H(decision,19),0 IFN
++  vmov -,HX(decision,0) SETF   # N marks strong filter
++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
++
++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
++  vmov HX(decision,0),0 IFNN # Z marks no filter
++
++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
++  # First extract out even terms
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
++  # Now expand back
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
++
++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
++
++  # Do a quick check to see if there is anything to do
++  mov r11, 0 # Signal no filtering
++  vmov -,1 IFNZ SUMS r5
++  cmp r5,0
++  beq filtering_done
++  mov r11, 1 # Signal some filtering
++  # And whether there is any strong filtering
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq normal_filtering
++
++  ##############################################################################
++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
++
++  # Take a copy of the original pixels for use in decision calculation
++  vmov HX(P0,32),HX(P0,0)
++  vmov HX(Q0,32),HX(Q0,0)
++  vmov HX(P1,32),HX(P1,0)
++  vmov HX(Q1,32),HX(Q1,0)
++  vmov HX(P2,32),HX(P2,0)
++  vmov HX(Q2,32),HX(Q2,0)
++
++  vadd -,HX(P2,32),4 CLRA SACC
++  vshl -,HX(P1,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl HX(delta,0),HX(Q1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
++
++  vadd -,HX(P2,32),2 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vshl HX(delta,0),HX(Q0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
++
++  vadd -,HX(Q0,32),4 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vmul -,HX(P2,32),3 SACC
++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
++  #vmov HX(P2,0),3 IFN
++
++  # Now reverse all P/Qs
++
++  vadd -,HX(Q2,32),4 CLRA SACC
++  vshl -,HX(Q1,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl HX(delta,0),HX(P1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
++
++  vadd -,HX(Q2,32),2 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vshl HX(delta,0),HX(P0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
++
++  vadd -,HX(P0,32),4 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vmul -,HX(Q2,32),3 SACC
++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
++
++  ##############################################################################
++  # Normal filtering
++normal_filtering:
++  # Invert the decision flags
++  # make instruction more complicated as assembler has error and loses SETF
++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
++
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq filtering_done
++
++  vasr HX(tc2,0), HX(tc,0), 1
++  vmul HX(tc10,0), HX(tc,0), 10
++
++  vasr HX(thresh,0), HX(beta,0), 1
++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
++
++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
++  # Expand ptest and qtest together
++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
++
++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
++  vmov -,8 CLRA SACC
++  vmul -,HX(delta0,0), 9 SACC
++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
++  vasr HX(delta0,0), HX(delta0,0), 4
++  vdist HX(deltatest,0), HX(delta0,0), 0
++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
++
++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
++
++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
++  vadd HX(deltap1,0), HX(deltap1,0), 1
++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
++  vasr HX(deltap1,0), HX(deltap1,0), 1
++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
++
++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
++  vrsub -, HX(delta0,0), 0 SACC
++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
++
++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
++
++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
++
++  vmov -,HX(deltatest,0) SETF
++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
++
++  #vmov HX(P2,0),1 IFN
++
++filtering_done:
++  b lr
++
++
++hevc_uv_deblock_16x16:
++  push r6-r15, lr
++  mov r14,0
++  b hevc_uv_start
++hevc_uv_deblock_16x16_with_clear:
++  push r6-r15, lr
++  mov r14,1
++  b hevc_uv_start
++
++hevc_uv_start:
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++# r14 is 1 if we should clear the old contents, or 0 if not
++
++uv_process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  cmp r14,1
++  bne uv_skip0
++  vstb H(zeros,0),(r4)
++uv_skip0:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl uv_vert_filter
++  sub r3,8
++  b uv_start_deblock_loop
++uv_deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  cmp r14,1
++  bne uv_skip1
++  vstb H(zeros,0),(r4)
++uv_skip1:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip3
++  vstb H(zeros,0),-16(r4)
++uv_skip3:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,uv_skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++uv_start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt uv_deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip2
++  vstb H(zeros,0),-16(r4)
++uv_skip2:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,uv_skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt uv_start_again
++  pop r6-r15, pc
++uv_start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b uv_process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++uv_vert_filter:
++  push lr
++
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++
++  bl do_chroma_filter
++
++  vadds V(16,15)+r3, HX(P0,0), 0
++  vadds V(16,16)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++uv_horz_filter:
++  push lr
++
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++
++  bl do_chroma_filter
++
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_chroma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
++  valtl HX(tc,0),H(setup,0),H(setup,0)
++
++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
++  vsub -,HX(P1,0),HX(Q1,0) SACC
++  vmov HX(delta,0),4 SACC
++  vasr HX(delta,0),HX(delta,0),3
++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
++  b lr
++
++# r0 = list
++# r1 = number
++hevc_run_command_list:
++  push r6-r7, lr
++  mov r6, r0
++  mov r7, r1
++loop_cmds:
++  ld r0,(r6) # How to encode r6++?
++  add r6,4
++  ld r1,(r6)
++  add r6,4
++  ld r2,(r6)
++  add r6,4
++  ld r3,(r6)
++  add r6,4
++  ld r4,(r6)
++  add r6,4
++  ld r5,(r6)
++  add r6,4
++  bl hevc_trans_16x16
++  sub r7,1
++  cmp r7,0
++  bgt loop_cmds
++
++  pop r6-r7, pc
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..b0e9902d82
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
 @@ -0,0 +1,3070 @@
-+unsigned char rpi_hevc_transform [] = {
++static const unsigned char rpi_hevc_transform10 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++220,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++2,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++6,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++2,
++0,
++0,
++101,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++132,
++24,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++143,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++135,
++0,
++131,
++102,
++0,
++158,
++71,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++132,
++24,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++112,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++104,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++134,
++24,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++123,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++112,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++178,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++134,
++24,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++95,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++8,
++250,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..2901b6568d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,3070 @@
++static const unsigned char rpi_hevc_transform8 [] = {
 +21,
 +106,
 +0,
@@ -13487,932 +22053,9 @@ index 0000000..4309f1c
 +33,
 +3,
 +};
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000..5543093
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,917 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+hevc_trans_16x16:
-+  cmp r5,1
-+  beq memclear16
-+  cmp r5,2
-+  beq hevc_deblock_16x16
-+  cmp r5,3
-+  beq hevc_uv_deblock_16x16
-+  cmp r5,4
-+  beq hevc_uv_deblock_16x16_with_clear
-+  cmp r5,5
-+  beq hevc_run_command_list
-+
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,1<<11 # Constant used for rounding second pass
-+
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+block_loop:
-+  eor r0,r8
-+  add r1,r7
-+  # Prefetch the next block
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  eor r0,r8
-+  sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+#
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+  # set r8 to 32byte aligned stack pointer
-+  add r8,sp,31
-+  lsr r8,5
-+  lsl r8,5
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+block_loop32:
-+
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, 1<<11 # Constant used for rounding second pass
-+  mov r5, 4 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+
-+  add sp,sp,32*32*2+32 # Restore stack
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+memclear16:
-+  # r0 is address
-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-+  vmov HX(0++,0),0 REP 16
-+  mov r2,32
-+loop:
-+  vsth HX(0++,0),(r0+=r2) REP 16
-+  add r0,16*16*2
-+  sub r1,16*16
-+  cmp r1,0
-+  bgt loop
-+  b lr
-+
-+
-+################################################################################
-+# HEVC VPU Deblock
-+#
-+# Vertical edges before horizontal
-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-+#
-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-+# The VPU code works in units of 16x16 blocks.
-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-+# One final horizontal filter is required at the end.
-+# PCM is not allowed in this code.
-+#
-+#
-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-+
-+.set P0,63
-+.set P1,62
-+.set P2,61
-+.set P3,60
-+.set Q0,59
-+.set Q1,58
-+.set Q2,57
-+.set Q3,56
-+
-+.set dp,32
-+.set dq,33
-+.set d,34
-+.set decision,35
-+.set beta,36
-+.set beta2,37
-+.set beta3,38
-+.set ptest,39
-+.set qtest,40
-+.set pqtest,41
-+.set thresh,42
-+.set deltatest, 44
-+.set deltap1, 45
-+.set tc25, 46
-+.set setup,47
-+.set tc,48
-+.set tc25,49
-+.set tc2, 50
-+.set do_filter, 51
-+.set delta, 52
-+.set tc10, 53
-+.set delta0, 54
-+.set delta1, 55
-+.set zeros, 0
-+.set setup_input, 1
-+.set deltaq1, 2
-+
-+
-+
-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-+# Row has num16 16x16 blocks across
-+# Beta goes from 0 to 64
-+# tc goes from 0 to 24
-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-+#   has 8 bytes per edge
-+#   has 16 bytes per direction
-+#   has 32 bytes per 16x16 block
-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-+hevc_deblock_16x16:
-+  push r6-r15, lr
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+
-+process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl vert_filter
-+  sub r3,8
-+  b start_deblock_loop
-+deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt start_again
-+  pop r6-r15, pc
-+start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+vert_filter:
-+  push lr
-+
-+  vmov HX(P3,0), V(16,12)+r3
-+  vmov HX(P2,0), V(16,13)+r3
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+  vmov HX(Q2,0), V(16,18)+r3
-+  vmov HX(Q3,0), V(16,19)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds V(16,13)+r3, HX(P2,0), 0
-+  vadds V(16,14)+r3, HX(P1,0), 0
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+  vadds V(16,17)+r3, HX(Q1,0), 0
-+  vadds V(16,18)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+horz_filter:
-+  push lr
-+
-+  vmov HX(P3,0), H(12,0)+r3
-+  vmov HX(P2,0), H(13,0)+r3
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+  vmov HX(Q2,0), H(18,0)+r3
-+  vmov HX(Q3,0), H(19,0)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds H(13,0)+r3, HX(P2,0), 0
-+  vadds H(14,0)+r3, HX(P1,0), 0
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+  vadds H(17,0)+r3, HX(Q1,0), 0
-+  vadds H(18,0)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_luma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-+  valtl HX(beta,0),H(setup,0),H(setup,0)
-+  valtu HX(tc,0),H(setup,0),H(setup,0)
-+  vmul HX(tc25,0), HX(tc,0), 5
-+  vadd HX(tc25,0),HX(tc25,0), 1
-+  vasr HX(tc25,0), HX(tc25,0), 1
-+
-+  # Compute decision
-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-+
-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-+
-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
-+  vasr HX(beta2,0),HX(beta,0),2
-+  vasr HX(beta3,0),HX(beta,0),3
-+
-+  # Compute flags that are negative if all conditions pass
-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-+
-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-+  vmov HX(decision,0), 1 IFNN
-+  vadd H(decision,0),H(decision,3),0 IFN
-+  vadd H(decision,16),H(decision,19),0 IFN
-+  vmov -,HX(decision,0) SETF   # N marks strong filter
-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-+
-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-+  vmov HX(decision,0),0 IFNN # Z marks no filter
-+
-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-+  # First extract out even terms
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-+  # Now expand back
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-+
-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-+
-+  # Do a quick check to see if there is anything to do
-+  mov r11, 0 # Signal no filtering
-+  vmov -,1 IFNZ SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+  mov r11, 1 # Signal some filtering
-+  # And whether there is any strong filtering
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq normal_filtering
-+
-+  ##############################################################################
-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-+
-+  # Take a copy of the original pixels for use in decision calculation
-+  vmov HX(P0,32),HX(P0,0)
-+  vmov HX(Q0,32),HX(Q0,0)
-+  vmov HX(P1,32),HX(P1,0)
-+  vmov HX(Q1,32),HX(Q1,0)
-+  vmov HX(P2,32),HX(P2,0)
-+  vmov HX(Q2,32),HX(Q2,0)
-+
-+  vadd -,HX(P2,32),4 CLRA SACC
-+  vshl -,HX(P1,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl HX(delta,0),HX(Q1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P2,32),2 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vshl HX(delta,0),HX(Q0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q0,32),4 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vmul -,HX(P2,32),3 SACC
-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-+  #vmov HX(P2,0),3 IFN
-+
-+  # Now reverse all P/Qs
-+
-+  vadd -,HX(Q2,32),4 CLRA SACC
-+  vshl -,HX(Q1,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl HX(delta,0),HX(P1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q2,32),2 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vshl HX(delta,0),HX(P0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P0,32),4 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vmul -,HX(Q2,32),3 SACC
-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-+
-+  ##############################################################################
-+  # Normal filtering
-+normal_filtering:
-+  # Invert the decision flags
-+  # make instruction more complicated as assembler has error and loses SETF
-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-+
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+
-+  vasr HX(tc2,0), HX(tc,0), 1
-+  vmul HX(tc10,0), HX(tc,0), 10
-+
-+  vasr HX(thresh,0), HX(beta,0), 1
-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-+
-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-+  # Expand ptest and qtest together
-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-+
-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-+  vmov -,8 CLRA SACC
-+  vmul -,HX(delta0,0), 9 SACC
-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-+  vasr HX(delta0,0), HX(delta0,0), 4
-+  vdist HX(deltatest,0), HX(delta0,0), 0
-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-+
-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-+
-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-+  vadd HX(deltap1,0), HX(deltap1,0), 1
-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-+  vasr HX(deltap1,0), HX(deltap1,0), 1
-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-+
-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-+  vrsub -, HX(delta0,0), 0 SACC
-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-+
-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-+
-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-+
-+  vmov -,HX(deltatest,0) SETF
-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-+
-+  #vmov HX(P2,0),1 IFN
-+
-+filtering_done:
-+  b lr
-+
-+
-+hevc_uv_deblock_16x16:
-+  push r6-r15, lr
-+  mov r14,0
-+  b hevc_uv_start
-+hevc_uv_deblock_16x16_with_clear:
-+  push r6-r15, lr
-+  mov r14,1
-+  b hevc_uv_start
-+
-+hevc_uv_start:
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+# r14 is 1 if we should clear the old contents, or 0 if not
-+
-+uv_process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  cmp r14,1
-+  bne uv_skip0
-+  vstb H(zeros,0),(r4)
-+uv_skip0:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl uv_vert_filter
-+  sub r3,8
-+  b uv_start_deblock_loop
-+uv_deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  cmp r14,1
-+  bne uv_skip1
-+  vstb H(zeros,0),(r4)
-+uv_skip1:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip3
-+  vstb H(zeros,0),-16(r4)
-+uv_skip3:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,uv_skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+uv_start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt uv_deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip2
-+  vstb H(zeros,0),-16(r4)
-+uv_skip2:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,uv_skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt uv_start_again
-+  pop r6-r15, pc
-+uv_start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b uv_process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+uv_vert_filter:
-+  push lr
-+
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+uv_horz_filter:
-+  push lr
-+
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_chroma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
-+  valtl HX(tc,0),H(setup,0),H(setup,0)
-+
-+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
-+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
-+  vsub -,HX(P1,0),HX(Q1,0) SACC
-+  vmov HX(delta,0),4 SACC
-+  vasr HX(delta,0),HX(delta,0),3
-+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
-+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
-+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
-+  b lr
-+
-+# r0 = list
-+# r1 = number
-+hevc_run_command_list:
-+  push r6-r7, lr
-+  mov r6, r0
-+  mov r7, r1
-+loop_cmds:
-+  ld r0,(r6) # How to encode r6++?
-+  add r6,4
-+  ld r1,(r6)
-+  add r6,4
-+  ld r2,(r6)
-+  add r6,4
-+  ld r3,(r6)
-+  add r6,4
-+  ld r4,(r6)
-+  add r6,4
-+  ld r5,(r6)
-+  add r6,4
-+  bl hevc_trans_16x16
-+  sub r7,1
-+  cmp r7,0
-+  bgt loop_cmds
-+
-+  pop r6-r7, pc
 diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
 new file mode 100644
-index 0000000..0255f5d
+index 0000000000..0255f5dd44
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.c
 @@ -0,0 +1,149 @@
@@ -14567,7 +22210,7 @@ index 0000000..0255f5d
 +
 diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
 new file mode 100644
-index 0000000..b316878
+index 0000000000..b3168788d2
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.h
 @@ -0,0 +1,58 @@
@@ -14629,12 +22272,64 @@ index 0000000..b316878
 +int mbox_get_image_params(int fd, VC_IMAGE_T * img);
 +
 +#endif
+diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h
+new file mode 100644
+index 0000000000..e6127749ea
+--- /dev/null
++++ b/libavcodec/rpi_opts.h
+@@ -0,0 +1,46 @@
++#ifndef AVCODEC_RPI_OPTS_H
++#define AVCODEC_RPI_OPTS_H
++
++// define RPI to split the CABAC/prediction/transform into separate stages
++#ifndef RPI
++
++  #define RPI_INTER          0
++  #define RPI_TSTATS         0
++  #define RPI_HEVC_SAND      0
++
++#else
++  #include "config.h"
++
++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
++
++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
++  // This has no effect unless RPI_WORKER is defined
++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
++  // free for the foreground to fill in.
++  #define RPI_MAX_JOBS 2
++
++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
++//  #define RPI_DEBLOCK_VPU
++
++  #define RPI_VPU_DEBLOCK_CACHED 1
++
++  #if HAVE_NEON
++  #define RPI_HEVC_SAND      1
++  #else
++  // Sand bust on Pi1 currently - reasons unknown
++  #define RPI_HEVC_SAND      0
++  #endif
++
++
++  #define RPI_QPU_EMU_Y      0
++  #define RPI_QPU_EMU_C      0
++
++  #define RPI_TSTATS 0
++#endif
++
++#endif
++
 diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
 new file mode 100644
-index 0000000..7c0eedd
+index 0000000000..e872b855b7
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,902 @@
+@@ -0,0 +1,935 @@
 +#ifdef RPI
 +#include <stdio.h>
 +#include <stdlib.h>
@@ -14653,8 +22348,9 @@ index 0000000..7c0eedd
 +#include "rpi_mailbox.h"
 +#include "rpi_qpu.h"
 +#include "rpi_shader.h"
-+#include "rpi_hevc_transform.h"
-+#include "rpi_zc.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
 +
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
@@ -14678,26 +22374,13 @@ index 0000000..7c0eedd
 +#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
 +#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
-+// On Pi2 there is no way to access the VPU L2 cache
-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+#define GPU_MEM_FLG 0x4
-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
-+#define GPU_MEM_MAP 0x0
-+
 +#define vcos_verify_ge0(x) ((x)>=0)
 +
-+/*static const unsigned code[] =
-+{
-+  #include "rpi_shader.hex"
-+};*/
-+
 +// Size in 32bit words
-+#define QPU_CODE_SIZE 2048
++#define QPU_CODE_SIZE 4098
 +#define VPU_CODE_SIZE 2048
 +
-+const short rpi_transMatrix2even[32][16] = { // Even rows first
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
 +{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
 +{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
 +{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
@@ -14737,7 +22420,8 @@ index 0000000..7c0eedd
 +struct GPU
 +{
 +  unsigned int qpu_code[QPU_CODE_SIZE];
-+  unsigned int vpu_code[VPU_CODE_SIZE];
++  unsigned int vpu_code8[VPU_CODE_SIZE];
++  unsigned int vpu_code10[VPU_CODE_SIZE];
 +  short transMatrix2even[16*16*2];
 +};
 +
@@ -14749,8 +22433,9 @@ index 0000000..7c0eedd
 +#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
 +
 +struct rpi_cache_flush_env_s {
-+    unsigned int n;
-+    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++//    unsigned int n;
++//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++  struct vcsm_user_clean_invalid2_s v;
 +};
 +
 +#define WAIT_COUNT_MAX 16
@@ -14774,7 +22459,6 @@ index 0000000..7c0eedd
 +typedef struct vq_wait_s
 +{
 +  sem_t sem;
-+  unsigned int cost;
 +  struct vq_wait_s * next;
 +} vq_wait_t;
 +
@@ -14793,7 +22477,7 @@ index 0000000..7c0eedd
 +  int open_count;
 +  int init_count;
 +  int mb;
-+  unsigned int current_load;
++  int vpu_i_cache_flushed;
 +  GPU_MEM_PTR_T code_gm_ptr;
 +  vq_wait_pool_t wait_pool;
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -14866,8 +22550,8 @@ index 0000000..7c0eedd
 +
 +// GPU_MEM_PTR_T alloc fns
 +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  p->numbytes = (numbytes + 255) & ~255;  // Round up
++  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
@@ -14878,12 +22562,14 @@ index 0000000..7c0eedd
 +  av_assert0(p->arm);
 +  p->vc = mbox_mem_lock(mb, p->vc_handle);
 +  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
++
 +  return 0;
 +}
 +
 +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
 +  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
 +  av_assert0(p->vcsm_handle);
 +  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
 +  av_assert0(p->vc_handle);
@@ -14891,6 +22577,7 @@ index 0000000..7c0eedd
 +  av_assert0(p->arm);
 +  p->vc = mbox_mem_lock(mb, p->vc_handle);
 +  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
 +  return 0;
 +}
 +
@@ -14899,6 +22586,7 @@ index 0000000..7c0eedd
 +  vcsm_unlock_ptr(p->arm);
 +  vcsm_free(p->vcsm_handle);
 +  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++//  printf("***** %s\n", __func__);
 +}
 +
 +
@@ -14955,9 +22643,14 @@ index 0000000..7c0eedd
 +  }
 +  // And the VPU code
 +  {
-+    int num_bytes = sizeof(rpi_hevc_transform);
++    int num_bytes = sizeof(rpi_hevc_transform8);
 +    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
++    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++  }
++  {
++    int num_bytes = sizeof(rpi_hevc_transform10);
++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
 +  }
 +  // And the transform coefficients
 +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
@@ -15048,10 +22741,18 @@ index 0000000..7c0eedd
 +  gpu_unlock_unref(ge);
 +}
 +
-+unsigned int vpu_get_fn(void) {
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
 +  // Make sure that the gpu is initialized
 +  av_assert0(gpu != NULL);
-+  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
++  switch (bit_depth){
++    case 8:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++    case 10:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++    default:
++      av_assert0(0);
++  }
++  return 0;
 +}
 +
 +unsigned int vpu_get_constants(void) {
@@ -15081,95 +22782,75 @@ index 0000000..7c0eedd
 +//
 +// Cache flush functions
 +
++#define CACHE_EL_MAX 16
 +
 +rpi_cache_flush_env_t * rpi_cache_flush_init()
 +{
-+    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
-+    if (rfe == NULL)
-+        return NULL;
++  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
++            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
++  if (rfe == NULL)
++    return NULL;
 +
-+    rfe->n = 0;
-+    return rfe;
++  rfe->v.op_count = 0;
++  return rfe;
 +}
 +
 +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
 +{
-+    if (rfe != NULL)
-+        free(rfe);
++  if (rfe != NULL)
++    free(rfe);
 +}
 +
 +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
 +{
-+    int rc = 0;
-+    unsigned int na;
-+    unsigned int nr;
++  int rc = 0;
 +
-+    // Clear any reamaining ents in the final block
-+    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
-+        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
++  if (vcsm_clean_invalid2(&rfe->v) != 0)
++    rc = -1;
 +
-+    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
-+    {
-+        if (vcsm_clean_invalid(rfe->a + na) != 0)
-+            rc = -1;
-+    }
++  free(rfe);
 +
-+    free(rfe);
++  if (rc == 0)
++    return 0;
 +
-+    if (rc == 0)
-+        return 0;
-+
-+    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
-+    return rc;
++  av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++  return rc;
 +}
 +
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
 +{
-+    // Deal with empty pointer trivially
-+    if (gm == NULL || gm->numbytes == 0)
-+        return;
++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
 +
-+    {
-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
 +
-+        av_assert0(rfe->n < CFE_ENT_COUNT);
-+
-+        a->s[n].cmd = mode;
-+        a->s[n].handle = gm->vcsm_handle;
-+        a->s[n].addr = (unsigned int)gm->arm;
-+        a->s[n].size = gm->numbytes;
-+        ++rfe->n;
-+    }
++  b->invalidate_mode = mode;
++  b->block_count = blocks;
++  b->start_address = gm->arm + offset0;
++  b->block_size = block_size;
++  b->inter_block_stride = block_stride;
 +}
 +
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
 +  const unsigned int offset, const unsigned int size)
 +{
-+    // Deal with empty pointer trivially
-+    if (gm == NULL || size == 0)
-+        return;
++  // Deal with empty pointer trivially
++  if (gm == NULL || size == 0)
++    return;
 +
-+//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
++  av_assert0(offset <= gm->numbytes);
++  av_assert0(size <= gm->numbytes);
++  av_assert0(offset + size <= gm->numbytes);
 +
-+    av_assert0(offset <= gm->numbytes);
-+    av_assert0(size <= gm->numbytes);
-+    av_assert0(offset + size <= gm->numbytes);
-+
-+    {
-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
-+
-+        av_assert0(rfe->n < CFE_ENT_COUNT);
-+
-+        a->s[n].cmd = mode;
-+        a->s[n].handle = gm->vcsm_handle;
-+        a->s[n].addr = (unsigned int)gm->arm + offset;
-+        a->s[n].size = size;
-+        ++rfe->n;
-+    }
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
 +}
 +
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
 +{
 +#if !RPI_ONE_BUF
@@ -15186,21 +22867,27 @@ index 0000000..7c0eedd
 +  }
 +}
 +
-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
 +{
-+  const unsigned int y_offset = frame->linesize[0] * start_line;
-+  const unsigned int y_size = frame->linesize[0] * n;
++  const unsigned int y_offset = frame->linesize[0] * y0;
++  const unsigned int y_size = frame->linesize[0] * height;
 +  // Round UV up/down to get everything
 +  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
 +
++#if 0
++  // *** frame->height is cropped height so not good
 +  // As all unsigned they will also reject -ve
 +  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);
++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
 +  av_assert0(n <= (unsigned int)frame->height);
 +  av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
 +
 +  if (!gpu_is_buf1(frame))
 +  {
@@ -15212,7 +22899,7 @@ index 0000000..7c0eedd
 +      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
 +    }
 +  }
-+  else if (!rpi_sliced_frame(frame))
++  else if (!av_rpi_is_sand_frame(frame))
 +  {
 +    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
 +    if (do_luma) {
@@ -15225,16 +22912,30 @@ index 0000000..7c0eedd
 +  }
 +  else
 +  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
-+    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
-+      if (do_luma) {
-+        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
-+      }
-+      if (do_chroma) {
-+        rpi_cache_flush_add_gm_range(rfe, gm, mode,
-+                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
-+      }
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
++    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++    if (do_chroma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++      b->block_size = uv_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++    if (do_luma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++      b->block_size = y_size;
++      b->inter_block_stride = stride1 * stride2;
 +    }
 +  }
 +}
@@ -15275,13 +22976,11 @@ index 0000000..7c0eedd
 +
 +
 +// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(const unsigned int cost)
++static vq_wait_t * vq_wait_new(void)
 +{
 +  gpu_env_t * const ge = gpu_lock_ref();
 +  vq_wait_t * const wait = ge->wait_pool.head;
 +  ge->wait_pool.head = wait->next;
-+  ge->current_load += cost;
-+  wait->cost = cost;
 +  wait->next = NULL;
 +
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -15337,17 +23036,13 @@ index 0000000..7c0eedd
 +
 +static void vq_wait_post(vq_wait_t * const wait)
 +{
-+#if !RPI_TRACE_TIME_VPU_QPU_WAIT
-+  if (wait->cost != 0)
-+#endif
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +  {
 +    gpu_env_t *const ge = gpu_lock();
-+    ge->current_load -= wait->cost;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +    tto_end(&ge->ttw.active, ns_time());
-+#endif
 +    gpu_unlock();
 +  }
++#endif
 +
 +  sem_post(&wait->sem);
 +}
@@ -15363,7 +23058,6 @@ index 0000000..7c0eedd
 +{
 +  unsigned int n;
 +  unsigned int mask;
-+  unsigned int cost;
 +  struct gpu_job_s j[VPU_QPU_JOB_MAX];
 +};
 +
@@ -15396,23 +23090,26 @@ index 0000000..7c0eedd
 +    vqj->mask |= VPU_QPU_MASK_VPU;
 +
 +    j->command = EXECUTE_VPU;
-+    j->u.v.q[0] = vpu_code;
++    // The bottom two bits of the execute address contain no-flush flags
++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
++    // as we never reload code
++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
 +    j->u.v.q[1] = r0;
 +    j->u.v.q[2] = r1;
 +    j->u.v.q[3] = r2;
 +    j->u.v.q[4] = r3;
 +    j->u.v.q[5] = r4;
 +    j->u.v.q[6] = r5;
++    gpu->vpu_i_cache_flushed = 1;
 +  }
 +}
 +
 +// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
 +{
 +  if (n != 0) {
 +    struct gpu_job_s *const j = new_job(vqj);
 +    vqj->mask |= VPU_QPU_MASK_QPU;
-+    vqj->cost += cost;
 +
 +    j->command = EXECUTE_QPU;
 +    j->u.q.jobs = n;
@@ -15442,7 +23139,7 @@ index 0000000..7c0eedd
 +  }
 +
 +  // We are going to want a sync object
-+  wait = vq_wait_new(vqj->cost);
++  wait = vq_wait_new();
 +
 +  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
 +  // If we only posted one thing or only QPU jobs
@@ -15464,7 +23161,6 @@ index 0000000..7c0eedd
 +    j->callback.cookie = wait;
 +  }
 +
-+  vqj->cost = 0;
 +  vqj->mask = 0;
 +  *wait_h = wait;
 +}
@@ -15483,11 +23179,6 @@ index 0000000..7c0eedd
 +  return rv;
 +}
 +
-+unsigned int vpu_qpu_current_load(void)
-+{
-+  return gpu_ptr()->current_load;
-+}
-+
 +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
 +{
 +  if (wait_h != NULL)
@@ -15536,13 +23227,50 @@ index 0000000..7c0eedd
 +  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
 +}
 +
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++  // Dummy values we can catch with emulation
++  qf->y_pxx = ~1U;
++  qf->y_bxx = ~2U;
++  qf->y_p00 = ~3U;
++  qf->y_b00 = ~4U;
++  qf->c_pxx = ~5U;
++  qf->c_bxx = ~6U;
++
++  switch (bit_depth) {
++    case 8:
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
++      qf->c_pxx = qpu_fn(mc_filter_c_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c_b);
++      break;
++    case 10:
++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++      break;
++    default:
++      return -1;
++  }
++  return 0;
++}
++
 +#endif // RPI
 diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
 new file mode 100644
-index 0000000..a95f7d9
+index 0000000000..485a08f8ba
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,200 @@
+@@ -0,0 +1,206 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
@@ -15687,21 +23415,35 @@ index 0000000..a95f7d9
 +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
 +  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
 +
 +// init, add, finish for one gm ptr
 +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
 +
 +
 +// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++    uint32_t c_pxx;
++    uint32_t c_pxx_l1;
++    uint32_t c_bxx;
++    uint32_t y_pxx;
++    uint32_t y_bxx;
++    uint32_t y_p00;
++    uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
 +uint32_t qpu_fn(const int * const mc_fn);
 +
-+#define QPU_N_GRP_UV 4
-+#define QPU_N_UV     8
-+#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
-+#define QPU_N_Y      12
++#define QPU_N_GRP    4
++#define QPU_N_MAX    12
 +
 +#define QPU_MAIL_EL_VALS  2
 +
@@ -15717,27 +23459,19 @@ index 0000000..a95f7d9
 +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
 +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
 +  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
 +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
 +int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
 +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
 +
-+
-+extern unsigned int vpu_get_fn(void);
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
 +extern unsigned int vpu_get_constants(void);
 +
 +// Waits for previous post_codee to complete and Will null out *wait_h after use
 +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+unsigned int vpu_qpu_current_load(void);
 +int vpu_qpu_init(void);
 +void vpu_qpu_term(void);
 +
-+// Simple test of shader code
-+extern int rpi_test_shader(void);
-+
-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-+
 +extern int gpu_get_mailbox(void);
 +void gpu_ref(void);
 +void gpu_unref(void);
@@ -15745,10 +23479,10 @@ index 0000000..a95f7d9
 +#endif
 diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
 new file mode 100644
-index 0000000..0898ecd
+index 0000000000..2c6541a8fb
 --- /dev/null
 +++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,670 @@
+@@ -0,0 +1,1570 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -15772,648 +23506,1548 @@ index 0000000..0898ecd
 +__attribute__((aligned(8)))
 +#endif
 +unsigned int rpi_shader[] = {
-+// ::mc_setup_c
-+/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
-+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
-+/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
-+/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
-+/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
-+/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
-+/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
-+/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
-+/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
-+/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
-+/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
-+/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
-+/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
-+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
-+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
-+/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
-+/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
-+/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
-+/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
 +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
 +/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
 +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
-+/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
-+/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
-+/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
-+/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
-+/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
-+/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
-+/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
-+/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
-+/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
-+/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
-+/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
-+/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
-+/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
-+/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
-+/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
-+/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
-+/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
-+/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
-+/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
-+/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
-+/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
-+/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
-+/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
-+/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
-+/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
-+// ::mc_filter_uv
-+/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
-+/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
-+/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
-+/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
-+/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
-+/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
-+/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
-+/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
-+/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
-+/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
-+/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
-+/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
-+/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
-+/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
-+/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
-+/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
-+/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
-+// :uvloop
-+/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
-+/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
-+/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
-+/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
-+/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
-+/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
-+/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
-+/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
-+/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
-+/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
-+/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
-+/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
-+/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
-+/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
-+/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_uv_b0
-+/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
-+/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
-+/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
-+/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
-+/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
-+/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
-+/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
-+/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
-+/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
-+/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
-+/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
-+/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
-+/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
-+/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
-+/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
-+/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
-+// :uvloop_b0
-+/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
-+/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
-+/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
-+/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
-+/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
-+/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
-+/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
-+/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
-+/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
-+/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
-+/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
-+/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
-+/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
-+/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
-+/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
-+/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
-+/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
-+/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
-+/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
-+/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
-+/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
-+/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
-+/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
-+/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
-+/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
-+/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
-+/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
-+/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
-+/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
-+/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
-+/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
-+/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
-+/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
-+// :uv_b0_post12
-+/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
-+/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
-+/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
-+/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
-+/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
-+/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
-+// :uv_b0_post_fin
-+/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
-+/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
-+/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
-+/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
-+/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
-+/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
-+/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
-+/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
-+/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
-+/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
-+/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
-+/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
-+/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+// :uvloop_b
-+/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
-+/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
-+/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
-+/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
-+/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
-+/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
-+/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
-+/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
-+/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
-+/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
-+/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
-+/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
-+/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
-+/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
-+/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
-+/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
-+/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
-+/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
-+/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
-+/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
-+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
-+/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
-+/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_interrupt_exit8c
-+/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
-+/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit
-+// ::mc_exit_c
-+/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit12
-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
-+/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit1
-+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_setup
-+/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
-+/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-+/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
-+/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
-+/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
-+/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
-+/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
-+/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
-+/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
-+/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
-+/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
-+/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
-+/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
-+/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
-+/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
-+/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
-+/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
-+/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
-+/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
-+/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
-+/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
-+// :per_block_setup
-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
-+/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
-+/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
-+/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
-+/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
-+/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
-+/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
-+/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
-+/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
-+/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
-+/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
-+/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
-+/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
-+/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
-+/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
-+/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
-+/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
-+/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
-+/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
-+/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
-+/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
-+/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
-+/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
-+/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
-+/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
-+/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
-+/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
-+/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
-+/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
-+/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
-+/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
-+// ::mc_filter
-+/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
-+// :yloop
-+/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
-+/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
-+/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
-+/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
-+/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
-+/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
-+/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
-+/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
-+/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
-+/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
-+/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
-+/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
-+/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
-+/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
-+/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_filter_b
-+// :yloopb
-+/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
-+/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
-+/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
-+/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
-+/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
-+/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
-+/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
-+/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
-+/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
-+/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
-+/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
-+/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
-+/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
-+/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
-+/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
-+/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
-+/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
-+/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
-+/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000118] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :1
++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++// :1
++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_sync_q0
++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q2
++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q3
++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :1
++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter_y_pxx
++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :1
++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :1
++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :1
++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :1
++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++// :1
++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q7
++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q9
++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :1
++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter_y10_pxx
++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :1
++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :1
++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :1
++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
@@ -16421,35 +25055,79 @@ index 0000000..0898ecd
 +#endif
 diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
 new file mode 100644
-index 0000000..d17b9fd
+index 0000000000..82bf380eb4
 --- /dev/null
 +++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,19 @@
+@@ -0,0 +1,63 @@
 +#ifndef rpi_shader_H
 +#define rpi_shader_H
 +
 +extern unsigned int rpi_shader[];
 +
-+#define mc_setup_c (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 152)
-+#define mc_filter_uv_b0 (rpi_shader + 280)
-+#define mc_interrupt_exit8c (rpi_shader + 554)
-+#define mc_exit (rpi_shader + 582)
-+#define mc_exit_c (rpi_shader + 582)
-+#define mc_interrupt_exit12 (rpi_shader + 598)
-+#define mc_exit1 (rpi_shader + 634)
-+#define mc_setup (rpi_shader + 650)
-+#define mc_filter (rpi_shader + 942)
-+#define mc_filter_b (rpi_shader + 1094)
-+#define mc_end (rpi_shader + 1246)
++#define mc_setup_c_q0 (rpi_shader + 0)
++#define mc_start (rpi_shader + 0)
++#define mc_setup_c_qn (rpi_shader + 2)
++#define mc_filter_c_p (rpi_shader + 142)
++#define mc_filter_c_p_l1 (rpi_shader + 272)
++#define mc_filter_c_b (rpi_shader + 402)
++#define mc_sync_q0 (rpi_shader + 590)
++#define mc_sync_q1 (rpi_shader + 608)
++#define mc_sync_q2 (rpi_shader + 620)
++#define mc_sync_q3 (rpi_shader + 632)
++#define mc_sync_q4 (rpi_shader + 644)
++#define mc_sync_q5 (rpi_shader + 662)
++#define mc_sync_q6 (rpi_shader + 674)
++#define mc_sync_q7 (rpi_shader + 686)
++#define mc_sync_q8 (rpi_shader + 698)
++#define mc_sync_q9 (rpi_shader + 716)
++#define mc_sync_q10 (rpi_shader + 728)
++#define mc_sync_q11 (rpi_shader + 740)
++#define mc_exit_c_qn (rpi_shader + 752)
++#define mc_exit_y_qn (rpi_shader + 752)
++#define mc_exit_c_q0 (rpi_shader + 770)
++#define mc_exit_y_q0 (rpi_shader + 770)
++#define mc_setup_y_q0 (rpi_shader + 790)
++#define mc_setup_y_qn (rpi_shader + 792)
++#define mc_filter_y_pxx (rpi_shader + 1032)
++#define mc_filter_y_bxx (rpi_shader + 1162)
++#define mc_filter_y_p00 (rpi_shader + 1292)
++#define mc_filter_y_b00 (rpi_shader + 1382)
++#define mc_setup_c10_q0 (rpi_shader + 1462)
++#define mc_setup_c10_qn (rpi_shader + 1464)
++#define mc_filter_c10_p (rpi_shader + 1600)
++#define mc_filter_c10_p_l1 (rpi_shader + 1728)
++#define mc_filter_c10_b (rpi_shader + 1856)
++#define mc_sync10_q0 (rpi_shader + 2042)
++#define mc_sync10_q1 (rpi_shader + 2060)
++#define mc_sync10_q2 (rpi_shader + 2072)
++#define mc_sync10_q3 (rpi_shader + 2084)
++#define mc_sync10_q4 (rpi_shader + 2096)
++#define mc_sync10_q5 (rpi_shader + 2114)
++#define mc_sync10_q6 (rpi_shader + 2126)
++#define mc_sync10_q7 (rpi_shader + 2138)
++#define mc_sync10_q8 (rpi_shader + 2150)
++#define mc_sync10_q9 (rpi_shader + 2168)
++#define mc_sync10_q10 (rpi_shader + 2180)
++#define mc_sync10_q11 (rpi_shader + 2192)
++#define mc_exit_c10_q0 (rpi_shader + 2204)
++#define mc_exit_y10_q0 (rpi_shader + 2204)
++#define mc_exit_c10_qn (rpi_shader + 2224)
++#define mc_exit_y10_qn (rpi_shader + 2224)
++#define mc_setup_y10_q0 (rpi_shader + 2242)
++#define mc_setup_y10_qn (rpi_shader + 2244)
++#define mc_filter_y10_pxx (rpi_shader + 2494)
++#define mc_filter_y10_p00 (rpi_shader + 2624)
++#define mc_filter_y10_bxx (rpi_shader + 2716)
++#define mc_filter_y10_b00 (rpi_shader + 2846)
++#define mc_end (rpi_shader + 2926)
 +
 +#endif
 diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
 new file mode 100644
-index 0000000..aa3fe47
+index 0000000000..ba6cc13a95
 --- /dev/null
 +++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1259 @@
+@@ -0,0 +1,1741 @@
 +
 +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
 +# the warning that we are using rotation & ra/rb registers. r0..3 can be
@@ -16457,102 +25135,197 @@ index 0000000..aa3fe47
 +# local 4.  As it happens this is what is wanted here as we do not want the
 +# constants from the other half of the calc.
 +
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# However in the current world there seems to be no benefit (and a small
++# overhead) in setting this bigger than 2.
++
++.set PREREAD,                      4
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8,               16
++.set C_BLK_HEIGHT_16,              8
++.set Y_BLK_HEIGHT_8,               16
++.set Y_BLK_HEIGHT_16,              8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in rpi_shader_cmd.h
++
++.set N_QPU_8,                      12
++.set N_QPU_16,                     12
++
 +# register allocation
 +#
-+# ra0...ra7                                     eight horizontal filter coefficients
-+#
-+# rb0 rx_shift2
-+# rb1 rb_y2_next
-+#
-+# rb4...rb7
-+#
-+# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
-+#
-+#                                               (ra15 isn't clamped to zero - this happens during the
-+#                                                copy to ra14, and during its use in the vertical filter)
-+#
-+# rb8...rb11                                    eight vertical filter coefficients
 +
-+# ra4                                           y: Fiter, UV: part -of b0 -> b stash
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
 +
-+# rb12                                          offset to add before shift (round + weighting offsets)
-+# rb13                                          shift: denom + 6 + 9
-+# rb14                                          L0 weight (U on left, V on right)
-+# rb15                                          -- free --
-+#
-+# ra16                                          width:height
-+# ra17                                          ra_y:ra_xshift
-+# ra18                                          L1 weight (Y)
-+# ra19                                          ra_y_next:ra_xshift_next
-+#
-+# rb16                                          pitch
-+# rb17                                          height + 1
-+# rb18                                          max(height,16) + 3
-+# rb19                                          frame_base2_next
-+#
-+# ra20                                          1
-+# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
-+# ra22 ra_k256                                  256
-+# ra23                                          0
-+#
-+# rb20                                          -- free --
-+# rb21                                          -- free --
-+# rb22 rb_k255                                  255
-+# rb23                                          dest (Y)
-+#
-+# rb24                                          vdw_setup_1(dst_pitch)
-+# rb25                                          frame width-1
-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
-+# rb27                                          vdw_setup_0 (depends on QPU number)
-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
-+# rb29                                          vdw_setup_1(dst_pitch-width)
-+# rb30                                          frame height-1
-+# rb31                                          used as temp to count loop iterations
-+#
-+# ra24                                          src frame base
-+# ra25                                          src frame base 2
-+# ra26                                          next ra24
-+# ra27                                          next ra25
-+# ra28                                          -- free --
-+# ra29                                          -- free --
-+#
-+# Use an even numbered register as a link register to avoid corrupting flags
-+# ra30                                          next kernel address
-+# ra31                                          chroma-B height+3; free otherwise
++# ra4-7
++# C:   L0 H filter out FIFO
++# otherwise -- free --
 +
-+.set rb_max_x,                     rb25
-+.set rb_max_y,                     rb30
-+.set rb_pitch,                     rb16
++# ra8-11
++# temp in some places - check usage
++# Y:   (with rb8-11) horiz out FIFO
++
++# ra12-15
++# -- free --
++
++# uniform: width:height
 +.set ra_width_height,              ra16
 +.set ra_width,                     ra16.16b
 +.set ra_height,                    ra16.16a
-+.set ra_y2,                        ra21.16a
-+.set ra_y2_next,                   ra21.16b
 +
-+.set rb_base2_next,                rb19
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2,                      ra17
++.set ra_y2,                        ra17.16a
++.set ra_y,                         ra17.16b
 +
-+.set rb_dest,                      rb23
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1,             ra18
++.set ra_wt_off_l1,                 ra18.16b
++.set ra_wt_mul_l1,                 ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next,                 ra19
++.set ra_y_next,                    ra19.16b
++.set ra_y2_next,                   ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff100100,                 ra20
++.set ra_k256,                      ra20.16a
++.set ra_k0,                        ra20.8a
++.set ra_k1,                        ra20.8b
++.set ra_k16,                       ra20.8c
++.set ra_k255,                      ra20.8d
++
++# Loop: xshifts
++.set ra_xshift,                    ra21.16a
++.set ra_xshift_next,               ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0,             ra22
++.set ra_wt_mul_l0,                 ra22.16a
++.set ra_wt_off_l0,                 ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax,           ra23
++.set ra_pmax,                      ra23.16a
++.set ra_blk_height,                ra23.8c
++# -- free --                       ra23.8d
++
++# Loop:  src frame base (L0)
 +.set ra_base,                      ra24
-+.set ra_base_next,                 ra26
-+.set ra_xshift,                    ra17.16a
 +
++# Loop: src frame base (L1)
 +.set ra_base2,                     ra25
 +
-+# Note ra_xy & ra_xy_next should have same structure!
-+.set ra_xshift_next,               ra19.16a
++# Loop: next src frame base (L0)
++.set ra_base_next,                 ra26
++
++# -- free --                       ra27
++# -- free --                       ra28
++# -- free --                       ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link,                      ra30
++
++# -- free --                       ra31
++
 +.set rb_xshift2,                   rb0
 +.set rb_xshift2_next,              rb1
 +
-+.set ra_y_next,                    ra19.16b
-+.set ra_y,                         ra17.16b
++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x,                    rb2
 +
-+.set ra_k1,                        ra20
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++.set rb_ef,                        rb3
++
++# rb4-7
++# C-B: L1 H filter out FIFO
++# Y:   (with ra2.8x) Y vertical filter coeffs
++
++# rb8-11
++# C:   Vertical filter coeffs
++# Y:   (with ra8-11) horiz out FIFO
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off,                    rb12
++
++# Setup: denom + 6 + 9
++.set rb_wt_den_p15,                rb13
++
++# -- free --                       rb14
++# -- free --                       rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch,                     rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu,                     rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount,                    rb18
++
++# frame_base2_next
++.set rb_base2_next,                rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
 +.set rb_xpitch,                    rb20
-+.set rb_k255,                      rb22
-+.set ra_k256,                      ra22
-+.set ra_k0,                        ra23
 +
-+.set ra_link,                      ra30
++# -- free --                       rb21
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask,                     rb22
++
++# Loop: destination address
++.set rb_dest,                      rb23
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base,                 rb24
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x,                     rb25
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set rb_dma0,                      rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base,                 rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init,                  rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1,                      rb29
++
++# Setup: pic_height - 1
++.set rb_max_y,                     rb30
++
++# -- free --                       rb31
++
++
++
 +
 +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
 +.set i_shift16,                    -16
@@ -16564,8 +25337,10 @@ index 0000000..aa3fe47
 +# Macros that express this - obviously these can't be overlapped
 +# so are probably unsuitable for loop code
 +
-+.macro m_calc_dma_regs, r_vpm, r_dma
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
 +  mov r2, qpu_num
++.if v_bit_depth <= 8
++  # 8 bit version
 +  asr r1, r2, 2
 +  shl r1, r1, 6
 +  and r0, r2, 3
@@ -16576,811 +25351,983 @@ index 0000000..aa3fe47
 +
 +  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
 +  shl r0, r0, 5
-+  add r_dma, r0, r1  # DMA out
-+.endm
 +
-+# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
-+.macro m_calc_dma_regs_c, r_vpm, r_dma
-+  mov r2, qpu_num
++.else
++  # 16 bit version
++  # Limited to 8 QPUs if blk height > 8
 +  asr r1, r2, 1
++.if v_blk_height <= 8
++  shl r1, r1, 4
++.else
 +  shl r1, r1, 5
++.endif
 +  and r0, r2, 1
 +  or  r0, r0, r1
 +
-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add r_vpm, r0, r1  # VPM 8bit storage
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
++  add r_vpm, r0, r1
 +
 +  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
 +  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
 +  shl r0, r0, 6
++.endif
 +  add r_dma, r0, r1  # DMA out
 +.endm
 +
 +
++.macro m_setup_q0
++  srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
 +################################################################################
 +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+::mc_setup_c
-+  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_pmask,           0xff
++.set v_blk_height,      C_BLK_HEIGHT_8
++.else
++.set v_x_shift,         2
++.set v_pmask,           0xffff
++.set v_blk_height,      C_BLK_HEIGHT_16
++.endif
++
++  mov tmurs, 1                                  # No swap TMUs
 +
 +# Load first request location
-+  mov ra0, unif         # next_x_y
++  mov ra0, unif                                 # next_x_y
++
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
 +
 +  mov ra_base, unif                             # Store frame c base
 +
 +# Read image dimensions
-+  sub rb_max_x, unif, 1     # pic c width
-+  sub rb_max_y, unif, 1     # pic c height
++  sub r0, unif, 1                               # pic c width
++  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
++  sub rb_max_y, unif, 1                         # pic c height
 +
 +# load constants
-+  mov ra_k1, 1
-+  mov ra_k256, 256
-+  mov rb_k255, 255
-+  mov ra_k0, 0
++  mov ra_kff100100, 0xff100100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
 +
-+# touch registers to keep simulator happy
-+
-+  # ra/b4..7: B0 -> B stash registers
-+  mov ra4, 0 ; mov rb4, 0
-+  mov ra5, 0 ; mov rb5, 0
-+  mov ra6, 0 ; mov rb6, 0
-+  mov ra7, 0 ; mov rb7, 0
-+
-+  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
-+
-+# ; ra12..15: vertical scroll registers
 +# get source pitch
-+  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
-+  mov rb_pitch, unif    ; mov ra13, 0           # stride1
-+  mov r0, elem_num      ; mov ra14, 0
-+# get destination vdw setup
-+  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
++  mov rb_xpitch, unif                           # stride2
++  mov rb_pitch, unif                            # stride1
++  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
++  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
++
++  and r0, 1, elem_num
++  nop                   ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++  add rb_elem_x, r0, elem_num
++.else
++  add r0, r0, elem_num
++  add rb_elem_x, r0, r0
++.endif
 +
 +# Compute base address for first and second access
 +# ra_base ends up with t0s base
 +# ra_base2 ends up with t1s base
 +
-+  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
++  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
++  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
 +  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
 +  min r0, r0, rb_max_x
 +
 +# Get shift
-+  and r1, r0, 1
-+  shl ra_xshift_next, r1, 4
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.else
++  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
 +
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
 +
-+  and r0, r0, -2
-+  add r0, r0, r0        ; v8subs r1, r1, r1
-+  sub r1, r1, rb_pitch
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
 +  and r1, r0, r1
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra_y
++  add r0, r0, r1
 +  add ra_base, ra_base, r0
 +
-+  max r0, r1, 0
-+  min r0, r0, rb_max_y
-+
-+# submit texture requests for first line
-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+  add t0s, ra_base, r0
-+
-+# submit texture requests for 2nd line
-+
-+  max r0, r1, 0
-+  min r0, r0, rb_max_y
-+
-+  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+  add t0s, ra_base, r0
-+
-+  add rb13, 9, unif     # denominator
-+  mov -, unif           # Unused
++  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
 +
 +# Compute part of VPM to use for DMA output
-+  m_calc_dma_regs_c rb28, rb27
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
 +
-+# -----------------
 +# And again for L1, but only worrying about frame2 stuff
 +
-+  mov ra_link, unif        # Next fn
-+
 +# Load first request location
-+  mov ra0, unif            # next_x_y
++  mov ra0, unif                                 # next_x_y
 +
-+  mov ra_base2, unif # Store frame c base
++  mov ra_base2, unif                            # [ra0 delay] Store frame c base
 +
 +# Compute base address for first and second access
 +# ra_base ends up with t0s base
 +# ra_base2 ends up with t1s base
 +
-+  mov ra_y2, ra0.16a       # Store y
-+  mov r0, ra0.16b          # Load x
-+  add r0, r0, elem_num     # Add QPU slice
-+  max r0, r0, 0         ; mov -, unif           # Unused 0
-+  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
++  shl r0, ra0.16b, v_x_shift
++  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
++  max r0, r0, 0
++  min r0, r0, rb_max_x
 +
-+# Get shift
-+  and r1, r0, 1         ; mov -, unif           # Unused 2
-+  shl rb_xshift2_next, r1, 4
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
 +
 +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
 +
-+  and r0, r0, -2
-+  add r0, r0, r0        ; v8subs r1, r1, r1
-+  sub r1, r1, rb_pitch
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
 +  and r1, r0, r1
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra_y2
++  add r0, r0, r1        ; mov r2, ra_y2
 +  add ra_base2, ra_base2, r0
 +
-+  max r0, r1, 0
-+  min r0, r0, rb_max_y
++# Do preloads
++# r0 = ra_y, r2 = ra_y2
++  mov r3, PREREAD       ; mov r0, ra_y
 +
-+# submit texture requests for first line
-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
 +
-+# submit texture requests for 2nd line
-+
-+  max r0, r1, 0         ; mov -, unif           # Unused 4
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
 +
++  mov ra_link, unif                             # link
++# touch registers to keep simulator happy
++  # ra/b4..7: B0 -> B stash registers
++  mov ra4, 0 ; mov rb4, 0
 +  bra -, ra_link
-+
-+  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
-+  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+  add t1s, ra_base2, r0
-+
++  mov ra5, 0 ; mov rb5, 0
++  mov ra6, 0 ; mov rb6, 0
++  mov ra7, 0 ; mov rb7, 0
 +# >>> ra_link
-+
-+
-+.macro setf_nz_if_v
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +.endm
 +
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  m_setup_c 8
 +
 +################################################################################
 +
-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv
-+  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_x_mul,           2
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_x_mul,           4
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift,        rb_xshift2              # b side more convienient
++.set vrx_xshift_next,   ra_xshift_next
++.set vra_y_next,        ra_y_next
++.set vrx_base_next,     ra_base_next
++.set vra_y,             ra_y
++.set vra_base,          ra_base
++.set vr_txs,            t0s
++.else
++.set vrx_xshift,        ra_xshift               # a side more convienient
++.set vrx_xshift_next,   rb_xshift2_next
++.set vra_y_next,        ra_y2_next
++.set vrx_base_next,     rb_base2_next
++.set vra_y,             ra_y2
++.set vra_base,          ra_base2
++.set vr_txs,            t1s
++.endif
 +
 +# per-channel shifts were calculated on the *previous* invocation
-+
 +# get base addresses and per-channel shifts for *next* invocation
-+  mov ra2, unif         ; mov r0, elem_num
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
 +
-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
++  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
 +
-+  shl ra_xshift_next, r0, 4
-+
-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++.if v_bit_depth <= 8
++  shl vrx_xshift_next, r0, 3
++  and r0, r0, -4
++.endif
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
++  add vrx_base_next, r3, r0     ; mov r1, ra_height
 +
 +# set up VPM write
-+
-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
-+  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
-+  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
 +
 +# ; unpack filter coefficients
 +
-+  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
-+  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
++  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
 +
-+  shl r1, r1, rb13      ; mov rb10, ra3.8c
-+  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
++  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
 +
-+  asr rb12, r1, 1
-+  shl rb14, ra1.16a, 1  # b14 = weight*2
++  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
 +
-+# rb14 - weight L0 * 2
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
++  sub ra3, rb_wt_den_p15, ra_k1
 +
++# r5           = 0 (loop counter)
++# ra9          = alias for rb_max_y
++# ra_wt_mul_l0 = weight L0
++# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
++# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+# r3 = 0
-+:uvloop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++.if v_tmu == 0
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++.endif
 +
-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
++  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++  min r3, r3, ra9       ; mov.ifnc r0, r2
 +
-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+  min r2, r2, rb_max_y
-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  setf_nz_if_v
++  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
 +# apply horizontal filter
 +# The filter coeffs for the two halves of this are the same (unlike in the
 +# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are vaild for all QPUs
 +
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  sub r0, r2, r3        ; mov r3, rb31
-+  sub.setf -, r3, 4     ; mov ra12, ra13
-+  brr.anyn -, r:uvloop
-+  mov ra13, ra14        ; mul24 r1, ra14, rb9
-+  mov ra14, ra15
-+  mov ra15, r0          ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
 +
-+# apply vertical filter and write to VPM
++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
++# Have to dup block as we need to move the brr - code is more common than it
++# looks at first glance
++.if v_bit_depth <= 8
++  brr.anyn -, r:1b
++  add r2, r2, r3        ; mov ra5, ra6
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++.else
++  add r2, r2, r3        ; mov ra5, ra6
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, ra4, rb8
++  asr ra7, r2, v_bit_depth - 8
++.endif
++# >>> .anyn 1b
 +
-+  sub r1, r1, r0        ; mul24 r0, ra14, rb10
-+  add r1, r1, r0        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
 +  sub r1, r1, r0
-+  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
 +  asr r1, r1, 14
-+  nop                   ; mul24 r1, r1, rb14
-+  shl r1, r1, 8
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  shl r1, r1, 8         ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, ra3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
 +
-+  add r1, r1, rb12
-+  asr ra1.8as, r1, rb13
-+  nop                   ; mov r1, r1 << 8
-+  brr.anyn -, r:uvloop
-+  asr ra1.8bs, r1, rb13
-+  mov -, vw_wait
-+  mov vpm, ra1
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+# >>>
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
 +
-+# DMA out for U & stash for V
-+  bra -, ra_link
-+  mov vw_setup, rb26
-+  mov vw_setup, rb29
-+  mov vw_addr, unif     # u_dst_addr
-+# >>>
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++# At 10 bits
++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
++# (P)
++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
++# ... should be OK
++#
++# (B)
++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
++# So signed overflow if we sign extend here :-(
++#
++# In practice this doesn't happen (we need a maximal offset and a very unlucky
++# filter).
++#
++# This could be fixed by offsetting the filters s.t. they are unsigned until
++# weight mul and then removing the offset with the weighting offset (I think
++# this should work) or splitting the rounding & offsetting
++
++::mc_filter_c_p
++  m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++  m_filter_c_p 1, 8
 +
 +################################################################################
 +
-+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++# mc_filter_c_b
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv_b0
-+  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++.set v_x_mul,           (1 << v_x_shift)
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+  mov ra2, unif         ; mov r0, elem_num
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
 +
-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
++  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
++  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
 +
-+  shl ra_xshift_next, r0, 4
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.endif
 +
-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
 +
 +# set up VPM write
 +
-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
-+  add rb17, r1, 1
-+  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
 +
-+# ; unpack filter coefficients
++  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
++  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
++  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
 +
-+  add r0,   r0, r2      ; mov rb9,  ra3.8b
-+  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27
++# L1 - uniform layout could possibly be optimized
 +
-+  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
++  shl r0, ra3.16b, v_x_shift                    # r0=x*2
++  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
++  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
++  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
 +
-+  mov rb14, unif                                # U weight
-+  mov.ifnz rb14, unif                           # V weight
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
 +
-+# rb14 unused in b0 but will hang around till the second pass
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# r3 = 0
-+:uvloop_b0
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
-+
-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+  min r2, r2, rb_max_y
-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  sub r0, r2, r3        ; mov r3, rb31
-+  sub.setf -, r3, 4     ; mov ra12, ra13
-+  brr.anyn -, r:uvloop_b0
-+  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
-+  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
-+  mov ra15, r0          ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop_b0
-+
-+# apply vertical filter and write to B-FIFO
-+
-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
-+  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
-+  sub r1, r1, r0        ; mov ra7, rb6
-+
-+# FIFO goes:
-+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
-+# This arrangement optimizes the inner loop FIFOs at the expense of making the
-+# bulk shift between loops quite a bit nastier
-+# a8 used as temp
-+
-+  sub.setf -, r3, ra31
-+  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
-+  brr.anyn -, r:uvloop_b0
-+  mov ra5, rb4          ; mov rb4, ra4
-+  mov ra4, rb5          ; mov rb5, ra6
-+  mov ra6, rb7          ; mov rb7, ra8
-+# >>>
-+
-+# 1st half done all results now in the a/b4..7 fifo
-+
-+# Need to bulk rotate FIFO for heights other than 16
-+# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
-+# we are allowed 3/4 cb_size w/h :-(
-+
-+# Destination uniforms discarded
-+# At the end drop through to _b - we will always do b after b0
-+
-+  sub.setf -, 15, r3    # 12 + 3 of preroll
-+  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
-+  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
-+  mov r0, i_shift16     ; mov ra_link, unif
-+  mov r1, 0x10000
-+# >>>
-+  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
-+# If h != 16 && h != 12 then h <= 8 so
-+# shift 8 with discard (.16b = .16a on all regs)
-+  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
-+  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
-+  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
-+# >>>
-+  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
-+
-+  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
-+# Shift 4
-+  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
-+  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
-+  # If we shifted by 4 here then the max length remaining is 4
-+  # so that is it
-+
-+  brr -, r:uv_b0_post_fin
-+# Shift 2
-+  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
-+  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
-+  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
-+  # 6 / 2 so need 6 outputs
-+# >>>
-+
-+:uv_b0_post12
-+# this one is annoying as we need to swap halves of things that don't
-+# really want to be swapped
-+
-+# b7a, a6a, b5a, a4a
-+# b4a, a5a, b6a, a7a
-+# b7b, a6b, b5b, a4b
-+# b4b, a5b, b6b, a7b
-+
-+  mov r2, ra6           ; mov r3, rb7
-+  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
-+  mov ra5, r2           ; mov rb4, r3
-+
-+  mov r2,  ra4          ; mov r3,  rb5
-+  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
-+  mov ra7, r2           ; mov rb6, r3
-+
-+:uv_b0_post_fin
-+
-+##### L1 B processing
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov ra2, unif         ; mov r0, elem_num
-+
-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
-+
-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
-+
-+  shl rb_xshift2_next, r0, 4
-+
-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
-+  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
-+  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
++  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++  and r1, r0, r1        ; mov rb10, ra3.8c
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
++  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
 +  add rb_base2_next, r3, r0
 +
-+  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
-+  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
++  mov ra9, rb_max_y     ; mov rb11, ra3.8d
++  shl r1, ra_wt_off_l1, rb_wt_den_p15
++  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
 +
-+  nop                   ; mov rb11, ra3.8d
-+  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
-+  asr rb12, r1, 1
-+
-+# ra1.16a used directly in the loop
++# r5        loop counter
++# ra0       H coeffs L0
++# ra1       H coeffs L1
++# ra2       V coeffs L0
++# ra3       temp
++# ra4-7     L0 H FIFO
++# rb4-7     L1 H FIFO
++# rb8-rb11  V coeffs L1
++# ra9       rb_max_y alias
 +
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
++  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++  add ra_y, 1, ra_y     ; mov r3, ra_y
 +
-+# r3 = 0
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
 +
-+:uvloop_b
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
++# L0 H-filter
++# H FIFO scrolls are spread all over this loop
++  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
 +
-+  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
-+  min r2, r2, rb_max_y
-+  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
-+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++.if v_bit_depth <= 8
++  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++.else
++  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++  asr ra3, r2, (v_bit_depth - 8)
++.endif
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
++  shr r2, r4, rb_xshift2 ; mov ra5, ra6
++  shr r1, r2, v_v_shift ; mov r3, ra_y2
++  add ra_y2, r3, ra_k1  ; mov rb6, rb7
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
 +
-+  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
-+  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
-+  sub.setf -, r3, 4    ; mov ra12, ra13
-+  brr.anyn -, r:uvloop_b
-+  mov ra13, ra14          ; mul24 r1, ra14, rb9
-+  mov ra14, ra15          ; mul24 r2, ra15, rb10
-+  mov ra15, r0            ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop_b
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
-+# apply vertical filter and write to VPM
++# L1 H-filter
 +
-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
-+  add r1, r1, r2        ; mul24 r0, ra15, rb11
-+  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
-+  mov ra7, rb6          ; mul24 r1, r1, ra_k256
-+  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
++  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++# V filters - start in branch delay slots of H
++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
++  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r3, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++# >>> .anyn 1b
 +
-+  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
-+  add r1, r1, r0        ; mov rb4, ra4
++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
++  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++  sub r2, r1, r0        ; mul24 r0, ra4, rb8
++  sub r1, r3, r0        ; mul24 r0, ra5, rb9
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
++  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
 +
-+  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
-+  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
 +
-+  sub.setf -, r3, ra31  ; mov ra6, rb7
-+  asr ra3.8as, r1, rb13
-+  nop                   ; mov r1, r1 << 8
-+  brr.anyn -, r:uvloop_b
-+  asr ra3.8bs, r1, rb13
-+  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
-+  mov vpm, ra3
-+# >>>
++  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
++  add r1, r1, r2        ; mov r3, ra_blk_height
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
 +
-+  bra -, ra_link
-+  mov vw_setup, rb26
-+  mov vw_setup, rb29
-+  mov vw_addr, unif     # c_dst_addr
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
 +
++::mc_filter_c_b
++  m_filter_c_b 8
 +
 +################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  nop                   ; nop           ; ldtmu0
++  mov -, vw_wait        ; nop           ; ldtmu1
++.else
++  mov.setf r3, PREREAD - 1
++:1
++  brr.anynz -, r:1b
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  sub.setf r3, r3, 1
++ # >>>
++  mov  -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 -  fns should never be called
++.if n_qpu < n_quads * 4
++  mov ra_link, unif     # Can only branch to an a reg (not r0)
++  mov -, vw_wait        # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in,  12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  bra -, ra_link
++  sacq -, n_sem_quad_in
++  srel -, n_sem_out
++  srel -, n_sem_quad_out
++
++.else
++  bra -, ra_link
++  srel -, n_sem_sync
++  sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++  srel -, n_sem_out
++.else
++  nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++  m_sync_q 0, v_quads8
++::mc_sync_q1
++  m_sync_q 1, v_quads8
++::mc_sync_q2
++  m_sync_q 2, v_quads8
++::mc_sync_q3
++  m_sync_q 3, v_quads8
++::mc_sync_q4
++  m_sync_q 4, v_quads8
++::mc_sync_q5
++  m_sync_q 5, v_quads8
++::mc_sync_q6
++  m_sync_q 6, v_quads8
++::mc_sync_q7
++  m_sync_q 7, v_quads8
++::mc_sync_q8
++  m_sync_q 8, v_quads8
++::mc_sync_q9
++  m_sync_q 9, v_quads8
++::mc_sync_q10
++  m_sync_q 10, v_quads8
++::mc_sync_q11
++  m_sync_q 11, v_quads8
 +
 +# mc_exit()
-+
-+::mc_interrupt_exit8c
-+  ldtmu0
-+  ldtmu1
-+  ldtmu1
-+  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
-+
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+#  mov -,sacq(0) # 8
-+#  mov -,sacq(0) # 9
-+#  mov -,sacq(0) # 10
-+#  mov -,sacq(0) # 11
-+
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
 +# Chroma & Luma the same now
-+::mc_exit_c
-+::mc_exit
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
 +
-+  mov -,srel(0)
++.macro m_exit_qn
++  m_exit_drain
++  nop                   ; nop           ; thrend
++  nop
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++  m_exit_qn
 +
-+  nop        ; nop ; thrend
-+  nop        ; nop # delay slot 1
-+  nop        ; nop # delay slot 2
 +
 +
 +# mc_interrupt_exit12()
-+::mc_interrupt_exit12
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
 +
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+  mov -,sacq(0) # 8
-+  mov -,sacq(0) # 9
-+  mov -,sacq(0) # 10
-+  mov -,sacq(0) # 11
++.macro m_exit_q0
++  m_exit_drain
++  sacq -, 12
++  nop                   ; nop           ; thrend
++  mov interrupt, 1
++  nop
++# >>> thrend <<<
++.endm
 +
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+
-+::mc_exit1
-+  mov  -, vw_wait # wait on the VDW
-+
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  ldtmu1
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
++::mc_exit_c_q0
++::mc_exit_y_q0
++  m_exit_q0
 +
 +# LUMA CODE
 +
 +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
 +# For P frames we make the second x,y coordinates offset by +8
 +
++
 +################################################################################
-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
-+::mc_setup
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t pic_h;
++#    uint16_t pic_w;
++#    uint32_t stride2;
++#    uint32_t stride1;
++#    uint32_t wdenom;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_pmask,           0xff
++.set v_blk_height,      Y_BLK_HEIGHT_8
++.else
++.set v_x_shift,         1
++.set v_pmask,           0xffff
++.set v_blk_height,      Y_BLK_HEIGHT_16
++.endif
++
++
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
-+  mov ra9, unif         # ref_y_base
-+  mov ra10, unif        # y2_x2
-+  mov ra11, unif        # ref_y2_base
++  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
++  mov ra9, unif                                 # ref_y_base
++  mov ra1, unif                                 # x2_y2
++  mov ra11, unif                                # ref_y2_base
++
++# load constants
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
++
++
++  mov ra_kff100100, 0xff100100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++
++# Compute part of VPM to use
 +
 +# Read image dimensions
-+  mov ra3, unif         # width_height
-+  mov rb_xpitch, unif   # stride2
++  mov ra3, unif                                 # width_height
++  mov rb_xpitch, unif                           # stride2
++.if v_x_shift == 0
 +  sub rb_max_x, ra3.16b, 1
++.else
++  sub r0, ra3.16b, 1
++  shl rb_max_x, r0, v_x_shift
++.endif
 +  sub rb_max_y, ra3.16a, 1
-+  mov rb_pitch, unif    # stride1
++  mov rb_pitch, unif                            # stride1
 +
 +# get destination pitch
 +  mov r1, vdw_setup_1(0)
-+  or  rb24, r1, rb_pitch
++  or  rb_dma1_base, r1, rb_pitch
 +
 +# Compute base address for first and second access
 +  mov r3, elem_num
-+  add r0, ra8.16a, r3   # Load x + elem_num
++  add r0, ra0.16b, r3                           # Load x + elem_num
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
 +  shl ra_xshift_next, r0, 3 # Compute shifts
 +
-+
-+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
++# X is byte offset - we can only load words - mask
 +
 +  and r0, r0, -4        ; v8subs r2, r2, r2
 +  sub r2, r2, rb_pitch
 +  and r1, r0, r2
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
++  add r0, r0, r1                                # Add stripe offsets
 +  add ra_base, ra9, r0
 +
-+  mov r1, ra8.16b       # Load y
-+  add ra_y, r1, 1       # Set for next
-+  max r1, r1, 0
-+  min r1, r1, rb_max_y
-+
-+# submit texture requests for first line
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1
-+
-+
 +  # r3 still contains elem_num
-+  add r0, ra10.16a, r3  # Load x
++  add r0, ra1.16b, r3                           # Load x
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3 # Compute shifts
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
 +
 +  # r2 still contains mask
 +  and r0, r0, -4
 +  and r1, r0, r2
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
++  add r0, r0, r1                                # Add stripe offsets
 +  add ra_base2, ra11, r0
 +
-+  mov r1, ra10.16b       # Load y
-+  add ra_y2, r1, 1       # Set for next
-+  max r1, r1, 0
++# Do preloads
++  nop                   ; mov r0, ra0.16a       # ; r0 = y
++  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
 +  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
 +
-+# submit texture requests for first line
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t1s, ra_base2, r1
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
 +
-+# load constants
++  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
 +
-+  mov ra_k1, 1
-+  mov ra_k256, 256
-+  mov rb_k255, 255
-+  mov ra_k0, 0
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++  mov ra_link, unif                             # Next fn
 +
 +# touch vertical context to keep simulator happy
-+
 +  mov ra8,  0           ; mov rb8,  0
++  bra -, ra_link
 +  mov ra9,  0           ; mov rb9,  0
 +  mov ra10, 0           ; mov rb10, 0
 +  mov ra11, 0           ; mov rb11, 0
++# >>> ra_link
++.endm
 +
-+# Compute part of VPM to use
-+  m_calc_dma_regs rb28, rb27
-+
-+# Weighted prediction denom
-+  add rb13, unif, 9     # unif = weight denom + 6
-+
-+# submit texture requests for second line
-+  max r1, ra_y, 0
-+  min r1, r1, rb_max_y
-+  add ra_y, ra_y, 1
-+  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
-+  add t0s, r1, ra_base
-+
-+  max r1, ra_y2, 0
-+  min r1, r1, rb_max_y
-+  add ra_y2, ra_y2, 1
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t1s, r1, ra_base2
-+
-+# FALL THROUGHT TO PER-BLOCK SETUP
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
++  m_setup_y 8
 +
++################################################################################
++#
 +# Start of per-block setup code
 +# P and B blocks share the same setup code to save on Icache space
-+:per_block_setup
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra_link, unif
-+#### We do all the setup even if we are about to exit - reading junk from unif....
 +
-+  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+  mov ra_xshift, ra_xshift_next
-+  mov rb_xshift2, rb_xshift2_next
++# luma_setup_delay3 done in delay slots of branch that got us here
 +
 +# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
 +
-+  add r0, ra1.16a, r3   # Load x
-+  max r0, r0, 0
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t mymx21;
++#    uint32_t wo1;
++#    uint32_t wo2;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++  brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++  brr ra_link, r:per_block_setup_10
++.endif
++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
++  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
 +  min r0, r0, rb_max_x
 +
 +  shl ra_xshift_next, r0, 3         # Compute shifts
-+  and r0, r0, -4        ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch
-+  and r1, r0, r2
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
-+  add ra_base_next, unif, r0              # Base1
-+  mov ra_y_next, ra1.16b                      # Load y
-+  mov ra1, unif         # x2_y2
-+  nop                   # ra1 delay
-+
-+  add r0, ra1.16a, r3   # Load x2
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+
-+  shl rb_xshift2_next, r0, 3         # Compute shifts
 +  and r0, r0, -4
-+  and r1, r0, r2
++  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
-+  add rb_base2_next, unif, r0              # Base1
-+  mov ra_y2_next, ra1.16b                      # Load y
-+  mov ra_width_height, unif         # width_height
++  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
++  add ra_base_next, ra_base_next, r0            # [ra1 delay]
 +
-+# set up VPM write
-+  mov vw_setup, rb28    # [ra1 delay]
++  add r0, ra1.16b, r3                           # Load x2
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
++  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
++  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
++  add rb_base2_next, rb_base2_next, r0
 +
-+# get width,height of block (unif load above)
-+  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
-+  add rb17, ra_height, 5  ; mov r0, ra_height
-+  mov r1, 16
-+  min r0, r0, r1
-+  add rb18, r0, 7
-+  shl r0,   r0, 7
-+  add r0,   r0, ra_width                        # Combine width and height of destination area
-+  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
++# get width,height of block (unif load above), r1 = width * pel_size
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++  add rb_lcount, r0, 7
++  shl r0,   r0, v_dma_h_shift
++  add r0,   r0, r1                              # Combine width and height of destination area
++  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
-+  mov r2, 0x01040400                 # [ra5 delay]
-+  shl ra8, r0, 3                     ; mov rb14, ra5.16a
++  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3        ; mov r3, ra_k255
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
 +
-+  mov r1,0x00010100  # -ve
++  mov r1,0x00010100  # -ve                      [ra8 delay]
 +  ror ra2.8a, r1, ra8.8d
 +  ror ra0.8a, r1, ra8.8c
 +
-+  ror ra2.8b, r2, ra8.8d
-+  ror ra0.8b, r2, ra8.8c
++  mov r1, 0x01040400
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
 +
 +  mov r1,0x050b0a00  # -ve
 +  ror ra2.8c, r1, ra8.8d
@@ -17390,49 +26337,44 @@ index 0000000..aa3fe47
 +  ror ra2.8d, r1, ra8.8d
 +  ror ra0.8d, r1, ra8.8c
 +
-+# In the 2nd vertical half we use b registers due to
-+# using a-side fifo regs. The easiest way to achieve this to pack it
-+# and then unpack!
++# In the 2nd vertical half we use b registers due to using a-side fifo regs
 +
 +  mov r1,0x3a281100
-+  ror ra3.8a, r1, ra8.8d
-+  ror ra1.8a, r1, ra8.8c
++  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
 +
 +  mov r1,0x0a0b0500  # -ve
-+  ror ra3.8b, r1, ra8.8d
-+  ror ra1.8b, r1, ra8.8c
++  ror r0, r1, ra8.8d
++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
 +
 +  mov r1,0x04040100
-+  ror ra3.8c, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c
++  ror r0, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++
++  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
 +
 +  mov r1,0x01010000  # -ve
-+  ror ra3.8d, r1, ra8.8d
-+  ror ra1.8d, r1, ra8.8c
-+
-+# Extract weighted prediction information in parallel
-+# We are annoyingly A src limited here
-+
-+  mov rb4, ra3.8a            ; mov ra18, unif
-+  mov rb5, ra3.8b
-+  mov rb6, ra3.8c
-+  mov.ifnz ra5, ra18
-+
-+  mov rb_dest, unif     # Destination address
++  ror r0, r1, ra8.8d
 +
 +  bra -, ra_link
++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
 +
-+  shl r0, ra5.16b, rb13      # Offset calc
-+  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
-+  mov r3, 0                  ; mov rb7, ra3.8d
++  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
 +# >>> branch ra_link
-+#
-+# r3 = 0
-+# ra18.16a = weight L1
-+# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
-+# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
-+# rb13     = weight denom + 6 + 9
-+# rb14     = weight L0
++
++# r5 = 0
++# ra_wt_mul_l1  = weight L1
++# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
++# rb_wt_den_p15 = weight denom + 6 + 9
++# rb_wt_mul_l0  = weight L0
++.endm
++
++:per_block_setup_8
++  m_per_block_setup 8
++
 +
 +
 +################################################################################
@@ -17440,137 +26382,118 @@ index 0000000..aa3fe47
 +# In a P block, y2_x2 should be y_x+8
 +# At this point we have already issued two pairs of texture requests for the current block
 +
-+::mc_filter
-+# ra5.16a = weight << 16; We want weight * 2 in rb14
++.macro m_filter_y_pxx, v_bit_depth
++  m_luma_setup v_bit_depth
 +
-+  shl rb14, ra5.16a, 1
++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
 +
-+# r3 = 0
++# r5 = 0 (loop count)
 +
-+:yloop
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
 +# N.B. Whilst y == y2 as far as this loop is concerned we will start
 +# the grab for the next block before we finish with this block and that
 +# might be B where y != y2 so we must do full processing on both y and y2
 +
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y          ; mov ra7, ra8
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
 +
 +# apply horizontal filter
-+  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
-+  brr.anyn -, r:yloop
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
-+  # >>> .anyn yloop
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++  mov ra10, ra11        ; mov rb10, rb11
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
 +
 +  # apply vertical filter and write to VPM
-+
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov -, vw_wait
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0
 +# At this point r1 is a 22-bit signed quantity: 8 (original sample),
 +#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
 +# The top 8 bits have rubbish in them as mul24 is unsigned
 +# The low 6 bits need discard before weighting
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
 +  asr r1, r1, 14
-+  nop                     ; mul24 r1, r1, rb14
-+  add r1, r1, rb12
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
++
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
 +
-+  shl r1, r1, 8
-+  brr.anyn -, r:yloop
-+  asr r1, r1, rb13
-+# We have a saturating pack unit - I can't help feeling it should be useful here
-+  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
-+  max vpm, r1, 0         # Delay 3
 +# >>> branch.anyn yloop
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb29 (stride) remains constant
-+  # rb17 remains const (based on total height)
-+  # recalc rb26, rb18 based on new segment height
-+  # N.B. r3 is loop counter still
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+  mov r1, 16
-+  sub r0, ra_height, r1
-+  mov ra_height, r0
-+  max.setf r0, r0, 0    # Done if Z now
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
-+  brr.anyz -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, rb_dest # start the VDW   Delay 3
-+# >>> .anyz per_block_setup
-+
-+  min r0, r0, r1
-+  add rb18, rb18, r0
-+  sub r0, r0, r1
-+  shl r0, r0, i_shift23
-+  add rb26, rb26, r0
-+
-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
-+
-+  mov vw_setup, rb28    # Reset our VDM write pointer
-+
-+  brr -, r:yloop
-+  nop
-+  nop
-+  nop
-+# >>>
-+
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
 +
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
 +
++::mc_filter_y_pxx
++  m_filter_y_pxx 8
 +
 +
 +################################################################################
@@ -17578,243 +26501,1106 @@ index 0000000..aa3fe47
 +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, only the first half of coefficients contain used information.
 +# At this point we have already issued two pairs of texture requests for the current block
-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-+# Can fill in the coefficients so only
-+# Can also assume default weighted prediction for B frames.
 +# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
 +# Or possibly by taking advantage of symmetry?
-+# From 19->7 32bits per command.
 +
-+::mc_filter_b
-+  # r0 = weightL0 << 16, we want it in rb14
-+#  asr rb14, r0, i_shift16
++.macro m_filter_y_bxx, v_bit_depth
++  m_luma_setup v_bit_depth
 +
-+:yloopb
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++:1
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y          ; mov ra7, ra8
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
 +
 +# apply horizontal filter
-+  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
-+  brr.anyn -, r:yloopb
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
-+  # >>> .anyn yloopb
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++  mov ra10, ra11        ; mov rb10, rb11
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
 +
 +  # apply vertical filter and write to VPM
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov r2, rb12
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0        ; mov r2, rb_wt_off
 +# As with P-pred r1 is a 22-bit signed quantity in 32-bits
 +# Top 8 bits are bad - low 6 bits should be discarded
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
 +
 +  asr r1, r1, 14
-+  nop                     ; mul24 r0, r1, rb14
-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
++  nop                   ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
 +
-+  add r1, r1, r0          ; mov -, vw_wait
-+  shl r1, r1, 8
++  add r1, r1, r0        ; mov r3, ra_blk_height
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
 +
-+  brr.anyn -, r:yloopb
-+  asr r1, r1, rb13         # Delay 1
-+  min r1, r1, rb_k255       # Delay 2
-+  max vpm, r1, 0         # Delay 3
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb29 (stride) remains constant
-+  # rb17 remains const (based on total height)
-+  # recalc rb26, rb18 based on new segment height
-+  # N.B. r3 is loop counter still
-+
-+  mov r1, 16
-+  sub r0, ra_height, r1
-+  mov ra_height, r0
-+  max.setf r0, r0, 0    # Done if Z now
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
-+  brr.anyz -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, rb_dest # start the VDW   Delay 3
-+# >>> .anyz per_block_setup
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
 +
-+  min r0, r0, r1
-+  add rb18, rb18, r0
-+  sub r0, r0, r1
-+  shl r0, r0, i_shift23
-+  add rb26, rb26, r0
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
 +
-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
-+
-+  mov vw_setup, rb28    # Reset our VDM write pointer
-+
-+  brr -, r:yloopb
-+  nop
-+  nop
-+  nop
++::mc_filter_y_bxx
++  m_filter_y_bxx 8
 +
 +################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++#    qpu_mc_src_t next_src1;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t wo1;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++  mov ra0, unif         ; mov r3, elem_num      # y_x
++  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
++  add r0, ra0.16b, r3
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3                     # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++  shl r1, ra_width, v_x_shift
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
++  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
++  add rb_dma0, r0, rb_dma0_base
++
++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
++
++:1
++  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++  m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++  m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++  mov r0, 7
++  sub rb_i_tmu, rb_i_tmu, r0
++  sub rb_lcount, rb_lcount, r0
++  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++  shl rb_wt_off, rb_wt_off, r0
++  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++  add r1, r0, r1
++  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++  m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++  m_setup_q0
++::mc_setup_c10_qn
++  m_setup_c 10
++
++::mc_filter_c10_p
++  m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++  m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++  m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++  m_sync_q 0, v_quads10
++::mc_sync10_q1
++  m_sync_q 1, v_quads10
++::mc_sync10_q2
++  m_sync_q 2, v_quads10
++::mc_sync10_q3
++  m_sync_q 3, v_quads10
++::mc_sync10_q4
++  m_sync_q 4, v_quads10
++::mc_sync10_q5
++  m_sync_q 5, v_quads10
++::mc_sync10_q6
++  m_sync_q 6, v_quads10
++::mc_sync10_q7
++  m_sync_q 7, v_quads10
++::mc_sync10_q8
++  m_sync_q 8, v_quads10
++::mc_sync10_q9
++  m_sync_q 9, v_quads10
++::mc_sync10_q10
++  m_sync_q 10, v_quads10
++::mc_sync10_q11
++  m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++  m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++  m_exit_qn
++
++::mc_setup_y10_q0
++  m_setup_q0
++::mc_setup_y10_qn
++  m_setup_y 10
++
++:per_block_setup_10
++  m_per_block_setup 10
++
++::mc_filter_y10_pxx
++  m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++  m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++  m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++  m_filter_y_b00 10
++
++
 +
 +::mc_end
 +# Do not add code here because mc_end must appear after all other code.
 diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
 new file mode 100644
-index 0000000..27cbb59
+index 0000000000..9f8983da52
 --- /dev/null
 +++ b/libavcodec/rpi_shader_cmd.h
-@@ -0,0 +1,88 @@
+@@ -0,0 +1,128 @@
 +#ifndef RPI_SHADER_CMD_H
 +#define RPI_SHADER_CMD_H
 +
 +#pragma pack(push, 4)
 +
-+typedef struct qpu_mc_pred_c_s {
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++    int16_t y;
++    int16_t x;
++    qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++    qpu_mc_src_t next_src;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x;
++    uint32_t coeffs_y;
++    uint32_t wo_u;
++    uint32_t wo_v;
++    qpu_mc_dst_addr_t dst_addr_c;
 +    uint32_t next_fn;
-+    int16_t next_src_y;
-+    int16_t next_src_x;
-+    uint32_t next_src_base_c;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x1;
++    uint32_t coeffs_y1;
++    uint32_t weight_u1;
++    uint32_t weight_v1;
++    qpu_mc_src_t next_src2;
++    uint32_t coeffs_x2;
++    uint32_t coeffs_y2;
++    uint32_t wo_u2;
++    uint32_t wo_v2;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++    qpu_mc_src_t next_src1;
++    uint32_t pic_cw;            // C Width (== Y width / 2)
++    uint32_t pic_ch;            // C Height (== Y Height / 2)
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    qpu_mc_src_t next_src2;
++    uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
 +    union {
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t wo_u;
-+            uint32_t wo_v;
-+            uint32_t dst_addr_c;
-+        } p;
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t weight_u;
-+            uint32_t weight_v;
-+            uint32_t dummy0;
-+        } b0;
-+        struct {
-+            uint32_t dummy0;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t wo_u;
-+            uint32_t wo_v;
-+            uint32_t dst_addr_c;
-+        } b1;
-+        struct {
-+            uint32_t pic_cw;            // C Width (== Y width / 2)
-+            uint32_t pic_ch;            // C Height (== Y Height / 2)
-+            uint32_t stride2;
-+            uint32_t stride1;
-+            uint32_t wdenom;
-+            uint32_t dummy0;
-+        } s0;
-+        struct {
-+            uint32_t dummy0;
-+            uint32_t dummy1;
-+            uint32_t dummy2;
-+            uint32_t dummy3;
-+            uint32_t dummy4;
-+            uint32_t dummy5;
-+        } s1;
++        qpu_mc_pred_c_p_t p;
++        qpu_mc_pred_c_b_t b;
++        qpu_mc_pred_c_s_t s;
 +    };
 +} qpu_mc_pred_c_t;
 +
-+typedef struct qpu_mc_pred_y_s {
-+    int16_t next_src1_x;
-+    int16_t next_src1_y;
-+    uint32_t next_src1_base;
-+    int16_t next_src2_x;
-+    int16_t next_src2_y;
-+    uint32_t next_src2_base;
-+    union {
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t mymx21;
-+            uint32_t wo1;
-+            uint32_t wo2;
-+            uint32_t dst_addr;
-+        } p;
-+        struct {
-+            uint16_t pic_h;
-+            uint16_t pic_w;
-+            uint32_t stride2;
-+            uint32_t stride1;
-+            uint32_t wdenom;
-+            uint32_t dummy0;
-+        } s;
-+    };
++
++typedef struct qpu_mc_pred_y_p_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t h;
++    uint16_t w;
++    uint32_t mymx21;
++    uint32_t wo1;
++    uint32_t wo2;
++    qpu_mc_dst_addr_t dst_addr;
 +    uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t wo1;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t pic_h;
++    uint16_t pic_w;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++    union {
++        qpu_mc_pred_y_p_t p;
++        qpu_mc_pred_y_p00_t p00;
++        qpu_mc_pred_y_s_t s;
++    };
 +} qpu_mc_pred_y_t;
 +
++typedef union qpu_mc_pred_cmd_u {
++    qpu_mc_pred_y_t y;
++    qpu_mc_pred_c_t c;
++    uint32_t data[1];
++} qpu_mc_pred_cmd_t;
++
++#define QPU_MC_PRED_N_Y8        12
++#define QPU_MC_PRED_N_C8        12
++
++#define QPU_MC_PRED_N_Y10       12
++#define QPU_MC_PRED_N_C10       12
++
 +#pragma pack(pop)
 +
 +#endif
 +
+diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
+new file mode 100644
+index 0000000000..1925ab7a79
+--- /dev/null
++++ b/libavcodec/rpi_shader_template.c
+@@ -0,0 +1,65 @@
++#ifdef RPI
++
++#include "hevc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_shader_cmd.h"
++#include "rpi_shader_template.h"
++
++typedef struct shader_track_s
++{
++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    const struct qpu_mc_src_s *last_l0;
++    const struct qpu_mc_src_s *last_l1;
++    uint32_t width;  // pic_width * PW
++    uint32_t height;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++    return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++    int rv;
++    // As it happens we can take the 2nd filter term & divide it by 8
++    // (dropping fractions) to get the fractional move
++    rv = 8 - ((x >> 11) & 0xf);
++    av_assert2(rv >= 0 && rv <= 7);
++    return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++    return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCContext *const s, int32_t x)
++{
++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCContext *const s, int32_t x)
++{
++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++    return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_shader_template_fn.h"
++
++#endif
++
+diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
+new file mode 100644
+index 0000000000..ecf5b8185a
+--- /dev/null
++++ b/libavcodec/rpi_shader_template.h
+@@ -0,0 +1,24 @@
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++#ifdef RPI
++struct HEVCContext;
++struct HEVCRpiInterPredEnv;
++
++void rpi_shader_c8(struct HEVCContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_shader_c16(struct HEVCContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++#endif
++
+diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
+new file mode 100644
+index 0000000000..b5ac2ceed6
+--- /dev/null
++++ b/libavcodec/rpi_shader_template_fn.h
+@@ -0,0 +1,477 @@
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++        const pixel s = *(const pixel *)src;
++        pixel * d = (pixel *)dst;
++        for (unsigned int j = 0; j < w; j += PW) {
++            *d++ = s;
++        }
++    }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
++        memcpy(dst, src, w);
++    }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++                         uint8_t * dst, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > st->width) {
++        if (x >= st->width)
++            x = st->width - PW;
++        dr = (x + w) - st->width;
++        w = st->width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > st->height) {
++        if (y >= st->height)
++            y = st->height - 1;
++        db = (y + h) - st->height;
++        h = st->height - y;
++    }
++
++    dst += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++    if (dr != 0)
++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++    w += dl + dr;
++    dst -= dl;
++
++    if (dt != 0)
++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++    if (db != 0)
++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++    const int width = st->width;
++    const int height = st->height;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > width) {
++        if (x >= width)
++            x = width - PW;
++        dr = (x + w) - width;
++        w = width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > height) {
++        if (y >= height)
++            y = height - 1;
++        db = (y + h) - height;
++        h = height - y;
++    }
++
++    dst_u += dl + dt * dst_stride;
++    dst_v += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++    {
++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++    }
++    if (dr != 0)
++    {
++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++    }
++    w += dl + dr;
++    dst_u -= dl;
++    dst_v -= dl;
++
++    if (dt != 0)
++    {
++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++    }
++    if (db != 0)
++    {
++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++    }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++    if (is_c) {
++        x *= 2;
++        w *= 2;
++    }
++
++    for (int i = y; i != y + h; ++i) {
++        for (int j = x; j != x + w; ++j) {
++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++            if (j < 0 || i < 0)
++                printf("..%c", sep);
++            else
++                printf("%02x%c", *(const pixel*)p, sep);
++#else
++            if (j < 0 || i < 0)
++                printf("...%c", sep);
++            else
++                printf("%03x%c", *(const pixel*)p, sep);
++#endif
++        }
++        printf("\n");
++    }
++}
++
++
++void FUNC(rpi_shader_c)(HEVCContext *const s,
++                  const HEVCRpiInterPredEnv *const ipe_y,
++                  const HEVCRpiInterPredEnv *const ipe_c)
++{
++    for (int c_idx = 0; c_idx < 2; ++c_idx)
++    {
++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++        unsigned int exit_n = 0;
++
++        if (ipe == NULL || !ipe->used) {
++            continue;
++        }
++
++        do {
++            for (unsigned int i = 0; i != ipe->n; ++i) {
++                const HEVCRpiInterPredQ * const q = ipe->q + i;
++                shader_track_t * const st = tracka + i;
++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++                for (;;) {
++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++                    if (link == q->code_setup) {
++                        if (c_idx == 0) {
++                            // Luma
++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++                            st->height = c->pic_h;
++                            st->width = c->pic_w * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                        else {
++                            // Chroma
++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++                            st->height = c->pic_ch;
++                            st->width = c->pic_cw * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                    }
++                    else if (link == s->qpu.y_pxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++                        const int w1 = FFMIN(c->w, 8);
++                        const int w2 = c->w - w1;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        if (w2 > 0) {
++                            FUNC(get_patch_y)(st,
++                                        patch_y2, PATCH_STRIDE,
++                                        st->last_l1,
++                                        16, c->h + 7);
++                        }
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++                        if (w2 > 0) {
++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++                        }
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_bxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h + 7);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_p00) {
++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++                        st->last_l0 = &c->next_src1;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_b00) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        av_assert0(c->w <= 16 && c->h <= 64);
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++                           patch_y3, patch_y1, PATCH_STRIDE,
++                           c->h, 0, 0, c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), 0, 0, c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx_l1) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l1 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_bxx) {
++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++                        const int mx1 = fctom(c->coeffs_x1);
++                        const int my1 = fctom(c->coeffs_y1);
++                        const int mx2 = fctom(c->coeffs_x2);
++                        const int my2 = fctom(c->coeffs_y2);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72];
++                        uint8_t patch_v1[PATCH_STRIDE * 72];
++                        uint8_t patch_u2[PATCH_STRIDE * 72];
++                        uint8_t patch_v2[PATCH_STRIDE * 72];
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == q->code_sync) {
++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++                        break;
++                    }
++                    else if (link == q->code_exit) {
++                        // We expect exit to occur without other sync
++                        av_assert0(i == exit_n);
++                        ++exit_n;
++                        break;
++                    }
++                    else {
++                        av_assert0(0);
++                    }
++                }
++
++                st->qpu_mc_curr = cmd;
++            }
++        } while (exit_n == 0);
++    }
++}
++
++#undef FUNC
++#undef pixel
++
 diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
 new file mode 100644
-index 0000000..b061fe0
+index 0000000000..b502de0a2c
 --- /dev/null
 +++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,581 @@
+@@ -0,0 +1,745 @@
 +#include "config.h"
 +#ifdef RPI
++#include "libavcodec/avcodec.h"
 +#include "rpi_qpu.h"
 +#include "rpi_mailbox.h"
 +#include "rpi_zc.h"
 +#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
 +#include <pthread.h>
 +
 +#include "libavutil/buffer_internal.h"
@@ -17841,21 +27627,11 @@ index 0000000..b061fe0
 +    struct ZcPool * pool;
 +} ZcPoolEnt;
 +
-+#if 1
-+//#define ALLOC_PAD       0x1000
-+#define ALLOC_PAD       0
-+#define ALLOC_ROUND     0x1000
-+//#define ALLOC_N_OFFSET  0x100
-+#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    0x80
-+#define STRIDE_OR       0x80
-+#else
 +#define ALLOC_PAD       0
 +#define ALLOC_ROUND     0x1000
 +#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    32
++#define STRIDE_ROUND    64
 +#define STRIDE_OR       0
-+#endif
 +
 +#define DEBUG_ZAP0_BUFFERS 0
 +
@@ -18032,13 +27808,22 @@ index 0000000..b061fe0
 +    {
 +        case AV_PIX_FMT_YUV420P:
 +            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
 +            geo.stride_c = geo.stride_y / 2;
-+        //    geo.height_y = (video_height + 15) & ~15;
 +            geo.height_y = (video_height + 32 + 31) & ~31;
 +            geo.height_c = geo.height_y / 2;
 +            geo.planes_c = 2;
 +            geo.stripes = 1;
++            geo.bytes_per_pel = 1;
++            break;
++
++        case AV_PIX_FMT_YUV420P10:
++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 2;
 +            break;
 +
 +        case AV_PIX_FMT_SAND128:
@@ -18073,6 +27858,7 @@ index 0000000..b061fe0
 +            geo.height_c = img.pitch / stripe_w - geo.height_y;
 +            geo.planes_c = 1;
 +            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
 +
 +            pthread_mutex_unlock(&sand_lock);
 +
@@ -18081,6 +27867,45 @@ index 0000000..b061fe0
 +            break;
 +        }
 +
++        case AV_PIX_FMT_SAND64_16:
++        case AV_PIX_FMT_SAND64_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV_16,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                gpu_ref();
++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
++                gpu_unref();
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 2;
++
++            pthread_mutex_unlock(&sand_lock);
++            break;
++        }
++
 +        default:
 +            memset(&geo, 0, sizeof(geo));
 +            break;
@@ -18153,8 +27978,12 @@ index 0000000..b061fe0
 +    frame->linesize[0] = geo.stride_y;
 +    frame->linesize[1] = geo.stride_c;
 +    frame->linesize[2] = geo.stride_c;
++    // abuse: linesize[3] = "stripe stride"
++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++    // In a general case this makes the calculation an xor and multiply rather
++    // than a divide and multiply
 +    if (geo.stripes > 1)
-+        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
++        frame->linesize[3] = geo.height_y + geo.height_c;
 +
 +    frame->data[0] = buf->data;
 +    frame->data[1] = frame->data[0] + size_y;
@@ -18164,6 +27993,11 @@ index 0000000..b061fe0
 +    frame->extended_data = frame->data;
 +    // Leave extended buf alone
 +
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++    // *** If we intend to use this for real we will want a 2nd buffer pool
++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
++#endif
++
 +    return 0;
 +}
 +
@@ -18182,7 +28016,7 @@ index 0000000..b061fe0
 +        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
 +    else if (frame->format == AV_PIX_FMT_YUV420P ||
-+             frame->format == AV_PIX_FMT_SAND128)
++             av_rpi_is_sand_frame(frame))
 +    {
 +        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
 +    }
@@ -18212,6 +28046,7 @@ index 0000000..b061fe0
 +    unsigned int i;
 +    uint8_t * psrc, * pdest;
 +
++    dest->format = src->format;
 +    dest->width = src->width;
 +    dest->height = src->height;
 +
@@ -18243,29 +28078,142 @@ index 0000000..b061fe0
 +}
 +
 +
++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
++    const AVFrame * const src)
++{
++    AVFrame dest_frame;
++    AVFrame * const dest = &dest_frame;
++    unsigned int i;
++    uint8_t * psrc, * psrc2, * pdest;
++
++    memset(dest, 0, sizeof(*dest));
++    dest->format = AV_PIX_FMT_SAND128;
++    dest->width = src->width;
++    dest->height = src->height;
++
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
++    {
++        return NULL;
++    }
++
++    // Y
++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++         i != dest->height;
++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++    {
++        uint16_t * s = (uint16_t*)psrc;
++        uint8_t * d = pdest;
++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
++        {
++            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
++            for (unsigned int j = 0; j != n; ++j)
++                *d++ = (uint8_t)(*s++ >> 2);
++            d += (dest->linesize[3] - 1) * dest->linesize[0];
++        }
++    }
++
++    // C
++    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
++    {
++        const uint16_t * su = (uint16_t*)psrc;
++        const uint16_t * sv = (uint16_t*)psrc2;
++        uint8_t * d = pdest;
++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
++        {
++            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
++            for (unsigned int j = 0; j != n; ++j)
++            {
++                *d++ = (uint8_t)(*su++ >> 2);
++                *d++ = (uint8_t)(*sv++ >> 2);
++            }
++            d += (dest->linesize[3] - 1) * dest->linesize[1];
++        }
++    }
++
++    return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
++    const AVFrame * const src, const unsigned int src_bits)
++{
++    AVFrame dest_frame = {
++        .format = AV_PIX_FMT_SAND128,
++        .width = src->width,
++        .height = src->height
++    };
++    AVFrame * const dest = &dest_frame;
++    const unsigned int shr = src_bits - 8;
++
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
++    {
++        return NULL;
++    }
++
++    // Y
++    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
++                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
++                        src->width, src->height, shr);
++    // C
++    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
++                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
++                        src->width, src->height / 2, shr);
++
++    return dest->buf[0];
++}
++
++
++
 +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const int maycopy)
++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
 +{
 +    assert(s != NULL);
 +
 +    if (frame->format != AV_PIX_FMT_YUV420P &&
-+        frame->format != AV_PIX_FMT_SAND128)
++        frame->format != AV_PIX_FMT_YUV420P10 &&
++        !av_rpi_is_sand_frame(frame))
 +    {
 +        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
 +        return NULL;
 +    }
 +
-+    if (frame->buf[1] != NULL)
++    if (frame->buf[1] != NULL || frame->format != expected_format)
 +    {
-+        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
++#if RPI_ZC_SAND_8_IN_10_BUF
++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++        {
++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++        }
++#endif
++
 +        if (maycopy)
 +        {
-+            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+            return zc_copy(s, frame);
++            if (frame->buf[1] != NULL)
++                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++            else
++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++            switch (frame->format)
++            {
++                case AV_PIX_FMT_YUV420P10:
++                    return zc_420p10_to_sand128(s, frame);
++
++                case AV_PIX_FMT_SAND64_10:
++                    return zc_sand64_16_to_sand128(s, frame, 10);
++
++                default:
++                    return zc_copy(s, frame);
++            }
 +        }
 +        else
 +        {
-+            av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
++            if (frame->buf[1] != NULL)
++                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++            else
++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
 +            return NULL;
 +        }
 +    }
@@ -18392,10 +28340,10 @@ index 0000000..b061fe0
 +
 diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
 new file mode 100644
-index 0000000..f4aeb78
+index 0000000000..26fb3be999
 --- /dev/null
 +++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,137 @@
+@@ -0,0 +1,105 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
@@ -18406,23 +28354,33 @@ index 0000000..f4aeb78
 +// bit of memory for the frame when can then be reference counted until
 +// display has finished with it.
 +
-+#include "libavutil/frame.h"
-+#include "libavcodec/avcodec.h"
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++//     Only works if SAO active
++//     Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF  0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
 +
 +// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef AVBufferRef * AVRpiZcRefPtr;
++typedef struct AVBufferRef * AVRpiZcRefPtr;
 +
 +struct AVZcEnv;
 +typedef struct AVZcEnv * AVZcEnvPtr;
 +
 +typedef struct AVRpiZcFrameGeometry
 +{
-+    unsigned int stride_y;
-+    unsigned int height_y;
-+    unsigned int stride_c;
-+    unsigned int height_c;
-+    unsigned int planes_c;
-+    unsigned int stripes;
++    unsigned int stride_y;  // Luma stride (bytes)
++    unsigned int height_y;  // Luma height (lines)
++    unsigned int stride_c;  // Chroma stride (bytes)
++    unsigned int height_c;  // Chroma stride (lines)
++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
++    unsigned int stripes;   // Number of stripes (sand)
++    unsigned int bytes_per_pel;
 +} AVRpiZcFrameGeometry;
 +
 +
@@ -18448,7 +28406,7 @@ index 0000000..f4aeb78
 +//     the data, then allocate a new buffer and copy the data into it
 +//   Otherwise return NULL
 +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const int maycopy);
++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
 +
 +// Get the vc_handle from the frame ref
 +// Returns -1 if ref doesn't look valid
@@ -18489,52 +28447,10 @@ index 0000000..f4aeb78
 +
 +
 +
-+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int stride1 = frame->linesize[0];
-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = frame->linesize[0];
-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
-+    const unsigned int x = x_c * 2;
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
-+}
-+
-+static inline int rpi_sliced_frame(const AVFrame * const frame)
-+{
-+    return frame->format == AV_PIX_FMT_SAND128;
-+}
-+
-+
 +#endif
 +
 diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index f7adb52..3b398a3 100644
+index c4af9cbb17..c1b806e51b 100644
 --- a/libavcodec/utils.c
 +++ b/libavcodec/utils.c
 @@ -26,6 +26,12 @@
@@ -18550,7 +28466,15 @@ index f7adb52..3b398a3 100644
  #include "libavutil/atomic.h"
  #include "libavutil/attributes.h"
  #include "libavutil/avassert.h"
-@@ -64,6 +70,10 @@
+@@ -39,6 +45,7 @@
+ #include "libavutil/mathematics.h"
+ #include "libavutil/mem_internal.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/samplefmt.h"
+ #include "libavutil/dict.h"
+@@ -64,6 +71,10 @@
  #include "libavutil/ffversion.h"
  const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  
@@ -18561,7 +28485,7 @@ index f7adb52..3b398a3 100644
  #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
  static int default_lockmgr_cb(void **arg, enum AVLockOp op)
  {
-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
      return ret;
  }
  
@@ -18609,7 +28533,7 @@ index f7adb52..3b398a3 100644
  static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
  {
      FramePool *pool = avctx->internal->pool;
-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
              av_buffer_pool_uninit(&pool->pools[i]);
              pool->linesize[i] = linesize[i];
              if (size[i]) {
@@ -18624,20 +28548,20 @@ index f7adb52..3b398a3 100644
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
-@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
  {
      int ret;
  
 +#ifdef RPI
 +    // This is going to end badly if we let it continue
-+    av_assert0(frame->format != AV_PIX_FMT_SAND128);
++    av_assert0(!av_rpi_is_sand_frame(frame));
 +#endif
 +
      if ((ret = update_frame_pool(avctx, frame)) < 0)
          return ret;
  
 diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
-index 21f8d9e..71ce7b9 100644
+index 21f8d9e00d..71ce7b9186 100644
 --- a/libavfilter/avfilter.c
 +++ b/libavfilter/avfilter.c
 @@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
@@ -18649,7 +28573,7 @@ index 21f8d9e..71ce7b9 100644
  #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
              if (   !strcmp(filter->filter->name, "format")     ||
 diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index b31d233..2767306 100644
+index 6767b65ec8..f270190d57 100644
 --- a/libavformat/mpegts.c
 +++ b/libavformat/mpegts.c
 @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
@@ -18662,10 +28586,10 @@ index b31d233..2767306 100644
      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
 diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 6f343f2..83f26d5 100644
+index 5a35953d24..d36fdc3199 100644
 --- a/libavformat/utils.c
 +++ b/libavformat/utils.c
-@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
          int default_stream_index = av_find_default_stream_index(s);
          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
              for (i = 0; i < s->nb_streams; i++) {
@@ -18674,8 +28598,84 @@ index 6f343f2..83f26d5 100644
                      continue;
                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
+diff --git a/libavutil/Makefile b/libavutil/Makefile
+index 1e061763a2..cbc9bc145b 100644
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -59,6 +59,8 @@ HEADERS = adler32.h                                                     \
+           rational.h                                                    \
+           replaygain.h                                                  \
+           ripemd.h                                                      \
++          rpi_sand_fns.h                                                \
++          rpi_sand_fn_pw.h                                              \
+           samplefmt.h                                                   \
+           sha.h                                                         \
+           sha512.h                                                      \
+@@ -136,6 +138,7 @@ OBJS = adler32.o                                                        \
+        reverse.o                                                        \
+        rc4.o                                                            \
+        ripemd.o                                                         \
++       rpi_sand_fns.o                                                   \
+        samplefmt.o                                                      \
+        sha.o                                                            \
+        sha512.o                                                         \
+diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
+index 5da44b0542..b74b7c4e2f 100644
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+ 
+ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
+              arm/float_dsp_neon.o                                       \
++             arm/rpi_sand_neon.o                                        \
+diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..dbffdaefa4
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,40 @@
++#include "libavutil/arm/asm.S"
++
++@ void rpi_sand128b_stripe_to_8_10(
++@   uint8_t * dest,             [r0]
++@   const uint8_t * src1,       [r1]
++@   const uint8_t * src2,       [r2]
++@   unsigned int lines);        [r3]
++
++.macro  stripe2_to_8, bit_depth
++        vpush    {q4-q7}
++1:
++        vldm     r1!, {q0-q7}
++        subs     r3, #1
++        vldm     r2!, {q8-q15}
++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
++        vqrshrn.u16 d10, q10, #\bit_depth - 8
++        vqrshrn.u16 d11, q11, #\bit_depth - 8
++        vqrshrn.u16 d12, q12, #\bit_depth - 8
++        vqrshrn.u16 d13, q13, #\bit_depth - 8
++        vqrshrn.u16 d14, q14, #\bit_depth - 8
++        vqrshrn.u16 d15, q15, #\bit_depth - 8
++        vstm     r0!, {q0-q7}
++        bne      1b
++        vpop     {q4-q7}
++        bx       lr
++.endm
++
++function rpi_sand128b_stripe_to_8_10, export=1
++        stripe2_to_8     10
++endfunc
++
 diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 694e116..203ca7b 100644
+index 694e116a3c..203ca7b3a8 100644
 --- a/libavutil/buffer.c
 +++ b/libavutil/buffer.c
 @@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
@@ -18689,7 +28689,7 @@ index 694e116..203ca7b 100644
 +  return buf->opaque;
 +}
 diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 0c0ce12..82e0bc3 100644
+index 0c0ce12cf2..82e0bc3058 100644
 --- a/libavutil/buffer.h
 +++ b/libavutil/buffer.h
 @@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
@@ -18702,11 +28702,51 @@ index 0c0ce12..82e0bc3 100644
  /**
   * @}
   */
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index 2b5c3320c3..990347e484 100644
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -120,7 +120,20 @@ enum AVFrameSideDataType {
+      * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
+      * This is set on the first frame of a GOP that has a temporal reference of 0.
+      */
+-    AV_FRAME_DATA_GOP_TIMECODE
++    AV_FRAME_DATA_GOP_TIMECODE,
++
++    /**
++     * The data represents the AVSphericalMapping structure defined in
++     * libavutil/spherical.h.
++     */
++    AV_FRAME_DATA_SPHERICAL,
++
++    /**
++     * Extra data required to deal with a cropped Sand frame
++     * AVFrame holds the cropped size, but we cannot simply offset the start
++     * address to get the picture as we can for planar formats
++     */
++    AV_FRAME_DATA_SAND_INFO,
+ };
+ 
+ enum AVActiveFormatDescription {
+@@ -133,6 +146,13 @@ enum AVActiveFormatDescription {
+     AV_AFD_SP_4_3       = 15,
+ };
+ 
++typedef struct AVFrameDataSandInfo
++{
++    unsigned int left_offset;
++    unsigned int top_offset;
++    unsigned int pic_width;
++    unsigned int pic_height;
++} AVFrameDataSandInfo;
+ 
+ /**
+  * Structure to hold side data for an AVFrame.
 diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index 0dffa4d..5644176 100644
+index 0dffa4dbdb..17134b4f38 100644
 --- a/libavutil/pixdesc.c
 +++ b/libavutil/pixdesc.c
-@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
          .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
                   AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
      },
@@ -18721,35 +28761,481 @@ index 0dffa4d..5644176 100644
 +            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
 +        },
 +        .flags = 0,
-+    }
++    },
++    [AV_PIX_FMT_SAND64_10] = {
++        .name = "sand64_10",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
++            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
++            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
++        },
++        .flags = 0,
++    },
  };
  #if FF_API_PLUS1_MINUS1
  FF_ENABLE_DEPRECATION_WARNINGS
 diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 0ed01c4..4705e80 100644
+index 0ed01c4844..2155b78704 100644
 --- a/libavutil/pixfmt.h
 +++ b/libavutil/pixfmt.h
-@@ -303,7 +303,10 @@ enum AVPixelFormat {
+@@ -303,7 +303,22 @@ enum AVPixelFormat {
      AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
      AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
  
 -    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
++    AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
++
++    AV_PIX_FMT_GRAY12BE,   ///<        Y        , 12bpp, big-endian
++    AV_PIX_FMT_GRAY12LE,   ///<        Y        , 12bpp, little-endian
++    AV_PIX_FMT_GRAY10BE,   ///<        Y        , 10bpp, big-endian
++    AV_PIX_FMT_GRAY10LE,   ///<        Y        , 10bpp, little-endian
++
++    AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
++    AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
++
 +// RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
 +
 +    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
  };
  
  #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
+diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+new file mode 100644
+index 0000000000..52d52a2a83
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,182 @@
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x;
++    const unsigned int w = _w;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++            memcpy(dst, p, w);
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const uint8_t * p = p2;
++            uint8_t * d = dst;
++            memcpy(d, p1, w1);
++            d += w1;
++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++                memcpy(d, p, stride1);
++            }
++            memcpy(d, p, w3);
++        }
++    }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            const pixel * p = (const pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * p = (const pixel *)p1;
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *du++ = *p++;
++                    *dv++ = *p++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *p++ = *su++;
++                    *p++ = *sv++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+new file mode 100644
+index 0000000000..b8bfad915e
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,96 @@
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if HAVE_NEON
++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
++#endif
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    const unsigned int rnd = (1 << shr) >> 1;
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        *dst++ = (*src++ + rnd) >> shr;
++    }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    unsigned int rnd = (1 << shr) >> 1;
++    const unsigned int mask = ((1 << shr) - 1);
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        rnd = *src++ + (rnd & mask);
++        *dst++ = rnd >> shr;
++    }
++}
++#endif
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr)
++{
++    const unsigned int n = dst_stride1 / 2;
++    unsigned int j;
++
++    // This is true for our current layouts
++    av_assert0(dst_stride1 == src_stride1);
++
++    // As we have the same stride1 for src & dest and src is wider than dest
++    // then if we loop on src we can always write contiguously to dest
++    // We make no effort to copy an exact width - round up to nearest src stripe
++    // as we will always have storage in dest for that
++
++#if HAVE_NEON
++    if (shr == 3 && src_stride1 == 128) {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++        }
++    }
++    else
++#endif
++    {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++                cpy16_to_8(d, s1, n, shr);
++                cpy16_to_8(d + n, s2, n, shr);
++            }
++        }
++    }
++
++    // Fix up a trailing dest half stripe
++    if (j < w) {
++        uint8_t * d = dst + j * dst_stride2;
++        const uint8_t * s1 = src + j * 2 * src_stride2;
++
++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++            cpy16_to_8(d, s1, n, shr);
++        }
++    }
++}
++
+diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
+new file mode 100644
+index 0000000000..48948ecb47
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,127 @@
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
++    //   to optimize a whole lot better
++    return frame->linesize[0];
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_SAND128);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
 diff --git a/libswscale/input.c b/libswscale/input.c
-index 14ab5ab..e61b67a 100644
+index 14ab5abb3a..7a827c71e3 100644
 --- a/libswscale/input.c
 +++ b/libswscale/input.c
-@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
+@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
      }
  }
  
-+
 +static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
 +                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
 +                       int width, uint32_t *unused)
@@ -18760,112 +29246,418 @@ index 14ab5ab..e61b67a 100644
  #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
  
  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
      case AV_PIX_FMT_P010BE:
          c->chrToYV12 = p010BEToUV_c;
          break;
 +    case AV_PIX_FMT_SAND128:
-+        c->chrToYV12 = sand128ToUV_c;
++    case AV_PIX_FMT_SAND64_10:
++        c->chrToYV12 = sand128ToUV_c;  // NIF
 +        break;
      }
      if (c->chrSrcHSubSample) {
          switch (srcFormat) {
 diff --git a/libswscale/utils.c b/libswscale/utils.c
-index 576d8f0..d7206cc 100644
+index 576d8f0d5a..fd88a5e51e 100644
 --- a/libswscale/utils.c
 +++ b/libswscale/utils.c
-@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
      [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
      [AV_PIX_FMT_P010LE]      = { 1, 0 },
      [AV_PIX_FMT_P010BE]      = { 1, 0 },
 +#ifdef RPI
 +    [AV_PIX_FMT_SAND128]     = { 1, 0 },
++    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
 +#endif
  };
  
  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
-diff --git a/pi-util/conf.sh b/pi-util/conf.sh
-new file mode 100755
-index 0000000..8b596a2
---- /dev/null
-+++ b/pi-util/conf.sh
-@@ -0,0 +1,33 @@
-+echo "Configure for Pi2/3"
-+
-+RPI_BUILDROOT=`pwd`/build
-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
+diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
-index 0000000..160e149
+index 0000000000..b1e99a6a89
 --- /dev/null
-+++ b/pi-util/conf1.sh
-@@ -0,0 +1,34 @@
-+echo "Configure for Pi1"
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,25 @@
++Building Pi FFmpeg
++==================
 +
-+RPI_BUILDROOT=`pwd`/build
-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
++Configuration:
++=============
 +
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=armv\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++pi-util/conf_pi2.sh
++
++contains suitable options to build the code for Pi2/3.  It expects to find
++git clones of
++
++https://github.com/raspberrypi/tools
++https://github.com/raspberrypi/firmware
++
++in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
++lot of history you don't want.
++
++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
++rebuilt.  Otherwise the prebuilt .c & .h files will be used.
++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
++
++pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
++H265 QPU acceleration is broken on Pi1 and so it is disabled.
 +
 +
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
+diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
+new file mode 100644
+index 0000000000..f05b7753f7
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,193 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
+diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
+new file mode 100644
+index 0000000000..6082641271
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
 new file mode 100644
-index 0000000..fc14f2a
+index 0000000000..fc14f2a3c2
 --- /dev/null
 +++ b/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
@@ -19013,14 +29805,88 @@ index 0000000..fc14f2a
 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
+new file mode 100755
+index 0000000000..ec25b81c31
+--- /dev/null
++++ b/pi-util/conf_pi1.sh
+@@ -0,0 +1,31 @@
++echo "Configure for Pi1"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --cpu=arm1176jzf-s\
++ --arch=arm\
++ --disable-neon\
++ --target-os=linux\
++ --disable-stripping\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
+new file mode 100755
+index 0000000000..f8e5e75375
+--- /dev/null
++++ b/pi-util/conf_pi2.sh
+@@ -0,0 +1,30 @@
++echo "Configure for Pi2/3"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --arch=armv6t2\
++ --cpu=cortex-a7\
++ --target-os=linux\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
 diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
-new file mode 100644
-index 0000000..c896bc6
+new file mode 100755
+index 0000000000..70f7be22bb
 --- /dev/null
 +++ b/pi-util/ffconf.py
-@@ -0,0 +1,154 @@
+@@ -0,0 +1,174 @@
 +#!/usr/bin/env python
 +
++import string
 +import os
 +import subprocess
 +import re
@@ -19029,12 +29895,20 @@ index 0000000..c896bc6
 +import csv
 +from stat import *
 +
-+conf_root = "/opt/conform/h265"
 +ffmpeg_exec = "./ffmpeg"
 +
-+def testone(fileroot, name, es_file, md5_file):
++def testone(fileroot, srcname, es_file, md5_file):
 +    tmp_root = "/tmp"
 +
++    names = srcname.split('/')
++    while len(names) > 1:
++        tmp_root = os.path.join(tmp_root, names[0])
++        del names[0]
++    name = names[0]
++
++    if not os.path.exists(tmp_root):
++        os.makedirs(tmp_root)
++
 +    dec_file = os.path.join(tmp_root, name + ".dec.md5")
 +    try:
 +        os.remove(dec_file)
@@ -19079,10 +29953,10 @@ index 0000000..c896bc6
 +
 +def scandir(root):
 +    aconf = []
-+    ents = os.listdir(conf_root)
++    ents = os.listdir(root)
 +    ents.sort(key=str.lower)
 +    for name in ents:
-+        test_path = os.path.join(conf_root, name)
++        test_path = os.path.join(root, name)
 +        if S_ISDIR(os.stat(test_path).st_mode):
 +            files = os.listdir(test_path)
 +            es_file = "?"
@@ -19093,7 +29967,7 @@ index 0000000..c896bc6
 +                    pass
 +                elif ext == ".bit" or ext == ".bin":
 +                    es_file = f
-+                elif ext == ".md5":
++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
 +                    if md5_file == "?":
 +                        md5_file = f
 +                    elif base[-3:] == "yuv":
@@ -19105,13 +29979,15 @@ index 0000000..c896bc6
 +    if not tests:
 +        return True
 +    for t in tests:
-+        if name[0:len(t)] == t:
++        if name[0:len(t)] == t or name.find("/" + t) != -1:
 +            return True
-+        return False
++    return False
 +
-+def doconf(csva, tests):
-+    failures = []
++def doconf(csva, tests, test_root):
++    unx_failures = []
 +    unx_success = []
++    failures = 0
++    successes = 0
 +    for a in csva:
 +        exp_test = int(a[0])
 +        if (exp_test and runtest(a[1], tests)):
@@ -19119,17 +29995,25 @@ index 0000000..c896bc6
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
++            if (rv == 0):
++                successes += 1
++            else:
++                failures += 1
++
 +            if (rv == 0):
 +                if exp_test == 2:
 +                    print ": * OK *"
 +                    unx_success.append(name)
 +                else:
 +                    print ": ok"
-+            elif exp_test > 1 and rv == 1:
++            elif exp_test == 2 and rv == 1:
 +                print ": fail"
++            elif exp_test == 3 and rv == 2:
++                # Call an expected "crash" an abort
++                print ": abort"
 +            else:
-+                failures.append(name)
++                unx_failures.append(name)
 +                if rv == 1:
 +                    print ": * FAIL *"
 +                elif (rv == 2) :
@@ -19139,11 +30023,11 @@ index 0000000..c896bc6
 +                else :
 +                    print ": * BANG *"
 +
-+    if failures or unx_success:
-+        print "Unexpected Failures:", failures
++    if unx_failures or unx_success:
++        print "Unexpected Failures:", unx_failures
 +        print "Unexpected Success: ", unx_success
 +    else:
-+        print "All tests normal"
++        print "All tests normal:", successes, "ok,", failures, "failed"
 +
 +
 +class ConfCSVDialect(csv.Dialect):
@@ -19159,2536 +30043,184 @@ index 0000000..c896bc6
 +
 +    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
 +    argp.add_argument("tests", nargs='*')
++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
 +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
 +    args = argp.parse_args()
 +
 +    if args.csvgen:
-+        csv.writer(sys.stdout).writerows(scandir(conf_root))
++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
 +        exit(0)
 +
 +    with open(args.csv, 'rt') as csvfile:
 +        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
 +
 +
-+    doconf(csva, args.tests)
++    doconf(csva, args.tests, args.test_root)
 +
-diff --git a/pi-util/qasm.py b/pi-util/qasm.py
-new file mode 100644
-index 0000000..1eacc04
+diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
+new file mode 100755
+index 0000000000..27cc453963
 --- /dev/null
-+++ b/pi-util/qasm.py
-@@ -0,0 +1,2502 @@
-+#!/usr/bin/env python
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,124 @@
++#!/usr/bin/env python3
 +
-+#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
-+#    add  r0, r0, 1                    # implicit mul nop
-+#    nop                               # explicit add nop, implicit mul nop
-+#    bkpt                              # implicit add/mul nop
-+#    mov  r0, 0x1234                   # hex immediate
-+#    mov  r0, 20 * 40                  # expressions...
-+#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
-+#    mov  r0, a:label                  # put address of label in r0
-+# :label
-+#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
-+# :1
-+#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
-+# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
-+# .set my_val, 3                       # introduce alias for 3
-+# .set my_reg, r0                      # and for r0
-+#    mov  my_reg, my_val               # then use them
-+# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
-+# .macro my_add, a, b, c               # a, b, c act as if .set on entry
-+# .set my_val, 10
-+#    add  a, b, c
-+#    mov  r0, my_val                   # 10
-+# .endm                                # forget all .sets since .macro (including arg .sets)
-+#    mov  r0, my_val                   # 3
-+#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
-+
-+import math
-+import optparse
-+import os
-+import random
-+import re
-+import struct
-+import sys
 +import time
-+
-+###############################################################################
-+# constants
-+###############################################################################
-+
-+# ops
-+######
-+
-+# negatives are internal qasm ops
-+
-+AOP_MOV     = -3   # two operands
-+AOP_BRA     = -2   # two operands
-+AOP_BRR     = -1   # two operands
-+AOP_NOP     = 0x00 # no operands
-+AOP_FADD    = 0x01
-+AOP_FSUB    = 0x02
-+AOP_FMIN    = 0x03
-+AOP_FMAX    = 0x04
-+AOP_FMINABS = 0x05
-+AOP_FMAXABS = 0x06
-+AOP_FTOI    = 0x07 # two operands
-+AOP_ITOF    = 0x08 # two operands
-+AOP_ADD     = 0x0c
-+AOP_SUB     = 0x0d
-+AOP_SHR     = 0x0e
-+AOP_ASR     = 0x0f
-+AOP_ROR     = 0x10
-+AOP_SHL     = 0x11
-+AOP_MIN     = 0x12
-+AOP_MAX     = 0x13
-+AOP_AND     = 0x14
-+AOP_OR      = 0x15
-+AOP_XOR     = 0x16
-+AOP_NOT     = 0x17 # two operands
-+AOP_CLZ     = 0x18 # two operands
-+AOP_V8ADDS  = 0x1e
-+AOP_V8SUBS  = 0x1f
-+
-+MOP_MOV    = -1  # two operands
-+MOP_NOP    = 0x0 # no operands
-+MOP_FMUL   = 0x1
-+MOP_MUL24  = 0x2
-+MOP_V8MULD = 0x3
-+MOP_V8MIN  = 0x4
-+MOP_V8MAX  = 0x5
-+MOP_V8ADDS = 0x6
-+MOP_V8SUBS = 0x7
-+
-+# ldi modes
-+############
-+
-+LDI_32          = 0
-+LDI_EL_SIGNED   = 1
-+LDI_EL_UNSIGNED = 3
-+LDI_SEMA        = 4
-+
-+# conds
-+########
-+
-+COND_NEVER  = 0
-+COND_ALWAYS = 1
-+COND_IFZ    = 2
-+COND_IFNZ   = 3
-+COND_IFN    = 4
-+COND_IFNN   = 5
-+COND_IFC    = 6
-+COND_IFNC   = 7
-+
-+BCOND_ALLZ   = 0
-+BCOND_ALLNZ  = 1
-+BCOND_ANYZ   = 2
-+BCOND_ANYNZ  = 3
-+BCOND_ALLN   = 4
-+BCOND_ALLNN  = 5
-+BCOND_ANYN   = 6
-+BCOND_ANYNN  = 7
-+BCOND_ALLC   = 8
-+BCOND_ALLNC  = 9
-+BCOND_ANYC   = 10
-+BCOND_ANYNC  = 11
-+BCOND_ALWAYS = 15
-+
-+# packing/unpacking
-+####################
-+
-+# regfile a pack modes
-+PACK_A_NOP   = 0
-+PACK_A_16A   = 1
-+PACK_A_16B   = 2
-+PACK_A_8888  = 3
-+PACK_A_8A    = 4
-+PACK_A_8B    = 5
-+PACK_A_8C    = 6
-+PACK_A_8D    = 7
-+PACK_A_32S   = 8
-+PACK_A_16AS  = 9
-+PACK_A_16BS  = 10
-+PACK_A_8888S = 11
-+PACK_A_8AS   = 12
-+PACK_A_8BS   = 13
-+PACK_A_8CS   = 14
-+PACK_A_8DS   = 15
-+
-+# mul unit pack modes
-+PACK_MUL_NOP  = 0
-+PACK_MUL_8888 = 3
-+PACK_MUL_8A   = 4
-+PACK_MUL_8B   = 5
-+PACK_MUL_8C   = 6
-+PACK_MUL_8D   = 7
-+
-+# regfile a unpack modes
-+UNPACK_A_NOP = 0
-+UNPACK_A_16A = 1
-+UNPACK_A_16B = 2
-+UNPACK_A_8R  = 3
-+UNPACK_A_8A  = 4
-+UNPACK_A_8B  = 5
-+UNPACK_A_8C  = 6
-+UNPACK_A_8D  = 7
-+
-+# r4 unpack modes
-+UNPACK_R4_NOP = 0
-+UNPACK_R4_16A = 1
-+UNPACK_R4_16B = 2
-+UNPACK_R4_8R  = 3
-+UNPACK_R4_8A  = 4
-+UNPACK_R4_8B  = 5
-+UNPACK_R4_8C  = 6
-+UNPACK_R4_8D  = 7
-+
-+PACK_TYPE_INT    = 0
-+PACK_TYPE_FLOAT  = 1
-+PACK_TYPE_EITHER = -1
-+
-+PACK_MODE_A      = 0 # regfile a
-+PACK_MODE_M      = 1 # mul unit
-+PACK_MODE_EITHER = -1
-+
-+UNPACK_LOC_A     = 0 # regfile a
-+UNPACK_LOC_R4    = 1 # r4
-+UNPACK_LOC_AB    = 2 # either regfile a or regfile b
-+UNPACK_LOC_OTHER = 3 # somewhere else
-+
-+# args
-+#######
-+
-+# loc_t, ie internal
-+MUX_AC  = 0
-+MUX_ANY = 1
-+MUX_A   = 2
-+MUX_B   = 3
-+RW_EITHER = 0
-+RW_READ   = 1
-+RW_WRITE  = 2
-+
-+RADDR_NOP = 39
-+
-+# negatives are for internal use
-+RMUX_SEMA  = -6
-+RMUX_LABEL = -5
-+RMUX_IMMV  = -4
-+RMUX_IMM   = -3
-+RMUX_AC    = -2
-+RMUX_ANY   = -1
-+RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
-+RMUX_A     = 6
-+RMUX_B     = 7
-+
-+WADDR_R0  = 32 # followed by R1, R2, R3
-+WADDR_NOP = 39
-+
-+WMUX_ANY = 0
-+WMUX_A   = 1
-+WMUX_B   = 2
-+
-+# signals
-+##########
-+
-+SIG_BKPT       = 0
-+SIG_NORMAL     = 1
-+SIG_THRSW      = 2
-+SIG_THREND     = 3
-+SIG_SBWAIT     = 4
-+SIG_SBDONE     = 5
-+SIG_INT        = 6 # on a0
-+SIG_LTHRSW     = 6 # on b0
-+SIG_LOADCV     = 7
-+SIG_LOADC      = 8
-+SIG_LDCEND     = 9
-+SIG_LDTMU0     = 10
-+SIG_LDTMU1     = 11
-+SIG_ROTATE     = 12 # on a0
-+SIG_LOADAM     = 12 # on b0
-+SIG_SMALLIMMED = 13
-+SIG_IMMED      = 14
-+SIG_BRANCH     = 15
-+
-+# multi-line assembler constructs
-+##################################
-+
-+CONSTRUCT_MACRO = 0x1
-+CONSTRUCT_IF    = 0x2
-+CONSTRUCT_ELSE  = 0x4
-+CONSTRUCT_REP   = 0x8
-+
-+###############################################################################
-+# helpers
-+###############################################################################
-+
-+def asm_error(message, location = None):
-+   if location is None:
-+      location = current_location
-+   if location == '':
-+      sys.stderr.write('qasm ERROR: %s\n' % message)
-+   else:
-+      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
-+   sys.exit(-1)
-+
-+def asm_warning(message, location = None):
-+   if disable_warnings or (nwarn_level != 0):
-+      return
-+   if location is None:
-+      location = current_location
-+   if location == '':
-+      sys.stderr.write('qasm WARNING: %s\n' % message)
-+   else:
-+      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
-+   if warnings_are_errors:
-+      asm_error('warnings are errors!', location)
-+
-+# smart_split('') = []
-+# smart_split('a') = ['a']
-+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
-+def smart_split(s, delim = ',', count = 0):
-+   if len(s) == 0:
-+      return []
-+   parts = []
-+   depth = 0
-+   i = 0
-+   for j in xrange(len(s)):
-+      if s[j] in '([{':
-+         depth += 1
-+      elif s[j] in ')]}':
-+         depth -= 1
-+      elif (s[j] == delim) and (depth == 0):
-+         parts.append(s[i:j])
-+         i = j + 1
-+         if len(parts) == count:
-+            break
-+   if depth != 0:
-+      asm_error('bracket nesting fail')
-+   parts.append(s[i:])
-+   return parts
-+
-+def is_int(x):
-+   return isinstance(x, int) or isinstance(x, long)
-+
-+###############################################################################
-+# "parsing" stuff
-+###############################################################################
-+
-+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
-+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
-+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
-+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
-+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
-+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
-+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
-+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
-+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
-+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
-+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
-+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
-+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
-+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
-+re_label_ref_left = re.compile('\\b([ar]):')
-+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
-+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
-+
-+# ops
-+######
-+
-+aops = {
-+   'mov': (AOP_MOV, 2),
-+   'bra': (AOP_BRA, 2),
-+   'brr': (AOP_BRR, 2),
-+   'nop': (AOP_NOP, 0),
-+   'fadd': (AOP_FADD, 3),
-+   'fsub': (AOP_FSUB, 3),
-+   'fmin': (AOP_FMIN, 3),
-+   'fmax': (AOP_FMAX, 3),
-+   'fminabs': (AOP_FMINABS, 3),
-+   'fmaxabs': (AOP_FMAXABS, 3),
-+   'ftoi': (AOP_FTOI, 2),
-+   'itof': (AOP_ITOF, 2),
-+   'add': (AOP_ADD, 3),
-+   'sub': (AOP_SUB, 3),
-+   'shr': (AOP_SHR, 3),
-+   'asr': (AOP_ASR, 3),
-+   'ror': (AOP_ROR, 3),
-+   'shl': (AOP_SHL, 3),
-+   'min': (AOP_MIN, 3),
-+   'max': (AOP_MAX, 3),
-+   'and': (AOP_AND, 3),
-+   'or': (AOP_OR, 3),
-+   'xor': (AOP_XOR, 3),
-+   'not': (AOP_NOT, 2),
-+   'clz': (AOP_CLZ, 2),
-+   'v8adds': (AOP_V8ADDS, 3),
-+   'v8subs': (AOP_V8SUBS, 3)}
-+
-+def get_aop(aop):
-+   if aop not in aops:
-+      asm_error('invalid aop')
-+   return aops[aop]
-+
-+mops = {
-+   'mov': (MOP_MOV, 2),
-+   'nop': (MOP_NOP, 0),
-+   'fmul': (MOP_FMUL, 3),
-+   'mul24': (MOP_MUL24, 3),
-+   'v8muld': (MOP_V8MULD, 3),
-+   'v8min': (MOP_V8MIN, 3),
-+   'v8max': (MOP_V8MAX, 3),
-+   'v8adds': (MOP_V8ADDS, 3),
-+   'v8subs': (MOP_V8SUBS, 3)}
-+
-+def get_mop(mop):
-+   if mop not in mops:
-+      asm_error('invalid mop')
-+   return mops[mop]
-+
-+# conds
-+########
-+
-+conds = {
-+   'ifz': COND_IFZ,
-+   'ifnz': COND_IFNZ,
-+   'ifn': COND_IFN,
-+   'ifnn': COND_IFNN,
-+   'ifc': COND_IFC,
-+   'ifnc': COND_IFNC}
-+
-+def get_cond(cond):
-+   if not cond:
-+      return COND_ALWAYS
-+   if cond not in conds:
-+      asm_error('invalid cond')
-+   return conds[cond]
-+
-+bconds = {
-+   'allz': BCOND_ALLZ,
-+   'allnz': BCOND_ALLNZ,
-+   'anyz': BCOND_ANYZ,
-+   'anynz': BCOND_ANYNZ,
-+   'alln': BCOND_ALLN,
-+   'allnn': BCOND_ALLNN,
-+   'anyn': BCOND_ANYN,
-+   'anynn': BCOND_ANYNN,
-+   'allc': BCOND_ALLC,
-+   'allnc': BCOND_ALLNC,
-+   'anyc': BCOND_ANYC,
-+   'anync': BCOND_ANYNC}
-+
-+def get_bcond(bcond):
-+   if not bcond:
-+      return BCOND_ALWAYS
-+   if bcond not in bconds:
-+      asm_error('invalid bcond')
-+   return bconds[bcond]
-+
-+def get_setf(setf):
-+   if not setf:
-+      return False
-+   return True
-+
-+# packing/unpacking
-+####################
-+
-+packs = {
-+   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
-+   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
-+   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
-+   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
-+   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
-+
-+def get_pack(pack):
-+   if not pack:
-+      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
-+   if pack not in packs:
-+      asm_error('invalid pack')
-+   return packs[pack]
-+
-+a_unpacks = {
-+   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
-+   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
-+   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
-+   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
-+   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
-+   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
-+   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
-+   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
-+   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
-+   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
-+   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
-+   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
-+   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
-+
-+def get_a_unpack(unpack):
-+   if not unpack:
-+      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
-+   if unpack not in a_unpacks:
-+      asm_error('invalid ra unpack')
-+   return a_unpacks[unpack] + (UNPACK_LOC_A,)
-+
-+r4_unpacks = {
-+   '16af': UNPACK_R4_16A,
-+   '16bf': UNPACK_R4_16B,
-+   '8dr':  UNPACK_R4_8R,
-+   '8ac':  UNPACK_R4_8A,
-+   '8bc':  UNPACK_R4_8B,
-+   '8cc':  UNPACK_R4_8C,
-+   '8dc':  UNPACK_R4_8D}
-+
-+def get_r4_unpack(unpack):
-+   if not unpack:
-+      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
-+   if unpack not in r4_unpacks:
-+      asm_error('invalid r4 unpack')
-+   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
-+
-+# args
-+#######
-+
-+class loc_t:
-+   def __init__(self, mux, i, rot, r5_rot, pack, rw):
-+      self.mux = mux
-+      self.i = i
-+      self.rot = rot % 16
-+      self.r5_rot = r5_rot % 16
-+      self.pack = pack
-+      self.rw = rw
-+
-+   def copy(self):
-+      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __add__(self, i):
-+      if not is_int(i):
-+         raise Exception('can only add integer to loc')
-+      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __sub__(self, i):
-+      if not is_int(i):
-+         raise Exception('can only subtract integer from loc')
-+      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __cmp__(self, other):
-+      if is_int(other):
-+         return cmp(self.i, other)
-+      if not isinstance(other, loc_t):
-+         raise Exception('can only compare loc to integer or other loc')
-+      if self.mux != other.mux:
-+         return cmp(self.mux, other.mux)
-+      if self.i != other.i:
-+         return cmp(self.i, other.i)
-+      if self.rot != other.rot:
-+         return cmp(self.rot, other.rot)
-+      if self.r5_rot != other.r5_rot:
-+         return cmp(self.r5_rot, other.r5_rot)
-+      return cmp(self.pack, other.pack)
-+
-+   def is_r5(self):
-+      return (self.mux == MUX_AC) and (self.i == 5)
-+
-+   def shift(self, rot, left):
-+      if isinstance(rot, loc_t) and rot.is_r5():
-+         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
-+            raise Exception('can\'t rotate by rotated/unpacked r5')
-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
-+      if not is_int(rot):
-+         raise Exception('can only rotate by integer or r5')
-+      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
-+
-+   def __lshift__(self, rot):
-+      return self.shift(rot, True)
-+
-+   def __rshift__(self, rot):
-+      return self.shift(rot, False)
-+
-+   def __getattr__(self, name):
-+      # discard the first character if it is an underscore. this is a total hack
-+      # to allow packs starting with a digit to work
-+      if name[0] == '_':
-+         name = name[1:]
-+      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
-+         if self.pack:
-+            raise Exception('can\'t specify two packs')
-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
-+      raise AttributeError()
-+
-+   def __str__(self):
-+      if self.mux == MUX_AC:
-+         return 'r%d' % self.i
-+      if self.mux == MUX_ANY:
-+         return 'rany%d' % self.i
-+      if self.mux == MUX_A:
-+         return 'ra%d' % self.i
-+      if self.mux == MUX_B:
-+         return 'rb%d' % self.i
-+      assert 0
-+
-+class sema_t:
-+   def __init__(self, acq, i):
-+      if not is_int(i):
-+         raise Exception('semaphore index must be integer')
-+      self.acq = acq
-+      self.i = i
-+
-+class label_t:
-+   def __init__(self, rel, name, offset):
-+      self.rel = rel
-+      self.name = name
-+      self.offset = offset
-+
-+   def __add__(self, offset):
-+      return label_t(self.rel, self.name, self.offset + offset)
-+
-+   def __sub__(self, offset):
-+      return label_t(self.rel, self.name, self.offset - offset)
-+
-+class label_maker_t:
-+   def __init__(self, rel):
-+      self.rel = rel
-+
-+   def __getattr__(self, name):
-+      # we discard the first character. this is a total hack to allow numeric labels to work
-+      if not re_label_ref_right.match(name[1:]):
-+         raise Exception('invalid label reference')
-+      return label_t(self.rel, name[1:], 0)
-+
-+def bits(x, n):
-+   if (x >> n) != 0:
-+      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
-+   return x
-+
-+def bitsw(x, n):
-+   if x == (1 << n):
-+      x = 0
-+   return bits(x, n)
-+
-+def bitsws(x, n):
-+   if x == (1 << (n - 1)):
-+      x = 0
-+   if -(1 << (n - 1)) <= x < 0:
-+      x += 1 << n
-+   return bits(x, n)
-+
-+def vpm_setup(n, stride, addr, v2 = False):
-+   horiz, laned, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   if horiz:
-+      if x != 0:
-+         raise Exception('horizontal accesses must have x of 0')
-+   else:
-+      if (y & 0xf) != 0:
-+         raise Exception('vertical accesses must be 16 row aligned')
-+   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
-+   if v2:
-+      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
-+         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
-+   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
-+      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
-+
-+def vdw_setup_0(n, m, addr):
-+   horiz, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
-+      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
-+
-+def vdr_setup_0(n, m, addr, vpm_stride, stride):
-+   horiz, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   if (stride < 8) or (stride & (stride - 1)):
-+      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
-+   log2_stride = 3
-+   while (1 << log2_stride) != stride:
-+      log2_stride += 1
-+   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
-+      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
-+      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
-+
-+class allocator_t:
-+   def __init__(self, *available):
-+      self.available = list(available)
-+      self.allocated = {}
-+      self.reserved = []
-+
-+   def copy(self):
-+      a = allocator_t()
-+      a.available = self.available[:]
-+      a.allocated = self.allocated.copy()
-+      a.reserved = self.reserved[:]
-+      return a
-+
-+   def forget(self):
-+      self.__init__(self.available + self.allocated.values() + self.reserved)
-+
-+   def reserve(self, *rs):
-+      for r in rs:
-+         self.available.remove(r)
-+         self.reserved.append(r)
-+
-+   def retire(self, name):
-+      r = self.allocated.pop(name)
-+      del r.__invert__
-+      del r.retire
-+      self.available.append(r)
-+      return r
-+
-+   def __getattr__(self, name):
-+      if name not in self.allocated:
-+         r = self.available.pop()
-+         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
-+         r.__invert__ = r.retire
-+         self.allocated[name] = r
-+      return self.allocated[name]
-+
-+def pragma_allow_xor_0(x):
-+   global allow_xor_0
-+
-+   if not isinstance(x, bool):
-+      raise Exception('allow_xor_0 must be bool')
-+   x, allow_xor_0 = allow_xor_0, x
-+   return x
-+
-+def pragma_dont_warn_when_mul_rot_inp_r5(x):
-+   global dont_warn_when_mul_rot_inp_r5
-+
-+   if not isinstance(x, bool):
-+      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
-+   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
-+   return x
-+
-+arg_defs = {
-+   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
-+   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
-+   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
-+   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
-+   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
-+   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
-+   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
-+   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
-+   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
-+   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
-+   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
-+   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
-+   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
-+   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
-+   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
-+   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
-+   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
-+   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
-+   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
-+   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
-+   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
-+   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
-+   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
-+   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
-+   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
-+   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
-+   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
-+   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
-+   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
-+   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
-+   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
-+   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
-+   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
-+   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
-+   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
-+   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
-+   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
-+   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
-+   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
-+   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
-+   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
-+   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
-+   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
-+
-+   # semaphore acq/rel
-+   'sacq': lambda i: sema_t(True, i),
-+   'srel': lambda i: sema_t(False, i),
-+
-+   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
-+   'r_label_maker': label_maker_t(True),
-+   'a_label_maker': label_maker_t(False),
-+
-+   # handy functions
-+   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
-+   'sqrt':  math.sqrt,
-+   'sin':   math.sin,
-+   'cos':   math.cos,
-+   'atan2': math.atan2,
-+   'pi':    math.pi,
-+   'rseed': random.seed,
-+   'rand':  lambda: int(random.getrandbits(32)),
-+   'bits':  bits,
-+   'bitsw': bitsw,
-+   'bitsws': bitsws,
-+
-+   # handy vpm/vdw/vdr stuff
-+   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
-+   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
-+   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
-+   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
-+   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
-+   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
-+   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
-+   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
-+   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
-+   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
-+   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
-+   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
-+   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
-+   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
-+   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
-+   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
-+   'vpm_setup': vpm_setup,
-+   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
-+   'vdw_setup_0': vdw_setup_0,
-+   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
-+   'vdr_setup_0': vdr_setup_0,
-+   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
-+   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
-+
-+   # annotations
-+   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
-+   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
-+   'preserve_cond': ('preserve_cond', 1),
-+
-+   # somewhat experimental register allocator
-+   'allocator_t': allocator_t,
-+
-+   # pragmas
-+   'pragma_allow_xor_0': pragma_allow_xor_0,
-+   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
-+
-+# accumulators and regs (regular names -- r0, ra0, etc)
-+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
-+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+
-+def arg_eval(arg, sets):
-+   s = (arg.strip().split('.', 1) + [None])[:2]
-+   if s[0] == '-':
-+      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
-+   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
-+   arg = re_pack.sub('._\\1', arg)
-+   try:
-+      # todo: i would like to be able to pass both arg_defs and sets in here
-+      # (with sets hiding arg_defs in the case of conflicts), but the obvious
-+      # dict(arg_defs, **sets) won't permit things such as:
-+      # .set f, lambda x: y
-+      # .set y, 4
-+      # (the y in the lambda will be looked up in the temporary dict we created
-+      # when evaluating the f .set, which doesn't contain y)
-+      #
-+      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
-+      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
-+      # also, before dumping sets at the end, we strip out the arg_defs stuff
-+      # (this isn't entirely correct as we want to dump sets that are hiding
-+      # arg_defs)
-+      return eval(arg, sets)
-+   except Exception, e:
-+      asm_error(e)
-+   except:
-+      asm_error('unknown error while evaluating argument')
-+
-+# doesn't check/fixup pack
-+def check_and_fixup_loc(loc, read):
-+   if (not read) and (loc.rw == RW_READ):
-+      asm_error('writing to read-only hardware register')
-+   if read and (loc.rw == RW_WRITE):
-+      asm_error('reading from write-only hardware register')
-+   if not read:
-+      # conceptually, we are writing to a location rotated right by
-+      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
-+      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
-+      loc.rot = -loc.rot % 16
-+      loc.r5_rot = -loc.r5_rot % 16
-+   if (loc.rot != 0) and (loc.r5_rot != 0):
-+      asm_error('can\'t rotate by both r5 and immediate')
-+   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
-+      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
-+   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
-+      if not read:
-+         asm_error('target doesn\'t support write rotation')
-+      if loc.mux == MUX_ANY:
-+         loc.mux = MUX_A # can't do rotated read from regfile b
-+      if loc.mux != MUX_A:
-+         asm_error('rotation on read only allowed from regfile a')
-+      if loc.i >= 32:
-+         asm_warning('rotation only works from physical regfile')
-+   if loc.mux == MUX_AC:
-+      if (loc.i < 0) or (loc.i >= 6):
-+         asm_error('reg out of range')
-+      if not read:
-+         if loc.i == 4:
-+            asm_error('not allowed to write to r4')
-+         if loc.i == 5:
-+
-+            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
-+   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
-+      if (loc.i < 0) or (loc.i >= 64):
-+         asm_error('reg out of range')
-+   else:
-+      assert 0
-+
-+def get_dst(dst, sets):
-+   if not dst:
-+      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
-+   dst = arg_eval(dst, sets)
-+   if not isinstance(dst, loc_t):
-+      asm_error('invalid dst')
-+   dst = dst.copy()
-+   check_and_fixup_loc(dst, False)
-+   pack = get_pack(dst.pack)
-+   if dst.mux == MUX_AC:
-+      if pack[2] == PACK_MODE_A:
-+         asm_warning('ra packing only works when writing to physical regfile')
-+         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
-+      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
-+   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
-+      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
-+         asm_warning('ra packing only works when writing to physical regfile')
-+      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
-+   if dst.mux == MUX_ANY:
-+      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
-+   if dst.mux == MUX_B:
-+      if pack[2] == PACK_MODE_A:
-+         asm_error('this packing operation can only be used for regfile a')
-+      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
-+   assert 0
-+
-+def get_src(src, sets):
-+   if not src:
-+      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
-+   src = arg_eval(src, sets)
-+   if isinstance(src, sema_t):
-+      if not have_sema:
-+         asm_error('target does not support semaphores')
-+      if (src.i < 0) or (src.i >= 16):
-+         asm_error('semaphore number must be in [0, 16)')
-+      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if isinstance(src, label_t):
-+      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if isinstance(src, list):
-+      if len(src) != 16:
-+         asm_error('vector immediate must have length 16')
-+      src = src[:]
-+      for i in xrange(16):
-+         if not is_int(src[i]):
-+            asm_error('all elements of vector immediate must be integers')
-+         src[i] &= (1 << 32) - 1
-+      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if is_int(src):
-+      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if not isinstance(src, loc_t):
-+      asm_error('invalid src')
-+   src = src.copy()
-+   check_and_fixup_loc(src, True)
-+   if mulw_rotate:
-+      srot, sr5rot = 0, 0
-+      drot, dr5rot = src.rot, src.r5_rot
-+   else:
-+      srot, sr5rot = src.rot, src.r5_rot
-+      drot, dr5rot = 0, 0
-+   if src.mux == MUX_AC:
-+      if src.i == 4:
-+         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
-+      if src.pack:
-+         asm_error('unpack only allowed for regfile a or r4')
-+      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
-+   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
-+      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
-+   if src.mux == MUX_ANY:
-+      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
-+   if src.mux == MUX_B:
-+      if src.pack:
-+         asm_error('unpack only allowed for regfile a or r4')
-+      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
-+   assert 0
-+
-+# signals
-+##########
-+
-+sigs = {
-+   'bkpt': SIG_BKPT,
-+   'thrsw': SIG_THRSW,
-+   'thrend': SIG_THREND,
-+   'sbwait': SIG_SBWAIT,
-+   'sbdone': SIG_SBDONE,
-+   'int': SIG_INT,
-+   'loadcv': SIG_LOADCV,
-+   'loadc': SIG_LOADC,
-+   'ldcend': SIG_LDCEND,
-+   'ldtmu0': SIG_LDTMU0,
-+   'ldtmu1': SIG_LDTMU1}
-+
-+def get_sig(sig):
-+   if sig not in sigs:
-+      return SIG_NORMAL
-+   return sigs[sig]
-+
-+# annotations
-+##############
-+
-+def get_annots(annot, sets):
-+   annots = arg_eval(annot, sets)
-+   if isinstance(annots, list):
-+      annots = annots[:]
-+   else:
-+      annots = [annots]
-+   for i, annot in enumerate(annots):
-+      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
-+         (not is_int(annot[1]))):
-+         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
-+      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
-+   return annots
-+
-+###############################################################################
-+# core
-+###############################################################################
-+
-+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
-+   needfloat = PACK_TYPE_EITHER
-+   havefloata = False
-+   havefloatr4 = False
-+   unpacka = None
-+   unpackr4 = None
-+   forcebs = [False, False, False, False]
-+   forcerafloat = False
-+
-+   pm = PACK_MODE_EITHER
-+   for i in (0, 1, 2, 3):
-+      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
-+         assert rpacks[i][0] == 0
-+      else:
-+         if rpacks[i][2] == UNPACK_LOC_A:
-+            if unpacka is None:
-+               unpacka = rpacks[i][0]
-+            elif unpacka != rpacks[i][0]:
-+               asm_error('conflicting unpack operations on regfile a')
-+            havefloata = havefloata or rfloats[i]
-+         elif rpacks[i][2] == UNPACK_LOC_R4:
-+            if unpackr4 is None:
-+               unpackr4 = rpacks[i][0]
-+            elif unpackr4 != rpacks[i][0]:
-+               asm_error('conflicting unpack operations on r4')
-+            havefloatr4 = havefloatr4 or rfloats[i]
-+         else:
-+            assert 0
-+
-+         if rpacks[i][1] != PACK_TYPE_EITHER:
-+            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
-+               asm_error('conflicting unpack float requirements')
-+            needfloat = rpacks[i][1]
-+   for i in (0, 1, 2, 3):
-+      if rpacks[i][2] == UNPACK_LOC_AB:
-+         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
-+            forcebs[i] = True # non-nop unpack from regfile a. must use b
-+
-+   if unpacka:
-+      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
-+         havefloata = True
-+         forcerafloat = True
-+      havefloat = havefloata
-+   else:
-+      havefloat = havefloatr4
-+
-+   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
-+      asm_error('float unpack operation used in integer alu operations')
-+   if (needfloat == PACK_TYPE_INT) and havefloat:
-+      asm_error('integer unpack operation used in float alu operation')
-+
-+   unpack = 0
-+   if unpacka and unpackr4:
-+      asm_error('cannot specify pack operation for both regfile a and r4')
-+   if unpacka:
-+      pm = PACK_MODE_A
-+      unpack = unpacka
-+   elif unpackr4:
-+      pm = PACK_MODE_M
-+      unpack = unpackr4
-+
-+   pack = 0
-+   if wpacks[0][2] == PACK_MODE_M:
-+      asm_error('mul-unit pack operation used on add result')
-+   for i in (0, 1):
-+      if wpacks[i][2] == PACK_MODE_A:
-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
-+            asm_error('conflicting pack modes')
-+         pm = PACK_MODE_A
-+         pack = wpacks[i][0]
-+      elif wpacks[i][2] == PACK_MODE_M:
-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
-+            asm_error('conflicting pack modes')
-+         pm = PACK_MODE_M
-+         pack = wpacks[i][0]
-+
-+      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
-+         asm_error('float pack operation used with integer alu result')
-+      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
-+         asm_error('integer pack operation used with float alu result')
-+
-+   if pm == PACK_MODE_EITHER:
-+      pm = PACK_MODE_A
-+   return pm, pack, unpack, forcebs, forcerafloat
-+
-+# immediates that can be encoded with SIG_SMALLIMMED
-+bimms = {}
-+bimms.update((i, i) for i in xrange(16))
-+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
-+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
-+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
-+
-+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
-+   if rmux == RMUX_SEMA:
-+      asm_error('semaphore op can only be used with mov')
-+   if rmux == RMUX_LABEL:
-+      asm_error('label not allowed here')
-+   if rmux == RMUX_IMMV:
-+      asm_error('vector immediate can only be used with mov')
-+   if rmux == RMUX_IMM:
-+      if raddr not in bimms:
-+         asm_error('can\'t encode immediate 0x%08x' % raddr)
-+      raddr = bimms[raddr]
-+      if not immb:
-+         if raddr_b is not None:
-+            asm_error('regfile b and immediates don\'t mix')
-+         raddr_b = raddr
-+         immb = True
-+      elif raddr_b != raddr:
-+         asm_error('can only encode one rotation/immediate')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+   if rmux == RMUX_AC:
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
-+   if rmux == RMUX_ANY:
-+      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      if (not immb) and (raddr_b == raddr):
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+      if raddr_a is None:
-+         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
-+         raddr_a = raddr
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      if raddr_b is None:
-+         assert not immb
-+         raddr_b = raddr
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+      asm_error('no free read slots')
-+   if rmux == RMUX_A:
-+      if (not mulw_rotate) and (raddr_a is not None) and (
-+         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
-+         asm_error('conflicting rotations from regfile a')
-+      if raddr_a is None:
-+         raddr_a = raddr[0]
-+      elif raddr_a != raddr[0]:
-+         asm_error('can only read from one location in each regfile')
-+      arot_r5 = raddr[2]
-+      if raddr[1] == 0:
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      raddr = 48 + raddr[1]
-+      if not immb:
-+         if raddr_b is not None:
-+            asm_error('regfile b and rotation don\'t mix')
-+         raddr_b = raddr
-+         immb = True
-+      elif raddr_b != raddr:
-+         asm_error('can only encode one rotation/immediate')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+   if rmux == RMUX_B:
-+      if immb:
-+         asm_error('regfile b and rotation/immediates don\'t mix')
-+      if raddr_b is None:
-+         raddr_b = raddr
-+      elif raddr_b != raddr:
-+         asm_error('can only read from one location in each regfile')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+   assert 0
-+
-+# ok if:
-+# - accumulator (r0-r3)
-+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
-+#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
-+#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
-+#   what about vr_wait/vw_wait/mutex?
-+def read_rot_ok(rmux, raddr_a, raddr_b):
-+   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
-+      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
-+      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
-+
-+def asm_flush_prog_data():
-+   global prog_data
-+
-+   while len(prog_data) & 7:
-+      prog_data.append(0)
-+   for i in xrange(0, len(prog_data), 8):
-+      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
-+         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
-+   prog_data = []
-+
-+def asm_line(sets, location, line):
-+   global current_location, construct, nwarn_level
-+
-+   prev_location = current_location
-+   current_location = location
-+
-+   try:
-+      if construct != None:
-+         if re_macro.match(line):
-+            construct_stack.append(CONSTRUCT_MACRO)
-+         elif re_if.match(line):
-+            construct_stack.append(CONSTRUCT_IF)
-+         elif re_rep.match(line):
-+            construct_stack.append(CONSTRUCT_REP)
-+         else:
-+            else_m = line == '.else'
-+            elif_m = re_elif.match(line)
-+            if elif_m:
-+               end_construct = CONSTRUCT_IF
-+            else:
-+               end_construct = {
-+                  '.endm':  CONSTRUCT_MACRO,
-+                  '.else':  CONSTRUCT_IF,
-+                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
-+                  '.endr':  CONSTRUCT_REP}.get(line)
-+            if end_construct is not None:
-+               end_construct &= construct_stack.pop()
-+               if end_construct == 0:
-+                  if elif_m:
-+                     asm_error('unexpected .elif')
-+                  asm_error('unexpected %s' % line)
-+               if len(construct_stack) == 0:
-+                  lines = construct
-+                  construct = None
-+                  if end_construct == CONSTRUCT_MACRO:
-+                     return
-+                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
-+                     condition_if, condition_else = lines[0]
-+                     lines = lines[1:]
-+                     if condition_if:
-+                        for location, line in lines:
-+                           asm_line(sets, location, line)
-+                     if else_m:
-+                        construct = [(condition_else, False)]
-+                        construct_stack.append(CONSTRUCT_ELSE)
-+                     elif elif_m:
-+                        if elif_m.group('set'):
-+                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
-+                        else:
-+                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
-+                        condition_else = condition_else and (not condition_if)
-+                        construct = [(condition_if, condition_else)]
-+                        construct_stack.append(CONSTRUCT_IF)
-+                     return
-+                  if end_construct == CONSTRUCT_REP:
-+                     name, count = lines[0]
-+                     lines = lines[1:]
-+                     for i in xrange(count):
-+                        sets[name] = i
-+                        for location, line in lines:
-+                           asm_line(sets, location, line)
-+                     return
-+                  assert 0
-+               if else_m:
-+                  construct_stack.append(CONSTRUCT_ELSE)
-+               elif elif_m:
-+                  construct_stack.append(CONSTRUCT_IF)
-+         construct.append((current_location, line))
-+         return
-+
-+      if line in ('.endm', '.else', '.endif', '.endr'):
-+         asm_error('unexpected %s' % line)
-+      if re_elif.match(line):
-+         asm_error('unexpected .elif')
-+
-+      m = re_macro.match(line)
-+      if m:
-+         construct = []
-+         construct_stack.append(CONSTRUCT_MACRO)
-+         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
-+         return
-+
-+      m = re_if.match(line)
-+      if m:
-+         if m.group('set'):
-+            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
-+         else:
-+            # not not forces condition to a bool (this matters if condition is
-+            # something mutable like a list)
-+            condition = not not arg_eval(m.group('condition'), sets)
-+         construct = [(condition, not condition)]
-+         construct_stack.append(CONSTRUCT_IF)
-+         return
-+
-+      m = re_rep.match(line)
-+      if m:
-+         count = arg_eval(m.group('count'), sets)
-+         if not is_int(count):
-+            asm_error('.rep count must be integer')
-+         construct = [(m.group('name'), count)]
-+         construct_stack.append(CONSTRUCT_REP)
-+         return
-+
-+      m = re_include.match(line)
-+      if m:
-+         filename = arg_eval(m.group('filename'), sets)
-+         if not isinstance(filename, str):
-+            asm_error('expected string')
-+         asm_file(sets, '%s: %s' % (current_location, filename), filename)
-+         return
-+
-+      m = re_set.match(line)
-+      if m:
-+         sets[m.group('name')] = arg_eval(m.group('val'), sets)
-+         return
-+
-+      m = re_unset.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name not in sets:
-+            asm_error('%s not set' % name)
-+         if name in arg_defs: # todo: see arg_eval
-+            sets[name] = arg_defs[name]
-+         else:
-+            del sets[name]
-+         return
-+
-+      m = re_eval.match(line)
-+      if m:
-+         arg_eval(m.group('expr'), sets)
-+         return
-+
-+      m = re_print_info_warn_error.match(line)
-+      if m:
-+         def print_fn(message):
-+            print message
-+         def info_fn(message):
-+            sys.stderr.write('%s\n' % message)
-+         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
-+            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
-+         return
-+
-+      m = re_assert.match(line)
-+      if m:
-+         if not arg_eval(m.group('condition'), sets):
-+            asm_error('assertion failure: \'%s\'' % m.group('condition'))
-+         return
-+
-+      m = re_data.match(line)
-+      if m:
-+         size = int(m.group('size'))
-+         for datum in smart_split(m.group('data')):
-+            datum = arg_eval(datum, sets)
-+            if not is_int(datum):
-+               asm_error('datum must be integer')
-+            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
-+         return
-+
-+      m = re_macro_inst.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name in macros:
-+            params, lines = macros[name]
-+            args = smart_split(m.group('args'))
-+            if len(args) > len(params):
-+               asm_error('too many arguments to macro')
-+            sets = sets.copy()
-+            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
-+            for param in params[len(args):]:
-+               if param in sets:
-+                  if param in arg_defs: # todo: see arg_eval
-+                     sets[param] = arg_defs[param]
-+                  else:
-+                     del sets[param]
-+            for location, line in lines:
-+               asm_line(sets, '%s: %s' % (current_location, location), line)
-+            return
-+
-+      if line == '.pushnwarn':
-+         nwarn_level += 1
-+         return
-+      if line == '.popnwarn':
-+         if nwarn_level == 0:
-+            asm_error('.popnwarn without .pushnwarn')
-+         nwarn_level -= 1
-+         return
-+
-+      # everything below assumes prog is up to date
-+      asm_flush_prog_data()
-+
-+      m = re_label.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name[0].isdigit():
-+            labels.setdefault(name, []).append(len(prog))
-+         else:
-+            if name[0] == ':':
-+               undecorated_name = name[1:]
-+            else:
-+               undecorated_name = name
-+            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
-+               asm_error('named label defined twice')
-+            labels[name] = len(prog)
-+         return
-+
-+      annots = line.split('@')
-+      ops = [op.strip() for op in annots[0].split(';')]
-+      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
-+      sig = get_sig(ops[-1])
-+      if sig != SIG_NORMAL:
-+         ops = ops[:-1]
-+      if len(ops) > 2:
-+         asm_error('too many ops')
-+      elif (len(ops) == 1) and (ops[0] == ''):
-+         ops = []
-+      ops = (ops + ['nop', 'nop'])[:2]
-+      m = re_op.match(ops[0])
-+      if not m:
-+         asm_error('invalid syntax')
-+      aop, aargs_n = get_aop(m.group('op'))
-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
-+         acond = get_bcond(m.group('cond'))
-+      else:
-+         acond = get_cond(m.group('cond'))
-+      asf = get_setf(m.group('sf'))
-+      aargs = smart_split(m.group('args'))
-+      if len(aargs) != aargs_n:
-+         asm_error('wrong operand count')
-+      ard, ara, arb = (aargs + [None, None, None])[:3]
-+      m = re_op.match(ops[1])
-+      if not m:
-+         asm_error('invalid syntax')
-+      mop, margs_n = get_mop(m.group('op'))
-+      mcond = get_cond(m.group('cond'))
-+      msf = get_setf(m.group('sf'))
-+      margs = smart_split(m.group('args'))
-+      if len(margs) != margs_n:
-+         asm_error('wrong operand count')
-+      mrd, mra, mrb = (margs + [None, None, None])[:3]
-+      # eval srcs first so allocator can retire and reuse registers for dst
-+      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
-+      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
-+      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
-+      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
-+      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
-+      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
-+      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
-+         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
-+         asm_error('cannot have 2 arguments with different rotations')
-+      if aarmux is not None:
-+         awrot = (awrot + aadrot) % 16
-+         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
-+      if (awrot != 0) or awrot_r5:
-+         asm_error('rotate not allowed on add write')
-+      if marmux is not None:
-+         mwrot = (mwrot + madrot) % 16
-+         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
-+
-+      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
-+      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
-+      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
-+         [aarpack, abrpack, marpack, mbrpack],
-+         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
-+         aop == AOP_FTOI,
-+         [awpack, mwpack],
-+         [afloatw, mop == MOP_FMUL])
-+      if forcebs[0]:
-+         aarmux = RMUX_B
-+      if forcebs[1]:
-+         abrmux = RMUX_B
-+      if forcebs[2]:
-+         marmux = RMUX_B
-+      if forcebs[3]:
-+         mbrmux = RMUX_B
-+
-+      # extend nops to 3 operands
-+      if aop == AOP_NOP:
-+         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
-+      if mop == MOP_NOP:
-+         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
-+
-+      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
-+      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
-+         if forcerafloat:
-+            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
-+            # instead of duplicating the 2nd operand, take the ra operand from
-+            # the mul op thus forcing the ra value to be considered a float for
-+            # the purposes of unpacking
-+            if marmux == RMUX_A:
-+               abraddr, abrmux = maraddr, marmux
-+            else:
-+               assert mbrmux == RMUX_A
-+               abraddr, abrmux = mbraddr, mbrmux
-+         else:
-+            abraddr, abrmux = aaraddr, aarmux
-+      else:
-+         assert not forcerafloat # can only forcerafloat if we have an unused operand
-+
-+      # handle write addrs
-+      if (awmux == mwmux) and (awmux != WMUX_ANY):
-+         asm_error('add/mul ops not allowed to write to same regfile')
-+      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
-+
-+      # handle branch
-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
-+         # check setf
-+         if asf:
-+            asm_error('setf not allowed on bra/brr')
-+
-+         # check pack/unpack
-+         if (pack != 0) or (unpack != 0):
-+            asm_error('pack/unpack not allowed with bra/brr')
-+
-+         # handle read address
-+         if aarmux == RMUX_LABEL:
-+            if (aop == AOP_BRA) and aaraddr[1]:
-+               asm_warning('bra with rel label')
-+            if (aop == AOP_BRR) and (not aaraddr[1]):
-+               asm_warning('brr with abs label')
-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
-+         if aarmux == RMUX_ANY:
-+            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
-+         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
-+            asm_error('branch destination must be either label, immediate, or from regfile a')
-+         if aarmux == RMUX_IMM:
-+            imm = aaraddr
-+            raddr = 0 # can't use RADDR_NOP
-+         elif aarmux == RMUX_A:
-+            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
-+               asm_error('rotation of read from regfile a not allowed with branch')
-+            if aop == AOP_BRR:
-+               asm_warning('brr with ra')
-+            imm = 0
-+            raddr = aaraddr[0]
-+         else:
-+            assert 0
-+
-+         # check mul op is nop
-+         if mop != MOP_NOP:
-+            asm_error('mul op not allowed with branch')
-+
-+         # check sig
-+         if sig != SIG_NORMAL:
-+            asm_error('no signal allowed with branch')
-+
-+         if raddr >= 32:
-+            asm_error('can only branch to register locations in physical regfile')
-+         if raddr & 1:
-+            asm_warning('branch instruction will destroy flags (see hw-2780)')
-+
-+         # construct branch instruction
-+         prog.append((imm,
-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
-+            line, annots))
-+
-+         return
-+
-+      # use COND_NEVER when possible (might save power / allow mul setf)
-+      if not dict(annots).get('preserve_cond', 0):
-+          if (awaddr == WADDR_NOP) and (not asf):
-+             acond = COND_NEVER
-+          if (mwaddr == WADDR_NOP) and (not msf):
-+             mcond = COND_NEVER
-+
-+      # attempt to convert movs to ldi
-+      if (# no mul setf
-+         (not msf) and
-+         # ops must either be nop or mov of sema/label/imm/immv
-+         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
-+         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
-+         # but we don't want 2 nops
-+         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
-+         # if both ops are movs, srcs must be identical
-+         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
-+         # no signal
-+         (sig == SIG_NORMAL)):
-+         # make sure aarmux/aaraddr contains the value
-+         if aop != AOP_MOV:
-+            aarmux = marmux
-+            aaraddr = maraddr
-+
-+         # convert immediate
-+         if aarmux == RMUX_SEMA:
-+            ldi_mode = LDI_SEMA
-+         elif aarmux == RMUX_LABEL:
-+            ldi_mode = LDI_32
-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
-+         elif aarmux == RMUX_IMMV:
-+            signed, unsigned = True, True
-+            imm = 0
-+            for i, elem in enumerate(aaraddr):
-+               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
-+                  signed = False
-+               if elem not in (0, 1, 2, 3):
-+                  unsigned = False
-+               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
-+            if not (signed or unsigned):
-+               asm_error('can\'t encode vector immediate')
-+            if signed:
-+               ldi_mode = LDI_EL_SIGNED
-+            else:
-+               ldi_mode = LDI_EL_UNSIGNED
-+            aaraddr, aarmux = imm, RMUX_IMM
-+         elif aarmux == RMUX_IMM:
-+            ldi_mode = LDI_32
-+         else:
-+            assert 0
-+
-+         # construct ldi instruction
-+         prog.append((aaraddr,
-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
-+            line, annots))
-+
-+         return
-+
-+      # convert movs to alu ops
-+      if aop == AOP_MOV:
-+         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
-+            aop = AOP_XOR
-+            aaraddr, aarmux = 0, RMUX_AC
-+            abraddr, abrmux = 0, RMUX_AC
-+         else:
-+            aop = AOP_OR
-+            abraddr, abrmux = aaraddr, aarmux
-+      if mop == MOP_MOV:
-+         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
-+            mop = MOP_V8SUBS
-+            maraddr, marmux = 0, RMUX_AC
-+            mbraddr, mbrmux = 0, RMUX_AC
-+         else:
-+            mop = MOP_V8MIN
-+            mbraddr, mbrmux = maraddr, marmux
-+
-+      # normal alu instruction...
-+
-+      # handle setf
-+      if asf and (aop == AOP_NOP):
-+         asm_error('nop.setf is not allowed in add pipe')
-+      if msf and (mop == MOP_NOP):
-+         asm_warning('nop.setf, really?')
-+      if (aop == AOP_NOP) or (acond == COND_NEVER):
-+         sf = msf
-+      else:
-+         if msf:
-+            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
-+         sf = asf
-+
-+      # handle read addrs
-+      raddr_a = None
-+      raddr_b = None
-+      immb = False
-+      arot_r5 = False
-+      muxes = [0, 0, 0, 0]
-+      if mwrot != 0:
-+         raddr_b = 48 + mwrot
-+         immb = True
-+      if mwrot_r5 and have_am:
-+         raddr_b = 48
-+         immb = True
-+      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
-+         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
-+            if f(rmux):
-+               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
-+      add_a, add_b, mul_a, mul_b = muxes
-+      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
-+         # some output elements might not be as expected
-+         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
-+            bad_elems = 0xffff
-+         else:
-+            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
-+            if mwrot > 12:
-+               bad_elems ^= 0xffff
-+         bad_elems &= dict(annots).get('mul_used', 0xffff)
-+         if not msf:
-+            if mwaddr == WADDR_NOP:
-+               # not writing anywhere and not setting flags. no elements used
-+               bad_elems = 0
-+            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
-+               ((not ws) and (mwaddr == 37))):
-+               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
-+               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
-+               # only use element 0
-+               bad_elems &= 0x0001
-+            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
-+               ((not ws) and (mwaddr == 42))):
-+               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
-+               # flags. only use elements 0, 4, 8, and 12
-+               bad_elems &= 0x1111
-+         if bad_elems:
-+            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
-+      if raddr_a is None:
-+         raddr_a = RADDR_NOP
-+      if raddr_b is None:
-+         raddr_b = RADDR_NOP
-+      if immb:
-+         if sig != SIG_NORMAL:
-+            asm_error('rotation/immediates and signal don\'t mix')
-+         sig = SIG_SMALLIMMED
-+      if arot_r5 or (mwrot_r5 and (not have_am)):
-+         if sig != SIG_NORMAL:
-+            asm_error('rotation/immediates/signal don\'t mix')
-+         sig = SIG_ROTATE
-+
-+      # construct instruction
-+      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
-+         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
-+         line, annots))
-+   finally:
-+      current_location = prev_location
-+
-+def preprocess_passthrough(file):
-+   line_number = 0
-+   for line in file:
-+      line_number += 1
-+      yield line_number, line
-+
-+def asm_file(sets, location, filename, preprocess = None):
-+   global current_dir, current_location
-+
-+   if filename is None:
-+      location = '<stdin>'
-+      file = sys.stdin
-+
-+      prev_dir = current_dir
-+   else:
-+      filename = os.path.normpath(os.path.join(current_dir, filename))
-+
-+      try:
-+         file = open(filename)
-+      except Exception, e:
-+         asm_error(e)
-+      except:
-+         asm_error('unknown error while opening file %s' % filename)
-+
-+      prev_dir = current_dir
-+      current_dir = os.path.dirname(filename)
-+
-+   prev_location = current_location
-+   current_location = location
-+
-+   if preprocess is None:
-+      preprocess = preprocess_passthrough
-+
-+   try:
-+      for line_number, line in preprocess(file):
-+         # strip off comments and whitespace
-+         line = line.split('#')[0].strip()
-+         if line == '':
-+            continue
-+
-+         asm_line(sets, '%s: %d' % (current_location, line_number), line)
-+   finally:
-+      current_dir = prev_dir
-+      current_location = prev_location
-+
-+def asm_end_prog():
-+   # check we aren't in a multi-line construct (eg .macro or .rep)
-+   if construct != None:
-+      asm_error({
-+         CONSTRUCT_MACRO: '.macro without .endm',
-+         CONSTRUCT_IF:    '.if/.elif without .endif',
-+         CONSTRUCT_ELSE:  '.else without .endif',
-+         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
-+
-+   # check no warnings level back to 0
-+   if nwarn_level != 0:
-+      asm_error('.pushnwarn without .popnwarn')
-+
-+   # flush queued up data
-+   asm_flush_prog_data()
-+
-+   # fixup all the label references we can
-+   for pc in xrange(len(prog)):
-+      if isinstance(prog[pc][0], tuple):
-+         location, label, rel, offset = prog[pc][0]
-+         if label[0].isdigit():
-+            label_pcs = labels.get(label[:-1], [])
-+            if label[-1] == 'b':
-+               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
-+            else:
-+               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
-+            if label_pcs == []:
-+               asm_error('search for label reached begin/end of file', location = location)
-+            imm = label_pcs[0]
-+         elif label in labels:
-+            imm = labels[label]
-+         elif (':' + label) in labels:
-+            imm = labels[':' + label]
-+         elif external_link:
-+            continue # let the external linker deal with it
-+         else:
-+            asm_error('undefined label', location = location)
-+         imm = (imm * 8) + offset
-+         if rel:
-+            imm -= (pc + 4) * 8 # relative to instruction after delay slots
-+            imm &= (1 << 32) - 1
-+         else:
-+            if not external_link:
-+               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
-+            imm = (location, label, rel, offset, imm)
-+         prog[pc] = (imm,) + prog[pc][1:]
-+
-+def asm_init():
-+   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
-+
-+   current_dir = os.getcwd()
-+   current_location = ''
-+   prog = []
-+   prog_data = []
-+   macros = {
-+      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
-+      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
-+   labels = {}
-+   construct = None
-+   construct_stack = []
-+   nwarn_level = 0
-+
-+def asm_reset_prog():
-+   global prog, labels
-+
-+   prog = []
-+   labels = {}
-+
-+###############################################################################
-+# dumping
-+###############################################################################
-+
-+def print_lines(lines):
-+   for line in lines:
-+      print line
-+
-+class dumper_t:
-+   def external_link(self): return False
-+   def begin(self): pass
-+   def label(self, pc, name): pass
-+   def line(self, pc, ls, ms, line, annots, first): pass
-+   def end(self): pass
-+   def sets(self, sets): pass
-+   def direct(self, line): pass
-+
-+class clif_dumper_t(dumper_t):
-+   def __init__(self):
-+      self.annot_mode = 0
-+
-+   def external_link(self):
-+      return True
-+
-+   def parse_annot_mode(self, line):
-+      l = line.split(',')
-+      self.annot_mode = int(l[0])
-+      if self.annot_mode not in (0, 1, 2):
-+         asm_error('bad annot mode')
-+      if self.annot_mode == 2:
-+         if len(l) != 2:
-+            asm_error('expected buffer name')
-+         self.annot_name = l[1].strip()
-+         self.annot_offset = 0
-+      elif len(l) != 1:
-+         asm_error('unexpected comma')
-+
-+   def label(self, pc, name):
-+      if (self.annot_mode != 1) and (name[0] == ':'):
-+         if self.annot_mode == 2:
-+            name = name + '_annotations'
-+         print '@label %s' % name[1:]
-+      else:
-+         print '// :%s' % name
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if self.annot_mode == 0:
-+         if isinstance(ls, tuple):
-+            if len(ls) == 5:
-+               location, label, rel, offset, offset_from_prog = ls
-+               assert not rel
-+               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
-+            else:
-+               location, label, rel, offset = ls
-+               if rel:
-+                  asm_error('relative external label references not allowed in this mode', location = location)
-+               ls = '[%s + %d]' % (label, offset)
-+         else:
-+            ls = '0x%08x' % ls
-+         print '%s 0x%08x // %s' % (ls, ms, line)
-+      elif self.annot_mode == 1:
-+         print '// %s' % line
-+         for annot in annots:
-+            print '0x%08x 0x%08x // %s' % ({
-+               # todo: would rather not have these hard coded
-+               'mul_used':              1,
-+               'preserve_cond':         2,
-+               'geomd_open':            3,
-+               'geomd_i':               4,
-+               'geomd_tris_clear':      5,
-+               'geomd_verts':           6,
-+               'geomd_tris_add':        7,
-+               'geomd_tris_set_center': 8,
-+               'geomd_region_clear':    9,
-+               'geomd_region_set':      10,
-+               'geomd_images_clear':    11,
-+               'geomd_images_l':        12,
-+               'geomd_images_b':        13,
-+               'geomd_images_r':        14,
-+               'geomd_images_t':        15,
-+               'geomd_images_add_vpm':  16,
-+               'trace_4c':              17,
-+               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
-+         if len(annots) != 0:
-+            print '0x00000000 // end'
-+      else:
-+         assert self.annot_mode == 2
-+         if len(annots) == 0:
-+            print '0x00000000 // %s' % line
-+         else:
-+            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
-+            self.annot_offset += (len(annots) * 8) + 4
-+
-+   def direct(self, line):
-+      print line
-+
-+class plain_dumper_t(dumper_t):
-+   def line(self, pc, ls, ms, line, annots, first):
-+      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
-+
-+class c_c_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, array_name):
-+      self.header_name = header_name
-+      self.array_name = array_name
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      self.external_labels = set()
-+      self.lines = []
-+
-+      print '#include "%s.h"' % self.header_name
-+      print ''
-+      print '#ifdef _MSC_VER'
-+      print '   #include <stdint.h>'
-+      print '   /* cast through uintptr_t to avoid warnings */'
-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
-+      print '#else'
-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
-+      print '#endif'
-+      print ''
-+      print '#ifdef __cplusplus'
-+      print 'extern "C" { /* the types are probably wrong... */'
-+      print '#endif'
-+
-+   def label(self, pc, name):
-+      self.lines.append('// :%s' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple):
-+         if len(ls) == 5:
-+            location, label, rel, offset, offset_from_prog = ls
-+            assert not rel
-+            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
-+         else:
-+            location, label, rel, offset = ls
-+            if rel:
-+               asm_error('relative external label references not allowed in this mode', location = location)
-+            if label not in self.external_labels:
-+               self.external_labels.add(label)
-+               print 'extern uint8_t %s[];' % label
-+            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
-+      else:
-+         ls = '0x%08x' % ls
-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
-+
-+   def end(self):
-+      print '#ifdef __cplusplus'
-+      print '}'
-+      print '#endif'
-+      print ''
-+      print '#ifdef _MSC_VER'
-+      print '__declspec(align(8))'
-+      print '#elif defined(__GNUC__)'
-+      print '__attribute__((aligned(8)))'
-+      print '#endif'
-+      print 'unsigned int %s[] = {' % self.array_name
-+      print_lines(self.lines)
-+      print '};'
-+      print '#ifdef __HIGHC__'
-+      print '#pragma Align_to(8, %s)' % self.array_name
-+      print '#endif'
-+
-+class c_h_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, array_name):
-+      self.full_header_name = full_header_name
-+      self.array_name = array_name
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '#ifndef %s_H' % self.full_header_name
-+      print '#define %s_H' % self.full_header_name
-+      print ''
-+      print 'extern unsigned int %s[];' % self.array_name
-+      print ''
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
-+
-+   def end(self):
-+      print ''
-+      print '#endif'
-+
-+class ml_c_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, name, annots):
-+      self.header_name = header_name
-+      self.name = name
-+      self.annots = annots
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      if self.annots:
-+         self.annot_lines = []
-+      self.lines = []
-+      self.external_labels = set()
-+      self.link_lines = []
-+
-+      print '#include "%s.h"' % self.header_name
-+      print '#include <assert.h>'
-+      if self.annots:
-+         print '#ifdef SIMPENROSE'
-+         print '#include <stddef.h>'
-+         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
-+      print ''
-+
-+   def label(self, pc, name):
-+      self.lines.append('// :%s' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if self.annots:
-+         if len(annots) == 0:
-+            self.annot_lines.append('NULL,')
-+         else:
-+            print 'static unsigned int const annotations_%d[] = {' % pc
-+            for annot in annots:
-+               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
-+            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
-+            print ''
-+            self.annot_lines.append('annotations_%d,' % pc)
-+      if isinstance(ls, tuple):
-+         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
-+         if len(ls) == 5:
-+            location, label, rel, offset, offset_from_prog = ls
-+            assert not rel
-+            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
-+         else:
-+            location, label, rel, offset = ls
-+            self.external_labels.add(label)
-+            if rel:
-+               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
-+            else:
-+               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
-+         ls = '0xdeadbeef'
-+      else:
-+         ls = '0x%08x' % ls
-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
-+
-+   def end(self):
-+      if self.annots:
-+         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
-+         print_lines(self.annot_lines)
-+         print '};'
-+         print '#endif'
-+         print ''
-+      print 'static unsigned int const array[] = {'
-+      print_lines(self.lines)
-+      print '};'
-+      print ''
-+      print 'void %s_link(void *p_in, unsigned int base' % self.name
-+      for label in sorted(self.external_labels):
-+         print '   , unsigned int %s' % label
-+      print '   )'
-+      print '{'
-+      print '   unsigned int *p = (unsigned int *)p_in;'
-+      print '   unsigned int i;'
-+      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
-+      print '      p[i] = array[i];'
-+      print '   }'
-+      print_lines(self.link_lines)
-+      print '}'
-+
-+class ml_h_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, name, annots):
-+      self.full_header_name = full_header_name
-+      self.name = name
-+      self.annots = annots
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      self.external_labels = set()
-+      self.lines_n = 0
-+
-+      print '#ifndef %s_H' % self.full_header_name
-+      print '#define %s_H' % self.full_header_name
-+      print ''
-+      if self.annots:
-+         print '#ifdef SIMPENROSE'
-+         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
-+         print '#endif'
-+         print ''
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
-+         if self.annots:
-+            print '#ifdef SIMPENROSE'
-+            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
-+            print '#endif'
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple) and (len(ls) != 5):
-+         self.external_labels.add(ls[1])
-+      self.lines_n += 1
-+
-+   def end(self):
-+      print ''
-+      print 'extern void %s_link(void *p, unsigned int base' % self.name
-+      for label in sorted(self.external_labels):
-+         print '   , unsigned int %s' % label
-+      print '   );'
-+      print ''
-+      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
-+      print ''
-+      print '#endif'
-+
-+def print_lines_lc(lines):
-+   for line in lines:
-+      print '%s \\' % line
-+
-+def print_groups_lc(groups):
-+   first = True
-+   for group in groups:
-+      if first:
-+         print '{ \\'
-+      else:
-+         print ', { \\'
-+      print_lines_lc(group)
-+      print '} \\'
-+      first = False
-+
-+class inline_c_dumper_t(dumper_t):
-+   def __init__(self, annots):
-+      self.annots = annots
-+      self.iteration = False
-+
-+   def begin_iteration(self):
-+      assert not self.iteration
-+      self.iteration = True
-+      self.iteration_lines = []
-+      if self.annots:
-+         self.iteration_annot_lines = []
-+         self.annot_arrs = []
-+
-+   def end_iteration(self):
-+      assert self.iteration
-+      self.iteration = False
-+      print '%d, \\' % self.iteration_n
-+      if self.annots:
-+         print '( \\'
-+      print_groups_lc(self.iteration_lines)
-+      if self.annots:
-+         print '), ( \\'
-+         print_groups_lc(self.iteration_annot_lines)
-+         print '), ( \\'
-+         for annot_arr in self.annot_arrs:
-+            print_lines_lc(annot_arr)
-+         print ') \\'
-+
-+   def begin(self):
-+      self.n = 0
-+      self.lines = []
-+      if self.annots:
-+         self.annot_lines = []
-+         if not self.iteration:
-+            self.annot_arrs = []
-+
-+   def label(self, pc, name):
-+      self.lines.append('/* :%s */' % name)
-+      if self.annots:
-+         self.annot_lines.append('/* :%s */' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      self.n += 1
-+      if first:
-+         prefix = ''
-+      else:
-+         prefix = ', '
-+      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
-+      if self.annots:
-+         if len(annots) == 0:
-+            a = 'NULL'
-+         else:
-+            a = 'annotations_%d' % len(self.annot_arrs)
-+            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
-+            for annot in annots:
-+               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
-+            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
-+            self.annot_arrs.append(annot_arr)
-+         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
-+
-+   def end(self):
-+      if self.iteration:
-+         if len(self.iteration_lines) == 0:
-+            self.iteration_n = self.n
-+         elif self.iteration_n != self.n:
-+            asm_error('number of instructions differs between iterations')
-+         self.iteration_lines.append(self.lines)
-+         if self.annots:
-+            self.iteration_annot_lines.append(self.annot_lines)
-+      else:
-+         if self.annots:
-+            print '( \\'
-+         print_lines_lc(self.lines)
-+         if self.annots:
-+            print '), ( \\'
-+            print_lines_lc(self.annot_lines)
-+            print '), ( \\'
-+            for annot_arr in self.annot_arrs:
-+               print_lines_lc(annot_arr)
-+            print ') \\'
-+
-+   def direct(self, line):
-+      print line
-+
-+class asvc_dumper_t(dumper_t):
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '.align 8'
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '%s::' % name[1:]
-+      else:
-+         print '%s:' % name
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple):
-+         location, label, rel, offset = ls[:4]
-+         if rel:
-+            ls = '%s + %d - (. + 32)' % (label, offset)
-+         else:
-+            ls = '%s + %d' % (label, offset)
-+      else:
-+         ls = '0x%08x' % ls
-+      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
-+
-+def is_ra_or_rb(val):
-+   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
-+
-+class aliases_dumper_t(dumper_t):
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '#ifndef JUST_DQASM_ARGS'
-+
-+   def label(self, pc, name):
-+      if not name[0].isdigit():
-+         if name[0] == ':':
-+            name = name[1:]
-+         print '"bs%s", "bs%x",' % (name, pc * 8)
-+         print '"bu%s", "bu%x",' % (name, pc * 8)
-+
-+   def end(self):
-+      print '#endif'
-+
-+   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
-+   def sets(self, sets):
-+      dqasm_args = []
-+      print '#ifndef JUST_DQASM_ARGS'
-+      for name in sets:
-+         if is_ra_or_rb(sets[name]):
-+            dqasm_args.append('-r%s=%s' % (sets[name], name))
-+            print '"%s", "%s",' % (name, sets[name])
-+         elif isinstance(sets[name], list):
-+            for i, val in enumerate(sets[name]):
-+               if is_ra_or_rb(val):
-+                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
-+                  print '"%s[%d]", "%s",' % (name, i, val)
-+      print '#endif'
-+      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
-+
-+def dump(dumper):
-+   if (len(prog) != 0) or (len(labels) != 0):
-+      dumper.begin()
-+
-+      sorted_labels = []
-+      for name in labels:
-+         if name[0].isdigit():
-+            for pc in labels[name]:
-+               sorted_labels.append((pc, name))
-+         else:
-+            sorted_labels.append((labels[name], name))
-+      sorted_labels.sort(reverse = True)
-+
-+      first = True
-+      for pc in xrange(len(prog)):
-+         ls, ms, line, annots = prog[pc]
-+         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
-+            dumper.label(*sorted_labels.pop())
-+         dumper.line(pc, ls, ms, line, annots, first)
-+         first = False
-+      for sorted_label in sorted_labels:
-+         assert sorted_label[0] == len(prog)
-+         dumper.label(*sorted_label)
-+
-+      dumper.end()
-+
-+###############################################################################
-+# preprocessing
-+###############################################################################
-+
-+def preprocess_inline_c(dumper):
-+   def preprocess(file):
-+      ls = None
-+      line_number = 0
-+      for line in file:
-+         line_number += 1
-+         while True:
-+            if ls is None:
-+               l = line.split('%[', 1)
-+               if len(l) == 1:
-+                  dumper.direct(l[0].rstrip())
-+                  break
-+               dumper.direct('%s \\' % l[0].rstrip())
-+               line = l[1]
-+               ls = []
-+            else:
-+               l = line.split('%]', 1)
-+               ls.append((line_number, l[0]))
-+               if len(l) == 1:
-+                  break
-+               line = l[1]
-+               l = ls[-1][1].split('%|', 1)
-+               if len(l) == 1:
-+                  for l_number, l in ls:
-+                     yield l_number, l
-+                  asm_end_prog()
-+                  dump(dumper)
-+                  asm_reset_prog()
-+               else:
-+                  ls[-1] = (ls[-1][0], l[0])
-+                  if hasattr(dumper, 'begin_iteration'):
-+                     dumper.begin_iteration()
-+                  for repls in l[1].split('%,'):
-+                     repls = [repl.strip() for repl in repls.split('%/')]
-+                     for l_number, l in ls:
-+                        for i, repl in enumerate(repls):
-+                           l = l.replace('%' + str(i), repl)
-+                        yield l_number, l
-+                     asm_end_prog()
-+                     dump(dumper)
-+                     asm_reset_prog()
-+                  if hasattr(dumper, 'end_iteration'):
-+                     dumper.end_iteration()
-+               ls = None
-+   return preprocess
-+
-+def preprocess_clif(dumper):
-+   def preprocess(file):
-+      in_asm = False
-+      line_number = 0
-+      for line in file:
-+         line_number += 1
-+         if in_asm:
-+            if line.strip() == '%]':
-+               asm_end_prog()
-+               dump(dumper)
-+               asm_reset_prog()
-+               in_asm = False
-+            else:
-+               yield line_number, line
-+         else:
-+            if line.strip() == '%[':
-+               in_asm = True
-+            elif (line[:1] == '%') and (line[:2] != '%@'):
-+               yield line_number, line[1:]
-+            else:
-+               asm_end_prog()
-+               dump(dumper)
-+               asm_reset_prog()
-+               if line[:2] == '%@':
-+                  if hasattr(dumper, 'parse_annot_mode'):
-+                     dumper.parse_annot_mode(line[2:])
-+               else:
-+                  dumper.direct(line.rstrip())
-+   return preprocess
-+
-+###############################################################################
-+# main
-+###############################################################################
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
 +
 +def main():
-+   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
-+   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
++    global flog
 +
-+   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
 +
-+   # parse command line
-+   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
-+   parser.add_option('-m', '--mode', dest = 'mode',
-+      help = '<mode> should be clif, plain, ' +
-+      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
-+      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
-+      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
-+      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
-+      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
-+   parser.add_option('-t', '--target', dest = 'target',
-+      help = '<target> should be a0, b0, or hera', metavar = '<target>')
-+   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
-+   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
-+   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
-+   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
-+   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
-+   options, args = parser.parse_args()
-+   if len(args) == 0:
-+      filename = None
-+   elif len(args) == 1:
-+      filename = args[0]
-+   else:
-+      parser.print_help()
-+      sys.exit(-1)
++    argp.add_argument("streams", nargs='*')
++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++    argp.add_argument("--csv_in", help="CSV input filename")
++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
 +
-+   # handle mode
-+   mode = options.mode or 'clif' # assume clif if no mode specified
-+   if mode == 'clif':
-+      dumper = clif_dumper_t()
-+      preprocess = preprocess_clif(dumper)
-+   elif mode == 'plain':
-+      dumper = plain_dumper_t()
-+      preprocess = None
-+   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
-+      mode_options = mode[4:].split(',')
-+      if len(mode_options) != 3:
-+         asm_error('badly formatted mode on command line')
-+      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
-+      preprocess = None
-+   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
-+      mode_options = mode[5:].split(',')
-+      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
-+         asm_error('badly formatted mode on command line')
-+      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
-+         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
-+      preprocess = None
-+   elif mode == 'inline_c':
-+      dumper = inline_c_dumper_t(False)
-+      preprocess = preprocess_inline_c(dumper)
-+   elif mode == 'inline_c:annots':
-+      dumper = inline_c_dumper_t(True)
-+      preprocess = preprocess_inline_c(dumper)
-+   elif mode == 'asvc':
-+      dumper = asvc_dumper_t()
-+      preprocess = None
-+   elif mode == 'aliases':
-+      dumper = aliases_dumper_t()
-+      preprocess = None
-+   elif mode == 'aliases:inline_c':
-+      dumper = aliases_dumper_t()
-+      preprocess = preprocess_inline_c(dumper)
-+   else:
-+      asm_error('invalid mode')
-+   external_link = dumper.external_link()
++    args = argp.parse_args()
 +
-+   # handle target
-+   target = options.target or 'b0' # assume b0 if no target specified
-+   if target == 'a0':
-+      have_sema = False
-+      have_am = False
-+      mulw_rotate = False
-+      have_lthrsw = False
-+   elif target == 'b0':
-+      have_sema = True
-+      have_am = True
-+      mulw_rotate = True
-+      have_lthrsw = True
-+   elif target == 'hera':
-+      have_sema = True
-+      have_am = False
-+      mulw_rotate = True
-+      have_lthrsw = True
-+   else:
-+      asm_error('invalid target')
-+   if have_am:
-+      sigs['loadam'] = SIG_LOADAM
-+      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
-+   if have_lthrsw:
-+      sigs['lthrsw'] = SIG_LTHRSW
-+      del sigs['int']
-+      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++    csv_out.writeheader()
 +
-+   # handle misc options
-+   allow_xor_0 = options.allow_xor_0
-+   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
-+   warnings_are_errors = options.warnings_are_errors
-+   disable_warnings = options.disable_warnings
++    stats_in = {}
++    if args.csv_in != None:
++        with open(args.csv_in, 'r', newline='') as f_in:
++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
 +
-+   # make options visible to asm
-+   arg_defs['mode'] = mode
-+   arg_defs['target'] = target
++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
 +
-+   # arg_defs all setup at this point
-+   sets = arg_defs.copy() # todo: see arg_eval
++    streams = args.streams
++    if not streams:
++        if not stats_in:
++            print ("No source streams specified")
++            return 1
++        prefix = "" if args.prefix == None else args.prefix
++        streams = [k for k in stats_in]
++    elif args.prefix != None:
++        prefix = args.prefix
++    else:
++        prefix = streams[0]
++        for f in streams[1:]:
++            prefix = common_prefix(prefix, f)
++        pp = prefix.rpartition(os.sep)
++        prefix = pp[0] + pp[1]
++        streams = [s[len(prefix):] for s in streams]
 +
-+   # handle command line sets
-+   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
-+   for options_set in options.sets:
-+      m = re_options_set.match(options_set)
-+      if not m:
-+         asm_error('badly formatted set on command line')
-+      sets[m.group('name')] = arg_eval(m.group('val'), sets)
++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++        print ("====", f)
++
++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++        for i in range(3):
++            t = tstats.time_file(f, prefix)
++            print ("...", t.times_str())
++            if t0 > t:
++                t0 = t
++
++        if t0.name in stats_in:
++            pstat = stats_in[t0.name]
++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++        csv_out.writerow(t0.dict())
++
++        print ()
++
++    return 0
 +
-+   # assemble input file and dump
-+   asm_file(sets, filename, filename, preprocess)
-+   asm_end_prog()
-+   dump(dumper)
-+   for name in arg_defs: # todo: see arg_eval
-+      del sets[name]
-+   dumper.sets(sets)
 +
 +if __name__ == '__main__':
-+   main()
++    exit(main())
++
+diff --git a/pi-util/make_array.py b/pi-util/make_array.py
+new file mode 100755
+index 0000000000..864fa5e704
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,19 @@
++#!/usr/bin/env python
++
++# Usage
++#   make_array file.bin
++#   Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++  prefix,suffix = file.split('.')
++  assert suffix=='bin'
++  name=prefix.split('/')[-1]
++  print 'Converting',file
++  with open(prefix+'.h','wb') as out:
++    print >>out, 'static const unsigned char',name,'[] = {'
++    with open(file,'rb') as fd:  
++      for byte in fd.read():
++        print >>out, '%d,' % ord(byte)
++    print >>out,'};'
++
 diff --git a/pi-util/qem.sh b/pi-util/qem.sh
-new file mode 100644
-index 0000000..47dd071
+new file mode 100755
+index 0000000000..5ce2eeaf72
 --- /dev/null
 +++ b/pi-util/qem.sh
 @@ -0,0 +1,9 @@
 +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ pi-util/qasm.py
++QASM=python\ ../local/bin/qasm.py
 +SRC_FILE=libavcodec/rpi_shader.qasm
 +DST_BASE=shader
 +
@@ -21696,101 +30228,9 @@ index 0000000..47dd071
 +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
 +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
 +
-diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
-new file mode 100755
-index 0000000..6a9a33f
---- /dev/null
-+++ b/pi-util/rebase_liblinks.py
-@@ -0,0 +1,37 @@
-+#!/usr/bin/env python
-+
-+import os, sys
-+from stat import *
-+
-+def walktree(top, callback, n, prefix):
-+    '''recursively descend the directory tree rooted at top,
-+       calling the callback function for each regular file'''
-+
-+    for f in os.listdir(top):
-+        pathname = os.path.join(top, f)
-+        mode = os.lstat(pathname).st_mode
-+        if S_ISDIR(mode):
-+            # It's a directory, recurse into it
-+            walktree(pathname, callback, n+1, prefix)
-+        elif S_ISLNK(mode):
-+            # It's a file, call the callback function
-+            callback(pathname, os.readlink(pathname), n, prefix)
-+
-+def visitfile(file, linkname, n, prefix):
-+    if (linkname.startswith(prefix + 'lib/')):
-+        newlink = "../" * n + linkname[len(prefix):]
-+        print 'relinking', file, "->", newlink
-+        os.remove(file)
-+        os.symlink(newlink, file)
-+
-+if __name__ == '__main__':
-+    argc = len(sys.argv)
-+    if argc == 2:
-+        walktree(sys.argv[1], visitfile, 0, "/")
-+    elif argc == 3:
-+        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
-+    else:
-+        print "rebase_liblinks.py <local root> [<old sysroot>]"
-+
-+
-+
-diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
-new file mode 100755
-index 0000000..d8bdd91
---- /dev/null
-+++ b/pi-util/syncroot.sh
-@@ -0,0 +1,43 @@
-+set -e
-+
-+if [ "$1" == "" ]; then
-+  echo Usage: $0 \<src_dir\> [\<rootname\>]
-+  echo src_dir is a source for rsync so may contain m/c name.
-+  echo rootname will be set to \"raspian_jessie_pi1\" if missing
-+  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
-+  exit 1
-+fi
-+
-+SYSROOT_NAME=$2
-+if [ "$SYSROOT_NAME" == "" ]; then
-+  SYSROOT_NAME=raspian_jessie_pi1
-+fi
-+
-+DST_ROOT=`pwd`
-+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
-+SRC=$1
-+
-+echo Sync src:  $SRC
-+echo Sync dest: $DST
-+
-+mkdir -p $DST/lib
-+mkdir -p $DST/opt/vc/include
-+mkdir -p $DST/usr/lib/pkgconfig
-+mkdir -p $DST/usr/bin
-+mkdir -p $DST/usr/share
-+
-+#### MUST NOT include /opt/vc/include/*GL*
-+# Creates conflicts with GL includes inside Chrome
-+
-+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
-+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
-+rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
-+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
-+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
-+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
-+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
-+rsync -rl $SRC/usr/include $DST/usr
-+
-+pi-util/rebase_liblinks.py $DST
-+
-+
 diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
-new file mode 100644
-index 0000000..5935a11
+new file mode 100755
+index 0000000000..5935a11ca5
 --- /dev/null
 +++ b/pi-util/v3dusage.py
 @@ -0,0 +1,128 @@