diff --git a/packages/graphics/bcm2835-driver/package.mk b/packages/graphics/bcm2835-driver/package.mk
index 7ee0435910..3e196fe780 100644
--- a/packages/graphics/bcm2835-driver/package.mk
+++ b/packages/graphics/bcm2835-driver/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="bcm2835-driver"
-PKG_VERSION="756dd85"
+PKG_VERSION="ab5eb99"
 PKG_ARCH="any"
 PKG_LICENSE="nonfree"
 PKG_SITE="http://www.broadcom.com"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.asap/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.asap/package.mk
index ee2763dac1..dce3fa3f07 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.asap/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.asap/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.asap"
-PKG_VERSION="e56a821"
+PKG_VERSION="6c13ee6"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk
index 4653bc1231..e12f6841fa 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.upse/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.upse"
-PKG_VERSION="23a5430"
+PKG_VERSION="de58ded"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk
index f5d3354da1..b367a0d677 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.usf/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.usf"
-PKG_VERSION="c7fa708"
+PKG_VERSION="99c17c9"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/audiodecoder.wsr/package.mk b/packages/mediacenter/kodi-binary-addons/audiodecoder.wsr/package.mk
index c236d52311..1212c708a7 100644
--- a/packages/mediacenter/kodi-binary-addons/audiodecoder.wsr/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/audiodecoder.wsr/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="audiodecoder.wsr"
-PKG_VERSION="746fcbb"
+PKG_VERSION="ac3e274"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
index f8e9d75542..4051b21538 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.vuplus/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="pvr.vuplus"
-PKG_VERSION="25c4883"
+PKG_VERSION="c1e6a22"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index edfea75cd2..51c7c2a07f 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="kodi"
-PKG_VERSION="7fc6da0"
+PKG_VERSION="5bd45ab"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 044bf59c51..43d136a1b0 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -18,7 +18,7 @@
 
 PKG_NAME="ffmpeg"
 # Current branch is: release/3.1-xbmc
-PKG_VERSION="f58e5b9"
+PKG_VERSION="9702d0d"
 PKG_ARCH="any"
 PKG_LICENSE="LGPLv2.1+"
 PKG_SITE="https://ffmpeg.org"
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
index 96cfa9ae30..abd1499a6d 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
@@ -1,8 +1,16 @@
 diff --git a/.gitignore b/.gitignore
-index 524fb73..305632b 100644
+index 524fb73c16..bcc983739f 100644
 --- a/.gitignore
 +++ b/.gitignore
-@@ -23,6 +23,7 @@
+@@ -1,6 +1,7 @@
+ *.a
+ *.o
+ *.o.*
++*.bin
+ *.d
+ *.def
+ *.dll
+@@ -23,6 +24,7 @@
  .\#*
  /.config
  /.version
@@ -11,7 +19,7 @@ index 524fb73..305632b 100644
  /ffplay
  /ffprobe
 diff --git a/ffmpeg.c b/ffmpeg.c
-index 9ffd833..e2474e5 100644
+index cdded8673f..5eee7dfd40 100644
 --- a/ffmpeg.c
 +++ b/ffmpeg.c
 @@ -23,6 +23,11 @@
@@ -20,13 +28,21 @@ index 9ffd833..e2474e5 100644
  
 +#ifdef RPI
 +#define RPI_DISPLAY
-+#define RPI_ZERO_COPY
++#define RPI_DISPLAY_ALL 0
 +#endif
 +
  #include "config.h"
  #include <ctype.h>
  #include <string.h>
-@@ -66,6 +71,25 @@
+@@ -42,6 +47,7 @@
+ #include "libavformat/avformat.h"
+ #include "libavdevice/avdevice.h"
+ #include "libswresample/swresample.h"
++#include "libavutil/atomic.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/channel_layout.h"
+ #include "libavutil/parseutils.h"
+@@ -66,6 +72,25 @@
  # include "libavfilter/buffersrc.h"
  # include "libavfilter/buffersink.h"
  
@@ -38,21 +54,21 @@ index 9ffd833..e2474e5 100644
 +#include <interface/mmal/mmal.h>
 +#include <interface/mmal/mmal_parameters_camera.h>
 +#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
 +#include <interface/mmal/util/mmal_util.h>
 +#include <interface/mmal/util/mmal_default_components.h>
 +#include <interface/mmal/util/mmal_connection.h>
 +#include <interface/mmal/util/mmal_util_params.h>
 +#pragma GCC diagnostic pop
-+#ifdef RPI_ZERO_COPY
 +#include "libavcodec/rpi_qpu.h"
-+#endif
++#include "libavutil/rpi_sand_fns.h"
 +#include "libavcodec/rpi_zc.h"
 +#endif
 +
  #if HAVE_SYS_RESOURCE_H
  #include <sys/time.h>
  #include <sys/types.h>
-@@ -158,6 +182,182 @@ static int restore_tty;
+@@ -158,6 +183,241 @@ static int restore_tty;
  static void free_input_threads(void);
  #endif
  
@@ -60,39 +76,36 @@ index 9ffd833..e2474e5 100644
 +
 +#define NUM_BUFFERS 4
 +
-+static MMAL_COMPONENT_T* rpi_display = NULL;
-+static MMAL_POOL_T *rpi_pool = NULL;
-+static volatile int rpi_display_count = 0;
 +
-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
++typedef struct rpi_display_env_s
++{
++    MMAL_COMPONENT_T* display;
++    MMAL_COMPONENT_T* isp;
++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
++    MMAL_CONNECTION_T * conn;
++
++    MMAL_POOL_T *rpi_pool;
++    volatile int rpi_display_count;
++    enum AVPixelFormat avfmt;
++} rpi_display_env_t;
++
++static rpi_display_env_t * rpi_display_env = NULL;
++
++
++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
 +{
 +    MMAL_POOL_T* pool;
-+    size_t i;
-+    size_t size = (w*h*3)/2;
-+#ifdef RPI_ZERO_COPY
 +    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
 +    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
 +    assert(pool);
-+#else
-+    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
-+
-+    for (i = 0; i < NUM_BUFFERS; ++i)
-+    {
-+       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
-+       char * bufPtr = buffer->data;
-+       memset(bufPtr, i*30, w*h);
-+       memset(bufPtr+w*h, 128, (w*h)/2);
-+    }
-+#endif
 +
 +    return pool;
 +}
 +
 +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+#ifdef RPI_ZERO_COPY
++    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
 +    av_rpi_zc_unref(buffer->user_data);
-+    --rpi_display_count;
-+#endif
++    avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
 +    mmal_buffer_header_release(buffer);
 +}
 +
@@ -100,9 +113,12 @@ index 9ffd833..e2474e5 100644
 +  mmal_buffer_header_release(buffer);
 +}
 +
-+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
++#define DISPLAY_PORT_DEPTH 4
++
++static rpi_display_env_t *
++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
 +{
-+    MMAL_COMPONENT_T* display;
++    MMAL_STATUS_T err;
 +    MMAL_DISPLAYREGION_T region =
 +    {
 +        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
@@ -111,51 +127,113 @@ index 9ffd833..e2474e5 100644
 +        .fullscreen = 0,
 +        .dest_rect = {x, y, w, h}
 +    };
++#if RPI_ZC_SAND_8_IN_10_BUF
++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
++#else
++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
++#endif
 +    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
++    rpi_display_env_t * de;
++    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
 +
-+    bcm_host_init();  // TODO is this needed?
-+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
-+    assert(display);
++    bcm_host_init();  // Needs to be done by someone...
 +
-+    mmal_port_parameter_set(display->input[0], &region.hdr);
++    if ((de = av_mallocz(sizeof(*de))) == NULL) {
++        return NULL;
++    }
++
++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
++    av_assert0(de->display);
++    de->port_in = de->display->input[0];
++
++    if (isp_req)
++    {
++        mmal_component_create("vc.ril.isp", &de->isp);
++        de->port_in = de->isp->input[0];
++    }
++
++    mmal_port_parameter_set(de->display->input[0], &region.hdr);
 +
 +    {
-+        MMAL_ES_FORMAT_T* format = display->input[0]->format;
-+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
++        MMAL_PORT_T * const port = de->port_in;
++        MMAL_ES_FORMAT_T* const format = port->format;
++        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
++        port->buffer_num = DISPLAY_PORT_DEPTH;
++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
++            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
++                MMAL_ENCODING_I420;
 +        format->es->video.width = geo.stride_y;
-+        format->es->video.height = geo.height_y;
++        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
++                                      (h + 15) & ~15 : geo.height_y;  // Magic
 +        format->es->video.crop.x = 0;
 +        format->es->video.crop.y = 0;
 +        format->es->video.crop.width = w;
 +        format->es->video.crop.height = h;
-+        mmal_port_format_commit(display->input[0]);
++        mmal_port_format_commit(port);
 +    }
 +
-+    mmal_component_enable(display);
++    de->rpi_pool = display_alloc_pool(de->port_in);
++    mmal_port_enable(de->port_in,display_cb_input);
 +
-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
++    if (isp_req) {
++        MMAL_PORT_T * const port_out = de->isp->output[0];
++        mmal_log_dump_port(de->port_in);
++        mmal_format_copy(port_out->format, de->port_in->format);
++        if (fmt == AV_PIX_FMT_SAND64_10) {
++            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
++                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
++            {
++                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
++            }
++            else
++                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
 +
-+    mmal_port_enable(display->input[0],display_cb_input);
-+    mmal_port_enable(display->control,display_cb_control);
++        }
++        port_out->format->encoding = MMAL_ENCODING_I420;
++        mmal_log_dump_port(port_out);
++        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
++        {
++            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
++            goto fail;
++        }
++        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
++            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
++            goto fail;
++        }
++        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
++            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
++            goto fail;
++        }
++        mmal_port_enable(de->isp->control,display_cb_control);
++        mmal_component_enable(de->isp);
++    }
++
++    mmal_component_enable(de->display);
++    mmal_port_enable(de->display->control,display_cb_control);
++    de->avfmt = fmt;
 +
 +    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
 +
-+    return display;
++    return de;
++
++fail:
++    // **** Free stuff
++    return NULL;
 +}
 +
-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
 +{
 +    MMAL_BUFFER_HEADER_T* buf;
 +
-+    if (!display || !rpi_pool)
++    if (de == NULL)
 +        return;
 +
-+    if (rpi_display_count >= 3) {
++    if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
 +        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
 +        return;
 +    }
 +
-+    buf = mmal_queue_get(rpi_pool->queue);
++    buf = mmal_queue_get(de->rpi_pool->queue);
 +    if (!buf) {
 +        // Running too fast so drop the frame
 +        printf("Q alloc failure\n");
@@ -165,67 +243,64 @@ index 9ffd833..e2474e5 100644
 +    buf->cmd = 0;
 +    buf->offset = 0; // Offset to valid data
 +    buf->flags = 0;
-+#ifdef RPI_ZERO_COPY
-+{
-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
-+    if (fr_buf == NULL) {
-+        mmal_buffer_header_release(buf);
-+        return;
-+    }
-+
-+    buf->user_data = fr_buf;
-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
-+    buf->offset = av_rpi_zc_offset(fr_buf);
-+    buf->length = av_rpi_zc_length(fr_buf);
-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+#if 0
 +    {
-+        unsigned int n;
-+        for (n = 0; n < fr->width; n += 128) {
-+            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
++        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
++        if (fr_buf == NULL) {
++            mmal_buffer_header_release(buf);
++            return;
 +        }
++
++        buf->user_data = fr_buf;
++        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
++        buf->offset = av_rpi_zc_offset(fr_buf);
++        buf->length = av_rpi_zc_length(fr_buf);
++        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++        avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
 +    }
-+#endif
-+    ++rpi_display_count;
-+}
-+#else
-+{
-+#error YYY
-+    int w = fr->width;
-+    int h = fr->height;
-+    int w2 = (w+31)&~31;
-+    int h2 = (h+15)&~15;
-+
-+    buf->length = (w2 * h2 * 3)/2;
-+    buf->user_data = NULL;
-+
-+    //mmal_buffer_header_mem_lock(buf);
-+    memcpy(buf->data, fr->data[0], w2 * h);
-+    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
-+    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
-+    //mmal_buffer_header_mem_unlock(buf);
-+}
-+#endif
-+
-+    while (rpi_display_count >= 3) {
++#if RPI_DISPLAY_ALL
++    while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
 +        usleep(5000);
 +    }
++#endif
 +
-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
 +    {
-+        printf("** send failed: depth=%d\n", rpi_display_count);
-+        display_cb_input(NULL, buf);
++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++        display_cb_input(de->port_in, buf);
 +    }
 +}
 +
-+static void display_exit(MMAL_COMPONENT_T* display)
++static void display_exit(rpi_display_env_t ** const pde)
 +{
++    rpi_display_env_t * const de = *pde;
++    *pde = NULL;
++
++    if (de != NULL) {
 +//    sleep(120);
-+    if (display) {
-+        mmal_component_destroy(display);
-+    }
-+    if (rpi_pool) {
-+        mmal_port_pool_destroy(display->input[0], rpi_pool);
++
++        if (de->port_in != NULL) {
++            mmal_port_disable(de->port_in);
++        }
++
++        // The above disable should kick out all buffers - check that
++        if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
++            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
++        }
++
++        if (de->conn != NULL) {
++            mmal_connection_destroy(de->conn);
++        }
++        if (de->isp != NULL) {
++            mmal_component_destroy(de->isp);
++        }
++        if (de->display != NULL) {
++            mmal_component_destroy(de->display);
++        }
++        if (de->rpi_pool != NULL) {
++            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
++        }
++
++        av_free(de);
 +    }
 +}
 +
@@ -235,29 +310,29 @@ index 9ffd833..e2474e5 100644
  /* sub2video hack:
     Convert subtitles to video with alpha to insert them in filter graphs.
     This is a temporary solution until libavfilter gets real subtitles support.
-@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
+@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret)
          avformat_close_input(&input_files[i]->ctx);
          av_freep(&input_files[i]);
      }
 +
 +#ifdef RPI_DISPLAY
-+    display_exit(rpi_display);
++    display_exit(&rpi_display_env);
 +#endif
 +
      for (i = 0; i < nb_input_streams; i++) {
          InputStream *ist = input_streams[i];
  
-@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
+@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret)
          av_freep(&ist->filters);
          av_freep(&ist->hwaccel_device);
  
-+#ifdef RPI_ZERO_COPY
++#ifdef RPI_DISPLAY
 +        av_rpi_zc_uninit(ist->dec_ctx);
 +#endif
          avcodec_free_context(&ist->dec_ctx);
  
          av_freep(&input_streams[i]);
-@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
+@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret)
      }
      term_exit();
      ffmpeg_exited = 1;
@@ -265,28 +340,28 @@ index 9ffd833..e2474e5 100644
  }
  
  void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
+@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s,
      if (ost->source_index >= 0)
          ist = input_streams[ost->source_index];
  
 +#ifdef RPI_DISPLAY
 +    if (next_picture && ist != NULL)
 +    {
-+        if (!rpi_display)
-+            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
++        if (rpi_display_env == NULL)
++            rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
++        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
 +    }
 +#endif
 +
      if (filter->inputs[0]->frame_rate.num > 0 &&
          filter->inputs[0]->frame_rate.den > 0)
          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
-@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
          ist->dec_ctx->opaque                = ist;
          ist->dec_ctx->get_format            = get_format;
          ist->dec_ctx->get_buffer2           = get_buffer;
 +
-+#ifdef RPI_ZERO_COPY
++#ifdef RPI_DISPLAY
 +        // Overrides the above get_buffer2
 +        av_rpi_zc_init(ist->dec_ctx);
 +#endif
@@ -295,66 +370,74 @@ index 9ffd833..e2474e5 100644
  
          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
 diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index fd0d1f0..1740768 100644
+index bb28aea1e2..741aa0bdc4 100644
 --- a/libavcodec/Makefile
 +++ b/libavcodec/Makefile
-@@ -5,6 +5,12 @@ NAME = avcodec
+@@ -5,6 +5,16 @@ NAME = avcodec
  HEADERS = avcodec.h                                                     \
            avdct.h                                                       \
            avfft.h                                                       \
++          rpi_opts.h                                                    \
 +          rpi_qpu.h                                                     \
 +          rpi_shader.h                                                  \
-+	  rpi_shader_cmd.h                                              \
++          rpi_shader_cmd.h                                              \
++          rpi_shader_template.h                                         \
++          rpi_shader_template_fn.h                                      \
 +          rpi_mailbox.h                                                 \
-+          rpi_hevc_transform.h                                          \
++          rpi_hevc_transform8.h                                         \
++          rpi_hevc_transform10.h                                        \
 +          rpi_zc.h                                                      \
            d3d11va.h                                                     \
            dirac.h                                                       \
            dv_profile.h                                                  \
-@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
+@@ -43,6 +53,11 @@ OBJS = allcodecs.o                                                      \
         resample.o                                                       \
         resample2.o                                                      \
         utils.o                                                          \
 +       rpi_qpu.o                                                        \
 +       rpi_shader.o                                                     \
++       rpi_shader_template.o                                            \
 +       rpi_mailbox.o                                                    \
 +       rpi_zc.o                                                         \
         vorbis_parser.o                                                  \
         xiph.o                                                           \
  
-@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
  endif
 +
-+QASM := $(SUBDIR)../pi-util/qasm.py
++QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
 +
-+ifneq ("$(wildcard $(QASM))","")
++ifneq ("$(wildcard $(QASM_PY))","")
 +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
-+	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
++	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
 +
 +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
-+	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
++	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
 +endif
 +
-+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 54efaad..02a89c3 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -667,6 +667,7 @@ void avcodec_register_all(void)
-     REGISTER_PARSER(H261,               h261);
-     REGISTER_PARSER(H263,               h263);
-     REGISTER_PARSER(H264,               h264);
-+    REGISTER_PARSER(H264_MVC,           h264_mvc);
-     REGISTER_PARSER(HEVC,               hevc);
-     REGISTER_PARSER(MJPEG,              mjpeg);
-     REGISTER_PARSER(MLP,                mlp);
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++	python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++	python pi-util/make_array.py $<
++
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
 diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index a4ceca7..cafd25d 100644
+index a4ceca7f46..f8229a80e2 100644
 --- a/libavcodec/arm/Makefile
 +++ b/libavcodec/arm/Makefile
-@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
@@ -363,13 +446,15 @@ index a4ceca7..cafd25d 100644
 +                                          arm/hevcdsp_epel_neon.o       \
                                            arm/hevcdsp_idct_neon.o       \
 -                                          arm/hevcdsp_qpel_neon.o
++                                          arm/hevcdsp_cres_neon.o       \
++                                          arm/hevcdsp_res16_neon.o      \
 +                                          arm/hevcdsp_qpel_neon.o       \
 +                                          arm/hevcdsp_sao_neon.o
  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                            arm/rv40dsp_neon.o
 diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b..0a3980a 100644
+index fdbf86b45e..0a3980a1ef 100644
 --- a/libavcodec/arm/cabac.h
 +++ b/libavcodec/arm/cabac.h
 @@ -26,13 +26,34 @@
@@ -552,7 +637,7 @@ index fdbf86b..0a3980a 100644
  #endif /* AVCODEC_ARM_CABAC_H */
 diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
 new file mode 100644
-index 0000000..31d3c59
+index 0000000000..31d3c59205
 --- /dev/null
 +++ b/libavcodec/arm/hevc_cabac.h
 @@ -0,0 +1,491 @@
@@ -1047,9 +1132,239 @@ index 0000000..31d3c59
 +#endif /* HAVE_ARMV6T2_INLINE */
 +
 +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
+new file mode 100644
+index 0000000000..380d3c8d3b
+--- /dev/null
++++ b/libavcodec/arm/hevc_idct_fn_neon.S
+@@ -0,0 +1,224 @@
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT  (15 - BIT_DEPTH)
++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q0, r1
++        vdup.16     q1, r1
++        vst1.16     {q0, q1}, [r0]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++        vstm        r0, {q8-q15}
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++        vstm        r0!, {q8-q15}
++        vstm        r0!, {q8-q15}
++        vstm        r0!, {q8-q15}
++        vstm        r0, {q8-q15}
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        mov         r3, #16
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vmov.16     q10, q8
++        vmov.16     q11, q8
++        vmov.16     q12, q8
++        vmov.16     q13, q8
++        vmov.16     q14, q8
++        vmov.16     q15, q8
++1:      subs        r3, #1
++        vstm        r0!, {q8-q15}
++        bne         1b
++        bx lr
++endfunc
++
++
++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
++        vpush       {d8-d15}
++        vld1.16     {q14, q15}, [r0]  // coeffs
++        ldr         r3, =0x00240053 // 36 and 83
++        vmov.32     d0[0], r3
++
++        tr4_shift d28, d29, d30, d31, #7
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        vst1.16     {q14, q15}, [r0]
++        vpop        {d8-d15}
++        bx lr
++endfunc
++
++
++
++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++        vpush       {d8-d15}
++        vld1.16     {q14, q15}, [r0]  // coeffs
++        ldr         r3, =0x4a  // 74
++        vmov.32     d0[0], r3
++        ldr         r3, =0x1d  // 29
++        vmov.32     d0[1], r3
++        ldr         r3, =0x37  // 55
++        vmov.32     d1[0], r3
++
++        tr4_luma_shift d28, d29, d30, d31, #7
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++
++        tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
++
++        vtrn.16     d28, d29
++        vtrn.16     d30, d31
++        vtrn.32     q14, q15
++        vst1.16     {q14, q15}, [r0]
++        vpop        {d8-d15}
++        bx lr
++endfunc
++
++
++
++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
++        push   {r4-r8}
++        vpush {d8-d15}
++        mov    r5, #16
++
++        adrl      r3, tr4f
++        vld1.16   {d0, d1}, [r3]
++
++        // left half
++        vld1.16 {d24}, [r0], r5
++        vld1.16 {d25}, [r0], r5
++        vld1.16 {d26}, [r0], r5
++        vld1.16 {d27}, [r0], r5
++        vld1.16 {d28}, [r0], r5
++        vld1.16 {d29}, [r0], r5
++        vld1.16 {d30}, [r0], r5
++        vld1.16 {d31}, [r0], r5
++        sub      r0, #128
++        tr8_begin d25, d27, d29, d31
++        tr4       d24, d26, d28, d30
++        tr8_end   #7
++        vst1.16 {d2}, [r0], r5
++        vst1.16 {d3}, [r0], r5
++        vst1.16 {d4}, [r0], r5
++        vst1.16 {d5}, [r0], r5
++        vst1.16 {d6}, [r0], r5
++        vst1.16 {d7}, [r0], r5
++        vst1.16 {d8}, [r0], r5
++        vst1.16 {d9}, [r0], r5
++        sub      r0, #128
++        //skip right half if col_limit in r1 is less than 4
++        cmp      r1, #4
++        blt      1f
++        //right half
++        add      r0, #8
++        vld1.16 {d24}, [r0], r5
++        vld1.16 {d25}, [r0], r5
++        vld1.16 {d26}, [r0], r5
++        vld1.16 {d27}, [r0], r5
++        vld1.16 {d28}, [r0], r5
++        vld1.16 {d29}, [r0], r5
++        vld1.16 {d30}, [r0], r5
++        vld1.16 {d31}, [r0], r5
++        sub      r0, #128
++        tr8_begin d25, d27, d29, d31
++        tr4       d24, d26, d28, d30
++        tr8_end   #7
++        vst1.16 {d2}, [r0], r5
++        vst1.16 {d3}, [r0], r5
++        vst1.16 {d4}, [r0], r5
++        vst1.16 {d5}, [r0], r5
++        vst1.16 {d6}, [r0], r5
++        vst1.16 {d7}, [r0], r5
++        vst1.16 {d8}, [r0], r5
++        vst1.16 {d9}, [r0], r5
++        sub      r0, #136
++1:
++        // top half
++        vldm r0, {q12-q15} // coeffs
++        transpose_16b_4x4 d24, d26, d28, d30
++        transpose_16b_4x4 d25, d27, d29, d31
++        tr8_begin d26, d30, d27, d31
++        tr4 d24, d28, d25, d29
++        tr8_end #(TRN_SHIFT)
++        transpose_16b_4x4 d2, d3, d4, d5
++        transpose_16b_4x4 d6, d7, d8, d9
++        vswp     d7, d5
++        vswp     d7, d8
++        vswp     d3, d6
++        vswp     d6, d4
++        vstm r0!, {q1-q4}
++
++        // bottom half
++        vldm r0, {q12-q15} // coeffs
++        transpose_16b_4x4 d24, d26, d28, d30
++        transpose_16b_4x4 d25, d27, d29, d31
++        tr8_begin d26, d30, d27, d31
++        tr4 d24, d28, d25, d29
++        tr8_end #(TRN_SHIFT)
++        transpose_16b_4x4 d2, d3, d4, d5
++        transpose_16b_4x4 d6, d7, d8, d9
++        vswp     d7, d5
++        vswp     d7, d8
++        vswp     d3, d6
++        vswp     d6, d4
++        //vstm     r0, {q1-q4}
++        vst1.16 {q1-q2}, [r0]
++        add     r0, #32
++        vst1.16 {q3-q4}, [r0]
++        sub     r0, #32
++        vpop {d8-d15}
++        pop {r4-r8}
++        bx lr
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
 diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
 new file mode 100644
-index 0000000..373576b
+index 0000000000..373576b4cb
 --- /dev/null
 +++ b/libavcodec/arm/hevc_misc_neon.S
 @@ -0,0 +1,62 @@
@@ -1115,8 +1430,310 @@ index 0000000..373576b
 +
 +endfunc
 +
+diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
+new file mode 100644
+index 0000000000..bafefd4318
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_cres_neon.S
+@@ -0,0 +1,296 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ General notes:
++@
++@ Residual is only guaranteed to be cliped to 16 bits
++@ This means that we do need to do movul, qadd, qmovun
++@ rather than addw, qmovun (if we were clipped to 15 then we could get away
++@ with this)
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_add_residual_4x4_u_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vld1.16     {q0, q1}, [r1]
++        vdup.16     q2, r3
++        vdup.16     q3, r3
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_add_residual_8x8_u_neon_8, export=1
++        mov         r12,    #4
++        vdup.16     q15, r3
++1:
++        vld2.8      {d16, d17}, [r0, :128], r2
++        vld2.8      {d18, d19}, [r0, :128]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12, #1
++        vmovl.u8    q10, d16
++        sub         r0, r2
++        vmovl.u8    q11, d18
++        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d17
++        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d16,  q0
++        vqmovun.s16 d17,  q2
++        vqmovun.s16 d18,  q1
++        vqmovun.s16 d19,  q3
++        vst2.8      {d16, d17}, [r0, :128], r2
++        vst2.8      {d18, d19}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_add_residual_16x16_u_neon_8, export=1
++        mov         r12,    #16
++        vdup.16     q15, r3
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vaddw.u8    q2,  q15, d18
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d16, q0
++        vqmovun.s16 d17, q1
++        vqmovun.s16 d18, q2
++        vqmovun.s16 d19, q3
++        vst2.8      {q8, q9}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_v_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vld1.16     {q2, q3}, [r1]
++        vdup.16     q0, r3
++        vdup.16     q1, r3
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_v_neon_8, export=1
++        mov         r12,    #4
++        vdup.16     q15, r3
++1:
++        vld2.8      {d16, d17}, [r0, :128], r2
++        vld2.8      {d18, d19}, [r0, :128]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12, #1
++        vmovl.u8    q10, d17
++        sub         r0, r2
++        vmovl.u8    q11, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vaddw.u8    q2,  q15, d16
++        vaddw.u8    q3,  q15, d18
++        vqmovun.s16 d17,  q0
++        vqmovun.s16 d16,  q2
++        vqmovun.s16 d19,  q1
++        vqmovun.s16 d18,  q3
++        vst2.8      {d16, d17}, [r0, :128], r2
++        vst2.8      {d18, d19}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_v_neon_8, export=1
++        mov         r12,    #16
++        vdup.16     q15, r3
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d18
++        vmovl.u8    q11, d19
++        vaddw.u8    q2,  q15, d16
++        vaddw.u8    q3,  q15, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d16, q2
++        vqmovun.s16 d17, q3
++        vqmovun.s16 d18, q0
++        vqmovun.s16 d19, q1
++        vst2.8      {q8, q9}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_4x4_c_neon_8, export=1
++        vld1.8      {d16}, [r0, :64], r2
++        vld1.8      {d17}, [r0, :64], r2
++        vld1.8      {d18}, [r0, :64], r2
++        vld1.8      {d19}, [r0, :64], r2
++        vldm        r1, {q0-q3}           @ Q0/1 gets all of U, Q2/3 gets all of V
++        vmovl.u8    q10, d16
++        sub         r0, r0, r2, lsl #2
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0, :64], r2
++        vst1.8      {d1}, [r0, :64], r2
++        vst1.8      {d2}, [r0, :64], r2
++        vst1.8      {d3}, [r0, :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_8x8_c_neon_8, export=1
++        mov         r12,    #8
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.8      {d16, d17}, [r0, :128]
++        vld1.16     {q0}, [r1, :128]!
++        vld1.16     {q1}, [r3, :128]!
++        subs        r12, #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst2.8      {d0, d1}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_add_residual_16x16_c_neon_8, export=1
++        mov         r12,    #16
++        add         r3, r1, #(16*16*2)  @ Offset to V
++1:
++        vld2.8      {q8, q9}, [r0, :256]
++        vld1.16     {q0, q1}, [r1, :256]!
++        vld1.16     {q2, q3}, [r3, :256]!
++        subs        r12,   #1
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst2.8      {q0, q1}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
 diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
-index 166bddb..9bd0a42 100644
+index 166bddb104..15c4329cdb 100644
 --- a/libavcodec/arm/hevcdsp_deblock_neon.S
 +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
 @@ -15,7 +15,7 @@
@@ -1128,66 +1745,204 @@ index 166bddb..9bd0a42 100644
   */
  
  
-@@ -31,6 +31,9 @@
+@@ -24,70 +24,238 @@
+ 
+ .macro hevc_loop_filter_chroma_start
+         ldr      r12, [r2]
+-        ldr      r3, [r2, #4]
+-        add      r2, r3, r12
+-        cmp      r2, #0
++        ldr      r2, [r2, #4]
++        orrs     r2, r12, r2, lsl #16
+         it       eq
          bxeq     lr
  .endm
  
+-.macro hevc_loop_filter_chroma_body
+-        vsubl.u8  q3, d4, d2
+-        vsubl.u8  q11, d18, d19
+-        vshl.i16  q3, #2
+-        vadd.i16  q11, q3
+-        vdup.16   d0, r12
+-        vdup.16   d1, r3
+-        vrshr.s16 q11, q11, #3
+-        vneg.s16  q12, q0
 +@ Uses: d2, d4, d18, d19
 +@ Returns: d2, d4
-+@ Modifies: d0-d7, d22-d25
- .macro hevc_loop_filter_chroma_body
-         vsubl.u8  q3, d4, d2
-         vsubl.u8  q11, d18, d19
-@@ -49,6 +52,33 @@
-         vqmovun.s16 d4, q2
- .endm
- 
++@ Modifies: d0-d7, d22-d25, r12
 +
-+@ Uses r2[0:7], r2[8:15]
-+@ Modifies: d0-d7, d22-d25
-+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
-+        vsubl.u8  q3, \Q0, \P0
-+        vsubl.u8  q11, \P1, \Q1
-+        vshl.i16  q3, #2
-+        vadd.i16  q11, q3
++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
++        vsubl.u8  q0, \Q0, \P0
++        vsubl.u8  q1, \P1, \Q1
++        vdup.16   d4, r2
++        lsr       r2, r2, #16
++        vshl.i16  q0, #2
++        ldr       r12, [sp, #0] @ r12 = &no_q
++        vadd.i16  q0, q1
++        ldrh      r3, [r3]      @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
++        vdup.16   d5, r2
 +
-+        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
-+        vdup.16   d0, r2
-+        vmovl.u8  q0, d0
-+        vuzp.16   d0, d1
-+
-+        vrshr.s16 q11, q11, #3
-+        vneg.s16  q12, q0
++        vrshr.s16 q0, q0, #3
++        ldrh      r12, [r12]
++        vneg.s16  q3, q2
++        vmin.s16  q0, q0, q2
 +        vmovl.u8  q2, \Q0
-+        vmin.s16  q11, q11, q0
-+        vmax.s16  q11, q11, q12
-+        vaddw.u8  q1, q11, \P0
-+        vsub.i16  q2, q11
++        vmax.s16  q0, q0, q3
++        vaddw.u8  q1, q0, \P0
++        vsub.i16  q2, q0
++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
 +        vqmovun.s16 \P0, q1
 +        vqmovun.s16 \Q0, q2
 +.endm
 +
++@ Uses r2 (tc a;b)
++@ Modifies: q0-q3
++@ On exit
++@   r12 (and flags) contain no_p;no_q
++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
++        vsub.i16  q0, \Q0, \P0
++        lsl       r12, r2, #(\bit_depth - 8)
++        vsub.i16  q1, \P1, \Q1
++        vshl.i16  q0, #2
++        vdup.16   d4, r12
++        lsr       r12, r12, #16
++        vadd.i16  q0, q1
++        ldrh      r3, [r3]
++        vdup.16   d5, r12
++
++        vrshr.s16 q0, q0, #3
++        vneg.s16  q3, q2
++        movw      r12, #(1 << \bit_depth) - 1
++        vmin.s16  q0, q0, q2
++        vmax.s16  q0, q0, q3
++        vdup.i16  q3, r12
++        ldr       r12, [sp, #0]
++
++        vadd.i16  \P0, q0, \P0
++        vsub.i16  \Q0, q0
++
++        vmov.i64  q2, #0
++        ldrh      r12, [r12]
++        vmin.s16  \P0, q3
++        vmin.s16  \Q0, q3
++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
++        vmax.s16  \P0, q2
++        vmax.s16  \Q0, q2
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
++        vsubl.u8  q0, \Q0u, \P0u
++        vsubl.u8  q1, \Q0v, \P0v
++        vsubl.u8  q2, \P1u, \Q1u
++        vsubl.u8  q3, \P1v, \Q1v
++        vshl.i16  q0, #2
++        vshl.i16  q1, #2
++        vadd.i16  q0, q2
++        vdup.16   d4, r2
++        lsr       r2, #16
++        vadd.i16  q1, q3
++
++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
++        vrshr.s16 q0, #3
++        vdup.16   d6, r2
+         vmovl.u8  q2, d4
+-        vmin.s16  q11, q11, q0
+-        vmax.s16  q11, q11, q12
+-        vaddw.u8  q1, q11, d2
+-        vsub.i16  q2, q11
+-        vqmovun.s16 d2, q1
+-        vqmovun.s16 d4, q2
++        vmovl.u8  q3, d6
++        vuzp.16   d4, d5
++        vrshr.s16 q1, #3
++        vuzp.16   d6, d7
++
++        vmin.s16  q0, q2
++        vneg.s16  q2, q2
++        vmin.s16  q1, q3
++        vneg.s16  q3, q3
++        vmax.s16  q0, q2
++        vaddw.u8  q2, q0, \P0u
++        vmax.s16  q1, q3
++        vaddw.u8  q3, q1, \P0v
++
++        vqmovun.s16 \P0u, q2
++        vmovl.u8  q2, \Q0u
++        vqmovun.s16 \P0v, q3
++        vmovl.u8  q3, \Q0v
++        vsub.i16  q2, q0
++        vsub.i16  q3, q1
++
++        vqmovun.s16 \Q0u, q2
++        vqmovun.s16 \Q0v, q3
+ .endm
+ 
++@ Preserves r12
++@ Clobbers r2
++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
++        vsub.i16  q0, \Q0u, \P0u
++        vsub.i16  q1, \Q0v, \P0v
++        vsub.i16  q2, \P1u, \Q1u
++        vsub.i16  q3, \P1v, \Q1v
++        vshl.i16  q0, #2
++        vshl.i16  q1, #2
++        vadd.i16  q0, q2
++        vdup.16   d4, r2
++        lsr       r2, #16
++        vadd.i16  q1, q3
++
++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
++        vrshr.s16 q0, #3
++        vdup.16   d6, r2
++        vshll.u8  q2, d4, #\bit_depth - 8
++        vshll.u8  q3, d6, #\bit_depth - 8
++        vuzp.16   d4, d5
++        vrshr.s16 q1, #3
++        vuzp.16   d6, d7
++
++        movw      r2, #(1 << \bit_depth) - 1
++        vmin.s16  q0, q2
++        vneg.s16  q2, q2
++        vmin.s16  q1, q3
++        vneg.s16  q3, q3
++        vmax.s16  q0, q2
++        vmov.i64  q2, #0
++        vmax.s16  q1, q3
++        vdup.i16  q3, r2
++        vadd.i16  \P0u, q0
++        vsub.i16  \Q0u, q0
++        vadd.i16  \P0v, q1
++        vsub.i16  \Q0v, q1
++
++        vmax.s16  \P0u, q2
++        vmax.s16  \Q0u, q2
++        vmax.s16  \P0v, q2
++        vmax.s16  \Q0v, q2
++        vmin.s16  \P0u, q3
++        vmin.s16  \Q0u, q3
++        vmin.s16  \P0v, q3
++        vmin.s16  \Q0v, q3
++.endm
++
 +
 +
  .macro hevc_loop_filter_luma_start
          ldr     r12, [r3]
          ldr      r3, [r3, #4]
-@@ -60,15 +90,17 @@
-         lsr      r3, #16
+-        lsl      r3, #16
+-        orr      r3, r12
+-        cmp      r3, #0
++        orrs     r3, r12, r3, lsl #16
+         it       eq
+         bxeq     lr
+-        lsr      r3, #16
  .endm
  
 -.macro hevc_loop_filter_luma_body
-+@ Uses: r2, r3, r12
-+@ Modifies: r5, r6, r7, r8, r9
-+function hevc_loop_filter_luma_body
-+        vmovl.u8  q15, d23
-+        vmovl.u8  q14, d22
-+        vmovl.u8  q13, d21
-+        vmovl.u8  q12, d20
-+        vmovl.u8  q11, d19
-+        vmovl.u8  q10, d18
-+        vmovl.u8  q9, d17
-         vmovl.u8  q8, d16
+-        vmovl.u8  q8, d16
 -        vmovl.u8  q9, d18
 -        vmovl.u8  q10, d20
 -        vmovl.u8  q11, d22
@@ -1195,46 +1950,103 @@ index 166bddb..9bd0a42 100644
 -        vmovl.u8  q13, d26
 -        vmovl.u8  q14, d28
 -        vmovl.u8  q15, d30
++@ Uses: r2, r3, r12
++@ Modifies: r5, r6, r7, r8, r9
++
++@ Input:
++@  r2          beta    (raw: needs shift for bitdepth > 8)
++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
++@  [sp,#96]    &no_p[0]
++@  [sp,#100]   &no_q[0]
++@
++@ Input & output
++@  8-bit: d16-d23
++@ 16-bit:  q8-q15
++@
++@ Output
++@  Z           r10==0
++@  r10[ 0:7 ]  no_p[0]
++@  r10[ 8:15]  no_p[1]
++@  r10[16:23]  no_q[0]
++@  r10[24:31]  no_q[1]
++
  
++.macro m_filter_luma bit_depth
++.if \bit_depth == 8
++        vmovl.u8  q15, d23
++        vmovl.u8  q14, d22
++        vmovl.u8  q13, d21
++        vmovl.u8  q12, d20
++        vmovl.u8  q11, d19
++        vmovl.u8  q10, d18
++        vmovl.u8  q9, d17
++        vmovl.u8  q8, d16
++.endif
          vadd.i16   q7, q9, q11
++.if \bit_depth > 8
++        lsl        r2, r2, #(\bit_depth - 8)
++.endif
          vadd.i16   q6, q14, q12
-@@ -77,7 +109,6 @@
++.if \bit_depth > 8
++        lsl        r3, r3, #(\bit_depth - 8)
++.endif
+         vsub.i16   q7, q10
++        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
+         vsub.i16   q6, q13
          vabd.s16   q7, q7, q10
          vabd.s16   q6, q6, q13
- 
 -
++        ldrh       r10, [r5]
+ 
          vdup.16    q0, r2
          vmov       q4, q7
          vmov       q5, q6
-@@ -152,7 +183,7 @@
+-        vdup.16    d4, r12
++        ldr        r5, [sp, #100]
++        vdup.16    d4, r3
++        lsr        r3, r3, #16
+         vtrn.16    q7, q4
++        ldrh       r5, [r5]
+         vtrn.16    q6, q5
+ 
+         vshl.u64   q7, #32
+         vshr.u64   q4, #32
+         vshl.u64   q6, #32
++        orr        r10, r10, r5, lsl #16
+         vshr.u64   q5, #32
+         vshr.u64   q7, #32
+         vshr.u64   q6, #32
+@@ -152,7 +320,7 @@
  
          and        r9, r8, r7
          cmp        r9, #0
 -        beq        weakfilter_\@
-+        beq        weakfilter_
++        beq        1f
  
          vadd.i16  q2, q11, q12
          vadd.i16  q4, q9, q8
-@@ -210,11 +241,11 @@
+@@ -210,11 +378,11 @@
          vbit      q13, q3, q5
          vbit      q14, q2, q5
  
 -weakfilter_\@:
-+weakfilter_:
++1:
          mvn       r8, r8
          and       r9, r8, r7
          cmp       r9, #0
 -        beq       ready_\@
-+        beq       ready_
++        beq       2f
  
          vdup.16    q4, r2
  
-@@ -275,75 +306,345 @@ weakfilter_\@:
+@@ -275,111 +443,1041 @@ weakfilter_\@:
          vbit      q11, q0, q5
          vbit      q12, q4, q5
  
 -ready_\@:
-+ready_:
++2:
++.if \bit_depth == 8
          vqmovun.s16 d16, q8
 -        vqmovun.s16 d18, q9
 -        vqmovun.s16 d20, q10
@@ -1243,7 +2055,7 @@ index 166bddb..9bd0a42 100644
 -        vqmovun.s16 d26, q13
 -        vqmovun.s16 d28, q14
 -        vqmovun.s16 d30, q15
--.endm
++        cmp       r10, #0
 +        vqmovun.s16 d17, q9
 +        vqmovun.s16 d18, q10
 +        vqmovun.s16 d19, q11
@@ -1251,7 +2063,30 @@ index 166bddb..9bd0a42 100644
 +        vqmovun.s16 d21, q13
 +        vqmovun.s16 d22, q14
 +        vqmovun.s16 d23, q15
++.else
++        movw      r12, #(1 << \bit_depth - 1)
++        vmov.i64  q0, #0
++        vdup.i16  q1, r12
++        @ q8 & q15 should be unaltered and so don't require clipping
++        vmax.s16  q9,  q0
++        cmp       r10, #0
++        vmax.s16  q10, q0
++        vmax.s16  q11, q0
++        vmax.s16  q12, q0
++        vmax.s16  q13, q0
++        vmax.s16  q14, q0
++        vmin.s16  q9,  q1
++        vmin.s16  q10, q1
++        vmin.s16  q11, q1
++        vmin.s16  q12, q1
++        vmin.s16  q13, q1
++        vmin.s16  q14, q1
++.endif
 +        mov       pc, lr
+ .endm
+ 
++function hevc_loop_filter_luma_body
++        m_filter_luma 8
 +endfunc
 +
 +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
@@ -1263,7 +2098,16 @@ index 166bddb..9bd0a42 100644
 +        b        v_loop_luma_common
 +endfunc
 +
- 
++
++@ void ff_hevc_v_loop_filter_luma_neon(
++@   uint8_t *_pix,      [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int *_tc,           [r3]
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++
  function ff_hevc_v_loop_filter_luma_neon, export=1
          hevc_loop_filter_luma_start
 -        push     {r5-r11}
@@ -1271,14 +2115,6 @@ index 166bddb..9bd0a42 100644
 +
 +        sub      r4, r0, #4
 +v_loop_luma_common:
-+        @ Why this isn't a bitmask to start with I have no idea...
-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
-+        ldr      r5, [sp, #32]
-+        ldrh     r10, [r5]
-+        ldr      r5, [sp, #36]
-+        ldrh     r5, [r5]
-+        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
-+
          vpush    {d8-d15}
 -        sub      r0, #4
 -        vld1.8   {d16}, [r0], r1
@@ -1335,44 +2171,38 @@ index 166bddb..9bd0a42 100644
 +
 +        @ no_p[1]
 +        tst     r10, #0xff00
-+        itt ne
-+        addne    r4, r4, r1, lsl #2
++        add     r2, r4, r1, lsl #2
 +        bne     1f
 +        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
 +        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
 +        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
-+
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
++1:
++        @ no_p[0]
++        tst     r10, #0xff
++        bne     1f
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
 +1:
 +        @ no_q[1]
 +        tst     r10, #0xff000000
-+        itt ne
-+        addne    r0, r0, r1, lsl #2
-+        bne     2f
++        add     r2, r0, r1, lsl #2
++        bne     1f
 +        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
 +        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
 +        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
-+
-+2:
-+        @ no_p[0]
-+        tst     r10, #0xff
-+        bne     3f
-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
-+
-+3:
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
++1:
 +        @ no_q[0]
 +        tst     r10, #0xff0000
-+        bne     4f
-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
-+
-+4:
++        bne     1f
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
 +bypasswrite:
          vpop     {d8-d15}
 -        pop      {r5-r11}
@@ -1380,6 +2210,81 @@ index 166bddb..9bd0a42 100644
 +        pop      {r4-r10,pc}
  endfunc
  
++.macro m_filter_v_luma_common_16 bit_depth
++        vpush    {d8-d15}
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        neg     r1, r1
++
++        @ p[1]
++        tst      r10, #0xff00
++        add      r2, r4, r1, lsl #2
++        bne      1f
++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4]
++1:
++        @ p[0]
++        tst      r10, #0xff
++        bne      1f
++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r2]
++1:
++        @ q[1]
++        tst      r10, #0xff000000
++        add      r2, r0, r1, lsl #2
++        bne      1f
++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0]
++1:
++        @ q[0]
++        tst      r10, #0xff0000
++        bne      1f
++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++        vpop     {d8-d15}
++        pop      {r4-r10,pc}
++.endm
++
++
++
++
 +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
 +@                                 ptrdiff_t stride, [r1]
 +@                                 int beta,         [r2]
@@ -1429,13 +2334,6 @@ index 166bddb..9bd0a42 100644
 +        neg     r1, r1
 +        add     r0, r0, r1
 +
-+        @ Why this isn't a bitmask to start with I have no idea...
-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
-+        ldr      r5, [sp, #32]
-+        ldrh     r10, [r5]
-+        ldr      r5, [sp, #36]
-+        ldrh     r5, [r5]
-+        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
 +        bne      1f
 +
 +        vst1.8  {d22}, [r0], r1
@@ -1486,8 +2384,81 @@ index 166bddb..9bd0a42 100644
 +
 +        pop      {r4-r10,pc}
 +
- endfunc
- 
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}
++
++        vpush    {d8-d15}
++        sub      r0, r0, r1, lsl #2
++
++        vld1.16 { q8}, [r0], r1
++        vld1.16 { q9}, [r0], r1
++        vld1.16 {q10}, [r0], r1
++        vld1.16 {q11}, [r0], r1
++        vld1.16 {q12}, [r0], r1
++        vld1.16 {q13}, [r0], r1
++        vld1.16 {q14}, [r0], r1
++        vld1.16 {q15}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        vpop     {d8-d15}
++
++        sub      r0, r1
++        neg      r1, r1
++        bne      1f
++
++        vst1.16  {q14}, [r0], r1
++        vst1.16  {q13}, [r0], r1
++        vst1.16  {q12}, [r0], r1
++        vst1.16  {q11}, [r0], r1
++        vst1.16  {q10}, [r0], r1
++        vst1.16  { q9}, [r0]
++        pop      {r4-r10,pc}
++
++@ Partial write
++1:
++        tst      r10, #0xff0000
++        mov      r2, r0
++        bne      1f
++        vst1.16  {d28}, [r2], r1
++        vst1.16  {d26}, [r2], r1
++        vst1.16  {d24}, [r2]
++
++1:
++        tst      r10, #0xff000000
++        add      r2, r0, #8
++        bne      1f
++        vst1.16  {d29}, [r2], r1
++        vst1.16  {d27}, [r2], r1
++        vst1.16  {d25}, [r2]
++
++1:
++        tst      r10, #0xff
++        @ r0 = r0 + r1 * 3
++        add      r0, r0, r1
++        add      r0, r0, r1, lsl # 1
++        add      r2, r0, #8
++        bne      1f
++        vst1.16  {d22}, [r0], r1
++        vst1.16  {d20}, [r0], r1
++        vst1.16  {d18}, [r0]
++
++1:
++        tst      r10, #0xff00
++        bne      1f
++        vst1.16  {d23}, [r2], r1
++        vst1.16  {d21}, [r2], r1
++        vst1.16  {d19}, [r2]
++
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
 +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
@@ -1501,9 +2472,7 @@ index 166bddb..9bd0a42 100644
 +        vld2.8   {d26,d27}, [r0], r1
 +        vld2.8   {d28,d29}, [r0]
 +        sub      r0, r0, r1, lsl #1
-+        hevc_loop_filter_uv_body d16, d18, d26, d28
-+        lsr      r2, r2, #16
-+        hevc_loop_filter_uv_body d17, d19, d27, d29
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
 +        cmp      r3, #0
 +        bne      1f
 +        vst2.8   {d18,d19}, [r0], r1
@@ -1513,122 +2482,509 @@ index 166bddb..9bd0a42 100644
 +        @ At least one no_f bit is set
 +        @ Which means we need to break this apart in an ugly fashion
 +1:      vzip.8   d18, d19
++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
 +        vzip.8   d26, d27
 +        sub      r1, r1, #8
 +
-+        tst      r3, #1
-+        bne      1f
++        bmi      1f
 +        vst1.8   {d18}, [r0]
 +1:      add      r0, r0, #8
-+        tst      r3, #2
-+        bne      2f
++        bcs      2f
 +        vst1.8   {d19}, [r0]
-+2:      add      r0, r0, r1
++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
++        add      r0, r0, r1
 +
-+        tst      r3, #4
-+        bne      1f
++        bmi      1f
 +        vst1.8   {d26}, [r0]
-+1:      add      r0, r0, #8
-+        tst      r3, #8
-+        it ne
-+        bxne     lr
++1:      it cs
++        bxcs     lr
++        add      r0, r0, #8
 +        vst1.8   {d27}, [r0]
 +        bx       lr
 +
 +endfunc
 +
 +
++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++        sub      r0, r0, r1, lsl #1
++        vld2.16  {q8,  q9 }, [r0], r1
++        vld2.16  {q10, q11}, [r0], r1
++        vld2.16  {q12, q13}, [r0], r1
++        vld2.16  {q14, q15}, [r0]
++        sub      r0, r0, r1, lsl #1
++
++        hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++
++        cmp      r3, #0
++        bne      1f
++        vst2.16  {q10, q11}, [r0], r1
++        vst2.16  {q12, q13}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:      vzip.16  q10, q11
++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
++        vzip.16  q12, q13
++        sub      r1, r1, #16
++
++        bmi      1f
++        vst1.16  {q10}, [r0]
++1:      add      r0, r0, #16
++        bcs      2f
++        vst1.16  {q11}, [r0]
++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
++        add      r0, r0, r1
++
++        bmi      1f
++        vst1.16  {q12}, [r0]
++1:      it cs
++        bxcs     lr
++        add      r0, r0, #16
++        vst1.16  {q13}, [r0]
++        bx       lr
++.endm
++
++
 +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
 +@                                     unsigned int stride,   // r1
 +@                                     uint32_t tc4,          // r2
 +@                                     uint8_t * src_l,       // r3
 +@                                     unsigned int no_f);   // sp[0]
 +@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++
 +function ff_hevc_v_loop_filter_uv2_neon_8, export=1
 +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
-+        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
++        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
++        sub      r12, r0, r3
 +
 +        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
-+        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
++        vld4.8   {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
++        cmp      r12, #4
 +
 +        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
-+        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
++        vld4.8   {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
 +
 +        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
-+        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
++        vld4.8   {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
 +
 +        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
-+        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++        vld4.8   {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
 +
 +        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
-+        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
++        vld4.8   {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
 +
 +        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
-+        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
++        vld4.8   {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
 +
 +        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
-+        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
-+
-+        hevc_loop_filter_uv_body d16, d18, d26, d28
-+        lsr      r2, r2, #16
-+        hevc_loop_filter_uv_body d17, d19, d27, d29
++        vld4.8   {d20[7], d21[7], d22[7], d23[7]}, [r0]
++        it eq
++        ldreq    r12, [sp, #0]
 +
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
++        cmp      r12, #0
++        add      r3, #2
 +        neg      r1, r1
-+
-+        ldr      r2, [sp, #0]
-+
-+        @ p[1]
-+        tst      r2, #2
-+        itt ne
-+        addne    r3, r3, r1, lsl #2
 +        bne      1f
-+        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
-+        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
-+        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
-+        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
 +
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst4.8   {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
++        vst4.8   {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
++        vst4.8   {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
++        vst4.8   {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
++        vst4.8   {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
++        vst4.8   {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
++        vst4.8   {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
++        vst4.8   {d18[0], d19[0], d20[0], d21[0]}, [r3]
++        bx       lr
++
++@ Either split or partial
 +1:
-+        @ q[1]
-+        tst      r2, #8
-+        itt ne
-+        addne    r0, r0, r1, lsl #2
-+        bne 2f
-+        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
-+        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
-+        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
-+        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
++        ldr      r12, [sp, #0]
++        lsls     r12, #29               @ b2 -> N, b3 -> C
++        add      r2, r0, r1, lsl #2
++        bcs      1f
++        vst2.8   {d20[7], d21[7]}, [r0], r1
++        vst2.8   {d20[6], d21[6]}, [r0], r1
++        vst2.8   {d20[5], d21[5]}, [r0], r1
++        vst2.8   {d20[4], d21[4]}, [r0]
++1:
++        bmi      2f
++        vst2.8   {d20[3], d21[3]}, [r2], r1
++        vst2.8   {d20[2], d21[2]}, [r2], r1
++        vst2.8   {d20[1], d21[1]}, [r2], r1
++        vst2.8   {d20[0], d21[0]}, [r2]
 +
 +2:
-+        @ p[0]
-+        tst      r2, #1
-+        bne      3f
-+        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
-+        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
-+        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
-+        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
-+
++        lsls     r12, #2
++        add      r2, r3, r1, lsl #2
++        bcs      3f
++        vst2.8   {d18[7], d19[7]}, [r3], r1
++        vst2.8   {d18[6], d19[6]}, [r3], r1
++        vst2.8   {d18[5], d19[5]}, [r3], r1
++        vst2.8   {d18[4], d19[4]}, [r3]
 +3:
-+        @ q[0]
-+        tst      r2, #4
-+        it ne
-+        bxne     lr
-+        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
-+        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
-+        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
-+        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
-+
++        it mi
++        bxmi     lr
++        vst2.8   {d18[3], d19[3]}, [r2], r1
++        vst2.8   {d18[2], d19[2]}, [r2], r1
++        vst2.8   {d18[1], d19[1]}, [r2], r1
++        vst2.8   {d18[0], d19[0]}, [r2]
 +        bx       lr
-+endfunc
+ endfunc
+ 
++
++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++.macro m_filter_v_uv2_16 bit_depth
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++        sub      r12, r0, r3
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++        cmp      r12, #8
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r3]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++        it eq
++        ldreq    r12, [sp, #0]
++
++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
++        cmp      r12, #0
++        add      r3, #4
++        neg      r1, r1
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst4.16  {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
++        vst4.16  {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
++        vst4.16  {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
++        vst4.16  {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
++        vst4.16  {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
++        vst4.16  {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
++        vst4.16  {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
++        vst4.16  {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
++        bx       lr
++
++@ Either split or partial
++1:
++        ldr      r12, [sp, #0]
++        lsls     r12, #29               @ b2 -> N, b3 -> C
++        add      r2, r0, r1, lsl #2
++        bcs      1f
++        vst2.16  {d25[3], d27[3]}, [r0], r1
++        vst2.16  {d25[2], d27[2]}, [r0], r1
++        vst2.16  {d25[1], d27[1]}, [r0], r1
++        vst2.16  {d25[0], d27[0]}, [r0]
++1:
++        bmi      2f
++        vst2.16  {d24[3], d26[3]}, [r2], r1
++        vst2.16  {d24[2], d26[2]}, [r2], r1
++        vst2.16  {d24[1], d26[1]}, [r2], r1
++        vst2.16  {d24[0], d26[0]}, [r2]
++
++2:
++        lsls     r12, #2
++        add      r2, r3, r1, lsl #2
++        bcs      3f
++        vst2.16  {d21[3], d23[3]}, [r3], r1
++        vst2.16  {d21[2], d23[2]}, [r3], r1
++        vst2.16  {d21[1], d23[1]}, [r3], r1
++        vst2.16  {d21[0], d23[0]}, [r3]
++3:
++        it mi
++        bxmi     lr
++        vst2.16  {d20[3], d22[3]}, [r2], r1
++        vst2.16  {d20[2], d22[2]}, [r2], r1
++        vst2.16  {d20[1], d22[1]}, [r2], r1
++        vst2.16  {d20[0], d22[0]}, [r2]
++        bx       lr
++.endm
++
 +
 +
  function ff_hevc_v_loop_filter_chroma_neon, export=1
          hevc_loop_filter_chroma_start
++
++        sub      r0, #2
++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
++
++        sub      r0, r0, r1, lsl #3
++        add      r0, r0, #1
++        hevc_loop_filter_chroma_body d16, d17, d18, d19
++        bne      1f
++
++        vst2.8   {d17[0], d18[0]}, [r0], r1
++        vst2.8   {d17[1], d18[1]}, [r0], r1
++        vst2.8   {d17[2], d18[2]}, [r0], r1
++        vst2.8   {d17[3], d18[3]}, [r0], r1
++        vst2.8   {d17[4], d18[4]}, [r0], r1
++        vst2.8   {d17[5], d18[5]}, [r0], r1
++        vst2.8   {d17[6], d18[6]}, [r0], r1
++        vst2.8   {d17[7], d18[7]}, [r0], r1
++        bx       lr
++
++1:
++        tst      r12, #0xff             @ P0a
++        bne      2f
++
++        vst1.8   {d17[0]}, [r0], r1
++        vst1.8   {d17[1]}, [r0], r1
++        vst1.8   {d17[2]}, [r0], r1
++        vst1.8   {d17[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++2:
++        tst      r12, #0xff0000         @ Q0a
++        add      r0, #1
++        bne      3f
++        vst1.8   {d18[0]}, [r0], r1
++        vst1.8   {d18[1]}, [r0], r1
++        vst1.8   {d18[2]}, [r0], r1
++        vst1.8   {d18[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++3:
++        tst      r12, #0xff000000       @ Q0b
++        add      r0, r0, r1, lsl #2
++        bne      4f
++        vst1.8   {d18[4]}, [r0], r1
++        vst1.8   {d18[5]}, [r0], r1
++        vst1.8   {d18[6]}, [r0], r1
++        vst1.8   {d18[7]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++4:
++        tst      r12, #0xff00           @ P0b
++        it ne
++        bxne     lr
++
++        sub      r0, #1
++        vst1.8   {d17[4]}, [r0], r1
++        vst1.8   {d17[5]}, [r0], r1
++        vst1.8   {d17[6]}, [r0], r1
++        vst1.8   {d17[7]}, [r0], r1
++        bx       lr
++
++endfunc
++
++
++.macro m_filter_v_chroma_16 bit_depth
++        hevc_loop_filter_chroma_start
++
          sub      r0, #4
-@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
-         vst1.8   {d4}, [r0]
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
++
++        sub      r0, r0, r1, lsl #3
++        add      r0, r0, #2
++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
++        bne      1f
++
++        vst2.16  {d18[0], d20[0]}, [r0], r1
++        vst2.16  {d18[1], d20[1]}, [r0], r1
++        vst2.16  {d18[2], d20[2]}, [r0], r1
++        vst2.16  {d18[3], d20[3]}, [r0], r1
++        vst2.16  {d19[0], d21[0]}, [r0], r1
++        vst2.16  {d19[1], d21[1]}, [r0], r1
++        vst2.16  {d19[2], d21[2]}, [r0], r1
++        vst2.16  {d19[3], d21[3]}, [r0], r1
++        bx       lr
++
++1:
++        tst      r12, #0xff             @ P0a
++        bne      2f
++
++        vst1.16  {d18[0]}, [r0], r1
++        vst1.16  {d18[1]}, [r0], r1
++        vst1.16  {d18[2]}, [r0], r1
++        vst1.16  {d18[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++2:
++        tst      r12, #0xff0000         @ Q0a
++        add      r0, #1
++        bne      3f
++        vst1.16  {d20[0]}, [r0], r1
++        vst1.16  {d20[1]}, [r0], r1
++        vst1.16  {d20[2]}, [r0], r1
++        vst1.16  {d20[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++3:
++        tst      r12, #0xff000000       @ Q0b
++        add      r0, r0, r1, lsl #2
++        bne      4f
++        vst1.16  {d21[0]}, [r0], r1
++        vst1.16  {d21[1]}, [r0], r1
++        vst1.16  {d21[2]}, [r0], r1
++        vst1.16  {d21[3]}, [r0], r1
++        sub      r0, r0, r1, lsl #2
++
++4:
++        tst      r12, #0xff00           @ P0b
++        it ne
++        bxne     lr
++
++        sub      r0, #1
++        vst1.16  {d19[0]}, [r0], r1
++        vst1.16  {d19[1]}, [r0], r1
++        vst1.16  {d19[2]}, [r0], r1
++        vst1.16  {d19[3]}, [r0], r1
++        bx       lr
++.endm
++
++
++@ void ff_hevc_h_loop_filter_chroma_neon(
++@   uint8_t *_pix,     [r0]
++@   ptrdiff_t _stride, [r1]
++@   int *_tc,          [r2]
++@   uint8_t *_no_p,    [r3]
++@   uint8_t *_no_q);   [sp+0]
++
++function ff_hevc_h_loop_filter_chroma_neon, export=1
++        hevc_loop_filter_chroma_start
++        sub      r0, r0, r1, lsl #1
+         vld1.8   {d16}, [r0], r1
+         vld1.8   {d17}, [r0], r1
+         vld1.8   {d18}, [r0], r1
+-        vld1.8   {d2},  [r0], r1
+-        vld1.8   {d4},  [r0], r1
+-        vld1.8   {d19}, [r0], r1
+-        vld1.8   {d20}, [r0], r1
+-        vld1.8   {d21}, [r0], r1
+-        sub      r0, r0, r1, lsl #3
+-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+-        hevc_loop_filter_chroma_body
+-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
+-        vst1.8   {d16}, [r0], r1
++        vld1.8   {d19}, [r0]
++        sub      r0, r0, r1, lsl #1
++        hevc_loop_filter_chroma_body d16, d17, d18, d19
++        bne      1f     @ Partial write
+         vst1.8   {d17}, [r0], r1
+-        vst1.8   {d18}, [r0], r1
+-        vst1.8   {d2},  [r0], r1
+-        vst1.8   {d4},  [r0], r1
+-        vst1.8   {d19}, [r0], r1
+-        vst1.8   {d20}, [r0], r1
+-        vst1.8   {d21}, [r0]
++        vst1.8   {d18}, [r0]
++        bx       lr
++1:
++        tst      r12, #0xff
++        vmov     r2, r3, d17
++        it eq
++        streq    r2, [r0]
++        tst      r12, #0xff00
++        it eq
++        streq    r3, [r0, #4]
++
++        add      r0, r1
++        tst      r12, #0xff0000
++        vmov     r2, r3, d18
++        it eq
++        streq    r2, [r0]
++        tst      r12, #0xff000000
++        it eq
++        streq    r3, [r0, #4]
++
          bx       lr
  endfunc
+ 
+-function ff_hevc_h_loop_filter_chroma_neon, export=1
++.macro m_filter_h_chroma_16 bit_depth
+         hevc_loop_filter_chroma_start
+         sub      r0, r0, r1, lsl #1
+-        vld1.8   {d18}, [r0], r1
+-        vld1.8   {d2}, [r0], r1
+-        vld1.8   {d4}, [r0], r1
+-        vld1.8   {d19}, [r0]
++        vld1.16  {q8}, [r0], r1
++        vld1.16  {q9}, [r0], r1
++        vld1.16  {q10}, [r0], r1
++        vld1.16  {q11}, [r0]
+         sub      r0, r0, r1, lsl #1
+-        hevc_loop_filter_chroma_body
+-        vst1.8   {d2}, [r0], r1
+-        vst1.8   {d4}, [r0]
++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
++        bne      1f     @ Partial write
++        vst1.16  {q9}, [r0], r1
++        vst1.16  {q10}, [r0]
++        bx       lr
++1:
++        tst      r12, #0xff
++        bne      2f
++        vst1.16  {d18}, [r0]
++2:
++        tst      r12, #0xff00
++        bne      3f
++        add      r0, #8
++        vst1.16  {d19}, [r0]
++        sub      r0, #8
++3:
++        tst      r12, #0xff0000
++        add      r0, r1
++        bne      4f
++        vst1.16  {d20}, [r0]
++4:
++        tst      r12, #0xff000000
++        it ne
++        bxne     lr
++        add      r0, #8
++        vst1.16  {d21}, [r0]
++
+         bx       lr
++.endm
++
 +
 +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
 + *                                            int *curr_rpl0, int *curr_
@@ -1754,9 +3110,54 @@ index 166bddb..9bd0a42 100644
 +        b           11b
 +endfunc
 +
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++        m_filter_luma 10
++endfunc
++
++function ff_hevc_h_loop_filter_luma_neon_10, export=1
++        m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_luma2_neon_10, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
++
++        ldr      r4, [sp, #40]
++        b        v_loop_luma_common_10
++endfunc
++
++function ff_hevc_v_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        push     {r4-r10,lr}
++
++        sub      r4, r0, #8
++v_loop_luma_common_10:
++        m_filter_v_luma_common_16 10
++endfunc
++
++function ff_hevc_h_loop_filter_uv_neon_10, export=1
++        m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_uv2_neon_10, export=1
++        m_filter_v_uv2_16 10
++endfunc
++
++function ff_hevc_h_loop_filter_chroma_neon_10, export=1
++        m_filter_h_chroma_16 10
++endfunc
++
++function ff_hevc_v_loop_filter_chroma_neon_10, export=1
++        m_filter_v_chroma_16 10
+ endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
 new file mode 100644
-index 0000000..00eab9e
+index 0000000000..00eab9eeee
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_epel_neon.S
 @@ -0,0 +1,337 @@
@@ -2097,11 +3498,399 @@ index 0000000..00eab9e
 +       .byte 4, 28, 46, 6
 +       .byte 2, 16, 54, 4
 +       .byte 2, 10, 58, 2
+diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
+index 13d540e5ff..9b6d745556 100644
+--- a/libavcodec/arm/hevcdsp_idct_neon.S
++++ b/libavcodec/arm/hevcdsp_idct_neon.S
+@@ -21,82 +21,6 @@
+ #include "libavutil/arm/asm.S"
+ #include "neon.S"
+ 
+-function ff_hevc_idct_4x4_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q0, r1
+-        vdup.16     q1, r1
+-        vst1.16     {q0, q1}, [r0]
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_8x8_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-        vstm        r0, {q8-q15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_16x16_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-        vstm        r0!, {q8-q15}
+-        vstm        r0!, {q8-q15}
+-        vstm        r0!, {q8-q15}
+-        vstm        r0, {q8-q15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_idct_32x32_dc_neon_8, export=1
+-        ldrsh       r1, [r0]
+-        ldr         r2, =0x20
+-        add         r1, #1
+-        asr         r1, #1
+-        add         r1, r2
+-        asr         r1, #6
+-        mov         r3, #16
+-        vdup.16     q8, r1
+-        vdup.16     q9, r1
+-        vmov.16     q10, q8
+-        vmov.16     q11, q8
+-        vmov.16     q12, q8
+-        vmov.16     q13, q8
+-        vmov.16     q14, q8
+-        vmov.16     q15, q8
+-1:      subs        r3, #1
+-        vstm        r0!, {q8-q15}
+-        bne         1b
+-        bx lr
+-endfunc
+-
+ function ff_hevc_transform_add_4x4_neon_8, export=1
+         vldm        r1, {q0-q1}
+         vld1.32     d4[0], [r0], r2
+@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1
+         bx          lr
+ endfunc
+ 
++
++@ ff_hevc_add_residual_4x4_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_4x4_dc_neon_8, export=1
++        vdup.16     q15, r2
++
++        vld1.32     d4[0], [r0], r1
++        vld1.32     d4[1], [r0], r1
++        vld1.32     d5[0], [r0], r1
++        vld1.32     d5[1], [r0], r1
++        sub         r0, r0, r1, lsl #2
++        vaddw.u8    q0, q15, d4
++        vaddw.u8    q1, q15, d5
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r1
++        vst1.32     d0[1], [r0], r1
++        vst1.32     d1[0], [r0], r1
++        vst1.32     d1[1], [r0], r1
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_4x4_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #4
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_8x8_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_8x8_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #8
++
++1:      subs        r3,   #1
++        vld1.8      d16,  [r0]
++        vaddw.u8    q0,   q15, d16
++        vqmovun.s16 d0,   q0
++        vst1.32     d0,   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_8x8_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #8
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_16x16_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_16x16_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #16
++
++1:      subs        r3,   #1
++        vld1.8      {q8},  [r0]
++        vaddw.u8    q0,  q15, d16
++        vaddw.u8    q1,  q15, d17
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vst1.8      {q0},   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++@ ff_hevc_add_residual_16x16_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
++        vdup.32     q15, r2
++        mov         r3,  #16
++        b           1f
++endfunc
++
++@ ff_hevc_add_residual_32x32_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_add_residual_32x32_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #32
++
++1:      subs        r3,   #1
++        vld1.8      {q8, q9},  [r0]
++        vaddw.u8    q0,  q15, d16
++        vaddw.u8    q1,  q15, d17
++        vaddw.u8    q2,  q15, d18
++        vaddw.u8    q3,  q15, d19
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q1
++        vqmovun.s16 d2,  q2
++        vqmovun.s16 d3,  q3
++        vst1.8     {q0, q1},   [r0], r1
++        bne         1b
++        bx          lr
++endfunc
++
++
++
+ .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
+         vtrn.64         \r0, \r4
+         vtrn.64         \r1, \r5
+@@ -263,55 +312,6 @@ endfunc
+         vqrshrn.s32   \r3, q3, \shift
+ .endm
+ 
+-function ff_hevc_transform_4x4_neon_8, export=1
+-        vpush       {d8-d15}
+-        vld1.16     {q14, q15}, [r0]  // coeffs
+-        ldr         r3, =0x00240053 // 36 and 83
+-        vmov.32     d0[0], r3
+-
+-        tr4_shift d28, d29, d30, d31, #7
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        tr4_shift d28, d29, d30, d31, #12
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        vst1.16     {q14, q15}, [r0]
+-        vpop        {d8-d15}
+-        bx lr
+-endfunc
+-
+-function ff_hevc_transform_luma_4x4_neon_8, export=1
+-        vpush       {d8-d15}
+-        vld1.16     {q14, q15}, [r0]  // coeffs
+-        ldr         r3, =0x4a  // 74
+-        vmov.32     d0[0], r3
+-        ldr         r3, =0x1d  // 29
+-        vmov.32     d0[1], r3
+-        ldr         r3, =0x37  // 55
+-        vmov.32     d1[0], r3
+-
+-        tr4_luma_shift d28, d29, d30, d31, #7
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-
+-        tr4_luma_shift d28, d29, d30, d31, #12
+-
+-        vtrn.16     d28, d29
+-        vtrn.16     d30, d31
+-        vtrn.32     q14, q15
+-        vst1.16     {q14, q15}, [r0]
+-        vpop        {d8-d15}
+-        bx lr
+-endfunc
+-
+ .macro tr8_begin in0, in1, in2, in3
+         vmull.s16  q7, \in0, d1[1]   // 89 * src1
+         vmull.s16  q8, \in0, d1[0]   // 75 * src1
+@@ -356,100 +356,6 @@ endfunc
+         vqrshrn.s32   d8, q5, \shift
+ .endm
+ 
+-function ff_hevc_transform_8x8_neon_8, export=1
+-        push   {r4-r8}
+-        vpush {d8-d15}
+-        mov    r5, #16
+-
+-        adr       r3, tr4f
+-        vld1.16   {d0, d1}, [r3]
+-
+-        // left half
+-        vld1.16 {d24}, [r0], r5
+-        vld1.16 {d25}, [r0], r5
+-        vld1.16 {d26}, [r0], r5
+-        vld1.16 {d27}, [r0], r5
+-        vld1.16 {d28}, [r0], r5
+-        vld1.16 {d29}, [r0], r5
+-        vld1.16 {d30}, [r0], r5
+-        vld1.16 {d31}, [r0], r5
+-        sub      r0, #128
+-        tr8_begin d25, d27, d29, d31
+-        tr4       d24, d26, d28, d30
+-        tr8_end   #7
+-        vst1.16 {d2}, [r0], r5
+-        vst1.16 {d3}, [r0], r5
+-        vst1.16 {d4}, [r0], r5
+-        vst1.16 {d5}, [r0], r5
+-        vst1.16 {d6}, [r0], r5
+-        vst1.16 {d7}, [r0], r5
+-        vst1.16 {d8}, [r0], r5
+-        vst1.16 {d9}, [r0], r5
+-        sub      r0, #128
+-        //skip right half if col_limit in r1 is less than 4
+-        cmp      r1, #4
+-        blt      1f
+-        //right half
+-        add      r0, #8
+-        vld1.16 {d24}, [r0], r5
+-        vld1.16 {d25}, [r0], r5
+-        vld1.16 {d26}, [r0], r5
+-        vld1.16 {d27}, [r0], r5
+-        vld1.16 {d28}, [r0], r5
+-        vld1.16 {d29}, [r0], r5
+-        vld1.16 {d30}, [r0], r5
+-        vld1.16 {d31}, [r0], r5
+-        sub      r0, #128
+-        tr8_begin d25, d27, d29, d31
+-        tr4       d24, d26, d28, d30
+-        tr8_end   #7
+-        vst1.16 {d2}, [r0], r5
+-        vst1.16 {d3}, [r0], r5
+-        vst1.16 {d4}, [r0], r5
+-        vst1.16 {d5}, [r0], r5
+-        vst1.16 {d6}, [r0], r5
+-        vst1.16 {d7}, [r0], r5
+-        vst1.16 {d8}, [r0], r5
+-        vst1.16 {d9}, [r0], r5
+-        sub      r0, #136
+-1:
+-        // top half
+-        vldm r0, {q12-q15} // coeffs
+-        transpose_16b_4x4 d24, d26, d28, d30
+-        transpose_16b_4x4 d25, d27, d29, d31
+-        tr8_begin d26, d30, d27, d31
+-        tr4 d24, d28, d25, d29
+-        tr8_end #12
+-        transpose_16b_4x4 d2, d3, d4, d5
+-        transpose_16b_4x4 d6, d7, d8, d9
+-        vswp     d7, d5
+-        vswp     d7, d8
+-        vswp     d3, d6
+-        vswp     d6, d4
+-        vstm r0!, {q1-q4}
+-
+-        // bottom half
+-        vldm r0, {q12-q15} // coeffs
+-        transpose_16b_4x4 d24, d26, d28, d30
+-        transpose_16b_4x4 d25, d27, d29, d31
+-        tr8_begin d26, d30, d27, d31
+-        tr4 d24, d28, d25, d29
+-        tr8_end #12
+-        transpose_16b_4x4 d2, d3, d4, d5
+-        transpose_16b_4x4 d6, d7, d8, d9
+-        vswp     d7, d5
+-        vswp     d7, d8
+-        vswp     d3, d6
+-        vswp     d6, d4
+-        //vstm     r0, {q1-q4}
+-        vst1.16 {q1-q2}, [r0]
+-        add     r0, #32
+-        vst1.16 {q3-q4}, [r0]
+-        sub     r0, #32
+-        vpop {d8-d15}
+-        pop {r4-r8}
+-        bx lr
+-endfunc
+ 
+ .align 4
+ tr4f:
+@@ -463,3 +369,11 @@ tr16:
+ .word 0x00500046  // 80, d2[2] = 70
+ .word 0x0039002b  // 57, d2[0] = 43
+ .word 0x00190009  // 25, d2[2] = 9
++
++#define BIT_DEPTH 8
++#include "hevc_idct_fn_neon.S"
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "hevc_idct_fn_neon.S"
++
 diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
-index 5591807..b6c48ee 100644
+index 55918077e2..e708b7c074 100644
 --- a/libavcodec/arm/hevcdsp_init_neon.c
 +++ b/libavcodec/arm/hevcdsp_init_neon.c
-@@ -22,11 +22,26 @@
+@@ -22,11 +22,41 @@
  #include "libavutil/arm/cpu.h"
  #include "libavcodec/hevcdsp.h"
  #include "hevcdsp_arm.h"
@@ -2113,6 +3902,11 @@ index 5591807..b6c48ee 100644
  void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
  void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
 +
++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
 +#ifdef RPI
 +void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
 +                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
@@ -2123,44 +3917,201 @@ index 5591807..b6c48ee 100644
 +void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
 +                             uint8_t * src_l,
 +                             unsigned int no_f);
++
++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
++                             const uint8_t no_p[2], const uint8_t no_q[2],
++                             uint8_t * _pix_l);
++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
 +#endif
 +
  void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
  void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
  void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
-@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
+ void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
++
+ void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                      ptrdiff_t stride);
++                                     ptrdiff_t stride);
+ void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                      ptrdiff_t stride);
++                                     ptrdiff_t stride);
+ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                      ptrdiff_t stride);
++                                       ptrdiff_t stride);
  void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-                                       ptrdiff_t stride);
- 
-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-                                      ptrdiff_t stride);
++                                       ptrdiff_t stride);
 +
-+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
 +
-+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
-+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
 +
-+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
-+                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
 +
-+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++#if RPI_HEVC_SAND
++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++#endif
++
++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++#if RPI_HEVC_SAND
++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height);
 +
++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++#endif
 +
++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
+ 
  #define PUT_PIXELS(name) \
      void name(int16_t *dst, uint8_t *src, \
-                                 ptrdiff_t srcstride, int height, \
-@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
  #undef PUT_PIXELS
@@ -2176,227 +4127,110 @@ index 5591807..b6c48ee 100644
  
  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
                                     int height, int width);
-@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
  }
  
-+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
++                                                MvField *curr, MvField *neigh, uint8_t *bs);
++
++
++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
 +{
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int8_t offset_table[32] = { 0 };
-+    int k, y, x;
-+    int shift  = 3; // BIT_DEPTH - 5
-+    int cwidth = 0;
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    for (k = 0; k < 4; k++)
-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    switch(cwidth){
-+    case 8:
-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 16:
-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 32:
-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    case 64:
-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
-+        break;
-+    default:
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++)
-+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+            dst += stride_dst;
-+            src += stride_src;
-+        }
-+    }
++    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
 +}
 +
-+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++#if RPI_HEVC_SAND
++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                                  int width, int height)
 +{
-+    // Width 32 already dealt with
-+    // width 16 code works in double lines
-+    if (width == 16 && (height & 1) == 0) {
-+        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
-+                                          sao_offset_val_u, sao_left_class_u,
-+                                          sao_offset_val_v, sao_left_class_v,
-+                                          width, height);
-+    }
-+    else
-+    {
-+        const int shift  = 3; // BIT_DEPTH - 5
-+        int k, y, x;
-+        pixel *dst = (pixel *)_dst;
-+        pixel *src = (pixel *)_src;
-+        int8_t offset_table_u[32] = { 0 };
-+        int8_t offset_table_v[32] = { 0 };
-+
-+        stride_src /= sizeof(pixel);
-+        stride_dst /= sizeof(pixel);
-+
-+        for (k = 0; k < 4; k++)
-+            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+        for (k = 0; k < 4; k++)
-+            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width * 2; x += 2)
-+            {
-+                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
-+                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
-+            }
-+            dst += stride_dst;
-+            src += stride_src;
-+
-+        }
-+    }
++    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
-+
-+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
-+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+                                          int16_t *_sao_offset_val, int eo, int width, int height)
++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
 +{
-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+    static const int8_t pos[4][2][2] = {
-+        { { -1,  0 }, {  1, 0 } }, // horizontal
-+        { {  0, -1 }, {  0, 1 } }, // vertical
-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
-+    };
-+    int8_t sao_offset_val[8];  // padding of 3 for vld
-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
-+    pixel *dst = (pixel *)_dst;
-+    pixel *src = (pixel *)_src;
-+    int a_stride, b_stride;
-+    int x, y;
-+    int cwidth = 0;
-+
-+    for (x = 0; x < 5; x++) {
-+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
-+    }
-+
-+    if (height % 8 == 0)
-+        cwidth = width;
-+
-+    stride_src /= sizeof(pixel);
-+    stride_dst /= sizeof(pixel);
-+
-+    switch (cwidth) {
-+    case 32:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    case 64:
-+        switch(eo) {
-+        case 0:
-+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 1:
-+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 2:
-+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        case 3:
-+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
-+            break;
-+        }
-+        break;
-+    default:
-+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width; x++) {
-+                int diff0         = CMP(src[x], src[x + a_stride]);
-+                int diff1         = CMP(src[x], src[x + b_stride]);
-+                int idx           = diff0 + diff1;
-+                if (idx)
-+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
++    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
 +}
++#endif
++#endif
 +
 +
-+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+                                  int eo, int width, int height)
-+{
-+    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
 +
-+    if (width == 32 && (height & 7) == 0) {
-+        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
-+    }
-+    else
-+    {
-+        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+        static const int8_t pos[4][2][2] = {
-+            { { -1,  0 }, {  1, 0 } }, // horizontal
-+            { {  0, -1 }, {  0, 1 } }, // vertical
-+            { { -1, -1 }, {  1, 1 } }, // 45 degree
-+            { {  1, -1 }, { -1, 1 } }, // 135 degree
-+        };
-+        int8_t sao_offset_val_u[8];  // padding of 3 for vld
-+        int8_t sao_offset_val_v[8];  // padding of 3 for vld
-+        pixel *dst = (pixel *)_dst;
-+        pixel *src = (pixel *)_src;
-+        int a_stride, b_stride;
-+        int x, y;
-+
-+        for (x = 0; x < 5; x++) {
-+            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
-+            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
-+        }
-+
-+        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+        for (y = 0; y < height; y++) {
-+            for (x = 0; x < width * 2; x += 2) {
-+                int diff0u = CMP(src[x], src[x + a_stride]);
-+                int diff1u = CMP(src[x], src[x + b_stride]);
-+                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
-+                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
-+            }
-+            src += stride_src;
-+            dst += stride_dst;
-+        }
-+    }
-+}
-+#undef CMP
-+
-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
 +
  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
  {
@@ -2407,7 +4241,9 @@ index 5591807..b6c48ee 100644
          c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
 +        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
          c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
          c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
 +#ifdef RPI
 +        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
 +        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
@@ -2416,21 +4252,68 @@ index 5591807..b6c48ee 100644
          c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
          c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
          c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
-@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
+         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
+         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
+-        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
+-        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
+-        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+-        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
++        c->transform_add[0]             = ff_hevc_transform_add_4x4_neon_8;
++        c->transform_add[1]             = ff_hevc_transform_add_8x8_neon_8;
++        c->transform_add[2]             = ff_hevc_transform_add_16x16_neon_8;
++        c->transform_add[3]             = ff_hevc_transform_add_32x32_neon_8;
++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
++#if RPI_HEVC_SAND
++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
++#endif
          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
-+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
-+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
-+          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
-+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
-+          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
-+        }
-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
++#endif
++#if RPI_HEVC_SAND
++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
++#endif
++#endif
          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
-@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
@@ -2452,22 +4335,711 @@ index 5591807..b6c48ee 100644
          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
-@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
      }
++    else if (bit_depth == 10) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
++#ifdef RPI
++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
++#endif
++        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
++        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
++        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
++        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
++        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
++        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
++        c->transform_add[0]             = ff_hevc_add_residual_4x4_neon_10;
++        c->transform_add[1]             = ff_hevc_add_residual_8x8_neon_10;
++        c->transform_add[2]             = ff_hevc_add_residual_16x16_neon_10;
++        c->transform_add[3]             = ff_hevc_add_residual_32x32_neon_10;
++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
++#if RPI_HEVC_SAND
++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
++#endif
++        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_10;
++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
++
++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
++#endif
++#if RPI_HEVC_SAND
++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
++#endif
++#endif
++    }
 +
 +    assert(offsetof(MvField, mv) == 0);
 +    assert(offsetof(MvField, ref_idx) == 8);
 +    assert(offsetof(MvField, pred_flag) == 10);
 +    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
  }
+diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
+new file mode 100644
+index 0000000000..7cc5cd5e5c
+--- /dev/null
++++ b/libavcodec/arm/hevcdsp_res16_neon.S
+@@ -0,0 +1,610 @@
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1]
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vld1.16     {d0}, [r0, :64], r2
++        vld1.16     {d1}, [r0, :64], r2
++        vld1.16     {d2}, [r0, :64], r2
++        vld1.16     {d3}, [r0, :64], r2
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r2
++        vst1.16     {d1}, [r0, :64], r2
++        vst1.16     {d2}, [r0, :64], r2
++        vst1.16     {d3}, [r0, :64], r2
++        bx          lr
++
++endfunc
++
++@ add_residual4x4(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vdup.i16    q9,  r3
++        vld1.16     {d0}, [r0, :64], r1
++        vld1.16     {d1}, [r0, :64], r1
++        vdup.16     q15, r2
++        vld1.16     {d2}, [r0, :64], r1
++        vld1.16     {d3}, [r0, :64], r1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0, :64], r1
++        vst1.16     {d1}, [r0, :64], r1
++        vst1.16     {d2}, [r0, :64], r1
++        vst1.16     {d3}, [r0, :64], r1
++        bx          lr
++
++endfunc
++
++
++@ add_residual8x8(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #2
++1:
++        vldm        r1!, {q10-q13}
++        vld1.16     {q0}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r2
++        vqadd.s16   q0,  q10
++        vld1.16     {q2}, [r0, :128], r2
++        vqadd.s16   q1,  q11
++        vld1.16     {q3}, [r0, :128], r2
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {q0}, [r0, :128], r2
++        vmin.s16    q2,  q2,  q9
++        vst1.16     {q1}, [r0, :128], r2
++        vmin.s16    q3,  q3,  q9
++        vst1.16     {q2}, [r0, :128], r2
++        vst1.16     {q3}, [r0, :128], r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #1
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual8x8_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++        mov         r12, #2
++        vdup.16     q15, r2
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vld1.16     {q0}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q1}, [r0, :128], r1
++        vqadd.s16   q0,  q15
++        vld1.16     {q2}, [r0, :128], r1
++        vqadd.s16   q1,  q15
++        vld1.16     {q3}, [r0, :128], r1
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r1, lsl #2
++        vmax.s16    q0,  q8
++        vmax.s16    q1,  q8
++        vmax.s16    q2,  q8
++        vmax.s16    q3,  q8
++        vmin.s16    q0,  q9
++        vmin.s16    q1,  q9
++        vst1.16     {q0}, [r0, :128], r1
++        vmin.s16    q2,  q9
++        vst1.16     {q1}, [r0, :128], r1
++        vmin.s16    q3,  q9
++        vst1.16     {q2}, [r0, :128], r1
++        vst1.16     {q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual16x16(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #8
++1:
++        vldm        r1!, {q10-q13}
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r2
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst1.16     {q0, q1}, [r0, :128], r2
++        vst1.16     {q2, q3}, [r0, :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #4
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual16x16_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #8
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0, :128], r1
++        subs        r12, #1
++        vld1.16     {q2, q3}, [r0, :128]
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        sub         r0,  r1
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0, :128], r1
++        vst1.16     {q2, q3}, [r0, :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++
++@ add_residual32x32(
++@  uint8_t *_dst,     [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++        mov         r12, #32
++1:
++        vldm        r1!, {q10-q13}
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r12, #16
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual32x32_dc(
++@  uint8_t *_dst,     [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r12, #32
++9:
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++1:
++        vldm        r0,  {q0-q3}
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vstm        r0,  {q0-q3}
++        add         r0,  r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++        vld1.16     {q10, q11}, [r1, :256]
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r0,  r2, lsl #2
++        clip16_4 q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  r2
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        sub         r0,  #32
++        clip16_4 q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++        vldm        r1, {q10-q13}
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        vdup.i16    q9,  r3
++
++        vld2.16     {d0, d2}, [r0, :128], r2
++        vld2.16     {d1, d3}, [r0, :128], r2
++        vld2.16     {d4, d6}, [r0, :128], r2
++        vld2.16     {d5, d7}, [r0, :128], r2
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r0,  r2, lsl #2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++
++        vst2.16     {d0, d2}, [r0, :128], r2
++        vst2.16     {d1, d3}, [r0, :128], r2
++        vst2.16     {d4, d6}, [r0, :128], r2
++        vst2.16     {d5, d7}, [r0, :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #4
++        vdup.i16    q9,  r3
++        add         r3, r1, #(8*8*2)  @ Offset to V
++1:
++        vld2.16     {q0, q1}, [r0, :256], r2
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  r2
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256], r2
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++        movw        r3,  #(1 << BIT_DEPTH) - 1
++        vmov.i64    q8,  #0
++        mov         r12, #16
++        vdup.i16    q9,  r3
++        add         r3,  r1, #(16*16*2)  @ Offset to V
++        sub         r2,  #32
++1:
++        vld2.16     {q0, q1}, [r0, :256]!
++        vld2.16     {q2, q3}, [r0, :256]
++        vld1.16     {q10, q11}, [r1, :256]!
++        vld1.16     {q12, q13}, [r3, :256]!
++        subs        r12, #1
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        sub         r0,  #32
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmax.s16    q2,  q2,  q8
++        vmax.s16    q3,  q3,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vmin.s16    q2,  q2,  q9
++        vmin.s16    q3,  q3,  q9
++        vst2.16     {q0, q1}, [r0, :256]!
++        vst2.16     {q2, q3}, [r0, :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
 diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
 new file mode 100644
-index 0000000..08a021d
+index 0000000000..30113d9c93
 --- /dev/null
 +++ b/libavcodec/arm/hevcdsp_sao_neon.S
-@@ -0,0 +1,862 @@
+@@ -0,0 +1,1882 @@
 +/*
 + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
 + *
@@ -2491,124 +5063,211 @@ index 0000000..08a021d
 +#include "libavutil/arm/asm.S"
 +#include "neon.S"
 +
-+.macro init_sao_band
-+        pld      [r1]
-+        vld1.8   {q0, q1}, [r2]  // offset table
-+        ldr       r2, [sp, #0]   // stride_dst
-+        ldr      r12, [sp, #4]   // height
-+        vmov.u8  q3, #128
-+.endm
++.set EDGE_SRC_STRIDE, 160
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
++        vshr.u8 q13, q9, #3
++        vadd.s8  q9, \Q_K128
++
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
 +
-+// 128 in q3
-+// input q8 - q11
-+.macro sao_band_64
-+        vtbl.8   d24, {d0, d1, d2, d3}, d24
-+        vadd.s8  q8, q3
-+        vtbl.8   d25, {d0, d1, d2, d3}, d25
-+        vadd.s8  q9, q3
-+        vtbl.8   d26, {d0, d1, d2, d3}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d0, d1, d2, d3}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0, d1, d2, d3}, d28
 +        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0, d1, d2, d3}, d29
++        vshr.u8 q12, q10, #3
++        vadd.s8  q10, \Q_K128
 +        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d0, d1, d2, d3}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d0, d1, d2, d3}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
++        vshr.u8 q13, q11, #3
++        vadd.s8  q11, \Q_K128
++
++        vsub.s8  q8, \Q_K128
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT0, d25
++        vsub.s8  q9, \Q_K128
++        vtbl.8   d26, \XLAT1, d26
++        vtbl.8   d27, \XLAT1, d27
++        vqadd.s8 q10, q12
++        vqadd.s8 q11, q13
++        vsub.s8  q10, \Q_K128
++        vsub.s8  q11, \Q_K128
 +.endm
 +
-+function ff_hevc_sao_band_w8_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #8
-+        vld1.8   {d16}, [r1, :64], r3
-+        vld1.8   {d17}, [r1, :64], r3
-+        vshr.u8  q12, q8, #3
-+        vld1.8   {d18}, [r1, :64], r3
-+        vld1.8   {d19}, [r1, :64], r3
-+        vshr.u8  q13, q9, #3
-+        vld1.8   {d20}, [r1, :64], r3
-+        vld1.8   {d21}, [r1, :64], r3
-+        vshr.u8  q14, q10, #3
-+        vld1.8   {d22}, [r1, :64], r3
-+        vld1.8   {d23}, [r1, :64], r3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8  {d16}, [r0, :64], r2
-+        vst1.8  {d17}, [r0, :64], r2
-+        vst1.8  {d18}, [r0, :64], r2
-+        vst1.8  {d19}, [r0, :64], r2
-+        vst1.8  {d20}, [r0, :64], r2
-+        vst1.8  {d21}, [r0, :64], r2
-+        vst1.8  {d22}, [r0, :64], r2
-+        vst1.8  {d23}, [r0, :64], r2
-+        bne    1b
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
++        vshr.u8 q12, q8, #3
++        vadd.s8  q8, \Q_K128
 +
-+        bx lr
++        vtbl.8   d24, \XLAT0, d24
++        vtbl.8   d25, \XLAT1, d25
++
++        vqadd.s8 q8, q12
++        vsub.s8  q8, \Q_K128
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vtbl.8    d26, \XLAT0, d26
++        vtbl.8    d27, \XLAT1, d27
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vaddw.s8  \Q2, d26
++        vaddw.s8  \Q3, d27
++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q12
++.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++function band_load_y
++        vmov.i64  q0, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q1, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
++
++        mov       r4, sp
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
++        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
++        mov       sp, r4
++
++        ldr       r12, [sp, #20]        @ height
++        pld       [r1]
++
++        sub       r12, #1
++        add       r4, r1, r3
++        bx        lr
 +endfunc
 +
-+function ff_hevc_sao_band_w16_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #4
-+        vld1.8  {q8}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vld1.8  {q9}, [r1, :128], r3
-+        vshr.u8  q13, q9, #3
-+        vld1.8  {q10}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vld1.8  {q11}, [r1, :128], r3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8   {q8}, [r0, :128], r2
-+        vst1.8   {q9}, [r0, :128], r2
-+        vst1.8   {q10}, [r0, :128], r2
-+        vst1.8   {q11}, [r0, :128], r2
-+        bne    1b
 +
-+        bx lr
-+endfunc
++function band_load_c
++        vmov.i64  q2, #0
++        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        vmov.i64  q3, #0
++        ldr       r12, [sp, #12]        @ sao_left_class
 +
-+function ff_hevc_sao_band_w32_neon_8, export=1
-+        init_sao_band
-+1:      subs     r12, #2
-+        vld1.8   {q8-q9}, [r1, :128], r3
-+        vshr.u8  q12, q8, #3
-+        vshr.u8  q13, q9, #3
-+        vld1.8   {q10-q11}, [r1, :128], r3
-+        vshr.u8  q14, q10, #3
-+        vshr.u8  q15, q11, #3
-+        sao_band_64
-+        vst1.8   {q8-q9}, [r0, :128], r2
-+        vst1.8   {q10-q11}, [r0, :128], r2
-+        bne      1b
++        mov       r4, sp                @ Remember SP
++        sub       sp, #32
++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
 +
-+        bx       lr
-+endfunc
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
 +
-+function ff_hevc_sao_band_w64_neon_8, export=1
-+        init_sao_band
++        @ And again for the 2nd set
++        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
++        add       r12, #2               @ 1st interesting val is [1]
++        vld1.16   {d16}, [r12]          @ Unaligned
++        ldr       r12, [r4, #20]        @ sao_left_class2
++
++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
++        add       r12, sp
++        vst1.8    {d16[0]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[2]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[4]}, [r12]!
++        and       r12, #~32
++        vst1.8    {d16[6]}, [r12]
++        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
++
++        mov       sp, r4
++
++        ldr       r12, [sp, #28]        @ height
++        pld       [r1]
 +
-+        push      {r4, lr}
 +        subs      r12, #1
-+        mov       r4, r1
-+        it ne
-+        addne     r4, r3
++        add       r4, r1, r3
++        bx        lr
++endfunc
++
++
++@ ff_hevc_sao_band_64_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_64_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
 +
 +1:      subs      r12, #1
 +        vldm      r1, {q8-q11}
 +        pld       [r4]
-+        vshr.u8   q12, q8, #3
-+        vshr.u8   q13, q9, #3
 +        add       r1, r3
-+        vshr.u8   q14, q10, #3
-+        vshr.u8   q15, q11, #3
-+        sao_band_64
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
 +        it ne
 +        addne     r4, r3
 +        vstm      r0, {q8-q11}
@@ -2618,8 +5277,113 @@ index 0000000..08a021d
 +        pop       {r4, pc}
 +endfunc
 +
++@ ff_hevc_sao_band_32_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
 +
-+@ ff_hevc_sao_band_c_w64_neon_8(
++function ff_hevc_sao_band_32_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #2
++        vld1.8    { q8, q9 }, [r1, :128], r3
++        vld1.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8, q9 }, [r0, :128], r2
++        vst1.8    {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_16_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_16_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        vmov.u8   q15, #128
++
++1:      subs      r12, #4
++        vld1.8    { q8}, [r1, :128], r3
++        vld1.8    { q9}, [r1, :128], r3
++        vld1.8    {q10}, [r1, :128], r3
++        vld1.8    {q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    { q8}, [r0, :128], r2
++        vst1.8    { q9}, [r0, :128], r2
++        vst1.8    {q10}, [r0, :128], r2
++        vst1.8    {q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_8_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_sao_band_8_neon_8, export=1
++        push      {r4, lr}
++        bl        band_load_y
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.32   {d16[0]}, [r1, :32], r3
++        vld1.32   {d16[1]}, [r1, :32], r3
++        vld1.32   {d17[0]}, [r1, :32], r3
++        vld1.32   {d17[1]}, [r1, :32], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
++
++        vst1.32   {d16[0]}, [r0, :32], r2
++        vst1.32   {d16[1]}, [r0, :32], r2
++        vst1.32   {d17[0]}, [r0, :32], r2
++        vst1.32   {d17[1]}, [r0, :32], r2
++        bpl       1b
++        pop       {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_32_neon_8(
 +@   uint8_t * dst          [r0]
 +@   uint8_t * src          [r1]
 +@   uint32_t dst_stride    [r2]
@@ -2631,707 +5395,1535 @@ index 0000000..08a021d
 +@   int width              sp[16]
 +@   int height             sp[20]
 +
-+@ As this is often done in-place on the frame buffer it is worth preloading
-+@ the pixel values but we want to beware of loading ouside our buffer to avoid
-+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
++function ff_hevc_sao_band_c_32_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
 +
-+function ff_hevc_sao_band_c_neon_8, export=1
-+        mov     r12, sp
-+        push   {r4-r8, lr}  // 24 bytes
++        vmov.i8   q15, #128
++        sub       r3, #32
++        sub       r2, #32
 +
-+        ldm     r12, {r4-r7}
++1:      subs      r12, #1
++        vld2.8    { q8, q9 }, [r1, :128]!
++        vld2.8    {q10, q11}, [r1, :128], r3
 +
-+        add     r4, #2
-+        add     r6, #2
-+        vld1.16 {d16}, [r4]    @ Unaligned
-+        lsl     r5, r5, #3
-+        vld1.16 {d18}, [r6]
-+        pld     [r1]
-+        vmov.i8  d17, #0
-+        mov     r4, r1
-+        vmov.i8  d19, #0
-+        lsl     r7, r7, #3
-+        vdup.8  q1, r5
-+        ldr     r5, [r12, #16]  @ width
-+        vdup.8  q2, r7
-+        ldr     r12, [r12, #20]
-+        vqmovn.s16 d0, q8
-+        cmp     r5, #16         @ At some point we may want a table lookup
-+        vqmovn.s16 d1, q9
-+        vmov.i8 q3, #128
-+        beq     16f
++        pld       [r4]
 +
-+        @ d0 U lookup
-+        @ d1 V lookup
-+        @ q1 U raw offset
-+        @ q2 V raw offset
-+        @ q3 #128
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
 +
-+        @ r4 = r1 = src - Inteded for preload pointer
-+        @ r12 = height
++        vst2.8    { q8, q9 }, [r0, :128]!
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        itt ne
++        addne     r4, r3
++        addne     r4, #32
++
++        bpl       1b
++
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_16_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_sao_band_c_16_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        vmov.i8   q15, #128
++
++1:      subs      r12, #2
++        vld2.8    { q8, q9 }, [r1, :128], r3
++        vld2.8    {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++@ ff_hevc_sao_band_c_8_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_sao_band_c_8_neon_8, export=1
++        push    {r4, lr}
++        bl      band_load_c
++        ldr       lr, [sp, #16]         @ width
++        vmov.u8   q15, #128
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.8    {d16, d17}, [r1, :128], r3
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vst2.8    {d16, d17}, [r0, :128], r2
++        bpl       1b
++        pop     {r4, pc}
++
++4:
++1:      subs      r12, #1
++        vld1.8    {d16}, [r1, :64], r3
++        vld1.8    {d17}, [r1, :64], r3
++        vuzp.8    d16, d17
++
++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
++
++        vzip.8    d16, d17
++        vst1.8    {d16}, [r0, :64], r2
++        vst1.8    {d17}, [r0, :64], r2
++        bpl       1b
++        pop     {r4, pc}
++endfunc
++
++
++@ ff_hevc_sao_band_64_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_64_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++        vpush     {q4-q7}
++
++1:      subs      r12, #1
++        vldm      r1, {q4-q11}
++        add       r1, r3
++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        vstm      r0, {q4-q11}
++        add       r0, r2
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_64_neon_10, export=1
++        band_64_16 10
++endfunc
++
++@ ff_hevc_sao_band_32_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_32_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q2, #0
++        vdup.i16  q3, lr
++        bl        band_load_y
++
++1:      subs      r12, #1
++        vldm      r1, {q8-q11}
++        add       r1, r3
++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_32_neon_10, export=1
++        band_32_16 10
++endfunc
++
++@ ff_hevc_sao_band_16_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_16_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        bl        band_load_y
++
++1:      subs      r12, #2
++        vld1.16   { q8, q9 }, [r1, :128], r3
++        vld1.16   {q10, q11}, [r1, :128], r3
++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   { q8, q9 }, [r0, :128], r2
++        vst1.16   {q10, q11}, [r0, :128], r2
++        bpl       1b
++
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_16_neon_10, export=1
++        band_16_16 10
++endfunc
++
++@ ff_hevc_sao_band_8_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_8_16 bit_depth
++        push      {r4, lr}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        bl        band_load_y
++        ldr       lr, [sp, #16]
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #2
++        vld1.16   { q8}, [r1, :128], r3
++        vld1.16   { q9}, [r1, :128], r3
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   { q8}, [r0, :128], r2
++        vst1.16   { q9}, [r0, :128], r2
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #4
++        vld1.16   {d16}, [r1, :64], r3
++        vld1.16   {d17}, [r1, :64], r3
++        vld1.16   {d18}, [r1, :64], r3
++        vld1.16   {d19}, [r1, :64], r3
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
++        vst1.16   {d16}, [r0, :64], r2
++        vst1.16   {d17}, [r0, :64], r2
++        vst1.16   {d18}, [r0, :64], r2
++        vst1.16   {d19}, [r0, :64], r2
++        bpl       1b
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_8_neon_10, export=1
++        band_8_16 10
++endfunc
++
++
++@ ff_hevc_sao_band_c_32_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_32_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        vpush     {q4-q7}
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        sub       r2, #96
++
++1:      subs      r12, #1
++
++        vld2.16   { q4, q5 }, [r1, :128]!
++        vld2.16   { q6, q7 }, [r1, :128]!
++        vld2.16   { q8, q9 }, [r1, :128]!
++        vld2.16   {q10, q11}, [r1, :128], r3
++
++        pld       [r4]
++        sub       r1, #96
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
 +
-+        @ Might (unlikely) be called with height == 1
-+        subs      r12, #1
 +        it ne
 +        addne     r4, r3
 +
-+1:
-+        subs      r12, #1
-+        vld2.8    {q8-q9}, [r1, :128]!
-+        vsub.u8   q12, q8, q1
-+        vld2.8    {q10-q11}, [r1, :128], r3
-+        vsub.u8   q14, q10, q1
-+        vsub.u8   q13, q9, q2
-+        sub       r1, #32
-+        vsub.u8   q15, q11, q2
-+        pld       [r4]
-+        vshr.u8   q12, #3
-+        vadd.s8   q8, q3
-+        vshr.u8   q13, #3
-+        vadd.s8   q9, q3
++        vst2.16   { q4, q5 }, [r0, :128]!
++        vst2.16   { q6, q7 }, [r0, :128]!
++        vst2.16   { q8, q9 }, [r0, :128]!
++        vst2.16   {q10, q11}, [r0, :128], r2
 +
-+        vtbl.8   d24, {d0}, d24
-+        vshr.u8  q14, #3
-+        vtbl.8   d25, {d0}, d25
-+        vshr.u8  q15, #3
-+        vtbl.8   d26, {d1}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d1}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d1}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d1}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
-+
-+        it ne
-+        addne     r4, r3        @ Do not inc on final pass
-+        vst2.8    {q8-q9}, [r0, :128]!
-+        vst2.8    {q10-q11}, [r0, :128], r2
-+        sub       r0, #32
 +        bpl       1b
 +
-+        pop    {r4-r8, pc}
-+
-+@ -- width 16 (UV pairs) --
-+16:
-+        subs    r12, #2
-+        it ne
-+        addne   r4, r4, r3, lsl #1
-+
-+1:
-+        subs      r12, #2
-+        vld2.8    {q8-q9}, [r1, :128], r3
-+        vsub.u8   q12, q8, q1
-+        vld2.8    {q10-q11}, [r1, :128], r3
-+        vsub.u8   q14, q10, q1
-+        vsub.u8   q13, q9, q2
-+        pld       [r4]
-+        vsub.u8   q15, q11, q2
-+        pld       [r4, r3]
-+        vshr.u8  q12, #3
-+        vadd.s8  q8, q3
-+        vshr.u8  q13, #3
-+        vadd.s8  q9, q3
-+
-+        vtbl.8   d24, {d0}, d24
-+        vshr.u8  q14, #3
-+        vtbl.8   d25, {d0}, d25
-+        vshr.u8  q15, #3
-+        vtbl.8   d26, {d1}, d26
-+        vadd.s8  q10, q3
-+        vtbl.8   d27, {d1}, d27
-+        vadd.s8  q11, q3
-+        vtbl.8   d28, {d0}, d28
-+        vqadd.s8 q8, q12
-+        vtbl.8   d29, {d0}, d29
-+        vqadd.s8 q9, q13
-+        vtbl.8   d30, {d1}, d30
-+        vqadd.s8 q10, q14
-+        vtbl.8   d31, {d1}, d31
-+        vsub.s8  q8, q3
-+        vqadd.s8 q11, q15
-+        vsub.s8  q9, q3
-+        vsub.s8  q10, q3
-+        vsub.s8  q11, q3
-+
-+        it ne
-+        addne   r4, r4, r3, lsl #1
-+        vst2.8    {q8-q9}, [r0, :128], r2
-+        vst2.8    {q10-q11}, [r0, :128], r2
-+        bpl       1b
-+
-+        pop    {r4-r8, pc}
++        vpop      {q4-q7}
++        pop       {r4, pc}
++.endm
 +
++function ff_hevc_sao_band_c_32_neon_10, export=1
++        band_c_32_16 10
 +endfunc
 +
 +
-+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
-+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
-+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
-+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
-+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
++@ ff_hevc_sao_band_c_16_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_16_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        sub       r2, #32
++        sub       r3, #32
++
++1:      subs      r12, #1
++
++        vld2.16   { q8, q9 }, [r1, :128]!
++        vld2.16   {q10, q11}, [r1, :128], r3
++
++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128]!
++        vst2.16   {q10, q11}, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
 +.endm
 +
++function ff_hevc_sao_band_c_16_neon_10, export=1
++        band_c_16_16 10
++endfunc
 +
-+// input
-+// a in q0 - q3
-+// c in q4 - q7
-+// b in q8 - q11
-+// offset table r4,r5 and r6,r7
-+//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
-+// output in q0 - q3
-+// clobbers q12 - q15
 +
-+@ a <- c <- b
++@ ff_hevc_sao_band_c_8_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_8_16 bit_depth
++        push      {r4, lr}
++        bl        band_load_c
++        movw      lr, #(1 << \bit_depth) - 1
++        vmov.i64  q14, #0
++        vdup.i16  q15, lr
++        ldr       lr, [sp, #24]         @ width
++        cmp       lr, #8
++        blt       4f
++
++1:      subs      r12, #1
++        vld2.16   { q8, q9 }, [r1, :128], r3
++
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
++
++4:
++1:      subs      r12, #2
++        vld2.16   {d16, d17}, [r1, :128], r3
++        vld2.16   {d18, d19}, [r1, :128], r3
++
++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
++
++        vst2.16   {d16, d17}, [r0, :128], r2
++        vst2.16   {d18, d19}, [r0, :128], r2
++
++        bpl       1b
++        pop       {r4, pc}
++.endm
++
++function ff_hevc_sao_band_c_8_neon_10, export=1
++        band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ [r5]  translate values
 +@
-+@ It appears that Neon can stall if you try and use results too soon so we try to
-+@ spread our instruction out
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
 +
-+.macro edgeidx64
++function edge_64b_body_8
 +
-+        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q1
-+        vcgt.u8 q14, q6, q2
-+        vcgt.u8 q15, q7, q3
++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8 q13,  q5,  q1
++        vcgt.u8 q14,  q6,  q2
++        vcgt.u8 q15,  q7,  q3
 +
-+        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
-+        vcgt.u8 q1, q1, q5
-+        vcgt.u8 q2, q2, q6
-+        vcgt.u8 q3, q3, q7
++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q1,  q5
++        vcgt.u8  q2,  q6
++        vcgt.u8  q3,  q7
 +
-+        vsub.s8 q0, q0, q12 // a = sign(c-a)
-+        vsub.s8 q1, q1, q13
-+        vsub.s8 q2, q2, q14
-+        vsub.s8 q3, q3, q15
++        vsub.s8  q0,  q12       @ a = sign(c-a)
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
 +
-+        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
-+        vcgt.u8 q13, q5, q9
-+        vcgt.u8 q14, q6, q10
-+        vcgt.u8 q15, q7, q11
++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q13, q5,  q9
++        vcgt.u8  q14, q6,  q10
++        vcgt.u8  q15, q7,  q11
 +
-+        vsub.s8 q0, q0, q12
-+        vsub.s8 q1, q1, q13
-+        vsub.s8 q2, q2, q14
-+        vsub.s8 q3, q3, q15
++        vsub.s8  q0,  q12
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
 +
-+        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
-+        vcgt.u8 q13, q9, q5
-+        vcgt.u8 q14, q10, q6
-+        vcgt.u8 q15, q11, q7
++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
++        vcgt.u8  q13, q9,  q5
++        vcgt.u8  q14, q10, q6
++        vcgt.u8  q15, q11, q7
 +
-+        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
-+        vadd.s8 q1, q1, q13
-+        vmov.u8 q12, #2
-+        vadd.s8 q2, q2, q14
-+        vadd.s8 q3, q3, q15
++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
++        vadd.s8  q1,  q13
++        vmov.u8  q12, #2
++        vadd.s8  q2,  q14
++        vadd.s8  q3,  q15
 +
-+        vadd.s8 q0, q0, q12
-+        vadd.s8 q1, q1, q12
-+        @ whilst vmov dn, rm, rn exists it is a vfp instruction
-+        @ and causes a stall till neon pipe empty - so don't do that!
-+        vmov    d26[0], r4
-+        vmov    d26[1], r5
-+        vmov    d27[0], r6
-+        vmov    d27[1], r7
-+        vadd.s8 q2, q2, q12
-+        vuzp.8    q0, q1
-+        vmov.u8 q15, #128
-+        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
++        vadd.s8  q0,  q12
++        vadd.s8  q1,  q12
 +
-+        vtbl.8  d0, {d26}, d0
-+        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
++        vld1.8   {d26, d27}, [r5]
 +
-+        vtbl.8  d1, {d26}, d1
-+        vadd.s8 q14, q5, q15
++        vadd.s8  q2,  q12
++        vuzp.8   q0,  q1
++        vmov.u8  q15, #128
++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
 +
-+        vtbl.8  d2, {d27}, d2
-+        vuzp.8    q2, q3
++        vtbl.8   d0,  {d26}, d0
++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
 +
-+        vtbl.8  d3, {d27}, d3
++        vtbl.8   d1,  {d26}, d1
++        vadd.s8  q14, q5, q15
 +
-+        vtbl.8  d4, {d26}, d4
-+        vzip.8    q0, q1
++        vtbl.8   d2,  {d27}, d2
++        vuzp.8   q2,  q3
 +
-+        vtbl.8  d5, {d26}, d5
-+        vqadd.s8 q0, q0, q12
-+        vqadd.s8 q1, q1, q14
-+        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
++        vtbl.8   d3,  {d27}, d3
 +
-+        vtbl.8  d6, {d27}, d6
-+        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
++        vtbl.8   d4,  {d26}, d4
++        vzip.8   q0,  q1
 +
-+        vtbl.8  d7, {d27}, d7
-+        vzip.8   q2, q3
++        vtbl.8   d5,  {d26}, d5
++        vqadd.s8 q0,  q12
++        vqadd.s8 q1,  q14
++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
 +
-+        vsub.s8 q0, q0, q15
-+        vqadd.s8 q2, q2, q12
-+        vqadd.s8 q3, q3, q14
-+        vsub.s8 q1, q1, q15
-+        vsub.s8 q2, q2, q15
-+        vsub.s8 q3, q3, q15
++        vtbl.8   d6,  {d27}, d6
++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
 +
-+.endm
++        vtbl.8   d7,  {d27}, d7
++        vzip.8   q2,  q3
++
++        vsub.s8  q0,  q15
++        vqadd.s8 q2,  q12
++        vqadd.s8 q3,  q14
++        vsub.s8  q1,  q15
++        vsub.s8  q2,  q15
++        vsub.s8  q3,  q15
++
++        bx      lr
++endfunc
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ r4    upper clip value
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_16
++
++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q1
++        vcgt.u16 q14, q6, q2
++        vcgt.u16 q15, q7, q3
++
++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u16 q1, q1, q5
++        vcgt.u16 q2, q2, q6
++        vcgt.u16 q3, q3, q7
++
++        vsub.s16 q0, q0, q12 // a = sign(c-a)
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q9
++        vcgt.u16 q14, q6, q10
++        vcgt.u16 q15, q7, q11
++
++        vsub.s16 q0, q0, q12
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u16 q13, q9, q5
++        vcgt.u16 q14, q10, q6
++        vcgt.u16 q15, q11, q7
++
++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s16 q1, q1, q13
++        vmov.u8  q12, #2
++        vadd.s16 q2, q2, q14
++        vadd.s16 q3, q3, q15
++
++        vmovn.s16 d0, q0
++        vmovn.s16 d1, q1
++        vmovn.s16 d2, q2
++        vmovn.s16 d3, q3
++
++        vuzp.8   q0, q1
++
++        vld1.8   {d26, d27}, [r5]
++
++        vadd.s8  q0, q0, q12
++        vadd.s8  q1, q1, q12
++
++        vtbl.8   d0, {d26}, d0
++        vtbl.8   d1, {d26}, d1
++        vtbl.8   d2, {d27}, d2
++        vtbl.8   d3, {d27}, d3
++
++        vmov.i64 q12, #0
++
++        vzip.8   q0, q1
++
++        vdup.i16 q13, r4
++
++        @ Avoid overwrite whilst widening
++        vaddw.s8 q2, q6, d2
++        vaddw.s8 q3, q7, d3
++        vaddw.s8 q1, q5, d1
++        vaddw.s8 q0, q4, d0
++
++        @ now clip
++        clip16_4 q2, q3, q1, q0, q12, q13
 +
-+function edge_w64_body
-+        edgeidx64
-+        vstm    r0, {q0-q3}
-+        add     r0, r0, r2
 +        bx       lr
 +endfunc
 +
-+.macro init_edge_64
-+        push   {r4-r8,lr}
-+        ldr    r12, [sp, #24] // height
-+        ldr    r5,  [sp, #28] // sao_offset_val_table
-+        ldrd   r4, r5, [r5]
-+        mov    r6, r4
-+        mov    r7, r5
-+.endm
 +
-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+        sub    r1, #8
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-+        // load a
-+        vext.8 q0, q3, q4, #15
-+        vext.8 q1, q4, q5, #15
-+        vext.8 q2, q5, q6, #15
-+        vext.8 q3, q6, q7, #15
-+        // load b
-+        vext.8 q8, q4, q5, #1
-+        vext.8 q9, q5, q6, #1
-+        vext.8 q10, q6, q7, #1
-+        vext.8 q11, q7, q12, #1
-+        bl    edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
++
++        vsub.s8  q0,  q3
++        vsub.s8  q10, q9
++        vadd.s8  q0,  q10       @ a = sign(c-a)
++
++        vadd.s8  q0,  q14
++        vuzp.8   d0,  d1
++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d0,  {d16}, d0
++        vtbl.8   d1,  {d17}, d1
++
++        vzip.8   d0,  d1
++        vqadd.s8 q0,  q3
++        vsub.s8  q0,  q15
++
++        bx      lr
 +endfunc
 +
-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+        sub     r1, r3
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
++        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
++        vsub.s16 q0, q3         @ a = sign(c-a)
++        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
++        vsub.s16 q0, q3
++        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
++        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
++
++        vmovn.s16 d0, q0
++        @ d1 will have random contents that we transform but
++        @ that doesn't matter as we then discard them
++        vuzp.8   d0, d1
++
++        vadd.s8  q0, q0, q14
++
++        vtbl.8   d0, {d16}, d0
++        vtbl.8   d1, {d17}, d1
++
++        vzip.8   d0, d1
++
++        vaddw.s8 q0, q1, d0
++
++        @ now clip
++        vmax.s16 q0, q12
++        vmin.s16 q0, q15
++        bx       lr
++endfunc
++
++
++@ ff_hevc_sao_edge_[c_]xx_neon(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
++@   int eo,                           [sp, #sp_base + 0]
++@   int width,                        [sp, #sp_base + 4]
++@   int height)                       [sp, #sp_base + 8]
++
++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
++        push     {r4-r6, lr}    @ 16 bytes
++.set sp_base, 16
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++        mov      r12, #2
++.if \is_chroma
++        ldr      r4, [sp, #16]
++.set sp_base, sp_base + 4
++.endif
++        vld1.8   {d16[2]}, [r3], r12
++        vld1.8   {d16[0]}, [r3], r12
++        vld1.8   {d16[1]}, [r3], r12
++        vld1.8   {d16[3]}, [r3], r12
++        vld1.8   {d16[4]}, [r3]
++.if \is_chroma
++        vld1.8   {d17[2]}, [r4], r12
++        vld1.8   {d17[0]}, [r4], r12
++        vld1.8   {d17[1]}, [r4], r12
++        vld1.8   {d17[3]}, [r4], r12
++        vld1.8   {d17[4]}, [r4]
++.else
++        vmov     d17, d16
++.endif
++
++@ Setup constant registers
++.if \bit_depth > 8
++        movw     r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++        vmov.i64 q12, #0
++        vdup.16  q15, r4
++.else
++        vmov.u8  q15, #128
++.endif
++        vmov.u8  q14, #2
++.endif
++        movw     r3, EDGE_SRC_STRIDE
++
++@ If setup_64b we need the xlat table on the stack and q4-q7 saved
++.if \setup_64b
++        sub      r5, sp, #16
++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++        ldr      r12, [sp, #sp_base + 4]        @ width
++        cmp      r12, #8
++.endif
++        ldr      r12, [sp, #sp_base + 0]        @ e0
++        adr      r6, \jump_tab
++.if \check_w4
++        it lt
++        addlt    r6, #16
++.endif
++        ldr      r6, [r6, r12, lsl #2]
++
++        ldr      r12, [sp, #sp_base + 8]        @ height
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++        push     {r0, r1, r6, r12}
++        blx      r6
++        pop      {r0, r1, r6, r12}
++
++        add      r0, #64
++        add      r1, #64
++.endif
++
++        blx      r6
++
++@ Tidy up & return
++.if \setup_64b
++        vpop     {q4-q8}        @ spurious but harmless load of q8
++.endif
++        pop      {r4-r6, pc}
++.endm
++
++
++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
++.endm
++
++
++.macro  edge_64b_e0, body_fn, pb
++        mov      r6, lr
++        sub      r1, #8
++1:      vldm     r1, {d7-d16}
++        subs     r12, #1
++        add      r1, r3
 +        // load a
-+        vld1.8  {q0-q1}, [r1, :128]!
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+        sub     r1, #32
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+1:      subs    r12, #1
++        vext.8   q0,  q3,  q4, #(16 - \pb)
++        vext.8   q1,  q4,  q5, #(16 - \pb)
++        vext.8   q2,  q5,  q6, #(16 - \pb)
++        vext.8   q3,  q6,  q7, #(16 - \pb)
 +        // load b
-+        vld1.8  {q8-q9}, [r1, :128]!
-+        vld1.8  {q10-q11}, [r1, :128], r3
-+        sub     r1, #32
-+        bl      edge_w64_body
++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
++        vext.8   q8,  q4,  q5, #\pb
++        vext.8   q9,  q5,  q6, #\pb
++        vext.8   q10, q6,  q7, #\pb
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_32bx2_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #2
++
++        vld1.8   {q4-q5}, [r1]
++        sub      r1, #\pb
++        vld1.8   {q0-q1}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {q8-q9}, [r1], r3
++        sub      r1, #\pb
++        vld1.8   {q6-q7}, [r1]
++        sub      r1, #\pb
++        vld1.8   {q2-q3}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {q10-q11}, [r1], r3
++        sub      r1, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {q0,q1}, [r0], r2
++        vst1.8   {q2,q3}, [r0], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_16b_e0, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++        sub      r3, #\pb * 2
++
++1:      subs     r12, #1
++
++        vld1.64  {q0}, [r1]             @ load a
++        add      r1, #\pb
++        vld1.64  {q1}, [r1, :128]       @ load c
++        add      r1, #\pb
++        vld1.64  {q2}, [r1], r3         @ load b
++
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #2
++
++        vld1.8   {d2}, [r1, :64]
++        sub      r1, #\pb
++        vld1.8   {d0}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {d4}, [r1], r3
++        sub      r1, #\pb
++        vld1.8   {d3}, [r1, :64]
++        sub      r1, #\pb
++        vld1.8   {d1}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.8   {d5}, [r1], r3
++        sub      r1, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r0, :64], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e0, body_fn, pb
++        mov      r6, lr
++
++1:      subs     r12, #4
++
++        vld1.32  {d2[0]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d0[0]}, [r1]
++        add      r1, #(\pb * 2)
++        vld1.32  {d4[0]}, [r1], r3      @ R
++        vld1.32  {d4[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d2[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d0[1]}, [r1], r3      @ L
++        vld1.32  {d1[0]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[0]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d5[0]}, [r1], r3      @ R
++        vld1.32  {d5[1]}, [r1]
++        sub      r1, #(\pb * 2)
++        vld1.32  {d1[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[1]}, [r1], r3      @ M
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++
++        bgt      1b
++        bx       r6
++.endm
++
++
++.macro  edge_64b_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {q0-q1}, [r1, :128]!
++        vld1.8   {q2-q3}, [r1, :128], r3
++        sub      r1, #32
++        // load c
++        vld1.8   {q4-q5}, [r1, :128]!
++        vld1.8   {q6-q7}, [r1, :128], r3
++        sub      r1, #32
++1:      subs     r12, #1
++        // load b
++        vld1.8   {q8-q9}, [r1, :128]!
++        vld1.8   {q10-q11}, [r1, :128], r3
++        sub      r1, #32
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
 +        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
++        vmov.64  q0, q4
++        vmov.64  q1, q5
++        vmov.64  q2, q6
++        vmov.64  q3, q7
 +        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++        vmov.64  q4, q8
++        vmov.64  q5, q9
++        vmov.64  q6, q10
++        vmov.64  q7, q11
++        bgt      1b
++        bx       r6
++.endm
 +
-+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+1:      sub     r1, r3
++.macro  edge_32bx2_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
 +        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        sub     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #31
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #33
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++        vld1.8   {q0-q1}, [r1, :128], r3
++        vld1.8   {q4-q5}, [r1, :128], r3
 +
-+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
-+        init_edge_64
-+        vpush {d8-d15}
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #1
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #33
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #1
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #31
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
++1:      subs     r12, #2
++        @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vmov     q2, q4
++        vmov     q3, q5
++        vld1.8   {q8-q9}, [r1, :128], r3
++        vld1.8   {q10-q11}, [r1, :128], r3
++        vmov     q6, q8
++        vmov     q7, q9
 +
++        bl       \body_fn
 +
-+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
-+@   uint8_t *_dst,               r0
-+@   uint8_t *_src,               r1
-+@   ptrdiff_t stride_dst,        r2
-+@   ptrdiff_t stride_src,        r3
-+@   int height,                  sp[0]
-+@   int16_t *sao_offset_table_u,  sp[4]
-+@   int16_t *sao_offset_table_v); sp[8]
-+@   int eo                        sp[12]
++        vst1.8   {q0,q1}, [r0], r2
++        vst1.8   {q2,q3}, [r0], r2
 +
-+function ff_hevc_sao_edge_c_w64_neon_8, export=1
-+        push   {r4-r8,lr}     // 6 reg = 24
-+        ldr    r5,  [sp, #28] // sao_offset_val_table_u
-+        ldr    r7,  [sp, #32] // sao_offset_val_table_v
-+
-+        @ Load and rearrange offsets
-+        @ Also "convert" from 16bit to 8bit
-+        ldrb    r4, [r5, #2]
-+        ldrb    r8, [r5, #4]
-+        ldrb    r6, [r7, #2]
-+        ldrb    r12, [r7, #4]
-+        orr     r4, r4, r8, lsl #8
-+        orr     r6, r6, r12, lsl #8
-+        ldrb    r8, [r5, #6]
-+        ldrb    r12, [r7, #6]
-+        orr     r4, r4, r8, lsl #24
-+        orr     r6, r6, r12, lsl #24
-+        ldrb    r5, [r5, #8]
-+        ldrb    r7, [r7, #8]
-+
-+        ldr     r12, [sp, #36] // e0
-+        adr     r8, edge_c_tbl_w64
-+        ldr     r8, [r8, r12, lsl #2]
-+
-+        ldr     r12, [sp, #24] // height
-+        vpush   {d8-d15}
-+        mov     pc, r8
-+
-+edge_c_tbl_w64:
-+        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
-+        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
-+
-+ff_hevc_sao_edge_c_eo0_w64_neon_8:
-+        sub    r1, #8
-+1:      subs    r12, #1
-+        vld1.64  {d7}, [r1, :64]!
-+        vld1.64  {q4-q5}, [r1, :128]! // load c
-+        vld1.64  {q6-q7}, [r1, :128]!
-+        vld1.64  {d24}, [r1, :64], r3
-+        sub      r1, #72
-+        // load a
-+        vext.8 q0, q3, q4, #14
-+        vext.8 q1, q4, q5, #14
-+        vext.8 q2, q5, q6, #14
-+        vext.8 q3, q6, q7, #14
-+        // load b
-+        vext.8 q8, q4, q5, #2
-+        vext.8 q9, q5, q6, #2
-+        vext.8 q10, q6, q7, #2
-+        vext.8 q11, q7, q12, #2
-+        bl    edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+
-+ff_hevc_sao_edge_c_eo1_w64_neon_8:
-+        sub     r1, r3
-+        // load a
-+        vldm    r1, {q0-q3}
-+        add     r1, r3
-+        // load c
-+        vldm    r1, {q4-q7}
-+        add     r1, r3
-+1:      subs    r12, #1
-+        // load b
-+        vldm    r1, {q8-q11}
-+        add     r1, r3
-+        bl      edge_w64_body
 +        // copy c to a
-+        vmov.64 q0, q4
-+        vmov.64 q1, q5
-+        vmov.64 q2, q6
-+        vmov.64 q3, q7
++        vmov.64  q0, q8
++        vmov.64  q1, q9
++
 +        // copy b to c
-+        vmov.64 q4, q8
-+        vmov.64 q5, q9
-+        vmov.64 q6, q10
-+        vmov.64 q7, q11
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++        vmov.64  q4, q10
++        vmov.64  q5, q11
++        bgt      1b
++        bx       r6
++.endm
 +
-+ff_hevc_sao_edge_c_eo2_w64_neon_8:
-+1:      sub     r1, r3
++.macro  edge_16b_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {q0}, [r1, :128], r3
++        // load c
++        vld1.8   {q1}, [r1, :128], r3
++1:      subs     r12, #1
++        // load b
++        vld1.8   {q2}, [r1, :128], r3
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        // copy c to a
++        vmov.64  q0, q1
++        // copy b to c
++        vmov.64  q1, q2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e1, body_fn
++        mov      r6, lr
++        sub      r1, r3
++        // load a
++        vld1.8   {d0}, [r1, :64], r3
++        vld1.8   {d2}, [r1, :64], r3
++
++1:      subs     r12, #2
++        @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vmov.64  d1, d2
++        vld1.8   {d4}, [r1, :64], r3
++        vld1.8   {d5}, [r1, :64], r3
++        vmov.64  d3, d4
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0], r2
++        vst1.8   {d1}, [r0], r2
++
++        // copy c to a
++        vmov.64  d0, d4
++        // copy b to c
++        vmov.64  d2, d5
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e1, body_fn
++        mov      r6, lr
++debug_me:
++        sub      r1, r3
++        // load a
++        vld1.32  {d0[0]}, [r1], r3
++        vld1.32  {d0[1]}, [r1], r3
++
++1:      subs     r12, #4
++        @ Given the data duplication here we could probably do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.32  {d4[0]}, [r1], r3
++        vld1.32  {d4[1]}, [r1], r3
++        vld1.32  {d5[0]}, [r1], r3
++        vld1.32  {d5[1]}, [r1], r3
++
++        vmov.32  d1, d4
++        vext.32  d2, d0, d4, #1
++        vext.32  d3, d4, d5, #1
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++
++        vmov.32  d0, d5
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_64b_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #32
++        sub      r3, #(32 - \pb)
++
++1:      sub      r1, r3
 +        // load a
 +        // TODO: fix unaligned load
 +        //       don't reload a like in eo1
-+        sub     r1, #2
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #30
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        add     r1, #2
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #34
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
++        vld1.8   {q0-q1}, [r1]!
++        vld1.8   {q2-q3}, [r1], r3
++        subs     r12, #1
++        // load  c
++        vld1.8   {q4-q5}, [r1, :128]!
++        vld1.8   {q6-q7}, [r1, :128], r3
++        // load  b
++        vld1.8   {q8-q9}, [r1]!
++        vld1.8   {q10-q11}, [r1]
++        sub      r1, #(64 + \pb)
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
 +
-+ff_hevc_sao_edge_c_eo3_w64_neon_8:
-+1:      sub     r1, r3
-+        // load a
-+        // TODO: fix unaligned load
-+        //       don't reload a like in eo1
-+        add     r1, #2
-+        vld1.8  {q0-q1}, [r1]!
-+        vld1.8  {q2-q3}, [r1], r3
-+        sub     r1, #34
-+        subs    r12, #1
-+        // load c
-+        vld1.8  {q4-q5}, [r1, :128]!
-+        vld1.8  {q6-q7}, [r1, :128], r3
-+        sub     r1, #32
-+        // load b
-+        sub     r1, #2
-+        vld1.8  {q8-q9}, [r1]!
-+        vld1.8  {q10-q11}, [r1]
-+        sub     r1, #30
-+        bl      edge_w64_body
-+        bne   1b
-+        vpop  {d8-d15}
-+        pop   {r4-r8,pc}
-+endfunc
-+
-+
-+.macro init_edge_32
-+        ldr     r12, [sp, #4] // sao_offset_val_table
-+        vld1.32 {d31}, [r12]
-+        ldr     r12, [sp] // height
++        add      r3, #(32 - \pb)
++        bx       r6
 +.endm
 +
-+.macro diff out0, tmp0, in0, in1
-+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
-+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
-+        vsub.s8 \out0, \tmp0, \out0 // diff0
++.macro  edge_32bx2_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++
++1:      sub      r1, r3
++        vld1.8   {q0-q1}, [r1], r3
++        vld1.8   {q2-q3}, [r1]
++        subs     r12, #2
++        // load  c
++        add      r1, #\pb
++        vld1.8   {q4-q5}, [r1, :128], r3
++        vld1.8   {q6-q7}, [r1, :128]
++        // load  b
++        add      r1, #\pb
++        vld1.8   {q8-q9}, [r1], r3
++        vld1.8   {q10-q11}, [r1]
++        sub      r1, #(\pb * 2)
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0], r2
++        vst1.8   {q2-q3}, [r0], r2
++        bgt      1b
++
++        bx       r6
 +.endm
 +
-+.macro table32
-+        vmov.s8  q10, #2
-+        vadd.s8  q0, q10
-+        vadd.s8  q1, q10
-+        vmov.s8  q10, #128
-+        vtbl.8   d0, {d31}, d0
-+        vadd.s8  q11, q2, q10
-+        vtbl.8   d1, {d31}, d1
-+        vadd.s8  q12, q3, q10
-+        vtbl.8   d2, {d31}, d2
-+        vqadd.s8 q11, q0
-+        vtbl.8   d3, {d31}, d3
-+        vqadd.s8 q12, q1
-+        vsub.s8  q0, q11, q10
-+        vsub.s8  q1, q12, q10
-+        vst1.8   {q0-q1}, [r0, :128], r2
++.macro  edge_16b_e2, body_fn, pb
++        mov      r6, lr
++        add     r3, #\pb
++
++1:      sub      r1, r3
++        // load a
++        vld1.8   {q0}, [r1], r3
++        subs     r12, #1
++        // load  c
++        vld1.8   {q1}, [r1, :128], r3
++        // load  b
++        vld1.8   {q2}, [r1]
++        sub      r1, #\pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0], r2
++        bgt      1b
++        bx       r6
 +.endm
 +
-+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
-+        init_edge_32
-+        vpush {q4-q7}
-+        sub     r1, #4
-+1:      subs    r12, #1
-+        vld1.8  {q13-q14}, [r1]!
-+        vld1.32 d30, [r1], r3
-+        sub     r1, #32
-+        // a
-+        vext.8   q0, q13, q14, #3
-+        vext.8   q1, q14, q15, #3
-+        vshr.u64 d24, d30, #24
-+        // c
-+        vext.8   q2, q13, q14, #4
-+        vext.8   q3, q14, q15, #4
-+        vshr.u64 d16, d30, #32
-+        // diff0
-+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
-+        diff   d18, d25, d24, d16
-+        // -diff1
-+        vext.s8 q0, q13, q14, #1
-+        vext.s8 q1, q14, q9, #1
++.macro  edge_8bx2_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
 +
-+        vsub.s8 q0, q13, q0 //diff0 + diff1
-+        vsub.s8 q1, q14, q1
-+        table32
-+        bne     1b
-+        vpop {q4-q7}
++1:      sub      r1, r3
++        vld1.8   {d0}, [r1], r3
++        vld1.8   {d1}, [r1]
++        subs     r12, #2
++        // load  c
++        add      r1, #\pb
++        vld1.8   {d2}, [r1, :64], r3
++        vld1.8   {d3}, [r1, :64]
++        // load  b
++        add      r1, #\pb
++        vld1.8   {d4}, [r1], r3
++        vld1.8   {d5}, [r1]
++        sub      r1, #(\pb * 2)
 +
-+        bx      lr
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0], r2
++        vst1.8   {d1}, [r0], r2
++        bgt      1b
++
++        bx       r6
++.endm
++
++.macro  edge_4bx4_e2, body_fn, pb
++        mov      r6, lr
++        sub      r1, #\pb
++
++1:      sub      r1, r3
++        @ line 0 {d0[0], -,     -    }  r1 lo
++        vld1.32  {d0[0]}, [r1], r3
++        subs     r12, #4
++        @ Line 1 {d0[1], d2[0], -    }  r1 lo
++        vld1.32  {d0[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d2[0]}, [r1], r3
++        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
++        vld1.32  {d2[1]}, [r1]
++        sub      r1, #\pb
++        vld1.32  {d1[0]}, [r1]
++        add      r1, #\pb * 2
++        vld1.32  {d4[0]}, [r1], r3
++        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
++        vld1.32  {d4[1]}, [r1]
++        sub      r1, #\pb * 2
++        vld1.32  {d1[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d3[0]}, [r1], r3
++        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
++        vld1.32  {d3[1]}, [r1]
++        add      r1, #\pb
++        vld1.32  {d5[0]}, [r1], r3
++        @ Line 4 {-,      -,    d5[1]}  r1 hi
++        vld1.32  {d5[1]}, [r1]
++        sub      r1, #(\pb * 2)
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0], r2
++        vst1.32  {d0[1]}, [r0], r2
++        vst1.32  {d1[0]}, [r0], r2
++        vst1.32  {d1[1]}, [r0], r2
++        bgt      1b
++
++        bx       r6
++.endm
++
++.macro  edge_64b_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_64b_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_32bx2_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_32bx2_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_16b_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_16b_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_8bx2_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_8bx2_e2 \body_fn, (-\pb)
++.endm
++
++.macro  edge_4bx4_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_64b_e0     \body_fn, \pb
++10:     edge_64b_e1     \body_fn
++20:     edge_64b_e2     \body_fn, \pb
++30:     edge_64b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_32bx2_e0   \body_fn, \pb
++10:     edge_32bx2_e1   \body_fn
++20:     edge_32bx2_e2   \body_fn, \pb
++30:     edge_32bx2_e3   \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_32bx2_e0   \body_fn_64b, \pb
++10:     edge_32bx2_e1   \body_fn_64b
++20:     edge_32bx2_e2   \body_fn_64b, \pb
++30:     edge_32bx2_e3   \body_fn_64b, \pb
++5:      edge_16b_e0     \body_fn_16b, \pb
++15:     edge_16b_e1     \body_fn_16b
++25:     edge_16b_e2     \body_fn_16b, \pb
++35:     edge_16b_e3     \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++5:      edge_8bx2_e0    \body_fn, \pb
++15:     edge_8bx2_e1    \body_fn
++25:     edge_8bx2_e2    \body_fn, \pb
++35:     edge_8bx2_e3    \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++        .word   0f
++        .word   10f
++        .word   20f
++        .word   30f
++        .word   5f
++        .word   15f
++        .word   25f
++        .word   35f
++
++0:      edge_8bx2_e0    \body_fn, \pb
++10:     edge_8bx2_e1    \body_fn
++20:     edge_8bx2_e2    \body_fn, \pb
++30:     edge_8bx2_e3    \body_fn, \pb
++5:      edge_4bx4_e0    \body_fn, \pb
++15:     edge_4bx4_e1    \body_fn
++25:     edge_4bx4_e2    \body_fn, \pb
++35:     edge_4bx4_e3    \body_fn, \pb
++.endm
++
++@ void ff_hevc_sao_edge_8_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_8_neon_8, export=1
++        edge_16b_init   8, 0, 1, 99f
++99:
++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
-+        init_edge_32
-+        vpush {q4-q7}
-+        // load a
-+        sub     r1, r3
-+        vld1.8  {q0-q1}, [r1, :128], r3
-+        // load c
-+        vld1.8  {q2-q3}, [r1, :128], r3
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q8-q9}, [r1, :128], r3
-+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
-+        vadd.s8 q0, q4, q12 //diff0 + diff1
-+        vadd.s8 q1, q5, q13
-+        table32
-+        // CMP ( c, a )
-+        vneg.s8 q12, q4
-+        vneg.s8 q13, q5
-+        // c
-+        vmov.64 q2, q8
-+        vmov.64 q3, q9
-+        bne     1b
-+        vpop {q4-q7}
-+        bx      lr
++@ void ff_hevc_sao_edge_16_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_16_neon_8, export=1
++        edge_16b_init   8, 0, 0, 99f
++99:
++        edge_16b_bodies edge_16b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
-+        init_edge_32
-+        vpush   {d8-d15}
-+        // load a
-+        sub     r1, r3
-+        sub     r1, #8
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #7
-+        vext.8  q1, q11, q12, #7
-+        // load c
-+        vld1.8  {d9}, [r1, :64]!
-+        vld1.8  {q2-q3}, [r1, :64], r3
-+        sub     r1, #8
-+        vext.8  q4, q4, q2, #15
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q8, q10, q11, #9
-+        vext.8  q9, q11, q12, #9
-+        vext.8  q6, q10, q11, #8
-+        vext.8  q7, q11, q12, #8
-+        vext.8  q5, q10, q11, #7
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
-+        // inputs for next loop iteration
-+        // a
-+        vmov.8  q0, q4
-+        vext.8  q1, q2, q3, #15
-+        // c
-+        vmov.8  q2, q6
-+        vmov.8  q3, q7
-+        vmov.8  q4, q5
-+        bne     1b
-+        vpop    {d8-d15}
-+        bx      lr
++@ void ff_hevc_sao_edge_32_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_32_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 1
 +endfunc
 +
-+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
-+        init_edge_32
-+        sub     r1, r3
-+        // load a
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {d24}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q0, q10, q11, #1
-+        vext.8  q1, q11, q12, #1
-+        // load c
-+        vld1.8  {q2-q3}, [r1, :64]!
-+        vld1.8  {d30}, [r1, :64], r3
-+        sub     r1, #40
-+1:      subs    r12, #1
-+        // load b
-+        vld1.8  {q10-q11}, [r1, :64]!
-+        vld1.8  {q12}, [r1, :64], r3
-+        sub     r1, #32
-+        vext.8  q8, q10, q11, #7
-+        vext.8  q9, q11, q12, #7
-+        vext.8  q14, q12, q10, #7
++@ void ff_hevc_sao_edge_64_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
 +
-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
++function ff_hevc_sao_edge_64_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 1
++endfunc
 +
-+        vadd.s8 q0, q12 //diff0 + diff1
-+        vadd.s8 q1, q13
-+        table32
++@ ff_hevc_sao_edge_c_8_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
 +
-+        // inputs for next loop iteration
-+        // a
-+        vext.8  q0, q2, q3, #1
-+        vext.8  q1, q3, q15, #1
-+        // c
-+        vext.8  q2, q8, q9, #1
-+        vext.8  q3, q9, q14, #1
-+        vext.8  d30, d28, d2, #1
-+        bne     1b
-+        bx      lr
++function ff_hevc_sao_edge_c_8_neon_8, export=1
++        edge_16b_init   8, 1, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_16_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_16_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_32_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_32_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_sao_edge_8_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_8_neon_10, export=1
++        edge_16b_init   10, 0, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_sao_edge_16_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_16_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_sao_edge_64_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_sao_edge_64_neon_10, export=1
++        edge_64b_init   10, 0, 1, 99f
++endfunc
++
++@ void ff_hevc_sao_edge_32_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_sao_edge_32_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_sao_edge_c_8_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_8_neon_10, export=1
++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_sao_edge_c_32_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_32_neon_10, export=1
++        edge_64b_init   10, 1, 1, 99f
++endfunc
++
++
++@ ff_hevc_sao_edge_c_16_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_sao_edge_c_16_neon_10, export=1
++        edge_64b_init   10, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 4
 +endfunc
 +
 diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 39713ed..25eb52b 100644
+index 1be52e7a12..bae5df4bc6 100644
 --- a/libavcodec/avcodec.h
 +++ b/libavcodec/avcodec.h
 @@ -410,6 +410,8 @@ enum AVCodecID {
@@ -3343,15 +6935,7 @@ index 39713ed..25eb52b 100644
      /* various PCM "codecs" */
      AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
      AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_DC_CLIP          4096
- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED       16384
-+#define FF_BUG_GMC_UNSUPPORTED 32768
- 
-     /**
-      * strictly follow the standard (MPEG-4, ...).
-@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
+@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext {
  #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
  #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
  #define FF_PROFILE_H264_CAVLC_444            44
@@ -3361,7 +6945,7 @@ index 39713ed..25eb52b 100644
  
  #define FF_PROFILE_VC1_SIMPLE   0
  #define FF_PROFILE_VC1_MAIN     1
-@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
+@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext {
  #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
  #endif
  
@@ -3371,11 +6955,12 @@ index 39713ed..25eb52b 100644
 +     * @author jc (08/02/2016)
 +     */
 +    void * get_buffer_context;
++
  } AVCodecContext;
  
  AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
 diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c62..ccfa991 100644
+index 1bf1c620d6..ccfa991f60 100644
 --- a/libavcodec/cabac.h
 +++ b/libavcodec/cabac.h
 @@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
@@ -3394,140 +6979,11 @@ index 1bf1c62..ccfa991 100644
      const uint8_t *bytestream_start;
      const uint8_t *bytestream;
      const uint8_t *bytestream_end;
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 9d94b72..535ebf0 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
-     },
-+    {
-+        .id        = AV_CODEC_ID_H264_MVC,
-+        .type      = AVMEDIA_TYPE_VIDEO,
-+        .name      = "h264_mvc",
-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
-+        .props     = AV_CODEC_PROP_LOSSY,
-+    },
- 
-     /* various PCM "codecs" */
-     {
-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index efe3555..16358aa 100644
---- a/libavcodec/h264.h
-+++ b/libavcodec/h264.h
-@@ -126,7 +126,9 @@ enum {
-     NAL_END_STREAM      = 11,
-     NAL_FILLER_DATA     = 12,
-     NAL_SPS_EXT         = 13,
-+    NAL_SPS_SUBSET      = 15,
-     NAL_AUXILIARY_SLICE = 19,
-+    NAL_SLICE_EXT       = 20,
-     NAL_FF_IGNORE       = 0xff0f001,
- };
- 
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index ce4bab2..b9b0c78 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
-     uint8_t parse_history[6];
-     int parse_history_count;
-     int parse_last_mb;
-+    int is_mvc;
-+    int slice_ext;
- } H264ParseContext;
- 
- 
-@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
-         } else if (state <= 5) {
-             int nalu_type = buf[i] & 0x1F;
-             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
--                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
-+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
-+                nalu_type == NAL_SPS_SUBSET) {
-                 if (pc->frame_start_found) {
-                     i++;
-                     goto found;
-                 }
-             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
--                       nalu_type == NAL_IDR_SLICE) {
-+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
-                 state += 8;
-+
-+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
-                 continue;
-             }
-             state = 7;
-         } else {
-             p->parse_history[p->parse_history_count++] = buf[i];
--            if (p->parse_history_count > 5) {
-+            if (p->parse_history_count > 8) {
-                 unsigned int mb, last_mb = p->parse_last_mb;
-                 GetBitContext gb;
- 
--                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
-+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
-                 p->parse_history_count = 0;
-                 mb= get_ue_golomb_long(&gb);
-                 p->parse_last_mb = mb;
-@@ -145,7 +150,7 @@ found:
-     pc->frame_start_found = 0;
-     if (p->is_avc)
-         return next_avc;
--    return i - (state & 5) - 5 * (state > 7);
-+    return i - (state & 5) - 8 * (state > 7);
- }
- 
- static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
-@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
-         }
-     }
- 
--    parse_nal_units(s, avctx, buf, buf_size);
-+    if (!p->is_mvc)
-+        parse_nal_units(s, avctx, buf, buf_size);
- 
-     if (avctx->framerate.num)
-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
-         if ((state & 0xFFFFFF00) != 0x100)
-             break;
-         nalu_type = state & 0x1F;
--        if (nalu_type == NAL_SPS) {
-+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
-             has_sps = 1;
-         } else if (nalu_type == NAL_PPS)
-             has_pps = 1;
-@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
-     .parser_close   = h264_close,
-     .split          = h264_split,
- };
-+
-+static av_cold int init_mvc(AVCodecParserContext *s)
-+{
-+    H264ParseContext *p = s->priv_data;
-+    int ret = init(s);
-+    if (ret < 0)
-+        return ret;
-+
-+    p->is_mvc = 1;
-+    return 0;
-+}
-+
-+AVCodecParser ff_h264_mvc_parser = {
-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
-+    .priv_data_size = sizeof(H264ParseContext),
-+    .parser_init    = init_mvc,
-+    .parser_parse   = h264_parse,
-+    .parser_close   = h264_close,
-+    .split          = h264_split,
-+};
 diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
-index b478065..955e426 100644
+index c1fa67f67b..6f99021339 100644
 --- a/libavcodec/hevc.c
 +++ b/libavcodec/hevc.c
-@@ -41,8 +41,196 @@
+@@ -41,8 +41,346 @@
  #include "hevc.h"
  #include "profiles.h"
  
@@ -3535,33 +6991,19 @@ index b478065..955e426 100644
 +  #include "rpi_qpu.h"
 +  #include "rpi_shader.h"
 +  #include "rpi_shader_cmd.h"
++  #include "rpi_shader_template.h"
 +  #include "rpi_zc.h"
++  #include "libavutil/rpi_sand_fns.h"
 +
 +  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
 +  #define RPI_CACHE_UNIF_MVS  1
 +
-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
-+  //#define RPI_SIMULATE_QPUS
-+  #ifdef RPI_WORKER
-+    #include "pthread.h"
-+  #endif
++  #include "pthread.h"
++  #include "libavutil/atomic.h"
 +
 +  static void worker_core(HEVCContext * const s);
-+
-+  // We can pred any block height but annoyingly if we we do then the TMU cache
-+  // explodes and it goes even slower :-(
-+  #if 0
-+  #define Y_P_MAX_H     16
-+  #define Y_B_MAX_H     16
-+  #else
-+  #define Y_P_MAX_H     64
-+  #define Y_B_MAX_H     64
-+  #endif
 +#endif
 +
-+// #define DISABLE_MC
-+
-+#define DISABLE_CHROMA 0
 +#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
 +
 +#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
@@ -3573,8 +7015,6 @@ index b478065..955e426 100644
 +}
 +#   define av_mod_uintp2   av_mod_uintp2_c
 +#endif
-+
-+#define Y_B_ONLY 0
 +
  const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
  
@@ -3584,18 +7024,23 @@ index b478065..955e426 100644
 +#define MC_DUMMY_X (-32)
 +#define MC_DUMMY_Y (-32)
 +
-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
-+// For each block of 64*64 the smallest block size is 8x4
-+// We also need an extra command for the setup information
++// UV still has min 4x4 pred
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
++#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
++
++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
 +
-+#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
 +// The QPU code for UV blocks only works up to a block width of 8
 +#define RPI_CHROMA_BLOCK_WIDTH 8
 +
 +#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
 +
-+// TODO Chroma only needs 4 taps
 +
 +// Actual filter goes -ve, +ve, +ve, -ve using these values
 +static const uint32_t rpi_filter_coefs[8] = {
@@ -3609,29 +7054,135 @@ index b478065..955e426 100644
 +        ENCODE_COEFFS(  2,  10,  58,  2)
 +};
 +
-+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++    const unsigned int n;
++    const int * const * setup_fns;
++    const int * const * sync_fns;
++    const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++    ipe_chan_info_t luma;
++    ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
++   {  // 8
++      .luma =   {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++      .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++   },
++   {  // 9
++      .luma =   {0},
++      .chroma = {0}
++   },
++   {  // 10
++      .luma =   {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++      .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++   }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++    const unsigned int n = ici->n;
++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++
++    ipe->n = n;
++    ipe->max_fill = q1_size - ipe->min_gap;
++    for(unsigned int i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(ici->setup_fns[i]);
++        q->code_sync = qpu_fn(ici->sync_fns[i]);
++        q->code_exit = qpu_fn(ici->exit_fns[i]);
++    }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
++{
++    const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
++
++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++
++    for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
++        HEVCRpiJob *const jb = s->jobs + i;
++        set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
++        set_ipe_from_ici(&jb->luma_ip,   &iii->luma);
++    }
++}
++
 +
 +#endif
 +
 +
-+#ifdef RPI_WORKER
-+
-+typedef struct worker_global_env_s
-+{
-+    volatile int arm_load;
-+    pthread_mutex_t lock;
-+
-+    unsigned int arm_y;
-+    unsigned int arm_c;
-+    unsigned int gpu_y;
-+    unsigned int gpu_c;
-+} worker_global_env_t;
-+
-+static worker_global_env_t worker_global_env =
-+{
-+    .lock = PTHREAD_MUTEX_INITIALIZER
-+};
-+
++#ifdef RPI
 +
 +//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
 +//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
@@ -3639,108 +7190,154 @@ index b478065..955e426 100644
 +#define LOG_ENTER
 +#define LOG_EXIT
 +
++#define USE_SEM 1
++
 +// Call this when we have completed pass0 and wish to trigger pass1 for the current job
-+static void worker_submit_job(HEVCContext *s)
++static void worker_submit_job(HEVCContext * const s)
 +{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_tail++;
-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_ENTER
++    sem_post(&s->jb0->sem_in);
++    s->jb0->pending = 1;
++    s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++    s->jb0 = s->jobs + s->pass0_job;
++    LOG_EXIT
 +}
 +
 +// Call this to say we have completed pass1
-+static void worker_complete_job(HEVCContext *s)
++static void worker_complete_job(HEVCContext * const s)
 +{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  s->worker_head++;
-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_ENTER
++    sem_t * const sem = &s->jb1->sem_out;
++    // Must set job no before signalling as otherwise rpi_do_all_passes
++    // may call worker_core from the main thread with a bad job number
++    s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
++    s->jb1 = s->jobs + s->pass1_job;
++    sem_post(sem);
++    LOG_EXIT
 +}
 +
-+// Call this to wait for all jobs to have completed at the end of a frame
-+static void worker_wait(HEVCContext *s)
-+{
-+  LOG_ENTER
-+  pthread_mutex_lock(&s->worker_mutex);
-+  while( s->worker_head !=s->worker_tail)
-+  {
-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
-+  }
-+  pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
-+}
 +
 +// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
 +// available to receive the next job.
 +static void worker_pass0_ready(HEVCContext *s)
 +{
-+  LOG_ENTER
-+    pthread_mutex_lock(&s->worker_mutex);
-+    // tail is number of submitted jobs
-+    // head is number of completed jobs
-+    // tail-head is number of outstanding jobs in the queue
-+    // we need to ensure there is at least 1 space left for us to use
-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
-+    {
-+      // Wait until another job is completed
-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
++    LOG_ENTER
++    HEVCRpiJob * const jb = s->jb0;
++    if (jb->pending) {
++        while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
++            /* Loop */;
++        jb->pending = 0;
 +    }
-+    pthread_mutex_unlock(&s->worker_mutex);
-+  LOG_EXIT
++    LOG_EXIT
++}
++
++// Call this to wait for all jobs to have completed at the end of a frame
++static void worker_wait(HEVCContext * const s)
++{
++    LOG_ENTER
++    unsigned int i;
++    for (i = 0; i != RPI_MAX_JOBS; ++i) {
++        HEVCRpiJob * const jb = s->jobs + i;
++        if (jb->pending) {
++            while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
++                /* Loop */;
++            jb->pending = 0;
++        }
++    }
++    LOG_EXIT
 +}
 +
 +static void *worker_start(void *arg)
 +{
-+  HEVCContext *s = (HEVCContext *)arg;
-+  while(1) {
-+    pthread_mutex_lock(&s->worker_mutex);
++    HEVCContext * const s = (HEVCContext *)arg;
 +
-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
++    for (;;)
 +    {
-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
-+    }
-+    pthread_mutex_unlock(&s->worker_mutex);
++        HEVCRpiJob * const jb = s->jb1;
++        while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
++            /* Loop */;
++        if (jb->terminate)
++            break;
 +
-+    if (s->kill_worker) {
-+      break;
++        LOG_ENTER
++        worker_core(s);
++        worker_complete_job(s);
++        LOG_EXIT
 +    }
-+    LOG_ENTER
-+    worker_core(s);
-+
-+    worker_complete_job(s);
-+    LOG_EXIT
-+  }
-+  return NULL;
++    return NULL;
 +}
 +
++static void worker_pic_free_all(HEVCContext * const s)
++{
++    unsigned int i;
++
++    // Free coeff stuff - allocation not the same for all buffers
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
++
++        if (cf->s[0].buf != NULL)
++            av_freep(&cf->mptr);
++        if (cf->s[2].buf != NULL)
++            gpu_free(&cf->gptr);
++        memset(cf, 0, sizeof(*cf));
++    }
++}
++
++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
++{
++    unsigned int i;
++
++    // Free coeff stuff - allocation not the same for all buffers
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
++
++//        av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
++//        av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
++//        av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
++//        av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
++
++        if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++            goto fail;
++        cf->s[2].buf = (int16_t *)cf->gptr.arm;
++        cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++        // Must be 64 byte aligned for our zero apping code so over-allocate &
++        // round
++        if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++            goto fail;
++        cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++    }
++    return 0;
++
++fail:
++    printf("%s: **** Failed\n", __func__);
++    worker_pic_free_all(s);
++    return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++    unsigned int i;
++    for (i = 0; i != 4; ++i) {
++        cf->s[i].n = 0;
++    }
++}
 +#endif
++
 +
  /**
   * NOTE: Each function hls_foo correspond to the function foo in the
   * specification (HLS stands for High Level Syntax).
-@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
  /* free everything allocated  by pic_arrays_init() */
  static void pic_arrays_free(HEVCContext *s)
  {
 +#ifdef RPI
-+    int job;
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+      if (s->coeffs_buf_arm[job][0]) {
-+        gpu_free(&s->coeffs_buf_default[job]);
-+        s->coeffs_buf_arm[job][0] = 0;
-+      }
-+      if (s->coeffs_buf_arm[job][2]) {
-+        gpu_free(&s->coeffs_buf_accelerated[job]);
-+        s->coeffs_buf_arm[job][2] = 0;
-+      }
-+    }
++    worker_pic_free_all(s);
 +#endif
++
 +#ifdef RPI_DEBLOCK_VPU
 +    {
 +        int i;
@@ -3757,7 +7354,7 @@ index b478065..955e426 100644
      av_freep(&s->sao);
      av_freep(&s->deblock);
  
-@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
      int ctb_count        = sps->ctb_width * sps->ctb_height;
      int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
  
@@ -3766,32 +7363,17 @@ index b478065..955e426 100644
 +    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
 +    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
 +    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
-+    int job;
 +
 +    av_assert0(sps);
-+//    s->max_ctu_count = sps->ctb_width;
-+//    printf("CTB with=%d\n", sps->ctb_width);
-+//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
-+    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
-+    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
-+    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
++#if RPI_ROUND_TO_LINES
++    // Round down to an integral quantity of lines
++    if (s->max_ctu_count > sps->ctb_width)
++        s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
++#endif
 +
-+    for(job=0;job<RPI_MAX_JOBS;job++) {
-+        for(job=0;job<RPI_MAX_JOBS;job++) {
-+            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
-+            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
-+            if (!s->coeffs_buf_arm[job][0])
-+                goto fail;
-+
-+            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
-+            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
-+            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
-+            if (!s->coeffs_buf_arm[job][2])
-+                goto fail;
-+            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
-+            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
-+        }
-+    }
++    if (worker_pic_alloc_all(s, coefs_per_row) != 0)
++        goto fail;
 +#endif
 +#ifdef RPI_DEBLOCK_VPU
 +    {
@@ -3847,7 +7429,7 @@ index b478065..955e426 100644
      s->bs_width  = (width  >> 2) + 1;
      s->bs_height = (height >> 2) + 1;
  
-@@ -137,6 +434,29 @@ fail:
+@@ -137,6 +560,29 @@ fail:
      return AVERROR(ENOMEM);
  }
  
@@ -3877,16 +7459,18 @@ index b478065..955e426 100644
  static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
  {
      int i = 0;
-@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
  static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
  {
      #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
 -    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+-    int ret, i;
 +    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
-     int ret, i;
++    int ret;
  
      pic_arrays_free(s);
-@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     s->ps.sps = NULL;
+@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
      switch (sps->pix_fmt) {
      case AV_PIX_FMT_YUV420P:
      case AV_PIX_FMT_YUVJ420P:
@@ -3899,7 +7483,20 @@ index b478065..955e426 100644
  #if CONFIG_HEVC_DXVA2_HWACCEL
          *fmt++ = AV_PIX_FMT_DXVA2_VLD;
  #endif
-@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
++#if RPI_HEVC_SAND
++        // Currently geometry calc is stuffed for big sizes
++        if (sps->width < 2048 && sps->height <= 1088) {
++            *fmt++ = AV_PIX_FMT_SAND64_10;
++        }
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
          ret = ff_thread_get_format(s->avctx, pix_fmts);
          if (ret < 0)
              goto fail;
@@ -3907,22 +7504,56 @@ index b478065..955e426 100644
          s->avctx->pix_fmt = ret;
      }
      else {
-@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
+     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
+     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
++#ifdef RPI
++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++#endif
+ 
+-    for (i = 0; i < 3; i++) {
+-        av_freep(&s->sao_pixel_buffer_h[i]);
+-        av_freep(&s->sao_pixel_buffer_v[i]);
+-    }
++    av_freep(&s->sao_pixel_buffer_h[0]);
++    av_freep(&s->sao_pixel_buffer_v[0]);
+ 
+     if (sps->sao_enabled && !s->avctx->hwaccel) {
+-        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+-        int c_idx;
++        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
++        unsigned int c_idx;
++        size_t vsize[3] = {0};
++        size_t hsize[3] = {0};
+ 
          for(c_idx = 0; c_idx < c_count; c_idx++) {
              int w = sps->width >> sps->hshift[c_idx];
              int h = sps->height >> sps->vshift[c_idx];
-+            // ******** Very very nasty allocation kludge for plaited Chroma
-             s->sao_pixel_buffer_h[c_idx] =
+-            s->sao_pixel_buffer_h[c_idx] =
 -                av_malloc((w * 2 * sps->ctb_height) <<
-+                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
-                           sps->pixel_shift);
-             s->sao_pixel_buffer_v[c_idx] =
+-                          sps->pixel_shift);
+-            s->sao_pixel_buffer_v[c_idx] =
 -                av_malloc((h * 2 * sps->ctb_width) <<
-+                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
-                           sps->pixel_shift);
+-                          sps->pixel_shift);
++            // ctb height & width are a min of 8 so this must a multiple of 16
++            // so no point rounding up!
++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
          }
++
++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++        // when we have plaited chroma
++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
      }
-@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
+ 
+     s->ps.sps = sps;
+@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s)
                  (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
                  pred_weight_table(s, gb);
              }
@@ -3934,20 +7565,25 @@ index b478065..955e426 100644
  
              sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
              if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
      return 0;
  }
  
 +#ifdef RPI
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
++{
++    return s->jb0->intra.cmds + s->jb0->intra.n++;
++}
++
 +static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
 +{
 +    // U & V done on U call in the case of sliced frames
-+    if (rpi_sliced_frame(s->frame) && c_idx > 1)
++    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
 +        return;
 +
 +    if (s->enable_rpi) {
 +        HEVCLocalContext *lc = s->HEVClc;
-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++        HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
 +        cmd->type = RPI_PRED_INTRA;
 +        cmd->size = log2_trafo_size;
 +        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
@@ -3956,7 +7592,7 @@ index b478065..955e426 100644
 +        cmd->i_pred.y = y0;
 +        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
 +    }
-+    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
++    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
 +        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
 +    }
 +    else {
@@ -3969,7 +7605,7 @@ index b478065..955e426 100644
  static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                                int xBase, int yBase, int cb_xBase, int cb_yBase,
                                int log2_cb_size, int log2_trafo_size,
-@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
      if (lc->cu.pred_mode == MODE_INTRA) {
          int trafo_size = 1 << log2_trafo_size;
          ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
@@ -3982,7 +7618,7 @@ index b478065..955e426 100644
      }
  
      if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
-@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -3994,7 +7630,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
@@ -4006,7 +7642,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
-@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                      trafo_size_h, trafo_size_v);
@@ -4018,7 +7654,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cb[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
                  if (lc->cu.pred_mode == MODE_INTRA) {
                      ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
                                                  trafo_size_h, trafo_size_v);
@@ -4030,7 +7666,7 @@ index b478065..955e426 100644
                  }
                  if (cbf_cr[i])
                      ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
-@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
              int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
              int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
              ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
@@ -4077,7 +7713,7 @@ index b478065..955e426 100644
              }
          }
      }
-@@ -1275,47 +1675,120 @@ do {
+@@ -1281,47 +1827,119 @@ do {
      return 0;
  }
  
@@ -4112,13 +7748,13 @@ index b478065..955e426 100644
 -    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
 -    if (s->ps.sps->chroma_format_idc) {
 -        s->hevcdsp.put_pcm(dst1, stride1,
-+#ifdef RPI
-+    if (rpi_sliced_frame(s->frame)) {
-+        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
++#if RPI_HEVC_SAND
++    if (av_rpi_is_sand_frame(s->frame)) {
++        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
 +                           s->frame->linesize[0],
 +                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
 +
-+        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
++        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
 +                           s->frame->linesize[1],
                             cb_size >> s->ps.sps->hshift[1],
                             cb_size >> s->ps.sps->vshift[1],
@@ -4157,10 +7793,9 @@ index b478065..955e426 100644
 +#ifdef RPI
 +int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
 +{
-+    int16_t * const coeffs = (buf_no != 3) ?
-+        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
-+        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
-+    s->num_coeffs[s->pass0_job][buf_no] += n;
++    HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++    cfe->n += n;
 +    return coeffs;
 +}
 +#endif
@@ -4205,7 +7840,7 @@ index b478065..955e426 100644
 +
 +        // Add command
 +        {
-+            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
 +            cmd->type = RPI_PRED_I_PCM;
 +            cmd->size = log2_cb_size;
 +            cmd->i_pcm.src = coeffs;
@@ -4223,99 +7858,7 @@ index b478065..955e426 100644
  /**
   * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
   *
-@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
-  * @param luma_offset additive offset applied to the luma prediction value
-  */
- 
-+#if RPI_INTER
-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
-+                        int block_w, int block_h, int luma_weight, int luma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_UNI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref->data[0];
-+    cmd->srcstride = ref->linesize[0];
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = luma_weight;
-+    cmd->offset = luma_offset;
-+}
-+
-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
-+                       const struct MvField * const current_mv)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_LUMA_BI;
-+    cmd->dst = dst;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[0];
-+    cmd->srcstride = ref0->linesize[0];
-+    cmd->mv = *mv0;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[0];
-+    cmd->srcstride1 = ref1->linesize[0];
-+    cmd->mv1 = *mv1;
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
-+                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = src0;
-+    cmd->srcstride = srcstride;
-+    cmd->mv = *mv;
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->weight = chroma_weight;
-+    cmd->offset = chroma_offset;
-+}
-+
-+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
-+                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
-+{
-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
-+    cmd->dst = dst0;
-+    cmd->dststride = dststride;
-+    cmd->src = ref0->data[cidx+1];
-+    cmd->srcstride = ref0->linesize[cidx+1];
-+    cmd->mv = current_mv->mv[0];
-+    cmd->mv1 = current_mv->mv[1];
-+    cmd->x_off = x_off;
-+    cmd->y_off = y_off;
-+    cmd->block_w = block_w;
-+    cmd->block_h = block_h;
-+    cmd->src1 = ref1->data[cidx+1];
-+    cmd->srcstride1 = ref1->linesize[cidx+1];
-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
-+}
-+
-+#endif
-+
- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
-                         int block_w, int block_h, int luma_weight, int luma_offset)
-@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
                             (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
      int idx              = ff_hevc_pel_weight[block_w];
  
@@ -4326,7 +7869,7 @@ index b478065..955e426 100644
      x_off += mv->x >> 2;
      y_off += mv->y >> 2;
      src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
   * @param mv1 motion vector1 (relative to block position) to get pixel data from
   * @param current_mv current motion vector structure
   */
@@ -4335,7 +7878,7 @@ index b478065..955e426 100644
                         AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
                         int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
  {
-@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
      uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
      uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
  
@@ -4346,7 +7889,7 @@ index b478065..955e426 100644
      if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
          x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
          y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
-@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
      intptr_t _mx         = mx << (1 - hshift);
      intptr_t _my         = my << (1 - vshift);
  
@@ -4357,7 +7900,7 @@ index b478065..955e426 100644
      x_off += mv->x >> (2 + hshift);
      y_off += mv->y >> (2 + vshift);
      src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
-@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
      int hshift = s->ps.sps->hshift[1];
      int vshift = s->ps.sps->vshift[1];
  
@@ -4368,7 +7911,125 @@ index b478065..955e426 100644
      intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
      intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
      intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
-@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+                                                          _mx1, _my1, block_w);
+ }
+ 
+-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
+-                                const Mv *mv, int y0, int height)
++#ifdef RPI
++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field)
+ {
+-    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++        HEVCContext *const fs = ref->tf.owner->priv_data;
++        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
++        sem_t * sem = NULL;
++
++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
++            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
++
++            av_assert0(pwait->req == -1 && pwait->next == NULL);
+ 
+-    if (s->threads_type == FF_THREAD_FRAME )
+-        ff_thread_await_progress(&ref->tf, y, 0);
++            pwait->req = val;
++            pwait->next = NULL;
++            if (pstate->first == NULL)
++                pstate->first = pwait;
++            else
++                pstate->last->next = pwait;
++            pstate->last = pwait;
++            sem = &pwait->sem;
++        }
++        pthread_mutex_unlock(&pstate->lock);
++
++        if (sem != NULL) {
++            while (sem_wait(sem) != 0)
++                av_assert0(errno == EINTR);
++        }
++    }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
++{
++    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
++
++    ((int *)s->ref->tf.progress->data)[field] = val;
++
++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++    {
++        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
++        HEVCRPiFrameProgressWait * pwait;
++
++        while ((pwait = *ppwait) != NULL) {
++            if (pwait->req > val)
++            {
++                ppwait = &pwait->next;
++                pstate->last = pwait;
++            }
++            else
++            {
++                *ppwait = pwait->next;
++                pwait->req = -1;
++                pwait->next = NULL;
++                sem_post(&pwait->sem);
++            }
++        }
++    }
++    pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
++{
++    pstate->first = NULL;
++    pstate->last = NULL;
++    pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
++{
++    pwait->req = -1;
++    pwait->next = NULL;
++    sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
++{
++    av_assert0(pstate->first == NULL);
++    pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
++{
++    sem_destroy(&pwait->sem);
++}
++#endif
++
++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
++                                const Mv * const mv, const int y0, const int height)
++{
++    if (s->threads_type == FF_THREAD_FRAME) {
++        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
++
++#ifdef RPI
++        if (s->enable_rpi) {
++            int16_t *const pr = s->jb0->progress + ref->dpb_no;
++            if (*pr < y) {
++                *pr = y;
++            }
++        }
++        else
++#endif
++        // It is a const ThreadFrame but the prototype isn't
++        ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++    }
+ }
+ 
+ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
      }
  }
  
@@ -4378,21 +8039,106 @@ index b478065..955e426 100644
 +
 +#if RPI_INTER
 +
-+static HEVCRpiLumaPred *
-+rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
 +{
-+    HEVCRpiLumaPred * yp = s->curr_pred_y;
-+    HEVCRpiLumaPred * ypt = yp + 1;
-+    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
++    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
++    HEVCRpiInterPredQ * ypt = yp + 1;
++    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
 +        if (ypt->load < yp->load)
 +            yp = ypt;
 +    }
 +
-+//        yp->load += load_val;
-+    ++yp->load;
++    yp->load += load_val;
++    ipe->used_grp = 1;
++    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
++
 +    return yp;
 +}
 +
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++    for (unsigned int i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr->data[-1] = q->code_sync;
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
++        q->load = 0;
++    }
++}
++
++// Returns 0 on success, -1 if Q is dangerously full
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++    if (!ipe->used_grp)
++        return 0;
++
++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
++    {
++        ipe->curr = 0;
++        rpi_inter_pred_sync(ipe);
++    }
++    ipe->used = 1;
++    ipe->used_grp = 0;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
++        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
++            return -1;
++        }
++    }
++    return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    ipe->curr = 0;
++    ipe->used = 0;
++    ipe->used_grp = 0;
++    for (i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base;
++        q->load = 0;
++        q->last_l0 = NULL;
++        q->last_l1 = NULL;
++    }
++}
++
++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n_max, const unsigned int n_grp,
++                                 const unsigned int total_size, const unsigned int min_gap)
++{
++    memset(ipe, 0, sizeof(*ipe));
++    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
++    ipe->n_grp = n_grp;
++    ipe->min_gap = min_gap;
++
++#if RPI_CACHE_UNIF_MVS
++    gpu_malloc_cached(total_size, &ipe->gptr);
++#else
++    gpu_malloc_uncached(total_size, &ipe->gptr);
++#endif
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
++{
++    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
++           wt << (s->ps.sps->bit_depth - 8);
++}
++
 +static void
 +rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
 +           const int nPbW, const int nPbH,
@@ -4401,116 +8147,155 @@ index b478065..955e426 100644
 +           const int weight_offset,
 +           AVFrame *const src_frame)
 +{
-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
-+
-+//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
-+//                    mv, x0, y0, nPbW, nPbH,
-+//                    weight_mul, weight_offset);
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx       = (my << 8) | mx;
++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
 +
++    if (my_mx == 0)
 +    {
-+        const unsigned int mx          = mv->x & 3;
-+        const unsigned int my          = mv->y & 3;
-+        const unsigned int my_mx       = (my << 8) | mx;
-+        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
-+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
-+        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
-+        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
-+        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int bh = nPbH;
 +
-+        // Potentially we could change the assembly code to support taller sizes in one go
-+        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
++        for (int start_x = 0; start_x < nPbW; start_x += 16)
 +        {
-+            const uint32_t src_yx_y = y1_m3 + start_y;
-+            int start_x = 0;
-+            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
 +
-+#if 1
-+            // As Y-pred operates on two independant 8-wide src blocks we can merge
-+            // this pred with the previous one if it the previous one is 8 pel wide,
-+            // the same height as the current block, immediately to the left of our
-+            // current dest block and mono-pred.
-+
-+            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
-+            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
-+            {
-+                const int bw = FFMIN(nPbW, 8);
-+                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
-+
-+                last_y8_lx->next_src2_x = x1_m3;
-+                last_y8_lx->next_src2_y = src_yx_y;
-+                last_y8_lx->next_src2_base = src_vc_address_y;
-+                last_y8_p->p.w += bw;
-+                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
-+                last_y8_p->p.wo2 = wo;
-+
-+                s->last_y8_p = NULL;
-+                s->last_y8_lx = NULL;
-+                start_x = bw;
 +#if RPI_TSTATS
-+                ++s->tstats.y_pred1_y8_merge;
-+#endif
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred1_x0y0;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
 +            }
 +#endif
 +
-+            for (; start_x < nPbW; start_x += 16)
-+            {
-+                const int bw = FFMIN(nPbW - start_x, 16);
-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src_vc_address_y;
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->wo1 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
++        const unsigned int bh = nPbH;
++        int start_x = 0;
++
++#if 1
++        // As Y-pred operates on two independant 8-wide src blocks we can merge
++        // this pred with the previous one if it the previous one is 8 pel wide,
++        // the same height as the current block, immediately to the left of our
++        // current dest block and mono-pred.
++
++        qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++        {
++            const int bw = FFMIN(nPbW, 8);
++            qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
++
++            last_y8_src2->x = x1_m3;
++            last_y8_src2->y = y1_m3;
++            last_y8_src2->base = src_vc_address_y;
++            last_y8_p->w += bw;
++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++            last_y8_p->wo2 = wo;
++
++            s->last_y8_p = NULL;
++            s->last_y8_l1 = NULL;
++            start_x = bw;
 +#if RPI_TSTATS
-+                {
-+                    HEVCRpiStats *const ts = &s->tstats;
-+                    if (mx == 0 && my == 0)
-+                        ++ts->y_pred1_x0y0;
-+                    else if (mx == 0)
-+                        ++ts->y_pred1_x0;
-+                    else if (my == 0)
-+                        ++ts->y_pred1_y0;
-+                    else
-+                        ++ts->y_pred1_xy;
-+
-+                    if (nPbW > 8)
-+                        ++ts->y_pred1_wgt8;
-+                    else
-+                        ++ts->y_pred1_wle8;
-+
-+                    if (nPbH > 16)
-+                        ++ts->y_pred1_hgt16;
-+                    else
-+                        ++ts->y_pred1_hle16;
-+                }
++            ++s->tstats.y_pred1_y8_merge;
++#endif
++        }
 +#endif
-+                cmd_y[-1].next_fn = s->qpu_filter;
-+                cmd_lx->next_src1_x = x1_m3 + start_x;
-+                cmd_lx->next_src1_y = src_yx_y;
-+                cmd_lx->next_src1_base = src_vc_address_y;
-+                if (bw <= 8)
-+                {
-+                    cmd_lx->next_src2_x = MC_DUMMY_X;
-+                    cmd_lx->next_src2_y = MC_DUMMY_Y;
-+                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
-+                }
-+                else
-+                {
-+                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
-+                    cmd_lx->next_src2_y = src_yx_y;
-+                    cmd_lx->next_src2_base = src_vc_address_y;
-+                }
-+                cmd_y->p.w = bw;
-+                cmd_y->p.h = bh;
-+                cmd_y->p.mymx21 = my2_mx2_my_mx;
-+                cmd_y->p.wo1 = wo;
-+                cmd_y->p.wo2 = wo;
-+                cmd_y->p.dst_addr =  dst_addr + start_x;
-+                yp->last_lx = cmd_y;
-+                yp->qpu_mc_curr = cmd_y + 1;
 +
-+                if (bw == 8) {
-+                    s->last_y8_lx = cmd_lx;
-+                    s->last_y8_p = cmd_y;
-+                }
++        for (; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                if (mx == 0 && my == 0)
++                    ++ts->y_pred1_x0y0;
++                else if (mx == 0)
++                    ++ts->y_pred1_x0;
++                else if (my == 0)
++                    ++ts->y_pred1_y0;
++                else
++                    ++ts->y_pred1_xy;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++            src1->x = x1_m3 + start_x;
++            src1->y = y1_m3;
++            src1->base = src_vc_address_y;
++            if (bw <= 8)
++            {
++                src2->x = MC_DUMMY_X;
++                src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++                src2->base = s->qpu_dummy_frame_emu;
++#else
++                src2->base = s->qpu_dummy_frame_qpu;
++#endif
++            }
++            else
++            {
++                src2->x = x1_m3 + start_x + 8;
++                src2->y = y1_m3;
++                src2->base = src_vc_address_y;
++            }
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo;
++            cmd_y->wo2 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++            if (bw == 8) {
++                s->last_y8_l1 = src2;
++                s->last_y8_p = cmd_y;
 +            }
 +        }
 +    }
@@ -4524,168 +8309,180 @@ index b478065..955e426 100644
 +           AVFrame *const src_frame,
 +           AVFrame *const src_frame2)
 +{
-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
 +    const Mv * const mv  = mv_field->mv + 0;
 +    const Mv * const mv2 = mv_field->mv + 1;
 +
-+//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
-+//           mv, x0, y0, nPbW, nPbH,
-+//           src_frame2, mv2, mv_field);
++    const unsigned int mx          = mv->x & 3;
++    const unsigned int my          = mv->y & 3;
++    const unsigned int my_mx = (my<<8) | mx;
++    const unsigned int mx2          = mv2->x & 3;
++    const unsigned int my2          = mv2->y & 3;
++    const unsigned int my2_mx2 = (my2<<8) | mx2;
++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
++    const uint32_t wt_offset =
++        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
++    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
++    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
++
++    if (my2_mx2_my_mx == 0)
 +    {
-+        const unsigned int mx          = mv->x & 3;
-+        const unsigned int my          = mv->y & 3;
-+        const unsigned int my_mx = (my<<8) | mx;
-+        const unsigned int mx2          = mv2->x & 3;
-+        const unsigned int my2          = mv2->y & 3;
-+        const unsigned int my2_mx2 = (my2<<8) | mx2;
-+        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++        const int x1 = x0 + (mv->x >> 2);
++        const int y1 = y0 + (mv->y >> 2);
++        const int x2 = x0 + (mv2->x >> 2);
++        const int y2 = y0 + (mv2->y >> 2);
++        const int bh = nPbH;
++
++        // Can do chunks a full 16 wide if we don't want the H filter
++        for (int start_x=0; start_x < nPbW; start_x += 16)
++        {
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                ++ts->y_pred2_x0y0;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 16);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = 0;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        // Filter requires a run-up of 3
 +        const int x1 = x0 + (mv->x >> 2) - 3;
 +        const int y1 = y0 + (mv->y >> 2) - 3;
 +        const int x2 = x0 + (mv2->x >> 2) - 3;
 +        const int y2 = y0 + (mv2->y >> 2) - 3;
-+        const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+        const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
-+                     s->sh.luma_offset_l1[ref_idx1] + 1;
-+        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
-+        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
++        const int bh = nPbH;
 +
-+        uint32_t dst = get_vc_address_y(s->frame) + y_off;
-+        const uint32_t src1_base = get_vc_address_y(src_frame);
-+        const uint32_t src2_base = get_vc_address_y(src_frame2);
-+
-+        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
-+        {
-+            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
-+
-+            for (int start_x=0; start_x < nPbW; start_x += 8)
-+            { // B blocks work 8 at a time
-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
++        for (int start_x=0; start_x < nPbW; start_x += 8)
++        { // B blocks work 8 at a time
++            // B weights aren't doubled as the QPU code does the same
++            // amount of work as it does for P
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
 +#if RPI_TSTATS
-+              {
-+                  HEVCRpiStats *const ts = &s->tstats;
-+                  const unsigned int mmx = mx | mx2;
-+                  const unsigned int mmy = my | my2;
-+                  if (mmx == 0 && mmy == 0)
-+                      ++ts->y_pred2_x0y0;
-+                  else if (mmx == 0)
-+                      ++ts->y_pred2_x0;
-+                  else if (mmy == 0)
-+                      ++ts->y_pred2_y0;
-+                  else
-+                      ++ts->y_pred2_xy;
++            {
++                HEVCRpiStats *const ts = &s->tstats;
++                const unsigned int mmx = mx | mx2;
++                const unsigned int mmy = my | my2;
++                if (mmx == 0 && mmy == 0)
++                    ++ts->y_pred2_x0y0;
++                else if (mmx == 0)
++                    ++ts->y_pred2_x0;
++                else if (mmy == 0)
++                    ++ts->y_pred2_y0;
++                else
++                    ++ts->y_pred2_xy;
 +
-+                  if (nPbH > 16)
-+                      ++ts->y_pred2_hgt16;
-+                  else
-+                      ++ts->y_pred2_hle16;
-+              }
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
 +#endif
-+              cmd_y[-1].next_fn = s->qpu_filter_b;
-+              cmd_lx->next_src1_x = x1 + start_x;
-+              cmd_lx->next_src1_y = y1 + start_y;
-+              cmd_lx->next_src1_base = src1_base;
-+              cmd_lx->next_src2_x = x2 + start_x;
-+              cmd_lx->next_src2_y = y2 + start_y;
-+              cmd_lx->next_src2_base = src2_base;
-+              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
-+              cmd_y->p.h = bh;
-+              cmd_y->p.mymx21 = my2_mx2_my_mx;
-+              cmd_y->p.wo1 = wo1;
-+              cmd_y->p.wo2 = wo2;
-+              cmd_y->p.dst_addr =  dst + start_x;
-+              yp->last_lx = cmd_y;
-+              yp->qpu_mc_curr = cmd_y + 1;
-+          }
-+          dst += s->frame->linesize[0] * 16;
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 8);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
 +        }
 +    }
 +}
 +
-+
-+static HEVCRpiChromaPred *
-+rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
-+{
-+    HEVCRpiChromaPred * cp = s->curr_pred_c;
-+    HEVCRpiChromaPred * cpt = cp + 1;
-+    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
-+        if (cpt->load < cp->load)
-+            cp = cpt;
-+    }
-+    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
-+    ++cp->load;
-+//    cp->load += load_val;
-+    return cp;
-+}
-+
++// h/v shifts fixed at one as that is all the qasm copes with
 +static void
-+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
 +  const Mv * const mv,
 +  const int16_t * const c_weights,
 +  const int16_t * const c_offsets,
 +  AVFrame * const src_frame)
 +{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // = s->ps.sps->hshift[1];
++    const int vshift = 1; // = s->ps.sps->vshift[1];
 +
-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
-+#if 0
-+    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
++    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
++    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
 +
-+    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
-+                c_weights[0], c_offsets[0]);
-+
-+    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
-+                c_weights[1], c_offsets[1]);
-+#endif
++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
 +    {
-+        const int hshift           = s->ps.sps->hshift[1];
-+        const int vshift           = s->ps.sps->vshift[1];
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++        qpu_mc_src_t * const last_lx = *plast_lx;
++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
-+        const uint32_t src_base_u = get_vc_address_u(src_frame);
-+        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
-+        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
-+        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
-+        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+
-+        for(int start_y=0;start_y < nPbH_c;start_y+=16)
-+        {
-+            const int bh = FFMIN(nPbH_c-start_y, 16);
-+
-+            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+            {
-+                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
-+                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
-+                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
-+                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+                u[-1].next_fn  = s->qpu_filter_uv;
-+                last_l0->next_src_x = x1_c + start_x;
-+                last_l0->next_src_y = y1_c + start_y;
-+                last_l0->next_src_base_c = src_base_u;
-+                u[0].p.h = bh;
-+                u[0].p.w = bw;
-+                u[0].p.coeffs_x = x_coeffs;
-+                u[0].p.coeffs_y = y_coeffs;
-+                u[0].p.wo_u = wo_u;
-+                u[0].p.wo_v = wo_v;
-+                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
-+                cp->last_l0 = u;
-+                cp->qpu_mc_curr = u + 1;
-+            }
-+
-+            dst_base_u += s->frame->linesize[1] * 16;
-+        }
++        last_lx->x = x1_c + start_x;
++        last_lx->y = y1_c;
++        last_lx->base = src_base_u;
++        cmd_c->h = bh;
++        cmd_c->w = bw;
++        cmd_c->coeffs_x = x_coeffs;
++        cmd_c->coeffs_y = y_coeffs;
++        cmd_c->wo_u = wo_u;
++        cmd_c->wo_v = wo_v;
++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++        *plast_lx = &cmd_c->next_src;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
 +    }
-+  return;
++    return;
 +}
 +
++// h/v shifts fixed at one as that is all the qasm copes with
 +static void
 +rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
 +  const int nPbW_c, const int nPbH_c,
@@ -4697,89 +8494,72 @@ index b478065..955e426 100644
 +  AVFrame * const src_frame,
 +  AVFrame * const src_frame2)
 +{
-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
-+#if 0
-+    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // s->ps.sps->hshift[1];
++    const int vshift = 1; // s->ps.sps->vshift[1];
++    const Mv * const mv = mv_field->mv + 0;
++    const Mv * const mv2 = mv_field->mv + 1;
 +
-+    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
-+#endif
++    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
++    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++
++    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++
++    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
++    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
++
++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++
++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
 +    {
-+        const int hshift = s->ps.sps->hshift[1];
-+        const int vshift = s->ps.sps->vshift[1];
-+        const Mv * const mv = mv_field->mv + 0;
-+        const Mv * const mv2 = mv_field->mv + 1;
++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
 +
-+        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
-+        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
-+        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++        qpu_mc_src_t * const src_l0 = cp->last_l0;
++        qpu_mc_src_t * const src_l1 = cp->last_l1;
 +
-+        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
-+        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
-+        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++        src_l0->x = x1_c + start_x;
++        src_l0->y = y1_c;
++        src_l0->base = src1_base;
++        src_l1->x = x2_c + start_x;
++        src_l1->y = y2_c;
++        src_l1->base = src2_base;
 +
-+        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
-+        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
++        u[0].h = bh;
++        u[0].w = bw;
++        u[0].coeffs_x1 = coefs0_x;
++        u[0].coeffs_y1 = coefs0_y;
++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
++        u[0].coeffs_x2 = coefs1_x;
++        u[0].coeffs_y2 = coefs1_y;
++        u[0].wo_u2 = wo_u2;
++        u[0].wo_v2 = wo_v2;
++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
 +
-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
-+
-+        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
-+          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
-+
-+          // We are allowed 3/4 powers of two as well as powers of 2
-+          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
-+
-+          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
-+              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
-+              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
-+              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
-+              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
-+
-+              u[-1].next_fn = s->qpu_filter_uv_b0;
-+              last_l0->next_src_x = x1_c + start_x;
-+              last_l0->next_src_y = y1_c + start_y;
-+              last_l0->next_src_base_c = get_vc_address_u(src_frame);
-+
-+              u[0].next_fn = 0;  // Ignored - 2 block cmd
-+              u[0].next_src_x = x2_c + start_x;
-+              u[0].next_src_y = y2_c + start_y;
-+              u[0].next_src_base_c = get_vc_address_u(src_frame2);
-+
-+              u[0].b0.h = (bh<16 ? bh : 16);
-+              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
-+              u[0].b0.coeffs_x = coefs0_x;
-+              u[0].b0.coeffs_y = coefs0_y;
-+              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
-+              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
-+              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
-+
-+              last_l1->next_src_x = x2_c + start_x;
-+              last_l1->next_src_y = y2_c + start_y;
-+              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
-+
-+              u[1].b1.dummy0 = 0;  // w,h inherited from b0
-+              u[1].b1.coeffs_x = coefs1_x;
-+              u[1].b1.coeffs_y = coefs1_y;
-+              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
-+              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
-+              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
-+
-+              cp->last_l0 = u;
-+              cp->last_l1 = u + 1;
-+              cp->qpu_mc_curr = u + 2;
-+          }
-+
-+          dst_base_u += s->frame->linesize[1] * 16;
-+        }
++        cp->last_l0 = &u[0].next_src1;
++        cp->last_l1 = &u[0].next_src2;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
 +    }
 +}
++
++
 +#endif
 +
 +
@@ -4796,7 +8576,7 @@ index b478065..955e426 100644
      int merge_idx = 0;
      struct MvField current_mv = {{{ 0 }}};
  
-@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
      int y_cb             = y0 >> log2_min_cb_size;
      int x_pu, y_pu;
      int i, j;
@@ -4806,7 +8586,7 @@ index b478065..955e426 100644
  
      if (!skip_flag)
          lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
-@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -4828,7 +8608,7 @@ index b478065..955e426 100644
          if (s->ps.sps->chroma_format_idc) {
 +#if RPI_INTER
 +            if (s->enable_rpi) {
-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
++                rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
 +                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
 +                  ref0->frame);
 +                return;
@@ -4837,7 +8617,7 @@ index b478065..955e426 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
                            0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
-@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -4859,7 +8639,7 @@ index b478065..955e426 100644
          if (s->ps.sps->chroma_format_idc) {
 +#if RPI_INTER
 +            if (s->enable_rpi) {
-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
++                rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
 +                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
 +                  ref1->frame);
 +                return;
@@ -4868,7 +8648,7 @@ index b478065..955e426 100644
              chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
                            1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
                            s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
-@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
          int nPbW_c = nPbW >> s->ps.sps->hshift[1];
          int nPbH_c = nPbH >> s->ps.sps->vshift[1];
  
@@ -4901,7 +8681,7 @@ index b478065..955e426 100644
              chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
                           x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
  
-@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
                  intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
                  ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
                  if (s->ps.sps->pcm.loop_filter_disable_flag)
@@ -4911,21 +8691,22 @@ index b478065..955e426 100644
  
                  if (ret < 0)
                      return ret;
-@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
      lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
  }
  
 +#ifdef RPI
 +static void rpi_execute_dblk_cmds(HEVCContext *s)
 +{
-+    int n;
-+    int job = s->pass1_job;
-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
-+    int (*p)[2] = s->dblk_cmds[job];
-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
++    const unsigned int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
++    HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
++    unsigned int i;
++
++    for (i = 0; i != de->n; ++i)
++    {
++        ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
 +    }
-+    s->num_dblk_cmds[job] = 0;
++    de->n = 0;
 +}
 +
 +#if 0
@@ -4958,21 +8739,33 @@ index b478065..955e426 100644
 +#endif
 +
 +
++#define RPI_OPT_SEP_PRED 0
++
++
 +// I-pred, transform_and_add for all blocks types done here
 +// All ARM
++#if RPI_OPT_SEP_PRED
++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
++#else
 +static void rpi_execute_pred_cmds(HEVCContext * const s)
++#endif
 +{
 +  int i;
-+  int job = s->pass1_job;
-+  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
-+#ifdef RPI_WORKER
++  HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
++  const HEVCPredCmd *cmd = iap->cmds;
++#ifdef RPI
 +  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
 +#else
 +  HEVCLocalContext *lc = s->HEVClc;
 +#endif
 +
-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
++  for(i = iap->n; i > 0; i--, cmd++) {
 +//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
++#if RPI_OPT_SEP_PRED
++      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
++          continue;
++      }
++#endif
 +
 +      switch (cmd->type)
 +      {
@@ -4983,7 +8776,7 @@ index b478065..955e426 100644
 +              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
 +              lc->na.cand_up           = (cmd->na >> 1) & 1;
 +              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
-+              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
++              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
 +                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
 +              else
 +                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
@@ -4991,16 +8784,25 @@ index b478065..955e426 100644
 +
 +          case RPI_PRED_ADD_RESIDUAL:
 +              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+#ifdef RPI_PRECLEAR
-+              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
-+#endif
 +              break;
++          case RPI_PRED_ADD_DC:
++              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++              break;
++#if RPI_HEVC_SAND
 +          case RPI_PRED_ADD_RESIDUAL_U:
-+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
 +              break;
 +          case RPI_PRED_ADD_RESIDUAL_V:
-+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
 +              break;
++          case RPI_PRED_ADD_RESIDUAL_C:
++              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++              break;
++          case RPI_PRED_ADD_DC_U:
++          case RPI_PRED_ADD_DC_V:
++              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++              break;
++#endif
 +
 +          case RPI_PRED_I_PCM:
 +              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
@@ -5011,88 +8813,27 @@ index b478065..955e426 100644
 +              abort();
 +      }
 +  }
-+  s->num_pred_cmds[job] = 0;
++#if RPI_OPT_SEP_PRED
++  if (do_luma)
++#endif
++  {
++      iap->n = 0;
++  }
 +}
 +
-+// Do any inter-pred that we want to do in software
-+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
-+// All ARM
-+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
-+{
-+    unsigned int cidx;
-+    AVFrame myref;
-+    AVFrame myref1;
-+    struct MvField mymv;
-+
-+    for(; n>0 ; n--, cmd++) {
-+        av_assert0(0);
-+
-+        switch(cmd->cmd) {
-+        case RPI_CMD_LUMA_UNI:
-+            if (b_only)
-+                break;
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_LUMA_BI:
-+            myref.data[0] = cmd->src;
-+            myref.linesize[0] = cmd->srcstride;
-+            myref1.data[0] = cmd->src1;
-+            myref1.linesize[0] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
-+                       &myref1, &cmd->mv1, &mymv);
-+            break;
-+        case RPI_CMD_CHROMA_UNI:
-+            if (b_only)
-+                break;
-+            mymv.mv[0] = cmd->mv;
-+            chroma_mc_uni(s, cmd->dst,
-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
-+            break;
-+        case RPI_CMD_CHROMA_BI:
-+        case RPI_CMD_CHROMA_BI+1:
-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
-+            myref.data[cidx+1] = cmd->src;
-+            myref.linesize[cidx+1] = cmd->srcstride;
-+            myref1.data[cidx+1] = cmd->src1;
-+            myref1.linesize[cidx+1] = cmd->srcstride1;
-+            mymv.ref_idx[0] = cmd->ref_idx[0];
-+            mymv.ref_idx[1] = cmd->ref_idx[1];
-+            mymv.mv[0] = cmd->mv;
-+            mymv.mv[1] = cmd->mv1;
-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
-+            break;
-+        }
-+    }
-+}
-+
-+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
-+{
-+    const int job = s->pass1_job;
-+
-+    if (!qpu_luma || luma_b_only)
-+        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
-+    s->num_mv_cmds_y[job] = 0;
-+    if (!qpu_chroma || chroma_b_only)
-+        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
-+    s->num_mv_cmds_c[job] = 0;
-+}
 +
 +#endif
 +
 +#ifdef RPI
++
 +// Set initial uniform job values & zero ctu_count
 +static void rpi_begin(HEVCContext *s)
 +{
 +#if RPI_INTER
-+    int job = s->pass0_job;
-+    int i;
++    unsigned int i;
++    HEVCRpiJob * const jb = s->jb0;
++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
 +
 +    const uint16_t pic_width_y        = s->ps.sps->width;
 +    const uint16_t pic_height_y       = s->ps.sps->height;
@@ -5100,73 +8841,60 @@ index b478065..955e426 100644
 +    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
 +    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
 +
-+    for(i=0; i < QPU_N_UV;i++) {
-+        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
-+        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
++    rpi_inter_pred_reset(cipe);
++    for (i = 0; i < cipe->n; i++) {
++        HEVCRpiInterPredQ * const cp = cipe->q + i;
++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
 +
-+        // Chroma setup is a double block with L0 fetch
-+        // and other stuff in the 1st block and L1 fetch
-+        // in the 2nd along with a lot of dummy vars
-+        // This could be packed a lot tighter but it would make
-+        // L0, L1 management a lot harder
++        u->next_src1.x = 0;
++        u->next_src1.y = 0;
++        u->next_src1.base = 0;
++        u->pic_cw = pic_width_c;
++        u->pic_ch = pic_height_c;
++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        u->wdenom = s->sh.chroma_log2_weight_denom;
++        cp->last_l0 = &u->next_src1;
 +
 +        u->next_fn = 0;
-+        u->next_src_x = 0;
-+        u->next_src_y = 0;
-+        u->next_src_base_c = 0;
-+        u->s0.pic_cw = pic_width_c;
-+        u->s0.pic_ch = pic_height_c;
-+        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
-+        u->s0.stride1 = s->frame->linesize[1];
-+        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
-+        u->s0.dummy0 = 0;
-+        cp->last_l0 = u;
-+        ++u;
++        u->next_src2.x = 0;
++        u->next_src2.y = 0;
++        u->next_src2.base = 0;
++        cp->last_l1 = &u->next_src2;
 +
-+        u->next_fn = 0;
-+        u->next_src_x = 0;
-+        u->next_src_y = 0;
-+        u->next_src_base_c = 0;
-+        u->s1.dummy0 = 0;
-+        u->s1.dummy1 = 0;
-+        u->s1.dummy2 = 0;
-+        u->s1.dummy3 = 0;
-+        u->s1.dummy4 = 0;
-+        u->s1.dummy5 = 0;
-+        cp->last_l1 = u;
-+        ++u;
-+
-+        cp->load = 0;
-+        cp->qpu_mc_curr = u;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
 +    }
-+    s->curr_pred_c = NULL;
 +
-+    for(i=0;i < QPU_N_Y;i++) {
-+        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
-+        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
++    rpi_inter_pred_reset(yipe);
++    for (i = 0; i < yipe->n; i++) {
++        HEVCRpiInterPredQ * const yp = yipe->q + i;
++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
 +
-+        y->next_src1_x = 0;
-+        y->next_src1_y = 0;
-+        y->next_src1_base = 0;
-+        y->next_src2_x = 0;
-+        y->next_src2_y = 0;
-+        y->next_src2_base = 0;
-+        y->s.pic_h = pic_height_y;
-+        y->s.pic_w = pic_width_y;
-+        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
-+        y->s.stride1 = s->frame->linesize[0];
-+        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
-+        y->s.dummy0 = 0;
++        y->next_src1.x = 0;
++        y->next_src1.y = 0;
++        y->next_src1.base = 0;
++        y->next_src2.x = 0;
++        y->next_src2.y = 0;
++        y->next_src2.base = 0;
++        y->pic_h = pic_height_y;
++        y->pic_w = pic_width_y;
++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        y->wdenom = s->sh.luma_log2_weight_denom;
 +        y->next_fn = 0;
-+        yp->last_lx = y;
-+        ++y;
++        yp->last_l0 = &y->next_src1;
++        yp->last_l1 = &y->next_src2;
 +
-+        yp->load = 0;
-+        yp->qpu_mc_curr = y;
++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
 +    }
-+    s->curr_pred_y = NULL;
++
 +    s->last_y8_p = NULL;
-+    s->last_y8_lx = NULL;
++    s->last_y8_l1 = NULL;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
++        jb->progress[i] = -1;
++    }
++
 +#endif
 +    s->ctu_count = 0;
 +}
@@ -5174,78 +8902,122 @@ index b478065..955e426 100644
 +
 +
 +#if RPI_INTER
-+static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
 +{
 +    unsigned int i;
-+    const uint32_t exit_fn = qpu_fn(mc_exit);
-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
-+    unsigned int tc = 0;
-+    HEVCRpiJob * const jb = s->jobs + job;
++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++    unsigned int max_block = 0;
 +
-+    // Add final commands to Q
-+    for(i = 0; i != QPU_N_Y; ++i) {
-+        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
-+        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
-+
-+        // We will always have had L0 if we have L1 so only test L0
-+        if (px != yp->qpu_mc_base)
-+            tc = 1;
-+
-+        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        px->next_src1_x = MC_DUMMY_X;
-+        px->next_src1_y = MC_DUMMY_Y;
-+        px->next_src1_base = s->qpu_dummy_frame;
-+        px->next_src2_x = MC_DUMMY_X;
-+        px->next_src2_y = MC_DUMMY_Y;
-+        px->next_src2_base = s->qpu_dummy_frame;
-+
-+        yp->last_lx = NULL;
++    if (!ipe->used) {
++        return 0;
 +    }
 +
-+    return tc;
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++        if (block_size > max_block)
++            max_block = block_size;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_qpu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_qpu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++
++        // Add to mailbox list
++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++        mail[i][1] = yp->code_setup;
++    }
++
++#if RPI_CACHE_UNIF_MVS
++    // We don't need invalidate here as the uniforms aren't changed by the QPU
++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++    // new values which seems to give us a small performance advantage
++    //
++    // In most cases we will not have a completely packed set of uniforms and as
++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++    // fullest
++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++                                  ipe->n, ipe->max_fill + ipe->min_gap);
++#endif
++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++    return 1;
 +}
++#endif
 +
-+#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
-+#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
-+
-+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(HEVCContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
 +{
 +    unsigned int i;
-+    const uint32_t exit_fn = qpu_fn(mc_exit_c);
-+    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
-+    unsigned int tc = 0;
-+    HEVCRpiJob * const jb = s->jobs + job;
-+
-+    // Add final commands to Q
-+    for(i = 0; i != QPU_N_UV; ++i) {
-+        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
-+        qpu_mc_pred_c_t *const p0 = cp->last_l0;
-+        qpu_mc_pred_c_t *const p1 = cp->last_l1;
-+
-+        // We will always have had L0 if we have L1 so only test L0
-+        if (p0 != cp->qpu_mc_base)
-+            tc = 1;
-+
-+        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
-+
-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+        p0->next_src_x = MC_DUMMY_X;
-+        p0->next_src_y = MC_DUMMY_Y;
-+        p0->next_src_base_c = s->qpu_dummy_frame;
-+        p1->next_src_x = MC_DUMMY_X;
-+        p1->next_src_y = MC_DUMMY_Y;
-+        p1->next_src_base_c = s->qpu_dummy_frame;;
-+
-+        cp->last_l0 = NULL;
-+        cp->last_l1 = NULL;
++    if (!ipe->used) {
++        return 0;
 +    }
 +
-+    return tc;
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_emu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_emu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++    }
++
++    return 1;
 +}
 +#endif
 +
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++#endif
++
 +#ifdef RPI
 +
 +
@@ -5260,174 +9032,178 @@ index b478065..955e426 100644
 +// Core execution tasks
 +static void worker_core(HEVCContext * const s)
 +{
-+    worker_global_env_t * const wg = &worker_global_env;
-+    int arm_cost = 0;
-+//    vpu_qpu_wait_h sync_c;
++#if RPI_OPT_SEP_PRED
++    vpu_qpu_wait_h sync_c;
++#endif
 +    vpu_qpu_wait_h sync_y;
-+    int qpu_luma = 0;
-+    int qpu_chroma = 0;
-+    int gpu_load;
-+    int arm_load;
-+    static const int arm_const_cost = 2;
 +
-+//    static int z = 0;
-+
-+    const int job = s->pass1_job;
-+    unsigned int flush_start = 0;
-+    unsigned int flush_count = 0;
++    HEVCRpiJob * const jb = s->jb1;
++    int pred_y, pred_c;
 +
 +    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
 +
-+    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
-+        vpu_qpu_job_add_vpu(vqj,
-+            vpu_get_fn(),
-+            vpu_get_constants(),
-+            s->coeffs_buf_vc[job][2],
-+            s->num_coeffs[job][2] >> 8,
-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
-+            s->num_coeffs[job][3] >> 10,
-+            0);
-+
-+        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+    }
-+
-+
-+#if RPI_INTER
-+    pthread_mutex_lock(&wg->lock);
-+
-+//    ++z;
-+    gpu_load = vpu_qpu_current_load();
-+    arm_load = avpriv_atomic_int_get(&wg->arm_load);
-+#if 0 // Y_B_ONLY
-+    qpu_luma =  gpu_load + 2 < arm_load;
-+    qpu_chroma = gpu_load < arm_load + 8;
-+#elif 0
-+    qpu_luma =  gpu_load < arm_load + 2;
-+    qpu_chroma = gpu_load < arm_load + 8;
-+#else
-+    qpu_chroma = 1;
-+    qpu_luma = 1;
-+#endif
-+
-+    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
-+
-+    wg->gpu_c += qpu_chroma;
-+    wg->gpu_y += qpu_luma;
-+    wg->arm_c += !qpu_chroma;
-+    wg->arm_y += !qpu_luma;
-+
-+
-+//    if ((z & 511) == 0) {
-+//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
-+//    }
-+
-+
 +    {
-+        int (*d)[2] = s->dblk_cmds[job];
-+        unsigned int high=(*d)[1];
-+        int n;
++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++        if (cf->s[3].n + cf->s[2].n != 0)
++        {
++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++            vpu_qpu_job_add_vpu(vqj,
++                vpu_get_fn(s->ps.sps->bit_depth),
++                vpu_get_constants(),
++                cf->gptr.vc,
++                cf->s[2].n >> 8,
++                cf->gptr.vc + offset32,
++                cf->s[3].n >> 10,
++                0);
 +
-+        flush_start = high;
-+        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
-+            unsigned int y = (*d)[1];
-+            flush_start = FFMIN(flush_start, y);
-+            high=FFMAX(high,y);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
 +        }
-+        // Avoid flushing past end of frame
-+        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
 +    }
 +
-+#if !DISABLE_CHROMA
-+    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
-+    {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+        const uint32_t code = qpu_fn(mc_setup_c);
-+        uint32_t * p;
-+        unsigned int i;
-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
-+            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
-+            *p++ = code;
-+        }
-+
-+        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
-+
-+#if RPI_CACHE_UNIF_MVS
-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+#endif
-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
-+    }
-+#endif
++    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
 +
 +// We can take a sync here and try to locally overlap QPU processing with ARM
 +// but testing showed a slightly negative benefit with noticable extra complexity
-+//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
-+
-+    if (qpu_luma && mc_terminate_y(s, job) != 0)
-+    {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+        const uint32_t code = qpu_fn(mc_setup);
-+        uint32_t * p;
-+        unsigned int i;
-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
-+
-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
-+            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
-+            *p++ = code;
-+        }
-+
-+        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
-+
-+#if RPI_CACHE_UNIF_MVS
-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++#if RPI_OPT_SEP_PRED
++    vpu_qpu_job_add_sync_this(vqj, &sync_c);
 +#endif
-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
-+    }
 +
-+    pthread_mutex_unlock(&wg->lock);
-+
-+#endif
++    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
 +
 +    vpu_qpu_job_add_sync_this(vqj, &sync_y);
 +
++
++    // We are expecting a contiguous Z-shaped set of blocks
++    // So generate up to 3 blocks:
++    //   1st line
++    //   body
++    //   last line
++    // This will work even if we don't have the expected geometry
++    if (pred_y || pred_c)
++    {
++        const HEVCRpiDeblkEnv *const de = &jb->deblk;
++        const HEVCRpiDeblkBlk * db = de->blks + 0;
++        const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
++        unsigned int x0 = db->x_ctb;
++        unsigned int xx = x0 + ctb_size;
++        unsigned int y0 = db->y_ctb;
++
++        unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
++        unsigned int b = 0;
++        unsigned int i;
++
++        for (i = 1, ++db; i < de->n; ++i, ++db)
++        {
++            if (db->x_ctb == xx && db->y_ctb == y0) {
++                xx += ctb_size;
++            }
++            else
++            {
++                unsigned int * const tlbr = blks_tlbr[b];
++                if (tlbr[0] > y0)
++                    tlbr[0] = y0;
++                if (tlbr[1] > x0)
++                    tlbr[1] = x0;
++                if (tlbr[2] < y0 + ctb_size)
++                    tlbr[2] = y0 + ctb_size;
++                if (tlbr[3] < xx)
++                    tlbr[3] = xx;
++                x0 = db->x_ctb;
++                xx = x0 + ctb_size;
++                y0 = db->y_ctb;
++                b = 1;
++            }
++        }
++
++        if (blks_tlbr[b][0] != ~0U)
++            ++b;
++
++        {
++            unsigned int * const tlbr = blks_tlbr[b];
++            tlbr[0] = y0;
++            tlbr[1] = x0;
++            tlbr[2] = y0 + ctb_size;
++            tlbr[3] = xx;
++        }
++
++        // ??? Coalesce blocks ???
++        for (i = 0; i <= b; ++i) {
++            const unsigned int * const tlbr = blks_tlbr[i];
++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++              tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
++        }
++    }
++
++
 +    // Having accumulated some commands - do them
 +    rpi_cache_flush_finish(rfe);
++
++    // Await progress as required
++    {
++        unsigned int i;
++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
++            if (jb->progress[i] >= 0) {
++                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
++            }
++        }
++    }
++
 +    vpu_qpu_job_finish(vqj);
 +
-+    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
++    worker_pic_reset(&jb->coeffs);
 +
-+#if Y_B_ONLY
-+    if (qpu_luma)
-+        vpu_qpu_wait(&sync_y);
++    // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    if (av_rpi_is_sand8_frame(s->frame))
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++        rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++    else
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++        rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
 +#endif
-+    // Perform inter prediction
-+    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
 +
++#if RPI_OPT_SEP_PRED
 +    // Wait for transform completion
++    vpu_qpu_wait(&sync_c);
 +
 +    // Perform intra prediction and residual reconstruction
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
-+#if Y_B_ONLY
-+    if (!qpu_luma)
-+        vpu_qpu_wait(&sync_y);
-+#else
++    rpi_execute_pred_cmds(s, 0, 1);
++
++    // Wait for transform completion
 +    vpu_qpu_wait(&sync_y);
-+#endif
++
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s, 1, 0);
++#else
++    // Wait for transform completion
++    vpu_qpu_wait(&sync_y);
++
++    // Perform intra prediction and residual reconstruction
 +    rpi_execute_pred_cmds(s);
++#endif
 +
 +    // Perform deblocking for CTBs in this row
 +    rpi_execute_dblk_cmds(s);
-+
-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
 +}
 +
 +static void rpi_do_all_passes(HEVCContext *s)
 +{
++    // Called from main thread - must be no pending background jobs
++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
++
 +    // Do the various passes - common with the worker code
 +    worker_core(s);
 +    // Prepare next batch
@@ -5435,99 +9211,90 @@ index b478065..955e426 100644
 +}
 +
 +
-+
 +#endif
 +
  static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  {
      HEVCContext *s  = avctxt->priv_data;
-@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
      int y_ctb       = 0;
      int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
  
 +#ifdef RPI
-+    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
-+        s->frame->format == AV_PIX_FMT_SAND128 &&
-+        !s->ps.pps->cross_component_prediction_enabled_flag;
-+
-+    if (!s->enable_rpi) {
-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
-+        printf("Cross component\n");
-+    }
++    // * We don't support cross_component_prediction_enabled_flag but as that
++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
++    //   only deal with sand which is never 4:4:4
++    //   [support wouldn't be hard]
++    s->enable_rpi =
++        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
++         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
 +#endif
 +    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
 +
      if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
          av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
          return AVERROR_INVALIDDATA;
-@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          }
      }
  
-+#ifdef RPI_WORKER
-+    s->pass0_job = 0;
-+    s->pass1_job = 0;
-+#endif
 +#ifdef RPI
++    // Worker must be idle at start
++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
 +    rpi_begin(s);
 +#endif
 +
      while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+-        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
  
-@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+         x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
          y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
-         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
- 
-+
-         ff_hevc_cabac_init(s, ctb_addr_ts);
- 
-         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
-@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
          s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
  
-+#if RPI_INTER
-+        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
-+        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
-+#endif
-+
          more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
 +
 +#ifdef RPI
++        // Report progress so we can use our MVs in other frames
++        // If we are tiled then this isn't really optimal but given that tiling
++        // can change on a per pic basis (described in PPS) other schemes are
++        // quite a lot harder
++        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
++            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
++        }
++
 +        if (s->enable_rpi) {
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
-+          //av_assert0(s->pass0_job>=0);
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
-+          s->ctu_count++;
++            int q_full = (++s->ctu_count >= s->max_ctu_count);
 +
-+          if ( s->ctu_count >= s->max_ctu_count ) {
-+#ifdef RPI_WORKER
-+            if (s->used_for_ref)
-+            {
-+//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
++            if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
++                q_full = 1;
++            if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
++                q_full = 1;
 +
-+//                worker_wait(s);
-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
-+              // Pass on this job to worker thread
-+              worker_submit_job(s);
++            s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
++            s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
 +
-+              // Make sure we have space to prepare the next job
-+              worker_pass0_ready(s);
++            if (q_full) {
++                if (s->used_for_ref)
++                {
++//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
 +
-+              // Prepare the next batch of commands
-+              rpi_begin(s);
-+            } else {
-+              // Non-ref frame so do it all on this thread
-+              rpi_do_all_passes(s);
++//                  worker_wait(s);
++                    // Split work load onto separate threads so we make as rapid progress as possible with this frame
++                    // Pass on this job to worker thread
++                    worker_submit_job(s);
++
++                    // Make sure we have space to prepare the next job
++                    worker_pass0_ready(s);
++
++                    // Prepare the next batch of commands
++                    rpi_begin(s);
++                } else {
++                    // Non-ref frame so do it all on this thread
++                    rpi_do_all_passes(s);
++                }
 +            }
-+#else
-+            rpi_do_all_passes(s);
-+#endif
-+          }
 +
 +        }
 +#endif
@@ -5536,7 +9303,7 @@ index b478065..955e426 100644
          if (more_data < 0) {
              s->tab_slice_address[ctb_addr_rs] = -1;
              return more_data;
-@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
  
          ctb_addr_ts++;
          ff_hevc_save_states(s, ctb_addr_ts);
@@ -5549,12 +9316,10 @@ index b478065..955e426 100644
  
 +#ifdef RPI
 +
-+#ifdef RPI_WORKER
 +    // Wait for the worker to finish all its jobs
 +    if (s->enable_rpi) {
 +        worker_wait(s);
 +    }
-+#endif
 +
 +    // Finish off any half-completed rows
 +    if (s->enable_rpi && s->ctu_count) {
@@ -5579,7 +9344,7 @@ index b478065..955e426 100644
      if (x_ctb + ctb_size >= s->ps.sps->width &&
          y_ctb + ctb_size >= s->ps.sps->height)
          ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
-@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
      s = s1->sList[self_id];
      lc = s->HEVClc;
  
@@ -5591,10 +9356,13 @@ index b478065..955e426 100644
      if(ctb_row) {
          ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
  
-@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
          if (ret < 0)
              return ret;
  
+-        if (s->max_ra == INT_MAX) {
+-            if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+-                s->max_ra = s->poc;
 +        // The definition of _N unit types is "non-reference for other frames
 +        // with the same temporal_id" so they may/will be ref frames for pics
 +        // with a higher temporal_id.
@@ -5621,47 +9389,95 @@ index b478065..955e426 100644
 +            s->is_decoded = 0;
 +            break;
 +        }
-         if (s->max_ra == INT_MAX) {
-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
-                 s->max_ra = s->poc;
-@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++
++        if (s->sh.first_slice_in_pic_flag) {
++            if (s->max_ra == INT_MAX) {
++                if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
++                    s->max_ra = s->poc;
++                } else {
++                    if (IS_IDR(s))
++                        s->max_ra = INT_MIN;
++                }
++            }
++
++            if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
++                s->poc <= s->max_ra) {
++                s->is_decoded = 0;
++                break;
+             } else {
+                 if (IS_IDR(s))
+                     s->max_ra = INT_MIN;
+@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
          }
      }
  
 -fail:
 -    if (s->ref && s->threads_type == FF_THREAD_FRAME)
-+fail:  // Also success path
-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
-+#if RPI_INTER
-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
-+#endif
-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+-        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
 -
-+    }
-+#if RPI_INTER
-+    else if (s->ref && s->enable_rpi) {
-+      // When running single threaded we need to flush the whole frame
-+      flush_frame(s,s->frame);
-+    }
++fail:  // Also success path
++    if (s->ref != NULL) {
++        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
++#ifdef RPI
++            rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
 +#endif
++            ff_hevc_progress_signal_all_done(s);
++        }
++#ifdef RPI
++        // * Flush frame will become confused if we pass it something
++        //   that doesn't have an expected number of planes (e.g. 400)
++        //   So only flush if we are sure we can.
++        else if (s->enable_rpi) {
++            // Flush frame to real memory as we expect to be able to pass
++            // it straight on to mmal
++            flush_frame(s, s->frame);
++        }
++#endif
++    }
      return ret;
  }
  
-@@ -3064,6 +4764,41 @@ fail:
+@@ -3070,6 +5056,83 @@ fail:
      return AVERROR(ENOMEM);
  }
  
-+#ifdef RPI_WORKER
-+static av_cold void hevc_init_worker(HEVCContext *s)
++#ifdef RPI
++static av_cold void hevc_init_worker(HEVCContext * const s)
 +{
 +    int err;
-+    pthread_cond_init(&s->worker_cond_head, NULL);
-+    pthread_cond_init(&s->worker_cond_tail, NULL);
-+    pthread_mutex_init(&s->worker_mutex, NULL);
 +
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
++    memset(s->jobs, 0, sizeof(s->jobs));
++
++    for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
++        HEVCRpiJob * const jb = s->jobs + job;
++
++        sem_init(&jb->sem_in, 0, 0);
++        sem_init(&jb->sem_out, 0, 0);
++        ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++        jb->intra.n = 0;
++        jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
++
++        // ** Sizeof the union structure might be overkill but at the moment it
++        //    is correct (it certainly isn't going to be too small)
++
++        rpi_inter_pred_alloc(&jb->chroma_ip,
++                             QPU_N_MAX, QPU_N_GRP,
++                             QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
++                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
++        rpi_inter_pred_alloc(&jb->luma_ip,
++                             QPU_N_MAX,  QPU_N_GRP,
++                             QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
++                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
++
++        jb->deblk.n = 0;
++        jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
++    }
++    s->pass0_job = 0;
++    s->pass1_job = 0;
++    s->jb0 = s->jobs + 0;
++    s->jb1 = s->jobs + 0;
++
 +    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
 +    if (err) {
 +        printf("Failed to create worker thread\n");
@@ -5669,83 +9485,74 @@ index b478065..955e426 100644
 +    }
 +}
 +
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++    av_freep(&ipe->q);
++    gpu_free(&ipe->gptr);
++}
++
 +static av_cold void hevc_exit_worker(HEVCContext *s)
 +{
 +    void *res;
-+    s->kill_worker=1;
-+    pthread_cond_broadcast(&s->worker_cond_tail);
++    unsigned int i;
++
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++        s->jobs[i].terminate = 1;
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++        sem_post(&s->jobs[i].sem_in);
 +    pthread_join(s->worker_thread, &res);
 +
-+    pthread_cond_destroy(&s->worker_cond_head);
-+    pthread_cond_destroy(&s->worker_cond_tail);
-+    pthread_mutex_destroy(&s->worker_mutex);
++    for(i = 0; i < RPI_MAX_JOBS; i++)
++    {
++        HEVCRpiJob * const jb = s->jobs + i;
 +
-+    s->worker_tail=0;
-+    s->worker_head=0;
-+    s->kill_worker=0;
++        sem_destroy(&jb->sem_in);
++        sem_destroy(&jb->sem_out);
++        ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++        av_freep(&jb->intra.cmds);
++        av_freep(&jb->deblk.blks);
++        rpi_free_inter_pred(&jb->chroma_ip);
++        rpi_free_inter_pred(&jb->luma_ip);
++    }
 +}
++
 +#endif
 +
  static av_cold int hevc_decode_free(AVCodecContext *avctx)
  {
      HEVCContext       *s = avctx->priv_data;
-@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
  
      av_freep(&s->cabac_state);
  
+-    for (i = 0; i < 3; i++) {
+-        av_freep(&s->sao_pixel_buffer_h[i]);
+-        av_freep(&s->sao_pixel_buffer_v[i]);
 +#ifdef RPI
 +
-+#ifdef RPI_WORKER
 +    hevc_exit_worker(s);
-+#endif
-+
-+    for(i=0;i<RPI_MAX_JOBS;i++) {
-+
-+        av_freep(&s->unif_mv_cmds_y[i]);
-+        av_freep(&s->unif_mv_cmds_c[i]);
-+        av_freep(&s->univ_pred_cmds[i]);
-+
-+#if RPI_INTER
-+        gpu_free(&s->jobs[i].chroma_mvs_gptr);
-+        gpu_free(&s->jobs[i].luma_mvs_gptr);
-+#endif
-+    }
-+
 +    vpu_qpu_term();
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
+     }
 +
 +    av_rpi_zc_uninit(avctx);
 +#endif
 +
-     for (i = 0; i < 3; i++) {
-         av_freep(&s->sao_pixel_buffer_h[i]);
-         av_freep(&s->sao_pixel_buffer_v[i]);
-@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
++    av_freep(&s->sao_pixel_buffer_v[0]);
+     av_frame_free(&s->output_frame);
+ 
+     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
+@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
      return 0;
  }
  
-+#ifdef RPI
-+#ifdef RPI_PRECLEAR
-+static av_cold void memclear16(int16_t *p, int n)
-+{
-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
-+  //int i;
-+  //for(i=0;i<n;i++)
-+  //  p[i] = 0;
-+}
-+#endif
-+#endif
 +
  static av_cold int hevc_init_context(AVCodecContext *avctx)
  {
      HEVCContext *s = avctx->priv_data;
-     int i;
-+#ifdef RPI
-+    unsigned int job;
-+#endif
- 
-     s->avctx = avctx;
- 
-@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
      s->HEVClcList[0] = s->HEVClc;
      s->sList[0] = s;
  
@@ -5759,71 +9566,39 @@ index b478065..955e426 100644
 +    if (vpu_qpu_init() != 0)
 +        goto fail;
 +
-+    for(job = 0; job < RPI_MAX_JOBS; job++) {
-+        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
-+        if (!s->unif_mv_cmds_y[job])
-+            goto fail;
-+        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
-+        if (!s->unif_mv_cmds_c[job])
-+            goto fail;
-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
-+        if (!s->univ_pred_cmds[job])
-+            goto fail;
-+    }
-+
 +#if RPI_INTER
-+    // We divide the image into blocks 256 wide and 64 high
-+    // We support up to 2048 widths
-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
-+    // Also add space for the startup command for each stream.
-+
-+    for (job = 0; job < RPI_MAX_JOBS; job++) {
-+        HEVCRpiJob * const jb = s->jobs + job;
-+#if RPI_CACHE_UNIF_MVS
-+        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
-+        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
-+#else
-+        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
-+        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
-+#endif
-+
-+        {
-+            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
-+            for(i = 0; i < QPU_N_UV; i++) {
-+                jb->chroma_mvs[i].qpu_mc_base = p;
-+                jb->chroma_mvs[i].qpu_mc_curr = p;
-+                p += UV_COMMANDS_PER_QPU;
-+            }
-+        }
-+        {
-+            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
-+            for(i = 0; i < QPU_N_Y; i++) {
-+                jb->luma_mvs[i].qpu_mc_base = p;
-+                jb->luma_mvs[i].qpu_mc_curr = p;
-+                p += Y_COMMANDS_PER_QPU;
-+            }
-+        }
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    {
++        static const uint32_t dframe[1] = {0x80808080};
++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
 +    }
-+    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
-+    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
-+    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
-+    s->qpu_filter = qpu_fn(mc_filter);
-+    s->qpu_filter_b = qpu_fn(mc_filter_b);
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
++#endif
 +#endif
 +    //gpu_malloc_uncached(2048*64,&s->dummy);
 +
 +    s->enable_rpi = 0;
 +
-+#ifdef RPI_WORKER
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
++    }
 +    hevc_init_worker(s);
 +#endif
-+
-+#endif
 +
      s->cabac_state = av_malloc(HEVC_CONTEXTS);
      if (!s->cabac_state)
          goto fail;
-@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
+@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+         if (!s->DPB[i].frame)
+             goto fail;
+         s->DPB[i].tf.f = s->DPB[i].frame;
++        s->DPB[i].dpb_no = i;
+     }
+ 
+     s->max_ra = INT_MAX;
+@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
      }
  
      if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
@@ -5836,7 +9611,7 @@ index b478065..955e426 100644
  
      return 0;
  }
-@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
+@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = {
      .update_thread_context = hevc_update_thread_context,
      .init_thread_copy      = hevc_init_thread_copy,
      .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
@@ -5846,88 +9621,63 @@ index b478065..955e426 100644
      .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
  };
 diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
-index be91010..dd7d152 100644
+index 162ca0e582..d647232638 100644
 --- a/libavcodec/hevc.h
 +++ b/libavcodec/hevc.h
-@@ -23,6 +23,9 @@
+@@ -23,6 +23,7 @@
  #ifndef AVCODEC_HEVC_H
  #define AVCODEC_HEVC_H
  
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#include "config.h"
-+
++#include "rpi_opts.h"
  #include "libavutil/buffer.h"
  #include "libavutil/md5.h"
  
-@@ -37,6 +40,45 @@
+@@ -37,6 +38,10 @@
  #include "thread.h"
  #include "videodsp.h"
  
-+// define RPI to split the CABAC/prediction/transform into separate stages
-+#ifndef RPI
-+
-+  #define RPI_INTER          0
-+  #define RPI_TSTATS         0
-+  #define RPI_HEVC_SAND      0
-+
-+#else
-+
-+  #include "rpi_qpu.h"
-+  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
-+
-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
-+  #define RPI_WORKER
-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
-+  // This has no effect unless RPI_WORKER is defined
-+  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
-+  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
-+  // free for the foreground to fill in.
-+  #define RPI_MAX_JOBS 2
-+
-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
-+  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
-+  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
-+  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
-+//  #define RPI_DEBLOCK_VPU
-+
-+  #define RPI_VPU_DEBLOCK_CACHED 1
-+
-+  #if HAVE_NEON
-+  #define RPI_HEVC_SAND      1
-+  #else
-+  // Sand bust on Pi1 currently - reasons unknown
-+  #define RPI_HEVC_SAND      0
-+  #endif
-+
-+  #define RPI_TSTATS 0
++#ifdef RPI
++#include "rpi_qpu.h"
 +#endif
 +
  #define MAX_DPB_SIZE 16 // A.4.1
  #define MAX_REFS 16
  
-@@ -660,17 +702,6 @@ typedef struct CodingUnit {
+@@ -463,6 +468,7 @@ typedef struct HEVCSPS {
+     int implicit_rdpcm_enabled_flag;
+     int explicit_rdpcm_enabled_flag;
+     int intra_smoothing_disabled_flag;
++    int high_precision_offsets_enabled_flag;
+     int persistent_rice_adaptation_enabled_flag;
+ 
+     ///< coded frame dimension in various units
+@@ -660,6 +666,7 @@ typedef struct CodingUnit {
      uint8_t cu_transquant_bypass_flag;
  } CodingUnit;
  
--typedef struct Mv {
--    int16_t x;  ///< horizontal component of motion vector
--    int16_t y;  ///< vertical component of motion vector
--} Mv;
--
--typedef struct MvField {
--    DECLARE_ALIGNED(4, Mv, mv)[2];
--    int8_t ref_idx[2];
--    int8_t pred_flag;
--} MvField;
--
++#if 0
+ typedef struct Mv {
+     int16_t x;  ///< horizontal component of motion vector
+     int16_t y;  ///< vertical component of motion vector
+@@ -670,6 +677,7 @@ typedef struct MvField {
+     int8_t ref_idx[2];
+     int8_t pred_flag;
+ } MvField;
++#endif
+ 
  typedef struct NeighbourAvailable {
      int cand_bottom_left;
-     int cand_left;
-@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
+@@ -745,9 +753,23 @@ typedef struct HEVCFrame {
+      * A combination of HEVC_FRAME_FLAG_*
+      */
      uint8_t flags;
++
++    // Entry no in DPB - can be used as a small unique
++    // frame identifier (within the current thread)
++    uint8_t dpb_no;
  } HEVCFrame;
  
-+#ifdef RPI_WORKER
++#ifdef RPI
 +typedef struct HEVCLocalContextIntra {
 +    TransformUnit tu;
 +    NeighbourAvailable na;
@@ -5935,21 +9685,22 @@ index be91010..dd7d152 100644
 +#endif
 +
  typedef struct HEVCLocalContext {
-+    TransformUnit tu;
-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
++    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
++    NeighbourAvailable na;
 +
      uint8_t cabac_state[HEVC_CONTEXTS];
  
      uint8_t stat_coeff[4];
-@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
+@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext {
  
      int qPy_pred;
  
 -    TransformUnit tu;
- 
+-
      uint8_t ctb_left_flag;
      uint8_t ctb_up_flag;
-@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
+     uint8_t ctb_up_right_flag;
+@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext {
      int ct_depth;
      CodingUnit cu;
      PredictionUnit pu;
@@ -5957,20 +9708,18 @@ index be91010..dd7d152 100644
  
  #define BOUNDARY_LEFT_SLICE     (1 << 0)
  #define BOUNDARY_LEFT_TILE      (1 << 1)
-@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
+@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext {
      int boundary_flags;
  } HEVCLocalContext;
  
-+
 +#ifdef RPI
 +
 +// The processing is done in chunks
-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
-+// This is a distance of 1536 pixels across the screen
 +// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
 +// but allocate more memory and increase the latency before data in the next frame can be processed
 +#define RPI_NUM_CHUNKS 4
 +#define RPI_CHUNK_SIZE 12
++#define RPI_ROUND_TO_LINES 0
 +
 +// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
 +#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
@@ -5989,9 +9738,6 @@ index be91010..dd7d152 100644
 +#define RPI_CMD_CHROMA_BI 3
 +#define RPI_CMD_V_BI 4
 +
-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
-+// #define RPI_PRECLEAR
-+
 +// Command for inter prediction
 +typedef struct HEVCMvCmd {
 +    uint8_t cmd;
@@ -6019,6 +9765,10 @@ index be91010..dd7d152 100644
 +    RPI_PRED_ADD_RESIDUAL,
 +    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
 +    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_ADD_DC,
++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
++    RPI_PRED_ADD_DC_V,
 +    RPI_PRED_INTRA,
 +    RPI_PRED_I_PCM,
 +    RPI_PRED_CMD_MAX
@@ -6033,8 +9783,14 @@ index be91010..dd7d152 100644
 +        struct {  // TRANSFORM_ADD
 +            uint8_t * dst;
 +            const int16_t * buf;
-+            uint32_t stride;
++            uint16_t stride;  // Should be good enough for all pic fmts we use
++            int16_t dc;
 +        } ta;
++        struct {
++            uint8_t * dst;
++            uint32_t stride;
++            int dc;
++        } dc;
 +        struct {  // INTRA
 +            uint16_t x;
 +            uint16_t y;
@@ -6052,32 +9808,87 @@ index be91010..dd7d152 100644
 +#endif
 +
 +#ifdef RPI
++#include <semaphore.h>
 +
-+struct qpu_mc_pred_c_s;
-+struct qpu_mc_pred_y_s;
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
 +
-+typedef struct HEVCRpiLumaPred
++typedef struct HEVCRpiInterPredQ
 +{
-+    struct qpu_mc_pred_y_s *qpu_mc_base;
-+    struct qpu_mc_pred_y_s *qpu_mc_curr;
-+    struct qpu_mc_pred_y_s *last_lx;
++    union qpu_mc_pred_cmd_u *qpu_mc_base;
++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    struct qpu_mc_src_s *last_l0;
++    struct qpu_mc_src_s *last_l1;
 +    unsigned int load;
-+} HEVCRpiLumaPred;
++    uint32_t code_setup;
++    uint32_t code_sync;
++    uint32_t code_exit;
++} HEVCRpiInterPredQ;
 +
-+typedef struct HEVCRpiChromaPred
++typedef struct HEVCRpiInterPredEnv
 +{
-+    struct qpu_mc_pred_c_s *qpu_mc_base;
-+    struct qpu_mc_pred_c_s *qpu_mc_curr;
-+    struct qpu_mc_pred_c_s *last_l0;
-+    struct qpu_mc_pred_c_s *last_l1;
-+    unsigned int load;
-+} HEVCRpiChromaPred;
++    HEVCRpiInterPredQ * q;
++    unsigned int n;        // Number of Qs
++    unsigned int n_grp;    // Number of Q in a group
++    unsigned int curr;     // Current Q number (0..n-1)
++    int used;              // 0 if nothing in any Q, 1 otherwise
++    int used_grp;          // 0 if nothing in any Q in the current group
++    unsigned int max_fill;
++    unsigned int min_gap;
++    GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++    unsigned int n;        // Number of commands
++    HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCeoffEnv {
++    unsigned int n;
++    uint16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCeoffsEnv {
++    HEVCRpiCoeffEnv s[4];
++    GPU_MEM_PTR_T gptr;
++    void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiDeblkBlk {
++    uint16_t x_ctb;
++    uint16_t y_ctb;
++} HEVCRpiDeblkBlk;
++
++typedef struct HEVCRpiDeblkEnv {
++    unsigned int n;
++    HEVCRpiDeblkBlk * blks;
++} HEVCRpiDeblkEnv;
++
++typedef struct HEVCRPiFrameProgressWait {
++    int req;
++    struct HEVCRPiFrameProgressWait * next;
++    sem_t sem;
++} HEVCRPiFrameProgressWait;
++
++typedef struct HEVCRPiFrameProgressState {
++    struct HEVCRPiFrameProgressWait * first;
++    struct HEVCRPiFrameProgressWait * last;
++    pthread_mutex_t lock;
++} HEVCRPiFrameProgressState;
 +
 +typedef struct HEVCRpiJob {
-+    GPU_MEM_PTR_T chroma_mvs_gptr;
-+    GPU_MEM_PTR_T luma_mvs_gptr;
-+    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
-+    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
++    volatile int terminate;
++    int pending;
++    sem_t sem_in;       // set by main
++    sem_t sem_out;      // set by worker
++    HEVCRpiInterPredEnv chroma_ip;
++    HEVCRpiInterPredEnv luma_ip;
++    int16_t progress[32];  // index by dpb_no
++    HEVCRpiIntraPredEnv intra;
++    HEVCRpiCoeffsEnv coeffs;
++    HEVCRpiDeblkEnv deblk;
++    HEVCRPiFrameProgressWait progress_wait;
 +} HEVCRpiJob;
 +
 +#if RPI_TSTATS
@@ -6105,78 +9916,42 @@ index be91010..dd7d152 100644
  typedef struct HEVCContext {
      const AVClass *c;  // needed by private avoptions
      AVCodecContext *avctx;
-@@ -798,13 +978,103 @@ typedef struct HEVCContext {
- 
-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
-     HEVCLocalContext    *HEVClc;
--
-+#ifdef RPI_WORKER
-+    HEVCLocalContextIntra HEVClcIntra;
-+#endif
-     uint8_t             threads_type;
-     uint8_t             threads_number;
- 
+@@ -805,6 +1025,69 @@ typedef struct HEVCContext {
      int                 width;
      int                 height;
  
-+    int used_for_ref;
-+
++    int used_for_ref;  // rpi
 +#ifdef RPI
 +    int enable_rpi;
-+    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
-+    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
-+    int buf_width;
-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
-+    int num_coeffs[RPI_MAX_JOBS][4];
-+    int num_xfm_cmds[RPI_MAX_JOBS];
-+    int num_mv_cmds_y[RPI_MAX_JOBS];
-+    int num_mv_cmds_c[RPI_MAX_JOBS];
-+    int num_pred_cmds[RPI_MAX_JOBS];
-+    int num_dblk_cmds[RPI_MAX_JOBS];
-+    int vpu_id;
-+    int pass0_job; // Pass0 does coefficient decode
-+    int pass1_job; // Pass1 does pixel processing
++    unsigned int pass0_job; // Pass0 does coefficient decode
++    unsigned int pass1_job; // Pass1 does pixel processing
 +    int ctu_count; // Number of CTUs done in pass0 so far
 +    int max_ctu_count; // Number of CTUs when we trigger a round of processing
-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
 +
++    HEVCRpiJob * jb0;
++    HEVCRpiJob * jb1;
 +    HEVCRpiJob jobs[RPI_MAX_JOBS];
 +#if RPI_TSTATS
 +    HEVCRpiStats tstats;
 +#endif
 +#if RPI_INTER
-+    HEVCRpiChromaPred * curr_pred_c;
-+    HEVCRpiLumaPred * curr_pred_y;
-+    struct qpu_mc_pred_y_s * last_y8_p;
-+    struct qpu_mc_pred_y_s * last_y8_lx;
++    struct qpu_mc_pred_y_p_s * last_y8_p;
++    struct qpu_mc_src_s * last_y8_l1;
 +
 +    // Function pointers
-+    uint32_t qpu_filter_uv;
-+    uint32_t qpu_filter_uv_b0;
-+    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
-+    uint32_t qpu_filter;
-+    uint32_t qpu_filter_b;
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
++#endif
++    HEVCRpiQpu qpu;
 +#endif
 +
-+#ifdef RPI_WORKER
 +    pthread_t worker_thread;
-+    pthread_cond_t worker_cond_head;
-+    pthread_cond_t worker_cond_tail;
-+    pthread_mutex_t worker_mutex;
-+
-+    int worker_tail; // Contains the number of posted jobs
-+    int worker_head; // Contains the number of completed jobs
-+    int kill_worker; // set to 1 to terminate the worker
-+#endif
-+
-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
 +
 +#ifdef RPI_DEBLOCK_VPU
++#define RPI_DEBLOCK_VPU_Q_COUNT 2
 +    int enable_rpi_deblock;
 +
 +    int uv_setup_width;
@@ -6204,34 +9979,25 @@ index be91010..dd7d152 100644
 +    unsigned int dvq_n;
 +
 +#endif
-+
++    HEVCLocalContextIntra HEVClcIntra;
++    HEVCRPiFrameProgressState progress_states[2];
 +#endif
 +
      uint8_t *cabac_state;
  
      /** 1 if the independent slice segment header was successfully parsed */
-@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
-     uint32_t max_mastering_luminance;
-     uint32_t min_mastering_luminance;
- 
-+#ifdef RPI
-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
-+#endif
- } HEVCContext;
- 
- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                                  int log2_trafo_size, enum ScanType scan_idx,
-                                  int c_idx);
+@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
+ int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
+                            uint8_t *buf, int buf_size);
 +#if RPI_INTER
 +extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
 +#endif
 +
- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
  
- 
-@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
+ /**
+  * Reset SEI values that are stored on the Context.
+@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
  extern const uint8_t ff_hevc_diag_scan8x8_x[64];
  extern const uint8_t ff_hevc_diag_scan8x8_y[64];
  
@@ -6244,11 +10010,85 @@ index be91010..dd7d152 100644
 +extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
 +#endif
 +
++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    if (s->enable_rpi)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++    else
++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
++}
++
++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
++{
++    if (s->enable_rpi && s->used_for_ref)
++        ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCFrame * const ref, const int y)
++{
++    if (s->enable_rpi)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++    else
++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
++}
++
++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
++{
++    if (s->used_for_ref)
++    {
++        if (s->enable_rpi)
++            ff_hevc_rpi_progress_signal_field(s, y, 0);
++        else
++            ff_thread_report_progress(&s->ref->tf, y, 0);
++    }
++}
++
++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
++{
++    if (s->enable_rpi)
++    {
++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++    }
++    else
++        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
++}
++
++#else
++
++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
++#define ff_hevc_progress_signal_mv(s, y)
++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
++
 +#endif
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
++{
++    if (ref->tf.progress != NULL)
++    {
++        int * const p = (int *)&ref->tf.progress->data;
++        p[0] = INT_MAX;
++        p[1] = INT_MAX;
++    }
++}
 +
  #endif /* AVCODEC_HEVC_H */
 diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
-index 05b2821..733efde 100644
+index 05b2821840..c84886817d 100644
 --- a/libavcodec/hevc_cabac.c
 +++ b/libavcodec/hevc_cabac.c
 @@ -21,14 +21,76 @@
@@ -6260,12 +10100,11 @@ index 05b2821..733efde 100644
  #include "libavutil/attributes.h"
  #include "libavutil/common.h"
  
--#include "cabac_functions.h"
+ #include "cabac_functions.h"
  #include "hevc.h"
-+#include "cabac_functions.h"
-+
+ 
 +#ifdef RPI
-+#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
 +#endif
 +
 +// BY22 is probably faster than simple bypass if the processor has
@@ -6287,7 +10126,7 @@ index 05b2821..733efde 100644
 +#if ARCH_ARM
 +#include "arm/hevc_cabac.h"
 +#endif
- 
++
  #define CABAC_MAX_BIN 31
  
 +
@@ -6610,7 +10449,7 @@ index 05b2821..733efde 100644
  {
      return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
  }
-@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
      return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
  }
  
@@ -6623,7 +10462,7 @@ index 05b2821..733efde 100644
 +
 +#ifndef coeff_abs_level_remaining_decode_bypass
 +static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
-+{
+ {
 +    CABACContext * const c = &s->HEVClc->cc;
 +    uint32_t y;
 +    unsigned int prefix;
@@ -6664,7 +10503,7 @@ index 05b2821..733efde 100644
 +#endif
 +
 +static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
- {
++{
 +    CABACContext * const c = &s->HEVClc->cc;
      int prefix = 0;
      int suffix = 0;
@@ -6740,7 +10579,7 @@ index 05b2821..733efde 100644
 +        rv = (rv << 1) | b;
 +    }
 +    return rv;
-+}
+ }
 +#endif
 +
 +
@@ -6881,22 +10720,21 @@ index 05b2821..733efde 100644
 +    int * const pPrev_sig)
 +{
 +    while (--i >= 0) {
-+        unsigned int x_cg = scan_x_cg[i];
-+        unsigned int y_cg = scan_y_cg[i];
++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++        const unsigned int x_cg = scan_x_cg[i];
 +
 +        // For the flag decode we only care about Z/NZ but
-+        // we use the full Right + Down * 2 when calculating
-+        // significant coeff flags so we obtain it here
-+        //.
++        // we use the full Right * 2 + Down when calculating
++        // significant coeff flags so we obtain it here.
++        //
 +        // The group flag array is one longer than it needs to
 +        // be so we don't need to check for y_cg limits
-+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
-+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
 +
 +        if (i == 0 ||
 +            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
 +        {
-+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
++            gf_y[0] |= (1 << x_cg);
 +            *pPrev_sig = prev_sig;
 +            break;
 +        }
@@ -6914,35 +10752,128 @@ index 05b2821..733efde 100644
 +    unsigned int stride = frame->linesize[c_idx];
 +    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
 +    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
-+    const int is_sliced = rpi_sliced_frame(frame);
++    const int is_sliced = av_rpi_is_sand_frame(frame);
 +    uint8_t * dst = !is_sliced ?
 +            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
 +        c_idx == 0 ?
-+            rpi_sliced_frame_pos_y(frame, x, y) :
-+            rpi_sliced_frame_pos_c(frame, x, y);
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
 +
-+//    if (c_idx != 0) {
-+//        return;
-+//    }
 +    if (s->enable_rpi) {
-+        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+        cmd->size = log2_trafo_size;
-+        cmd->c_idx = c_idx;
-+        cmd->ta.buf = coeffs;
-+        cmd->ta.dst = dst;
-+        cmd->ta.stride = stride;
++        const unsigned int i = s->jb0->intra.n;
++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
++
++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++            pc->ta.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->ta.stride == stride);
++
++            pc->type = RPI_PRED_ADD_RESIDUAL_C;
++        }
++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++            pc->dc.dst == dst)
++        {
++            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->dc.stride == stride);
++
++            // Rewrite as add residual - must rewrite all fields as different union member
++            pc->type = RPI_PRED_ADD_RESIDUAL_V;
++            pc->c_idx = c_idx;
++            pc->ta.buf = coeffs;
++            pc->ta.dst = dst;
++            pc->ta.stride = stride;
++            pc->ta.dc = dc;
++        }
++        else
++        {
++            HEVCPredCmd * const cmd = pc + 1;
++            s->jb0->intra.n = i + 1;
++
++            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++            cmd->size = log2_trafo_size;
++            cmd->c_idx = c_idx;
++            cmd->ta.buf = coeffs;
++            cmd->ta.dst = dst;
++            cmd->ta.stride = stride;
++            cmd->ta.dc = 0;
++        }
 +    }
 +    else if (!is_sliced || c_idx == 0) {
 +        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
 +    }
++#if RPI_HEVC_SAND
++    // * These should probably never happen
 +    else if (c_idx == 1) {
-+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
 +    }
 +    else {
-+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
 +    }
- }
++#endif
++}
++
++
++static void rpi_add_dc(HEVCContext * const s,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame->linesize[c_idx];
++    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
++    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
++    const int is_sliced = av_rpi_is_sand_frame(frame);
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++    if (s->enable_rpi) {
++        const unsigned int i = s->jb0->intra.n;
++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
++
++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++            pc->ta.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->ta.stride == stride);
++
++            pc->ta.dc = (int16_t)coeff;
++        }
++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++            pc->dc.dst == dst)
++        {
++            av_assert1(pc->size == log2_trafo_size &&
++                       pc->c_idx == 1 &&
++                       pc->dc.stride == stride &&
++                       (pc->dc.dc & ~0xffff) == 0);
++
++            pc->dc.dc |= (coeff << 16);
++        }
++        else
++        {
++            HEVCPredCmd * const cmd = pc + 1;
++            s->jb0->intra.n = i + 1;
++
++            cmd->type = RPI_PRED_ADD_DC + c_idx;
++            cmd->size = log2_trafo_size;
++            cmd->c_idx = c_idx;
++            cmd->dc.dst = dst;
++            cmd->dc.stride = stride;
++            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++        }
++    }
++}
++
++
 +#endif
  
  void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
@@ -6985,6 +10916,7 @@ index 05b2821..733efde 100644
 +#endif
 +#ifdef RPI
 +    int use_vpu;
++    int use_dc = 0;
 +#endif
 +    int16_t *coeffs;
 +    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
@@ -7006,7 +10938,6 @@ index 05b2821..733efde 100644
 +    const int c_idx_nz = (c_idx != 0);
 +
 +    int may_hide_sign;
-+
  
      // Derive QP for dequant
      if (!lc->cu.cu_transquant_bypass_flag) {
@@ -7015,7 +10946,7 @@ index 05b2821..733efde 100644
          static const uint8_t rem6[51 + 4 * 6 + 1] = {
              0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
              3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
-@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          };
          int qp_y = lc->qp_y;
  
@@ -7036,7 +10967,7 @@ index 05b2821..733efde 100644
          }
  
          if (c_idx == 0) {
-@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              qp += s->ps.sps->qp_bd_offset;
          }
  
@@ -7127,7 +11058,7 @@ index 05b2821..733efde 100644
                                             &last_significant_coeff_x, &last_significant_coeff_y);
  
      if (last_significant_coeff_x > 3) {
-@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
          int last_x_c = last_significant_coeff_x & 3;
          int last_y_c = last_significant_coeff_y & 3;
  
@@ -7184,53 +11115,35 @@ index 05b2821..733efde 100644
 -    for (i = num_last_subset; i >= 0; i--) {
 -        int n, m;
 -        int x_cg, y_cg, x_c, y_c, pos;
-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+    {
-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+#ifdef RPI
-+        use_vpu = 0;
-+        if (s->enable_rpi) {
-+            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
-+            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if HAVE_NEON
-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+        }
-+        else
-+#endif
-+        {
-+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
-+            memset(coeffs, 0, ccount * sizeof(int16_t));
-+        }
-+    }
-+
-+    i = num_last_subset;
-+    do {
-         int implicit_non_zero_coeff = 0;
+-        int implicit_non_zero_coeff = 0;
 -        int64_t trans_coeff_level;
 -        int prev_sig = 0;
 -        int offset = i << 4;
 -        int rice_init = 0;
-+        int n_end;
- 
-         uint8_t significant_coeff_flag_idx[16];
--        uint8_t nb_significant_coeff_flag = 0;
 -
+-        uint8_t significant_coeff_flag_idx[16];
+-        uint8_t nb_significant_coeff_flag = 0;
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+ 
 -        x_cg = scan_x_cg[i];
 -        y_cg = scan_y_cg[i];
--
++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+ 
 -        if ((i < num_last_subset) && (i > 0)) {
 -            int ctx_cg = 0;
 -            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
 -            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
 -                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
--
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++#ifdef RPI
++        use_vpu = 0;
++        if (s->enable_rpi) {
++            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
++            use_dc = (num_coeff == 1) && !special &&
++                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
+ 
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
 -            implicit_non_zero_coeff = 1;
@@ -7238,9 +11151,37 @@ index 05b2821..733efde 100644
 -            significant_coeff_group_flag[x_cg][y_cg] =
 -            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
 -             (x_cg == 0 && y_cg == 0));
--        }
--
++            if (use_dc) {
++                // Just need a little empty space
++                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++                // No need to clear
++            }
++            else
++            {
++                use_vpu = !special && log2_trafo_size >= 4;
++                coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if HAVE_NEON
++                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++                memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++            }
+         }
++        else
++#endif
++        {
++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++        }
++    }
+ 
 -        last_scan_pos = num_coeff - offset - 1;
++    i = num_last_subset;
++    do {
++        int implicit_non_zero_coeff = 0;
++        int n_end;
++
++        uint8_t significant_coeff_flag_idx[16];
 +        unsigned int nb_significant_coeff_flag = 0;
  
          if (i == num_last_subset) {
@@ -7272,23 +11213,24 @@ index 05b2821..733efde 100644
 +                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
 +                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
 +            };
++            // N.B. prev_sig = Right * 2 + Down
 +            static const uint8_t ctx_idx_maps[3][4][16] = {
 +                {
 +                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                },
 +                {
 +                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                },
 +                {
 +                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
 +                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
 +                }
              };
@@ -7326,7 +11268,7 @@ index 05b2821..733efde 100644
                          if (log2_trafo_size == 3) {
                              scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
                          } else {
-@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
                      }
                  }
              }
@@ -7375,11 +11317,12 @@ index 05b2821..733efde 100644
                      significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
                      nb_significant_coeff_flag++;
                  }
-@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          }
  
 -        n_end = nb_significant_coeff_flag;
+-
 +        if (nb_significant_coeff_flag != 0) {
 +            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
 +                ((i != 0 && !c_idx_nz) ? 2 : 0) |
@@ -7427,9 +11370,6 @@ index 05b2821..733efde 100644
 +                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
 +                }
  
-+                // Probably not worth the overhead of starting by22 for just one value
-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
- 
 -        if (n_end) {
 -            int first_nz_pos_in_cg;
 -            int last_nz_pos_in_cg;
@@ -7440,6 +11380,9 @@ index 05b2821..733efde 100644
 -            int sum_abs = 0;
 -            int sign_hidden;
 -            int sb_type;
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+ 
 +                if (coded_val)
 +                {
 +                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
@@ -7450,18 +11393,13 @@ index 05b2821..733efde 100644
 +                        const unsigned int c_rice_param = *stat_coeff >> 2;
 +                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
  
+-            // initialize first elem of coeff_bas_level_greater1_flag
+-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
 +                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
 +                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                    }
 +                }
  
--            // initialize first elem of coeff_bas_level_greater1_flag
--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
-+                {
-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
- 
 -            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
 -                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
@@ -7469,7 +11407,11 @@ index 05b2821..733efde 100644
 -                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
 -                c_rice_param = lc->stat_coeff[sb_type] / 4;
 -            }
--
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+ 
 -            if (!(i == num_last_subset) && greater1_ctx == 0)
 -                ctx_set++;
 -            greater1_ctx = 1;
@@ -7551,9 +11493,6 @@ index 05b2821..733efde 100644
 +                        {
 +                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
 +                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+                            sum_abs += last_coeff_abs_level_remaining + 1;
-+                            *level = trans_coeff_level;
  
 -            for (m = 0; m < n_end; m++) {
 -                n = significant_coeff_flag_idx[m];
@@ -7574,6 +11513,9 @@ index 05b2821..733efde 100644
 -                                if (lc->stat_coeff[sb_type] > 0)
 -                                    lc->stat_coeff[sb_type]--;
 -                            rice_init = 1;
++                            sum_abs += last_coeff_abs_level_remaining + 1;
++                            *level = trans_coeff_level;
++
 +                            if (stat_coeff != NULL)
 +                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
 +                            stat_coeff = NULL;
@@ -7678,7 +11620,7 @@ index 05b2821..733efde 100644
  
      if (lc->cu.cu_transquant_bypass_flag) {
          if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
          }
      } else {
@@ -7687,56 +11629,41 @@ index 05b2821..733efde 100644
              int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
                        log2_trafo_size == 2 &&
                        lc->cu.pred_mode == MODE_INTRA;
-@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 for (i = 0; i < 8; i++)
-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-             }
--
-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
- 
-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              }
          } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
--            s->hevcdsp.idct_4x4_luma(coeffs);
-+           s->hevcdsp.idct_4x4_luma(coeffs);
-         } else {
+             s->hevcdsp.idct_4x4_luma(coeffs);
+-        } else {
++        }
 +#ifdef RPI
-+            if (!use_vpu) {
-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+              if (max_xy == 0) {
-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-+              } else {
-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+                  if (max_xy < 4)
-+                      col_limit = FFMIN(4, col_limit);
-+                  else if (max_xy < 8)
-+                      col_limit = FFMIN(8, col_limit);
-+                  else if (max_xy < 12)
-+                      col_limit = FFMIN(24, col_limit);
-+
-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-+              }
-+            }
++        else if (!use_vpu)
 +#else
++        else
++#endif
++        {
              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
              if (max_xy == 0)
-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
-@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
-                     col_limit = FFMIN(24, col_limit);
-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
-             }
+-                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
++            {
++#ifdef RPI
++                if (use_dc)
++                    rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++                else
 +#endif
-         }
-     }
-     if (lc->tu.cross_pf) {
-@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            }
+             else {
+                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+                 if (max_xy < 4)
+@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
              coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
          }
      }
 +#ifdef RPI
-+    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++    if (!use_dc)
++    {
++        rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
++    }
 +#else
      s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
 +#endif
@@ -7744,7 +11671,7 @@ index 05b2821..733efde 100644
  
  void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
 diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
-index 1f33b0c..3143b4f 100644
+index 9fbcd1d8b8..df129e2e46 100644
 --- a/libavcodec/hevc_filter.c
 +++ b/libavcodec/hevc_filter.c
 @@ -22,6 +22,12 @@
@@ -7760,26 +11687,31 @@ index 1f33b0c..3143b4f 100644
  #include "libavutil/common.h"
  #include "libavutil/internal.h"
  
-@@ -31,6 +37,11 @@
+@@ -31,6 +37,16 @@
  
  #include "bit_depth_template.c"
  
 +#ifdef RPI
 +#include "rpi_qpu.h"
++#endif
++#if RPI_HEVC_SAND
 +#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++#else
++#define RPI_ZC_SAND_8_IN_10_BUF 0
 +#endif
 +
  #define LUMA 0
  #define CB 1
  #define CR 2
-@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
      return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
  }
  
 +static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
 +{
-+#ifdef RPI
-+    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
++#if RPI_HEVC_SAND
++    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
 +#else
 +    return s->ps.sps->pixel_shift;
 +#endif
@@ -7788,7 +11720,75 @@ index 1f33b0c..3143b4f 100644
  static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
                       intptr_t stride_dst, intptr_t stride_src)
  {
-@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+@@ -161,12 +186,21 @@ int i, j;
+     }
+ }
+ 
++// "DSP" these?
+ static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
+ {
+-    if (pixel_shift)
+-        *(uint16_t *)dst = *(uint16_t *)src;
+-    else
+-        *dst = *src;
++    switch (pixel_shift)
++    {
++        case 2:
++            *(uint32_t *)dst = *(uint32_t *)src;
++            break;
++        case 1:
++            *(uint16_t *)dst = *(uint16_t *)src;
++            break;
++        default:
++            *dst = *src;
++            break;
++    }
+ }
+ 
+ static void copy_vert(uint8_t *dst, const uint8_t *src,
+@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
+                       int stride_dst, int stride_src)
+ {
+     int i;
+-    if (pixel_shift == 0) {
+-        for (i = 0; i < height; i++) {
+-            *dst = *src;
+-            dst += stride_dst;
+-            src += stride_src;
+-        }
+-    } else {
+-        for (i = 0; i < height; i++) {
+-            *(uint16_t *)dst = *(uint16_t *)src;
+-            dst += stride_dst;
+-            src += stride_src;
+-        }
++    switch (pixel_shift)
++    {
++        case 2:
++            for (i = 0; i < height; i++) {
++                *(uint32_t *)dst = *(uint32_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        case 1:
++            for (i = 0; i < height; i++) {
++                *(uint16_t *)dst = *(uint16_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        default:
++            for (i = 0; i < height; i++) {
++                *dst = *src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
+     }
+ }
+ 
+@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
                             int stride_src, int x, int y, int width, int height,
                             int c_idx, int x_ctb, int y_ctb)
  {
@@ -7797,7 +11797,7 @@ index 1f33b0c..3143b4f 100644
      int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
      int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
  
-@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s,
          int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
          int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
          int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
@@ -7815,21 +11815,27 @@ index 1f33b0c..3143b4f 100644
                      for (n = 0; n < (min_pu_size >> vshift); n++) {
                          memcpy(src, dst, len);
                          src += stride_src;
-@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s,
  
  static void sao_filter_CTB(HEVCContext *s, int x, int y)
  {
 -    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
++#if SAO_FILTER_N == 5
 +    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
      HEVCLocalContext *lc = s->HEVClc;
      int c_idx;
      int edges[4];  // 0 left 1 top 2 right 3 bottom
-@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
      uint8_t right_tile_edge  = 0;
      uint8_t up_tile_edge     = 0;
      uint8_t bottom_tile_edge = 0;
-+#ifdef RPI
-+    const int sliced = rpi_sliced_frame(s->frame);
++#if RPI_HEVC_SAND
++    const int sliced = av_rpi_is_sand_frame(s->frame);
 +    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
 +#else
 +    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
@@ -7847,7 +11853,7 @@ index 1f33b0c..3143b4f 100644
      if (restore) {
          if (!edges[0]) {
              left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          }
      }
  
@@ -7856,7 +11862,7 @@ index 1f33b0c..3143b4f 100644
          int x0       = x >> s->ps.sps->hshift[c_idx];
          int y0       = y >> s->ps.sps->vshift[c_idx];
          int stride_src = s->frame->linesize[c_idx];
-@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
          int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
          int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
@@ -7865,24 +11871,24 @@ index 1f33b0c..3143b4f 100644
 +        ptrdiff_t stride_dst;
          uint8_t *dst;
  
-+#ifdef RPI
-+        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
++#if RPI_HEVC_SAND
++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
 +        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
 +        uint8_t * const src = !sliced ?
-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
 +        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
 +            !sliced ? src - (1 << sh) :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
 +        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
 +            !sliced ? src + (width << sh) :
 +            c_idx == 0 ?
-+                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
-+                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
 +
 +
 +        if (sliced && c_idx > 1) {
@@ -7913,7 +11919,7 @@ index 1f33b0c..3143b4f 100644
 +                dst = lc->edge_emu_buffer;
 +                stride_dst = 2*MAX_PB_SIZE;
 +                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
 +                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
@@ -7934,9 +11940,11 @@ index 1f33b0c..3143b4f 100644
 -            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
 -                                            sao->offset_val[c_idx], sao->band_position[c_idx],
 -                                            width, height);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +                if (sliced && c_idx != 0)
 +                {
++//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
++
 +                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
 +                                                    sao->offset_val[1], sao->band_position[1],
 +                                                    sao->offset_val[2], sao->band_position[2],
@@ -7952,7 +11960,7 @@ index 1f33b0c..3143b4f 100644
              }
              sao->type_idx[c_idx] = SAO_APPLIED;
              break;
-@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
          {
              int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
              int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
@@ -8091,7 +12099,7 @@ index 1f33b0c..3143b4f 100644
 -                                                vert_edge,
 -                                                horiz_edge,
 -                                                diag_edge);
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +            if (sliced && c_idx != 0)
 +            {
 +                // Class always the same for both U & V (which is just as well :-))
@@ -8121,18 +12129,42 @@ index 1f33b0c..3143b4f 100644
 +                                                    horiz_edge,
 +                                                    diag_edge);
 +            }
++            // ??? Does this actually work for chroma ???
              restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
                                 x, y, width, height, c_idx);
              sao->type_idx[c_idx] = SAO_APPLIED;
-@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+         }
+         }
      }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++    {
++        const unsigned int stride1 = s->frame->linesize[0];
++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++    }
++#endif
  }
  
 +// Returns 2 or 0.
  static int get_pcm(HEVCContext *s, int x, int y)
  {
      int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
-@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      uint8_t *src;
      int x, y;
      int chroma, beta;
@@ -8141,7 +12173,7 @@ index 1f33b0c..3143b4f 100644
      uint8_t no_p[2] = { 0 };
      uint8_t no_q[2] = { 0 };
  
-@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  s->ps.sps->pcm.loop_filter_disable_flag) ||
                 s->ps.pps->transquant_bypass_enable_flag;
  
@@ -8157,7 +12189,7 @@ index 1f33b0c..3143b4f 100644
      if (x0) {
          left_tc_offset   = s->deblock[ctb - 1].tc_offset;
          left_beta_offset = s->deblock[ctb - 1].beta_offset;
-@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                  tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                  tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
@@ -8175,14 +12207,14 @@ index 1f33b0c..3143b4f 100644
 -                                                       s->frame->linesize[LUMA],
 -                                                       beta, tc, no_p, no_q);
 +                }
-+#ifdef RPI
-+                if (rpi_sliced_frame(s->frame)) {
++#if RPI_HEVC_SAND
++                if (av_rpi_is_sand_frame(s->frame)) {
 +
 +                    // This copes properly with no_p/no_q
-+                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
++                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
 +                                                     s->frame->linesize[LUMA],
 +                                                     beta, tc, no_p, no_q,
-+                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
++                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
 +                }
 +                else
 +#endif
@@ -8217,21 +12249,21 @@ index 1f33b0c..3143b4f 100644
              }
          }
  
-@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                  beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
                  tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
                  tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
 -                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
 +                src =
-+#ifdef RPI
-+                    rpi_sliced_frame(s->frame) ?
-+                        rpi_sliced_frame_pos_y(s->frame, x, y) :
++#if RPI_HEVC_SAND
++                    av_rpi_is_sand_frame(s->frame) ?
++                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
 +#endif
 +                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
                  if (pcmf) {
                      no_p[0] = get_pcm(s, x, y - 1);
                      no_p[1] = get_pcm(s, x + 4, y - 1);
-@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                           s->frame->linesize[LUMA],
                                                           beta, tc, no_p, no_q);
                  } else
@@ -8251,17 +12283,19 @@ index 1f33b0c..3143b4f 100644
                      s->hevcdsp.hevc_h_loop_filter_luma(src,
                                                         s->frame->linesize[LUMA],
                                                         beta, tc, no_p, no_q);
-@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  
      if (s->ps.sps->chroma_format_idc) {
-+#ifdef RPI
-+        if (rpi_sliced_frame(s->frame)) {
++#if RPI_HEVC_SAND
++        if (av_rpi_is_sand_frame(s->frame)) {
 +            const int v = 2;
 +            const int h = 2;
 +
 +            // vertical filtering chroma
 +            for (y = y0; y < y_end; y += 8 * v) {
++//                const int demi_y = y + 4 * v >= s->ps.sps->height;
++                const int demi_y = 0;
 +                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
 +                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
 +                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
@@ -8269,7 +12303,7 @@ index 1f33b0c..3143b4f 100644
 +                    if ((bs0 == 2) || (bs1 == 2)) {
 +                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
 +                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
-+                        unsigned int no_f = 0;
++                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
 +
 +                        // tc_offset here should be set to cur_tc_offset I think
 +                        const uint32_t tc4 =
@@ -8289,10 +12323,10 @@ index 1f33b0c..3143b4f 100644
 +                                continue;
 +                        }
 +
-+                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
 +                                                       s->frame->linesize[1],
 +                                                       tc4,
-+                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
 +                                                       no_f);
 +                    }
 +                }
@@ -8307,6 +12341,9 @@ index 1f33b0c..3143b4f 100644
 +                    x_end2 = x_end - 8 * h;
 +
 +                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
++//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
++                    const int demi_x = 0;
++
 +                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
 +                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
 +                    if ((bs0 == 2) || (bs1 == 2)) {
@@ -8315,7 +12352,7 @@ index 1f33b0c..3143b4f 100644
 +                        const uint32_t tc4 =
 +                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
 +                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
-+                        unsigned int no_f = 0;
++                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
 +
 +                        if (tc4 == 0)
 +                            continue;
@@ -8331,7 +12368,7 @@ index 1f33b0c..3143b4f 100644
 +                                continue;
 +                        }
 +
-+                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
 +                                                             s->frame->linesize[1],
 +                                                             tc4, no_f);
 +                    }
@@ -8343,21 +12380,21 @@ index 1f33b0c..3143b4f 100644
          for (chroma = 1; chroma <= 2; chroma++) {
              int h = 1 << s->ps.sps->hshift[chroma];
              int v = 1 << s->ps.sps->vshift[chroma];
-@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                          c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
                          c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
 -                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
 +                        src =
-+#ifdef RPI
-+                            rpi_sliced_frame(s->frame) ?
-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#if RPI_HEVC_SAND
++                            av_rpi_is_sand_frame(s->frame) ?
++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
 +#endif
 +                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
                          if (pcmf) {
                              no_p[0] = get_pcm(s, x - 1, y);
                              no_p[1] = get_pcm(s, x - 1, y + (4 * v));
-@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -8381,21 +12418,21 @@ index 1f33b0c..3143b4f 100644
                      }
                  }
  
-@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
  
                          c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
                          c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
 -                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
 +                        src =
-+#ifdef RPI
-+                            rpi_sliced_frame(s->frame) ?
-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
++#if RPI_HEVC_SAND
++                            av_rpi_is_sand_frame(s->frame) ?
++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
 +#endif
 +                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
                          if (pcmf) {
                              no_p[0] = get_pcm(s, x,           y - 1);
                              no_p[1] = get_pcm(s, x + (4 * h), y - 1);
-@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
                                                                     s->frame->linesize[chroma],
                                                                     c_tc, no_p, no_q);
                          } else
@@ -8415,7 +12452,7 @@ index 1f33b0c..3143b4f 100644
                              s->hevcdsp.hevc_h_loop_filter_chroma(src,
                                                                   s->frame->linesize[chroma],
                                                                   c_tc, no_p, no_q);
-@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
      }
  }
  
@@ -8485,7 +12522,7 @@ index 1f33b0c..3143b4f 100644
  
  void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
                                             int log2_trafo_size)
-@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
      int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
      int min_pu_width     = s->ps.sps->min_pu_width;
      int min_tu_width     = s->ps.sps->min_tb_width;
@@ -8511,7 +12548,7 @@ index 1f33b0c..3143b4f 100644
  
      boundary_upper = y0 > 0 && !(y0 & 7);
      if (boundary_upper &&
-@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_upper = 0;
  
@@ -8588,7 +12625,7 @@ index 1f33b0c..3143b4f 100644
      boundary_left = x0 > 0 && !(x0 & 7);
      if (boundary_left &&
          ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
-@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
            (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
          boundary_left = 0;
  
@@ -8691,7 +12728,7 @@ index 1f33b0c..3143b4f 100644
          }
      }
  }
-@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
  #undef CB
  #undef CR
  
@@ -8701,8 +12738,8 @@ index 1f33b0c..3143b4f 100644
 +static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
 +{
 +    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
++    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
 +    rpi_cache_flush_finish(rfe);
 +}
 +#endif
@@ -8716,10 +12753,11 @@ index 1f33b0c..3143b4f 100644
 +        const int d0 = ((int *)f->progress->data)[0];
 +        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
 +
-+        if (curr_y < (unsigned int)f->f->height) {
++        if (curr_y < (unsigned int)s->ps.sps->height) {
 +            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
-+            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++              0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
++              s->ps.sps->vshift[1], 1, 1);
 +            rpi_cache_flush_finish(rfe);
 +        }
 +    }
@@ -8759,7 +12797,7 @@ index 1f33b0c..3143b4f 100644
 +  // Call VPU
 +  {
 +      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
-+      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
 +      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
 +      vpu_qpu_job_finish(vqj);
 +  }
@@ -8796,61 +12834,167 @@ index 1f33b0c..3143b4f 100644
      if (s->ps.sps->sao_enabled) {
          int y_end = y >= s->ps.sps->height - ctb_size;
          if (y && x)
-@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
              sao_filter_CTB(s, x - ctb_size, y);
          if (y && x_end) {
              sao_filter_CTB(s, x, y - ctb_size);
 -            if (s->threads_type & FF_THREAD_FRAME )
+-                ff_thread_report_progress(&s->ref->tf, y, 0);
 +            if (s->threads_type == FF_THREAD_FRAME ) {
 +#if RPI_INTER
 +                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
 +#endif
-                 ff_thread_report_progress(&s->ref->tf, y, 0);
++                ff_hevc_progress_signal_recon(s, y);
 +            }
          }
          if (x_end && y_end) {
              sao_filter_CTB(s, x , y);
 -            if (s->threads_type & FF_THREAD_FRAME )
+-                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
 +            if (s->threads_type == FF_THREAD_FRAME ) {
 +#if RPI_INTER
 +                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
 +#endif
-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
++                ff_hevc_progress_signal_recon(s, y + ctb_size);
 +            }
-+        }
+         }
+-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+-        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
 +    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
 +        //int newh = y + ctb_size - 4;
 +        //int currh = s->ref->tf.progress->data[0];
 +        //if (((y + ctb_size)&63)==0)
 +#ifdef RPI_DEBLOCK_VPU
 +        if (s->enable_rpi_deblock) {
-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
-+          if (done_deblock) {
-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-+          }
++            // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
++            if (done_deblock) {
++                ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
++            }
 +        } else {
 +#if RPI_INTER
-+          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
++            rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
 +#endif
-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
-         }
--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++            ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
++        }
 +#else
 +#if RPI_INTER
 +        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
 +#endif
-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
++        ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
 +#endif
 +    }
  }
  
  void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
+index 4a6dde0f67..8ee37ebfbc 100644
+--- a/libavcodec/hevc_mvs.c
++++ b/libavcodec/hevc_mvs.c
+@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
+     return 0;
+ }
+ 
+-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
+ {
+     int tx, scale_factor;
+ 
+@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+                            (scale_factor * src->y < 0)) >> 8);
+ }
+ 
+-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+-                       int colPic, int poc,
+-                       RefPicList *refPicList, int X, int refIdxLx,
+-                       RefPicList *refPicList_col, int listCol, int refidxCol)
++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
++                       const int colPic, const int poc,
++                       const RefPicList * const refPicList, const int X, const int refIdxLx,
++                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
+ {
+     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
+     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
+@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
+                 refPicList_col, L ## l, temp_col.ref_idx[l])
+ 
+ // derive the motion vectors section 8.5.3.1.8
+-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+-                                         int refIdxLx, Mv *mvLXCol, int X,
+-                                         int colPic, RefPicList *refPicList_col)
++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
++                                         const int refIdxLx, Mv * const mvLXCol, const int X,
++                                         const int colPic, const RefPicList * const refPicList_col)
+ {
+-    RefPicList *refPicList = s->ref->refPicList;
++    const RefPicList * const refPicList = s->ref->refPicList;
+ 
+     if (temp_col.pred_flag == PF_INTRA)
+         return 0;
+@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
+ /*
+  * 8.5.3.1.7  temporal luma motion vector prediction
+  */
+-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+-                                       int nPbW, int nPbH, int refIdxLx,
+-                                       Mv *mvLXCol, int X)
++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
++                                       const int nPbW, const int nPbH, const int refIdxLx,
++                                       Mv * const mvLXCol, const int X)
+ {
+     MvField *tab_mvf;
+     MvField temp_col;
+     int x, y, x_pu, y_pu;
+-    int min_pu_width = s->ps.sps->min_pu_width;
++    const int min_pu_width = s->ps.sps->min_pu_width;
+     int availableFlagLXCol = 0;
+     int colPic;
+ 
+-    HEVCFrame *ref = s->ref->collocated_ref;
++    HEVCFrame * const ref = s->ref->collocated_ref;
+ 
+-    if (!ref) {
++    if (ref == NULL || ref->tab_mvf == NULL) {
+         memset(mvLXCol, 0, sizeof(*mvLXCol));
+         return 0;
+     }
+@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+     x = x0 + nPbW;
+     y = y0 + nPbH;
+ 
+-    if (tab_mvf &&
+-        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+         y < s->ps.sps->height &&
+         x < s->ps.sps->width) {
+         x                 &= ~15;
+         y                 &= ~15;
+         if (s->threads_type == FF_THREAD_FRAME)
+-            ff_thread_await_progress(&ref->tf, y, 0);
++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+         x_pu               = x >> s->ps.sps->log2_min_pu_size;
+         y_pu               = y >> s->ps.sps->log2_min_pu_size;
+         temp_col           = TAB_MVF(x_pu, y_pu);
+@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
+     }
+ 
+     // derive center collocated motion vector
+-    if (tab_mvf && !availableFlagLXCol) {
++    if (!availableFlagLXCol) {
+         x                  = x0 + (nPbW >> 1);
+         y                  = y0 + (nPbH >> 1);
+         x                 &= ~15;
+         y                 &= ~15;
+         if (s->threads_type == FF_THREAD_FRAME)
+-            ff_thread_await_progress(&ref->tf, y, 0);
++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+         x_pu               = x >> s->ps.sps->log2_min_pu_size;
+         y_pu               = y >> s->ps.sps->log2_min_pu_size;
+         temp_col           = TAB_MVF(x_pu, y_pu);
 diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
-index 83f2ec2..bcf53dc 100644
+index c1b69a0199..455cdaea1c 100644
 --- a/libavcodec/hevc_ps.c
 +++ b/libavcodec/hevc_ps.c
-@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
      switch (sps->bit_depth) {
      case 8:
          if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
@@ -8863,17 +13007,115 @@ index 83f2ec2..bcf53dc 100644
          if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
          if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
         break;
-@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
-     sps->amp_enabled_flag = get_bits1(gb);
-     sps->sao_enabled      = get_bits1(gb);
+@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+         break;
+     case 10:
+         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
++#if RPI_HEVC_SAND
++        // *** Horrid kludge s.t. we start out with sand format
++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
++#else
+         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
++#endif
+         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
+         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
+         break;
+@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
+         if (sps_extension_flag[0]) {
+             int extended_precision_processing_flag;
+-            int high_precision_offsets_enabled_flag;
+             int cabac_bypass_alignment_enabled_flag;
  
-+    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
-+
-     sps->pcm_enabled_flag = get_bits1(gb);
-     if (sps->pcm_enabled_flag) {
-         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
+@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+                    "extended_precision_processing_flag not yet implemented\n");
+ 
+             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
+-            high_precision_offsets_enabled_flag  = get_bits1(gb);
+-            if (high_precision_offsets_enabled_flag)
++            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
++            if (sps->high_precision_offsets_enabled_flag)
+                 av_log(avctx, AV_LOG_WARNING,
+-                   "high_precision_offsets_enabled_flag not yet implemented\n");
++                   "high_precision_offsets_enabled_flag not fully implemented\n");
+ 
+             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
+ 
+diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
+index df52e401ad..992e994b1a 100644
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -23,7 +23,7 @@
+ 
+ #include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
+-
++#include "libavutil/rpi_sand_fns.h"
+ #include "internal.h"
+ #include "thread.h"
+ #include "hevc.h"
+@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+             HEVCFrame *frame = &s->DPB[min_idx];
+             AVFrame *dst = out;
+             AVFrame *src = frame->frame;
+-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
++            const int fmt = src->format;
++            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
+             int pixel_shift = !!(desc->comp[0].depth > 8);
+ 
+             ret = av_frame_ref(out, src);
+@@ -215,13 +216,31 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
+                 ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
+             if (ret < 0)
+                 return ret;
+-
+-            for (i = 0; i < 3; i++) {
+-                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+-                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+-                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+-                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
+-                dst->data[i] += off;
++#ifdef RPI
++            if (av_rpi_is_sand_format(fmt))
++            {
++                // Sand cannot be windowed by offset so add side data if we have an offset
++                const HEVCWindow * const window = &frame->window;
++                if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
++                {
++                    AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
++                    AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++                    si->left_offset = window->left_offset;
++                    si->top_offset = window->top_offset;
++                    si->pic_width = s->ps.sps->width;
++                    si->pic_height = s->ps.sps->height;
++                }
++            }
++            else
++#endif
++            {
++                for (i = 0; i < 3; i++) {
++                    int hshift = (i > 0) ? desc->log2_chroma_w : 0;
++                    int vshift = (i > 0) ? desc->log2_chroma_h : 0;
++                    int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
++                              (frame->window.top_offset   >> vshift) * dst->linesize[i];
++                    dst->data[i] += off;
++                }
+             }
+             av_log(s->avctx, AV_LOG_DEBUG,
+                    "Output frame with POC %d.\n", frame->poc);
+@@ -426,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
+     frame->sequence = s->seq_decode;
+     frame->flags    = 0;
+ 
+-    if (s->threads_type == FF_THREAD_FRAME)
+-        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
++    ff_hevc_progress_set_all_done(frame);
+ 
+     return frame;
+ }
 diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
-index 9d773d9..c4d7250 100644
+index 9d773d960e..c9661c3ab1 100644
 --- a/libavcodec/hevcdsp.c
 +++ b/libavcodec/hevcdsp.c
 @@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
@@ -8997,28 +13239,16 @@ index 9d773d9..c4d7250 100644
  void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
  {
  #undef FUNC
-@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
      PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
      PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
  
-+#ifndef RPI
++#if !RPI_HEVC_SAND
 +#define SLICED_LOOP_FILTERS(depth)
++#define SLICED_ADD_RESIDUAL(depth)
++#define SLICED_SAO(depth)
 +#else
-+#define SLICED_LOOP_FILTERS(depth)\
-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
-+#endif
-+
-+
- #define HEVC_DSP(depth)                                                     \
-     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
-     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
-@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
-     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
-     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
++#define SLICED_ADD_RESIDUAL(depth)\
 +    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
 +    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
 +    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
@@ -9027,30 +13257,77 @@ index 9d773d9..c4d7250 100644
 +    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
 +    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
 +    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
++    }                                                                         \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#endif
++
+ #define HEVC_DSP(depth)                                                     \
+     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+-    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+-    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
+-    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+-    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
+-    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
++    hevcdsp->transform_add[0]       = FUNC(add_residual4x4, depth);         \
++    hevcdsp->transform_add[1]       = FUNC(add_residual8x8, depth);         \
++    hevcdsp->transform_add[2]       = FUNC(add_residual16x16, depth);       \
++    hevcdsp->transform_add[3]       = FUNC(add_residual32x32, depth);       \
++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);      \
++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);      \
++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);    \
++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);    \
++    SLICED_ADD_RESIDUAL(depth);                                             \
      hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
-     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+-    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
++    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
++    hevcdsp->idct_4x4_luma          = FUNC(idct_4x4_luma, depth);           \
      hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
-@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
+     hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
+@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
+     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
+                                                                             \
+-    hevcdsp->sao_band_filter[0] =                                              \
+-    hevcdsp->sao_band_filter[1] =                                              \
+-    hevcdsp->sao_band_filter[2] =                                              \
+-    hevcdsp->sao_band_filter[3] =                                              \
+-    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
+-    hevcdsp->sao_edge_filter[0] =                                              \
+-    hevcdsp->sao_edge_filter[1] =                                              \
+-    hevcdsp->sao_edge_filter[2] =                                              \
+-    hevcdsp->sao_edge_filter[3] =                                              \
+-    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
++    }                                                                       \
      hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
      hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++    SLICED_SAO(depth);                                                         \
                                                                                 \
-+    hevcdsp->sao_band_filter_c[0] =                                            \
-+    hevcdsp->sao_band_filter_c[1] =                                            \
-+    hevcdsp->sao_band_filter_c[2] =                                            \
-+    hevcdsp->sao_band_filter_c[3] =                                            \
-+    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
-+    hevcdsp->sao_edge_filter_c[0] =                                            \
-+    hevcdsp->sao_edge_filter_c[1] =                                            \
-+    hevcdsp->sao_edge_filter_c[2] =                                            \
-+    hevcdsp->sao_edge_filter_c[3] =                                            \
-+    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
-+                                                                               \
      QPEL_FUNCS(depth);                                                         \
      QPEL_UNI_FUNCS(depth);                                                     \
-     QPEL_BI_FUNCS(depth);                                                      \
-@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
      EPEL_UNI_FUNCS(depth);                                                     \
      EPEL_BI_FUNCS(depth);                                                      \
                                                                                 \
@@ -9058,7 +13335,7 @@ index 9d773d9..c4d7250 100644
      hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
      hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
      hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
-@@ -257,6 +404,8 @@ int i = 0;
+@@ -257,6 +409,8 @@ int i = 0;
          break;
      }
  
@@ -9068,10 +13345,18 @@ index 9d773d9..c4d7250 100644
          ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
      if (ARCH_ARM)
 diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
-index 9f1f6dd..639ecf1 100644
+index 9f1f6dd59f..c4a1b0f09d 100644
 --- a/libavcodec/hevcdsp.h
 +++ b/libavcodec/hevcdsp.h
-@@ -42,11 +42,26 @@ typedef struct SAOParams {
+@@ -25,6 +25,7 @@
+ #ifndef AVCODEC_HEVCDSP_H
+ #define AVCODEC_HEVCDSP_H
+ 
++#include "rpi_opts.h"
+ #include "get_bits.h"
+ 
+ #define MAX_PB_SIZE 64
+@@ -42,11 +43,40 @@ typedef struct SAOParams {
      uint8_t type_idx[3];    ///< sao_type_idx
  } SAOParams;
  
@@ -9085,45 +13370,69 @@ index 9f1f6dd..639ecf1 100644
 +    int8_t ref_idx[2];
 +    int8_t pred_flag;
 +} MvField;
++
++#ifdef RPI
++#define SAO_FILTER_N 6
++#else
++#define SAO_FILTER_N 5
++#endif
++
 +
  typedef struct HEVCDSPContext {
      void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
                      struct GetBitContext *gb, int pcm_bit_depth);
-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+                    struct GetBitContext *gb, int pcm_bit_depth);
  
 -    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
++    // add_residual was transform_add - import 3.3 names
 +    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++#if RPI_HEVC_SAND
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++#endif
  
      void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
  
-@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
+@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext {
  
-     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+     void (*idct_dc[4])(int16_t *coeffs);
+ 
+-    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                 int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++#if RPI_HEVC_SAND
++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                               const int16_t *sao_offset_val_u, int sao_left_class_u,
 +                               const int16_t *sao_offset_val_v, int sao_left_class_v,
 +                               int width, int height);
++#endif
  
      /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
                                 int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++#if RPI_HEVC_SAND
++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
 +                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++#endif
  
      void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
                                  struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
                                  uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++#if RPI_HEVC_SAND
 +    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
 +                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
 +                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++#endif
  
      void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
                                      int height, intptr_t mx, intptr_t my, int width);
-@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
+@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext {
      void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
                                          int32_t *tc, uint8_t *no_p,
                                          uint8_t *no_q);
@@ -9147,23 +13456,24 @@ index 9f1f6dd..639ecf1 100644
  
  void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
-index b840d17..32b9e47 100644
+index 5bca02342d..122fbe8154 100644
 --- a/libavcodec/hevcdsp_template.c
 +++ b/libavcodec/hevcdsp_template.c
-@@ -26,6 +26,9 @@
+@@ -26,6 +26,7 @@
  #include "bit_depth_template.c"
  #include "hevcdsp.h"
  
-+#ifdef RPI
-+#include "rpi_zc.h"
-+#endif
++#include "rpi_shader_template.h"
  
  static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
                            GetBitContext *gb, int pcm_bit_depth)
-@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
      }
  }
  
+-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+-                                                     ptrdiff_t stride, int size)
++#if RPI_HEVC_SAND
 +static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
 +                          GetBitContext *gb, int pcm_bit_depth)
 +{
@@ -9185,17 +13495,20 @@ index b840d17..32b9e47 100644
 +        dst += stride;
 +    }
 +}
++#endif
 +
-+
- static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
-                                                      ptrdiff_t stride, int size)
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs,
++                                                ptrdiff_t stride, int size)
  {
-@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
+     int x, y;
+     pixel *dst = (pixel *)_dst;
+@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
      }
  }
  
-+static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
-+                                                ptrdiff_t stride, int size)
+-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
+-                                       ptrdiff_t stride)
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
 +{
 +    int x, y;
 +    pixel *dst = (pixel *)_dst;
@@ -9203,77 +13516,300 @@ index b840d17..32b9e47 100644
 +    stride /= sizeof(pixel);
 +
 +    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + dc);
++        }
++        dst += stride;
++    }
++}
++
++
++#if RPI_HEVC_SAND
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_v, int size)
+ {
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
 +        for (x = 0; x < size * 2; x += 2) {
 +            dst[x] = av_clip_pixel(dst[x] + *res);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++            res++;
++        }
++        dst += stride;
++    }
+ }
+ 
+-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_u, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
 +            res++;
 +        }
 +        dst += stride;
 +    }
 +}
 +
- static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, unsigned int size)
++{
++    unsigned int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int16_t * ru = res;
++    const int16_t * rv = res + size * size;
++
++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++        }
++        dst += stride;
++    }
++
++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int dc_v = dc >> 16;
++    const int dc_u = (dc << 16) >> 16;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++        }
++        dst += stride;
++    }
++}
++
++
++#endif
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, coeffs, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs,
                                         ptrdiff_t stride)
  {
-@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
-     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
++    FUNC(add_residual)(_dst, coeffs, stride, 8);
  }
  
+-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs,
+                                          ptrdiff_t stride)
+ {
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
++    FUNC(add_residual)(_dst, coeffs, stride, 16);
+ }
+ 
+-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs,
+                                          ptrdiff_t stride)
+ {
+-    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
++    FUNC(add_residual)(_dst, coeffs, stride, 32);
+ }
+ 
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++#if RPI_HEVC_SAND
 +// -- U -- (plaited)
 +
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 4);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
 +}
 +
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
-+                                  ptrdiff_t stride)
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 8);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
 +}
 +
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 16);
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
 +}
 +
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
-+                                    ptrdiff_t stride)
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
 +{
-+    FUNC(add_residual_uv)(_dst, res, stride, 32);
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
 +}
 +
 +// -- V -- (plaited)
 +
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
 +                                  ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
++    FUNC(add_residual_c)(_dst, res, stride, 4);
 +}
 +
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
 +                                  ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
++    FUNC(add_residual_c)(_dst, res, stride, 8);
 +}
 +
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
 +                                    ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
++    FUNC(add_residual_c)(_dst, res, stride, 16);
 +}
 +
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
 +                                    ptrdiff_t stride)
 +{
-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
 +}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++#endif
 +
  
  static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
  {
-@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
+         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
+     } while (0)
+ 
+-static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++static void FUNC(idct_4x4_luma)(int16_t *coeffs)
+ {
+     int i;
+     int shift    = 7;
+@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
+     }
+ }
+ 
++
++#if BIT_DEPTH == 10
++#if RPI_HEVC_SAND
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++#endif
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
+ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+                                     int *borders, int _width, int _height,
+@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
      int x, y;
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
@@ -9281,7 +13817,7 @@ index b840d17..32b9e47 100644
      int sao_eo_class    = sao->eo_class[c_idx];
      int init_x = 0, width = _width, height = _height;
  
-@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
  
      if (sao_eo_class != SAO_EO_VERT) {
          if (borders[0]) {
@@ -9321,7 +13857,7 @@ index b840d17..32b9e47 100644
              height--;
          }
      }
-@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
      int x, y;
      pixel *dst = (pixel *)_dst;
      pixel *src = (pixel *)_src;
@@ -9329,7 +13865,7 @@ index b840d17..32b9e47 100644
      int sao_eo_class    = sao->eo_class[c_idx];
      int init_x = 0, init_y = 0, width = _width, height = _height;
  
-@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
  
      if (sao_eo_class != SAO_EO_VERT) {
          if (borders[0]) {
@@ -9370,24 +13906,22 @@ index b840d17..32b9e47 100644
              height--;
          }
      }
-@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+ 
      }
  }
- 
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
 +
 +// --- Plaited chroma versions
 +
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
-+                                  int width, int height)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
++#if RPI_HEVC_SAND
++
 +static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
 +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
 +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
@@ -9413,23 +13947,17 @@ index b840d17..32b9e47 100644
 +    for (y = 0; y < height; y++) {
 +        for (x = 0; x < width; x += 2)
 +        {
-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++            // *** & 31 shouldn't be wanted but just now we generate broken input that
++            // crashes us in 10-bit world
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
 +        }
 +        dst += stride_dst;
 +        src += stride_src;
 +    }
 +}
-+#endif
-+
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+                                  int eo, int width, int height) {
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
 +
 +static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
 +                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
@@ -9447,9 +13975,12 @@ index b840d17..32b9e47 100644
 +    int a_stride, b_stride;
 +    int x, y;
 +    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
++
 +    stride_dst /= sizeof(pixel);
 +    width *= 2;
 +
++    av_assert0(width <= 64);
++
 +    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
 +    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
 +    for (y = 0; y < height; y++) {
@@ -9467,43 +13998,42 @@ index b840d17..32b9e47 100644
 +        dst += stride_dst;
 +    }
 +}
-+#endif
 +
-+#if BIT_DEPTH != 8
-+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
-+                                    int *borders, int _width, int _height,
-+                                    int c_idx, uint8_t *vert_edge,
-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#else
++// Do once
++#if BIT_DEPTH == 8
 +// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
-+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
 +#endif
 +
++#endif  // RPI_HEVC_SAND
 +
+ 
  #undef CMP
  
- ////////////////////////////////////////////////////////////////////////////////
-@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
  #undef TQ1
  #undef TQ2
  #undef TQ3
 +
-+#ifdef RPI
++#if RPI_HEVC_SAND
 +
 +// line zero
 +#define P3 pix_l[0 * xstride]
@@ -9717,7 +14247,7 @@ index b840d17..32b9e47 100644
 +#endif
 +
 diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
-index 02c1766..cea16ea 100644
+index 02c1766059..cea16eade4 100644
 --- a/libavcodec/hevcpred.c
 +++ b/libavcodec/hevcpred.c
 @@ -24,6 +24,7 @@
@@ -9799,7 +14329,7 @@ index 02c1766..cea16ea 100644
      case 9:
          HEVC_PRED(9);
 diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
-index eb17663..00ba3f9 100644
+index eb17663683..00ba3f94c0 100644
 --- a/libavcodec/hevcpred.h
 +++ b/libavcodec/hevcpred.h
 @@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
@@ -9821,10 +14351,10 @@ index eb17663..00ba3f9 100644
  
  void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
 diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
-index 6ae87cc..c14dddd 100644
+index 6fe33546b1..2f9f5f2798 100644
 --- a/libavcodec/hevcpred_template.c
 +++ b/libavcodec/hevcpred_template.c
-@@ -20,13 +20,55 @@
+@@ -20,13 +20,110 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
@@ -9836,34 +14366,90 @@ index 6ae87cc..c14dddd 100644
  #include "hevcpred.h"
  
 +#ifdef RPI
-+#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
 +#endif
 +
 +#define DUMP_PRED 0
 +
  #define POS(x, y) src[(x) + stride * (y)]
  
-+#if PRED_C
-+
++// REPEAT_INCLUDE defined at EOF
++#if defined(RPI) && !defined(INCLUDED_ONCE)
 +typedef uint8_t (* c8_dst_ptr_t)[2];
 +typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++    uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++    uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++    pixel4_16 t = {{x, x, x, x}};
++    return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++    pixel4_32 t = {{x, x, x, x}};
++    return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
 +
 +#if BIT_DEPTH == 8
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 16
-+#include "bit_depth_template.c"
-+#undef FUNC
-+#define FUNC(a) FUNC3(a, 8, _c)
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t  c8_src_ptr_t
++#define c_dst_ptr_t  c8_dst_ptr_t
 +#else
-+#undef FUNC
-+#define FUNC FUNCC
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
 +#endif
 +
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
 +#endif
 +
-+#if DUMP_PRED
-+#ifndef DEBUG_ONCE
-+#define DEBUG_ONCE
++
++#if DUMP_PRED && !defined(INCLUDE_ONCE)
 +static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
 +{
 +    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
@@ -9875,17 +14461,16 @@ index 6ae87cc..c14dddd 100644
 +    printf("\n");
 +}
 +#endif
-+#endif
 +
  static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
                                                int log2_size, int c_idx)
  {
-@@ -69,8 +111,11 @@ do {                                  \
+@@ -69,8 +166,11 @@ do {                                  \
                  AV_WN4P(&ptr[i], a);                                           \
              else                                                               \
                  a = PIXEL_SPLAT_X4(ptr[i + 3])
 -
-+#ifdef RPI_WORKER
++#ifdef RPI
 +    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
 +#else
      HEVCLocalContext *lc = s->HEVClc;
@@ -9893,7 +14478,7 @@ index 6ae87cc..c14dddd 100644
      int i;
      int hshift = s->ps.sps->hshift[c_idx];
      int vshift = s->ps.sps->vshift[c_idx];
-@@ -79,15 +124,23 @@ do {                                  \
+@@ -79,15 +179,23 @@ do {                                  \
      int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
      int size_in_luma_v = size << vshift;
      int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
@@ -9909,18 +14494,18 @@ index 6ae87cc..c14dddd 100644
 -    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
 +    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
 +#if defined(RPI)
-+    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
++    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
 +            (pixel*)s->frame->data[c_idx] + x + y * stride :
 +        c_idx == 0 ?
-+            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
-+            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
++            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
 +#else
      pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
 +#endif
  
      int min_pu_width = s->ps.sps->min_pu_width;
  
-@@ -95,14 +148,20 @@ do {                                  \
+@@ -95,14 +203,20 @@ do {                                  \
                                lc->tu.intra_pred_mode;
      pixel4 a;
      pixel  left_array[2 * MAX_TB_SIZE + 1];
@@ -9941,7 +14526,7 @@ index 6ae87cc..c14dddd 100644
      int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
      int cand_left        = lc->na.cand_left;
      int cand_up_left     = lc->na.cand_up_left;
-@@ -114,6 +173,26 @@ do {                                  \
+@@ -114,6 +228,27 @@ do {                                  \
      int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
                             (x0 + size_in_luma_h)) >> hshift;
  
@@ -9954,10 +14539,11 @@ index 6ae87cc..c14dddd 100644
 +#endif
 +
 +#if defined(RPI)
-+    if (s->frame->format == AV_PIX_FMT_SAND128) {
++    if (av_rpi_is_sand_frame(s->frame)) {
++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
 +        const AVFrame * const frame = s->frame;
 +        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
 +        if ((x & mask) == 0)
 +            src_l -= stripe_adj;
 +        if (((x + size) & mask) == 0)
@@ -9968,7 +14554,7 @@ index 6ae87cc..c14dddd 100644
      if (s->ps.pps->constrained_intra_pred_flag == 1) {
          int size_in_luma_pu_v = PU(size_in_luma_v);
          int size_in_luma_pu_h = PU(size_in_luma_h);
-@@ -163,23 +242,24 @@ do {                                  \
+@@ -163,23 +298,24 @@ do {                                  \
          top[-1] = 128;
      }
      if (cand_up_left) {
@@ -10000,29 +14586,29 @@ index 6ae87cc..c14dddd 100644
                 size - bottom_left_size);
      }
  
-@@ -268,7 +348,11 @@ do {                                  \
+@@ -268,7 +404,11 @@ do {                                  \
              cand_up_left = 1;
              cand_left    = 1;
          } else { // No samples available
-+#if PRED_C && BIT_DEPTH == 16
-+            left[-1] = 0x8080;
++#if PRED_C
++            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
 +#else
              left[-1] = (1 << (BIT_DEPTH - 1));
 +#endif
              EXTEND(top,  left[-1], 2 * size);
              EXTEND(left, left[-1], 2 * size);
          }
-@@ -287,6 +371,9 @@ do {                                  \
+@@ -287,6 +427,9 @@ do {                                  \
      top[-1] = left[-1];
  
      // Filtering process
-+    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
++    // Sand can only apply to chroma_format_idc == 1 so we don't need to
 +    // worry about chroma smoothing for that case
 +#if !PRED_C
      if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
          if (mode != INTRA_DC && size != 4){
              int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
-@@ -342,13 +429,46 @@ do {                                  \
+@@ -342,6 +485,30 @@ do {                                  \
                                             mode);
          break;
      }
@@ -10052,24 +14638,8 @@ index 6ae87cc..c14dddd 100644
 +#endif
  }
  
-+#if !PRED_C || BIT_DEPTH == 16
  #define INTRA_PRED(size)                                                            \
- static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
- {                                                                                   \
-     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
- }
-+#else
-+#define INTRA_PRED(size)                                                            \
-+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
-+{                                                                                   \
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
-+    abort();                                                                        \
-+}
-+#endif
- 
- INTRA_PRED(2)
- INTRA_PRED(3)
-@@ -357,6 +477,7 @@ INTRA_PRED(5)
+@@ -357,6 +524,7 @@ INTRA_PRED(5)
  
  #undef INTRA_PRED
  
@@ -10077,7 +14647,7 @@ index 6ae87cc..c14dddd 100644
  static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
                                    const uint8_t *_left, ptrdiff_t stride,
                                    int trafo_size)
-@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
              POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
                           (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
  }
@@ -10088,9 +14658,9 @@ index 6ae87cc..c14dddd 100644
 +{
 +    int x, y;
 +    int size = 1 << trafo_size;
-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +
 +    for (y = 0; y < size; y++, src += stride)
 +    {
@@ -10105,26 +14675,9 @@ index 6ae87cc..c14dddd 100644
 +}
 +#endif
  
-+#if !PRED_C || BIT_DEPTH == 16
  #define PRED_PLANAR(size)\
  static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-                                        const uint8_t *left, ptrdiff_t stride)   \
- {                                                                               \
-     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
- }
-+#else
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
-+                                       const uint8_t *left, ptrdiff_t stride)   \
-+{                                                                               \
-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
-+    abort();                                                                    \
-+}
-+#endif
- 
- PRED_PLANAR(0)
- PRED_PLANAR(1)
-@@ -386,6 +540,7 @@ PRED_PLANAR(3)
+@@ -386,6 +577,7 @@ PRED_PLANAR(3)
  
  #undef PRED_PLANAR
  
@@ -10132,7 +14685,7 @@ index 6ae87cc..c14dddd 100644
  static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
                            const uint8_t *_left,
                            ptrdiff_t stride, int log2_size, int c_idx)
-@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
              POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
      }
  }
@@ -10143,9 +14696,9 @@ index 6ae87cc..c14dddd 100644
 +{
 +    unsigned int i, j;
 +    const unsigned int size = (1 << log2_size);
-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
 +    unsigned int dc0 = size;
 +    unsigned int dc1 = size;
 +
@@ -10186,7 +14739,7 @@ index 6ae87cc..c14dddd 100644
  static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
                                                  const uint8_t *_top,
                                                  const uint8_t *_left,
-@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
      const pixel *top  = (const pixel *)_top;
      const pixel *left = (const pixel *)_left;
  
@@ -10202,7 +14755,7 @@ index 6ae87cc..c14dddd 100644
      int angle = intra_pred_angle[mode - 2];
      pixel ref_array[3 * MAX_TB_SIZE + 4];
      pixel *ref_tmp = ref_array + size;
-@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
          }
      }
  }
@@ -10214,26 +14767,26 @@ index 6ae87cc..c14dddd 100644
 +                                                int mode, int size)
 +{
 +    int x, y;
-+    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
-+    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
-+    c8_src_ptr_t left = (c8_src_ptr_t)_left;
++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
++    c_src_ptr_t top  = (c_src_ptr_t)_top;
++    c_src_ptr_t left = (c_src_ptr_t)_left;
 +
 +    const int angle = intra_pred_angle[mode - 2];
-+    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
-+    c8_dst_ptr_t ref_tmp = ref_array + size;
-+    c8_src_ptr_t ref;
++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++    c_dst_ptr_t ref_tmp = ref_array + size;
++    c_src_ptr_t ref;
 +    const int last = (size * angle) >> 5;
 +
 +    if (mode >= 18) {
 +        ref = top - 1;
 +        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, top - 1, (size + 1) * 2);
++            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
 +            for (x = last; x <= -1; x++)
 +            {
 +                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
 +                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
 +            }
-+            ref = (c8_src_ptr_t)ref_tmp;
++            ref = (c_src_ptr_t)ref_tmp;
 +        }
 +
 +        for (y = 0; y < size; y++, src += stride) {
@@ -10247,19 +14800,19 @@ index 6ae87cc..c14dddd 100644
 +                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
 +                }
 +            } else {
-+                memcpy(src, ref + idx + 1, size * 2);
++                memcpy(src, ref + idx + 1, size * 2 * PW);
 +            }
 +        }
 +    } else {
 +        ref = left - 1;
 +        if (angle < 0 && last < -1) {
-+            memcpy(ref_tmp, left - 1, (size + 1) * 2);
++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
 +            for (x = last; x <= -1; x++)
 +            {
 +                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
 +                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
 +            }
-+            ref = (c8_src_ptr_t)ref_tmp;
++            ref = (c_src_ptr_t)ref_tmp;
 +        }
 +
 +        for (x = 0; x < size; x++, src++) {
@@ -10286,138 +14839,4158 @@ index 6ae87cc..c14dddd 100644
  
  static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
                                   const uint8_t *left,
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 099a8c5..bdff2d2 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
+@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
+ }
  
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- 
- #include "avcodec.h"
- #include "internal.h"
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index 3adf28d..2f9195f 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- 
-         if (ctx->divx_version >= 0)
-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
+ #undef EXTEND_LEFT_CIP
+ #undef EXTEND_RIGHT_CIP
+ #undef EXTEND_UP_CIP
+@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
+ #undef EXTEND
+ #undef MIN_TB_ADDR_ZS
+ #undef POS
++#undef PW
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
 +
-+        if (ctx->num_sprite_warping_points > 1)
-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
-     }
- 
-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
- 
-+    avctx->workaround_bugs = s->workaround_bugs;
-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
-         s->codec_id == AV_CODEC_ID_MPEG4 &&
-         avctx->idct_algo == FF_IDCT_AUTO) {
 diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index bfa2537..1bca89e 100644
+index d36b68bfae..b526dc393d 100644
 --- a/libavcodec/raw.c
 +++ b/libavcodec/raw.c
-@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
      { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
      { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
  
 +    /* RPI */
 +#ifdef RPI
 +    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
 +#endif
 +
      /* special */
      { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
      { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
 diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d837056..81256b5 100644
+index d83705645c..4c746786ff 100644
 --- a/libavcodec/rawenc.c
 +++ b/libavcodec/rawenc.c
-@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
+@@ -31,6 +31,8 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -47,6 +49,73 @@ FF_ENABLE_DEPRECATION_WARNINGS
      return 0;
  }
  
-+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
-+{
-+    for (int y = 0; y != frame->height / 2; ++y) {
-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
-+            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
-+            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
-+            for (int i = 0; i < w; ++i)
-+                *dst++ = p[i * 2];
-+        }
-+    }
-+    return dst;
-+}
-+
-+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++#ifdef RPI
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
 +                      const AVFrame *frame)
 +{
-+    int size = frame->width * frame->height * 3 / 2;
++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
++    int size;
++    int width = frame->width;
++    int height = frame->height;
++    int x0 = 0;
++    int y0 = 0;
 +    uint8_t * dst;
 +    int ret;
 +
++    if (sd != NULL) {
++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++
++        x0 = si->left_offset;
++        y0 = si->top_offset;
++    }
++
++    size = width * height * 3 / 2;
 +    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
 +        return ret;
 +
 +    dst = pkt->data;
 +
-+    // Luma is "easy"
-+    for (int y = 0; y != frame->height; ++y) {
-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
-+            const int w = FFMIN(frame->linesize[0], frame->width - x);
-+            memcpy(dst,
-+                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
-+            dst += w;
-+        }
-+    }
-+    // Chroma is dull
-+    dst = cpy_sand_c(dst, frame, 0);
-+    dst = cpy_sand_c(dst, frame, 1);
-+
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
 +    return 0;
 +}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
++    int size;
++    int width = frame->width;
++    int height = frame->height;
++    int x0 = 0;
++    int y0 = 0;
++    uint8_t * dst;
++    int ret;
++
++    if (sd != NULL) {
++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
++
++        x0 = si->left_offset;
++        y0 = si->top_offset;
++    }
++
++    size = width * height * 3;
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++#endif
++
 +
  static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
                        const AVFrame *frame, int *got_packet)
  {
-@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+@@ -56,6 +125,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
      if (ret < 0)
          return ret;
  
-+    if (frame->format == AV_PIX_FMT_SAND128) {
-+        ret = raw_sand_as_yuv420(avctx, pkt, frame);
++#ifdef RPI
++    if (av_rpi_is_sand_frame(frame)) {
++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
 +        *got_packet = (ret == 0);
 +        return ret;
 +    }
++#endif
 +
      if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
          return ret;
      if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
 new file mode 100644
-index 0000000..4309f1c
+index 0000000000..391f761df9
 --- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.h
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,923 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# HEVC VPU Transform
++#             fe
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++  cmp r5,1
++  beq memclear16
++  cmp r5,2
++  beq hevc_deblock_16x16
++  cmp r5,3
++  beq hevc_uv_deblock_16x16
++  cmp r5,4
++  beq hevc_uv_deblock_16x16_with_clear
++  cmp r5,5
++  beq hevc_run_command_list
++
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++block_loop:
++  eor r0,r8
++  add r1,r7
++  # Prefetch the next block
++  vldh HX(0++,0)+r0,(r1 += r3) REP 16
++  eor r0,r8
++  sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++#
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
++  # set r8 to 32byte aligned stack pointer
++  add r8,sp,31
++  lsr r8,5
++  lsl r8,5
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++block_loop32:
++
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, TRANS_RND2 # Constant used for rounding second pass
++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++
++  add sp,sp,32*32*2+32 # Restore stack
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++memclear16:
++  # r0 is address
++  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
++  vmov HX(0++,0),0 REP 16
++  mov r2,32
++loop:
++  vsth HX(0++,0),(r0+=r2) REP 16
++  add r0,16*16*2
++  sub r1,16*16
++  cmp r1,0
++  bgt loop
++  b lr
++
++
++################################################################################
++# HEVC VPU Deblock
++#
++# Vertical edges before horizontal
++# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
++#
++# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
++# The VPU code works in units of 16x16 blocks.
++# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
++# One final horizontal filter is required at the end.
++# PCM is not allowed in this code.
++#
++#
++# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
++# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
++
++.set P0,63
++.set P1,62
++.set P2,61
++.set P3,60
++.set Q0,59
++.set Q1,58
++.set Q2,57
++.set Q3,56
++
++.set dp,32
++.set dq,33
++.set d,34
++.set decision,35
++.set beta,36
++.set beta2,37
++.set beta3,38
++.set ptest,39
++.set qtest,40
++.set pqtest,41
++.set thresh,42
++.set deltatest, 44
++.set deltap1, 45
++.set tc25, 46
++.set setup,47
++.set tc,48
++.set tc25,49
++.set tc2, 50
++.set do_filter, 51
++.set delta, 52
++.set tc10, 53
++.set delta0, 54
++.set delta1, 55
++.set zeros, 0
++.set setup_input, 1
++.set deltaq1, 2
++
++
++
++# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
++# Row has num16 16x16 blocks across
++# Beta goes from 0 to 64
++# tc goes from 0 to 24
++# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
++#   has 8 bytes per edge
++#   has 16 bytes per direction
++#   has 32 bytes per 16x16 block
++# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
++hevc_deblock_16x16:
++  push r6-r15, lr
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++
++process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl vert_filter
++  sub r3,8
++  b start_deblock_loop
++deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  vstb H(zeros,0),(r4)
++  bl vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  vstb H(zeros,0),-16(r4)
++  bl horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt start_again
++  pop r6-r15, pc
++start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++vert_filter:
++  push lr
++
++  vmov HX(P3,0), V(16,12)+r3
++  vmov HX(P2,0), V(16,13)+r3
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++  vmov HX(Q2,0), V(16,18)+r3
++  vmov HX(Q3,0), V(16,19)+r3
++
++  bl do_luma_filter
++
++  vadds V(16,13)+r3, HX(P2,0), 0
++  vadds V(16,14)+r3, HX(P1,0), 0
++  vadds V(16,15)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds V(16,16)+r3, HX(Q0,0), 0
++  vadds V(16,17)+r3, HX(Q1,0), 0
++  vadds V(16,18)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++horz_filter:
++  push lr
++
++  vmov HX(P3,0), H(12,0)+r3
++  vmov HX(P2,0), H(13,0)+r3
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++  vmov HX(Q2,0), H(18,0)+r3
++  vmov HX(Q3,0), H(19,0)+r3
++
++  bl do_luma_filter
++
++  vadds H(13,0)+r3, HX(P2,0), 0
++  vadds H(14,0)+r3, HX(P1,0), 0
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++  vadds H(17,0)+r3, HX(Q1,0), 0
++  vadds H(18,0)+r3, HX(Q2,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_luma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
++  valtl HX(beta,0),H(setup,0),H(setup,0)
++  valtu HX(tc,0),H(setup,0),H(setup,0)
++  vmul HX(tc25,0), HX(tc,0), 5
++  vadd HX(tc25,0),HX(tc25,0), 1
++  vasr HX(tc25,0), HX(tc25,0), 1
++
++  # Compute decision
++  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
++  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
++  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
++  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
++
++  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
++  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
++  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
++  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
++
++  vadd HX(d,0), HX(dp,0), HX(dq,0)
++  vasr HX(beta2,0),HX(beta,0),2
++  vasr HX(beta3,0),HX(beta,0),3
++
++  # Compute flags that are negative if all conditions pass
++  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
++  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
++  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
++
++  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
++  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
++  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
++  vmov HX(decision,0), 1 IFNN
++  vadd H(decision,0),H(decision,3),0 IFN
++  vadd H(decision,16),H(decision,19),0 IFN
++  vmov -,HX(decision,0) SETF   # N marks strong filter
++  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
++
++  vadd HX(do_filter,0), HX(d,3), HX(d,0)
++  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
++  vmov HX(decision,0),0 IFNN # Z marks no filter
++
++  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
++  # First extract out even terms
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
++  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
++  # Now expand back
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
++  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
++
++  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
++
++  # Do a quick check to see if there is anything to do
++  mov r11, 0 # Signal no filtering
++  vmov -,1 IFNZ SUMS r5
++  cmp r5,0
++  beq filtering_done
++  mov r11, 1 # Signal some filtering
++  # And whether there is any strong filtering
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq normal_filtering
++
++  ##############################################################################
++  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
++  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
++
++  # Take a copy of the original pixels for use in decision calculation
++  vmov HX(P0,32),HX(P0,0)
++  vmov HX(Q0,32),HX(Q0,0)
++  vmov HX(P1,32),HX(P1,0)
++  vmov HX(Q1,32),HX(Q1,0)
++  vmov HX(P2,32),HX(P2,0)
++  vmov HX(Q2,32),HX(Q2,0)
++
++  vadd -,HX(P2,32),4 CLRA SACC
++  vshl -,HX(P1,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl HX(delta,0),HX(Q1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
++
++  vadd -,HX(P2,32),2 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vshl HX(delta,0),HX(Q0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(P1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
++
++  vadd -,HX(Q0,32),4 CLRA SACC
++  vadd -,HX(P1,32),HX(P0,32) SACC
++  vmul -,HX(P2,32),3 SACC
++  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(P2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
++  #vmov HX(P2,0),3 IFN
++
++  # Now reverse all P/Qs
++
++  vadd -,HX(Q2,32),4 CLRA SACC
++  vshl -,HX(Q1,32),1 SACC
++  vshl -,HX(Q0,32),1 SACC
++  vshl -,HX(P0,32),1 SACC
++  vshl HX(delta,0),HX(P1,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
++
++  vadd -,HX(Q2,32),2 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vshl HX(delta,0),HX(P0,32),0 SACC
++  vasr HX(delta,0),HX(delta,0), 2
++  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
++
++  vadd -,HX(P0,32),4 CLRA SACC
++  vadd -,HX(Q1,32),HX(Q0,32) SACC
++  vmul -,HX(Q2,32),3 SACC
++  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
++  vasr HX(delta,0),HX(delta,0), 3
++  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
++  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
++  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
++
++  ##############################################################################
++  # Normal filtering
++normal_filtering:
++  # Invert the decision flags
++  # make instruction more complicated as assembler has error and loses SETF
++  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
++  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
++
++  vmov -,1 IFN SUMS r5
++  cmp r5,0
++  beq filtering_done
++
++  vasr HX(tc2,0), HX(tc,0), 1
++  vmul HX(tc10,0), HX(tc,0), 10
++
++  vasr HX(thresh,0), HX(beta,0), 1
++  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
++  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
++
++  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
++  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
++  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
++  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
++  # Expand ptest and qtest together
++  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
++  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
++  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
++  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
++  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
++
++  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
++  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
++  vmov -,8 CLRA SACC
++  vmul -,HX(delta0,0), 9 SACC
++  vmul HX(delta0,0),HX(delta1,0), r6 SACC
++  vasr HX(delta0,0), HX(delta0,0), 4
++  vdist HX(deltatest,0), HX(delta0,0), 0
++  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
++  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
++
++  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
++
++  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
++  vadd HX(deltap1,0), HX(deltap1,0), 1
++  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
++  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
++  vasr HX(deltap1,0), HX(deltap1,0), 1
++  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
++
++  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
++  vadd HX(deltaq1,0), HX(deltaq1,0), 1
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
++  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
++  vrsub -, HX(delta0,0), 0 SACC
++  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
++  vasr HX(deltaq1,0), HX(deltaq1,0), 1
++  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
++
++  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
++  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
++
++  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
++  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
++
++  vmov -,HX(deltatest,0) SETF
++  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
++  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
++
++  #vmov HX(P2,0),1 IFN
++
++filtering_done:
++  b lr
++
++
++hevc_uv_deblock_16x16:
++  push r6-r15, lr
++  mov r14,0
++  b hevc_uv_start
++hevc_uv_deblock_16x16_with_clear:
++  push r6-r15, lr
++  mov r14,1
++  b hevc_uv_start
++
++hevc_uv_start:
++  mov r9,r4
++  mov r4,r3
++  mov r13,r2
++  mov r2,r0
++  mov r10,r0
++  subscale4 r0,r1
++  mov r8,63
++  mov r6,-3
++  vmov H(zeros,0),0
++# r7 is number of blocks still to load
++# r0 is location of current block - 4 * stride
++# r1 is stride
++# r2 is location of current block
++# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
++# r4 is setup
++# r5 is for temporary calculations
++# r8 holds 63
++# r6 holds -3
++# r9 holds the number of 16 high rows to process
++# r10 holds the original img base
++# r11 returns 0 if no filtering was done on the edge
++# r12 saves a copy of this
++# r13 is copy of width
++# r14 is 1 if we should clear the old contents, or 0 if not
++
++uv_process_row:
++  # First iteration does not do horizontal filtering on previous
++  mov r7, r13
++  mov r3,0
++  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
++  cmp r14,1
++  bne uv_skip0
++  vstb H(zeros,0),(r4)
++uv_skip0:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
++  bl uv_vert_filter
++  sub r3,8
++  b uv_start_deblock_loop
++uv_deblock_loop:
++  # Middle iterations do vertical on current block and horizontal on preceding
++  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
++  vldb H(16++,16)+r3,(r2 += r1) REP 16
++  vldb H(setup_input,0), (r4)
++  cmp r14,1
++  bne uv_skip1
++  vstb H(zeros,0),(r4)
++uv_skip1:
++  bl uv_vert_filter
++  add r3,8
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_vert_filter
++  sub r3,8
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip3
++  vstb H(zeros,0),-16(r4)
++uv_skip3:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,8*64
++  addcmpbeq r12,0,0,uv_skip_save_top
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++uv_start_deblock_loop:
++  # move onto next 16x16 (could do this with circular buffer support instead)
++  add r3,16
++  and r3,r8
++  add r4,32
++  # Perform loop counter operations (may work with an addcmpbgt as well?)
++  add r0,16
++  add r2,16
++  sub r7,1
++  cmp r7,0 # Are there still more blocks to load
++  bgt uv_deblock_loop
++
++  # Final iteration needs to just do horizontal filtering
++  vldb H(setup_input,0), -16(r4)
++  cmp r14,1
++  bne uv_skip2
++  vstb H(zeros,0),-16(r4)
++uv_skip2:
++  bl uv_horz_filter
++  mov r12,r11
++  add r3,8*64
++  vadd H(setup_input,0),H(setup_input,8),0
++  bl uv_horz_filter
++  sub r3,64*8
++  addcmpbeq r12,0,0,uv_skip_save_top2
++  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
++uv_skip_save_top2:
++  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
++
++# Now look to see if we should do another row
++  sub r9,1
++  cmp r9,0
++  bgt uv_start_again
++  pop r6-r15, pc
++uv_start_again:
++  # Need to sort out r0,r2 to point to next row down
++  addscale16 r10,r1
++  mov r2,r10
++  subscale4 r0,r2,r1
++  b uv_process_row
++
++
++# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
++# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
++
++uv_vert_filter:
++  push lr
++
++  vmov HX(P1,0), V(16,14)+r3
++  vmov HX(P0,0), V(16,15)+r3
++  vmov HX(Q0,0), V(16,16)+r3
++  vmov HX(Q1,0), V(16,17)+r3
++
++  bl do_chroma_filter
++
++  vadds V(16,15)+r3, HX(P0,0), 0
++  vadds V(16,16)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# Filter edge at H(16,0)+r3
++uv_horz_filter:
++  push lr
++
++  vmov HX(P1,0), H(14,0)+r3
++  vmov HX(P0,0), H(15,0)+r3
++  vmov HX(Q0,0), H(16,0)+r3
++  vmov HX(Q1,0), H(17,0)+r3
++
++  bl do_chroma_filter
++
++  vadds H(15,0)+r3, HX(P0,0), 0
++  # P3 and Q3 never change so don't bother saving back
++  vadds H(16,0)+r3, HX(Q0,0), 0
++
++  pop pc
++
++# r4 points to array of beta/tc for each 4 length edge
++do_chroma_filter:
++  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
++  valtl HX(tc,0),H(setup,0),H(setup,0)
++
++  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
++  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
++  vsub -,HX(P1,0),HX(Q1,0) SACC
++  vmov HX(delta,0),4 SACC
++  vasr HX(delta,0),HX(delta,0),3
++  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
++  vadd HX(P0,0),HX(P0,0),HX(delta,0)
++  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
++  b lr
++
++# r0 = list
++# r1 = number
++hevc_run_command_list:
++  push r6-r7, lr
++  mov r6, r0
++  mov r7, r1
++loop_cmds:
++  ld r0,(r6) # How to encode r6++?
++  add r6,4
++  ld r1,(r6)
++  add r6,4
++  ld r2,(r6)
++  add r6,4
++  ld r3,(r6)
++  add r6,4
++  ld r4,(r6)
++  add r6,4
++  ld r5,(r6)
++  add r6,4
++  bl hevc_trans_16x16
++  sub r7,1
++  cmp r7,0
++  bgt loop_cmds
++
++  pop r6-r7, pc
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..b0e9902d82
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
 @@ -0,0 +1,3070 @@
-+unsigned char rpi_hevc_transform [] = {
++static const unsigned char rpi_hevc_transform10 [] = {
++21,
++106,
++0,
++144,
++47,
++1,
++37,
++106,
++0,
++144,
++66,
++1,
++53,
++106,
++0,
++144,
++192,
++4,
++69,
++106,
++0,
++144,
++192,
++4,
++85,
++106,
++0,
++144,
++220,
++5,
++169,
++3,
++62,
++64,
++79,
++64,
++3,
++232,
++32,
++0,
++0,
++0,
++12,
++248,
++0,
++136,
++0,
++0,
++192,
++248,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++12,
++248,
++0,
++168,
++0,
++0,
++192,
++248,
++0,
++0,
++0,
++96,
++3,
++232,
++32,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++8,
++232,
++0,
++4,
++0,
++0,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++4,
++232,
++64,
++0,
++0,
++0,
++5,
++232,
++0,
++2,
++0,
++0,
++128,
++69,
++113,
++66,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++8,
++4,
++0,
++128,
++69,
++113,
++70,
++128,
++144,
++40,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++16,
++0,
++76,
++254,
++48,
++192,
++9,
++4,
++32,
++8,
++0,
++0,
++4,
++254,
++0,
++144,
++128,
++2,
++0,
++8,
++2,
++0,
++128,
++144,
++23,
++0,
++4,
++255,
++48,
++192,
++128,
++3,
++32,
++8,
++20,
++0,
++76,
++254,
++48,
++192,
++6,
++4,
++32,
++8,
++0,
++0,
++140,
++248,
++44,
++0,
++0,
++0,
++32,
++48,
++4,
++0,
++128,
++69,
++113,
++66,
++242,
++140,
++211,
++192,
++34,
++31,
++41,
++3,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++96,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++70,
++192,
++80,
++7,
++164,
++255,
++36,
++204,
++224,
++2,
++0,
++248,
++62,
++0,
++3,
++255,
++55,
++208,
++120,
++3,
++224,
++3,
++190,
++11,
++16,
++139,
++246,
++91,
++0,
++103,
++90,
++0,
++225,
++64,
++242,
++64,
++3,
++232,
++128,
++0,
++0,
++0,
++7,
++232,
++0,
++2,
++0,
++0,
++57,
++239,
++224,
++247,
++255,
++255,
++72,
++192,
++95,
++207,
++88,
++122,
++88,
++124,
++137,
++64,
++26,
++64,
++4,
++232,
++64,
++0,
++0,
++0,
++149,
++96,
++161,
++64,
++152,
++64,
++128,
++144,
++35,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++27,
++0,
++4,
++232,
++0,
++2,
++0,
++0,
++101,
++96,
++145,
++64,
++168,
++64,
++128,
++144,
++19,
++0,
++72,
++232,
++0,
++4,
++0,
++0,
++65,
++232,
++32,
++0,
++0,
++0,
++128,
++144,
++11,
++0,
++74,
++232,
++0,
++8,
++0,
++0,
++242,
++140,
++221,
++192,
++57,
++239,
++32,
++8,
++0,
++0,
++41,
++3,
++239,
++3,
++12,
++248,
++0,
++128,
++0,
++0,
++192,
++248,
++4,
++0,
++12,
++248,
++0,
++132,
++64,
++0,
++192,
++248,
++4,
++0,
++0,
++96,
++255,
++159,
++154,
++255,
++0,
++232,
++0,
++4,
++0,
++0,
++255,
++159,
++165,
++255,
++4,
++255,
++48,
++204,
++16,
++3,
++224,
++251,
++62,
++0,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++128,
++64,
++6,
++232,
++64,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++32,
++247,
++240,
++207,
++16,
++3,
++32,
++247,
++176,
++207,
++17,
++19,
++32,
++247,
++112,
++207,
++18,
++35,
++32,
++247,
++48,
++207,
++19,
++51,
++32,
++247,
++240,
++206,
++20,
++67,
++32,
++247,
++176,
++206,
++21,
++83,
++32,
++247,
++112,
++206,
++22,
++99,
++32,
++247,
++48,
++206,
++23,
++115,
++32,
++247,
++240,
++205,
++24,
++131,
++32,
++247,
++176,
++205,
++25,
++147,
++32,
++247,
++112,
++205,
++26,
++163,
++32,
++247,
++48,
++205,
++27,
++179,
++32,
++247,
++240,
++204,
++28,
++195,
++32,
++247,
++176,
++204,
++29,
++211,
++32,
++247,
++112,
++204,
++30,
++227,
++32,
++247,
++48,
++204,
++31,
++243,
++4,
++255,
++51,
++204,
++128,
++3,
++224,
++251,
++16,
++0,
++76,
++254,
++51,
++204,
++128,
++3,
++224,
++251,
++20,
++0,
++0,
++237,
++32,
++0,
++0,
++0,
++140,
++248,
++47,
++0,
++0,
++0,
++224,
++99,
++0,
++0,
++111,
++3,
++4,
++254,
++0,
++128,
++0,
++4,
++0,
++248,
++0,
++0,
++2,
++232,
++32,
++0,
++0,
++0,
++140,
++248,
++32,
++0,
++0,
++0,
++224,
++35,
++0,
++0,
++64,
++232,
++0,
++2,
++0,
++0,
++193,
++232,
++0,
++1,
++0,
++0,
++1,
++106,
++116,
++30,
++90,
++0,
++169,
++3,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++137,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++129,
++0,
++131,
++102,
++0,
++158,
++67,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++108,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++100,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++161,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++150,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++182,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++112,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++101,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++103,
++255,
++239,
++3,
++0,
++254,
++0,
++143,
++92,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++93,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++210,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++211,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++107,
++0,
++8,
++255,
++99,
++23,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++23,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++52,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++52,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++0,
++143,
++12,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++143,
++13,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++64,
++142,
++18,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++0,
++142,
++19,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++33,
++0,
++8,
++255,
++99,
++3,
++0,
++212,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++3,
++0,
++228,
++192,
++51,
++0,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++8,
++255,
++99,
++4,
++0,
++164,
++192,
++51,
++0,
++0,
++8,
++255,
++163,
++4,
++0,
++148,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++137,
++47,
++240,
++40,
++246,
++2,
++140,
++47,
++240,
++128,
++245,
++99,
++140,
++5,
++4,
++0,
++247,
++99,
++140,
++1,
++20,
++88,
++246,
++99,
++140,
++1,
++20,
++0,
++247,
++35,
++136,
++62,
++226,
++32,
++247,
++35,
++136,
++32,
++210,
++0,
++247,
++34,
++136,
++63,
++2,
++208,
++246,
++34,
++136,
++0,
++4,
++0,
++247,
++99,
++136,
++58,
++162,
++32,
++247,
++99,
++136,
++33,
++146,
++0,
++247,
++98,
++136,
++59,
++18,
++208,
++246,
++98,
++136,
++0,
++20,
++0,
++247,
++162,
++136,
++33,
++2,
++88,
++246,
++98,
++137,
++2,
++68,
++88,
++246,
++162,
++137,
++3,
++68,
++208,
++254,
++227,
++136,
++60,
++242,
++192,
++243,
++188,
++11,
++208,
++254,
++227,
++136,
++56,
++178,
++192,
++243,
++188,
++10,
++32,
++255,
++226,
++136,
++38,
++58,
++192,
++243,
++60,
++0,
++208,
++254,
++227,
++136,
++59,
++242,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++49,
++58,
++192,
++243,
++60,
++128,
++0,
++255,
++226,
++136,
++34,
++34,
++192,
++243,
++60,
++128,
++32,
++255,
++226,
++136,
++37,
++58,
++192,
++243,
++60,
++128,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++194,
++8,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++255,
++202,
++40,
++0,
++52,
++195,
++243,
++0,
++128,
++0,
++254,
++0,
++240,
++35,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++192,
++136,
++1,
++4,
++0,
++240,
++0,
++160,
++0,
++255,
++226,
++140,
++34,
++34,
++195,
++243,
++60,
++0,
++32,
++255,
++227,
++140,
++36,
++58,
++192,
++243,
++60,
++0,
++0,
++254,
++192,
++136,
++0,
++4,
++0,
++240,
++0,
++160,
++16,
++246,
++226,
++136,
++35,
++50,
++16,
++246,
++226,
++136,
++35,
++50,
++32,
++246,
++226,
++136,
++35,
++50,
++32,
++254,
++226,
++136,
++35,
++58,
++192,
++243,
++60,
++0,
++11,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++115,
++5,
++106,
++0,
++144,
++173,
++1,
++27,
++96,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++227,
++0,
++64,
++246,
++163,
++140,
++1,
++4,
++0,
++246,
++192,
++175,
++63,
++2,
++0,
++246,
++192,
++174,
++59,
++2,
++0,
++246,
++128,
++175,
++62,
++2,
++0,
++246,
++128,
++174,
++58,
++2,
++0,
++246,
++64,
++175,
++61,
++2,
++0,
++246,
++64,
++174,
++57,
++2,
++0,
++255,
++43,
++240,
++4,
++212,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++228,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++191,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++143,
++52,
++242,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++212,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++180,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++190,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++143,
++52,
++226,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++180,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++191,
++226,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++212,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++196,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++189,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++143,
++52,
++210,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++148,
++192,
++243,
++128,
++11,
++64,
++254,
++43,
++240,
++1,
++164,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++180,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++240,
++1,
++244,
++192,
++243,
++128,
++10,
++64,
++254,
++43,
++141,
++0,
++228,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++187,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++235,
++142,
++52,
++178,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++2,
++148,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++64,
++254,
++43,
++141,
++0,
++244,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++2,
++68,
++32,
++247,
++35,
++141,
++186,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++171,
++142,
++52,
++162,
++192,
++243,
++60,
++128,
++0,
++255,
++43,
++240,
++4,
++244,
++192,
++243,
++128,
++11,
++0,
++255,
++43,
++240,
++187,
++162,
++192,
++243,
++188,
++10,
++128,
++253,
++43,
++240,
++3,
++148,
++192,
++243,
++128,
++10,
++64,
++254,
++35,
++141,
++1,
++132,
++192,
++243,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++32,
++247,
++35,
++141,
++185,
++66,
++240,
++246,
++35,
++141,
++50,
++66,
++0,
++255,
++107,
++142,
++52,
++146,
++192,
++243,
++60,
++128,
++64,
++255,
++98,
++141,
++0,
++52,
++192,
++243,
++0,
++0,
++0,
++254,
++0,
++240,
++53,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++1,
++4,
++0,
++240,
++64,
++147,
++5,
++106,
++0,
++144,
++177,
++0,
++88,
++246,
++163,
++140,
++1,
++4,
++128,
++245,
++99,
++141,
++10,
++4,
++88,
++246,
++162,
++138,
++1,
++68,
++0,
++247,
++162,
++138,
++36,
++162,
++88,
++254,
++162,
++138,
++3,
++164,
++192,
++243,
++128,
++11,
++0,
++255,
++226,
++137,
++32,
++2,
++195,
++243,
++60,
++0,
++32,
++247,
++226,
++137,
++42,
++114,
++0,
++255,
++34,
++138,
++33,
++18,
++195,
++243,
++60,
++0,
++32,
++247,
++34,
++138,
++42,
++130,
++16,
++246,
++98,
++138,
++40,
++114,
++16,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++98,
++138,
++41,
++146,
++32,
++246,
++226,
++137,
++41,
++146,
++40,
++246,
++34,
++138,
++41,
++146,
++32,
++247,
++163,
++141,
++63,
++178,
++32,
++247,
++227,
++141,
++62,
++162,
++0,
++254,
++0,
++240,
++8,
++4,
++0,
++240,
++128,
++11,
++128,
++253,
++35,
++240,
++9,
++100,
++192,
++243,
++128,
++10,
++128,
++253,
++163,
++141,
++128,
++115,
++192,
++243,
++152,
++10,
++88,
++246,
++163,
++141,
++4,
++100,
++208,
++246,
++35,
++139,
++0,
++100,
++32,
++255,
++34,
++139,
++53,
++202,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++139,
++0,
++4,
++0,
++240,
++0,
++160,
++240,
++246,
++163,
++141,
++48,
++98,
++0,
++247,
++99,
++139,
++63,
++210,
++0,
++247,
++98,
++139,
++1,
++212,
++88,
++254,
++98,
++139,
++1,
++212,
++192,
++243,
++128,
++11,
++32,
++255,
++99,
++139,
++62,
++98,
++192,
++243,
++188,
++10,
++88,
++246,
++98,
++139,
++1,
++212,
++240,
++246,
++98,
++139,
++50,
++210,
++0,
++247,
++163,
++128,
++59,
++146,
++0,
++247,
++160,
++128,
++1,
++36,
++88,
++254,
++160,
++128,
++1,
++36,
++192,
++243,
++128,
++11,
++0,
++247,
++163,
++128,
++58,
++98,
++64,
++255,
++35,
++240,
++0,
++100,
++192,
++243,
++128,
++10,
++64,
++255,
++163,
++128,
++0,
++164,
++192,
++243,
++128,
++10,
++88,
++246,
++160,
++128,
++1,
++36,
++240,
++246,
++160,
++128,
++50,
++34,
++8,
++255,
++227,
++143,
++54,
++242,
++192,
++243,
++60,
++128,
++40,
++255,
++227,
++142,
++54,
++178,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++39,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++143,
++45,
++226,
++192,
++243,
++60,
++128,
++0,
++254,
++0,
++240,
++44,
++10,
++0,
++240,
++60,
++0,
++0,
++254,
++0,
++240,
++40,
++10,
++0,
++240,
++60,
++128,
++8,
++255,
++163,
++142,
++2,
++162,
++192,
++243,
++60,
++128,
++90,
++0,
++169,
++3,
++14,
++96,
++4,
++31,
++169,
++3,
++30,
++96,
++1,
++31,
++73,
++64,
++52,
++64,
++45,
++64,
++2,
++64,
++10,
++64,
++64,
++198,
++1,
++7,
++8,
++232,
++63,
++0,
++0,
++0,
++6,
++232,
++253,
++255,
++255,
++255,
++0,
++246,
++0,
++0,
++0,
++4,
++215,
++64,
++3,
++96,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++132,
++24,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++143,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++135,
++0,
++131,
++102,
++0,
++158,
++71,
++0,
++2,
++248,
++0,
++35,
++0,
++0,
++64,
++56,
++0,
++0,
++4,
++248,
++0,
++36,
++0,
++0,
++64,
++56,
++8,
++0,
++0,
++240,
++64,
++0,
++132,
++3,
++30,
++106,
++132,
++24,
++128,
++240,
++0,
++0,
++132,
++3,
++128,
++144,
++112,
++0,
++131,
++98,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++104,
++0,
++131,
++102,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++134,
++24,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++123,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++112,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++3,
++99,
++131,
++71,
++68,
++232,
++32,
++0,
++0,
++0,
++0,
++99,
++2,
++99,
++23,
++102,
++7,
++106,
++127,
++156,
++178,
++255,
++0,
++248,
++64,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++30,
++106,
++134,
++24,
++128,
++248,
++0,
++0,
++112,
++0,
++192,
++243,
++211,
++31,
++128,
++144,
++72,
++0,
++188,
++64,
++67,
++232,
++0,
++2,
++0,
++0,
++0,
++255,
++64,
++0,
++0,
++20,
++200,
++243,
++0,
++0,
++128,
++144,
++61,
++0,
++195,
++232,
++0,
++2,
++0,
++0,
++12,
++128,
++7,
++192,
++130,
++248,
++0,
++0,
++112,
++192,
++224,
++16,
++195,
++31,
++132,
++248,
++1,
++0,
++112,
++0,
++224,
++16,
++203,
++31,
++25,
++102,
++9,
++106,
++2,
++30,
++41,
++3,
++26,
++87,
++162,
++64,
++64,
++198,
++1,
++23,
++127,
++158,
++95,
++255,
++239,
++3,
++0,
++254,
++128,
++143,
++94,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++95,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++208,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++209,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++47,
++0,
++8,
++255,
++227,
++23,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++52,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++239,
++3,
++0,
++254,
++128,
++143,
++14,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++143,
++15,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++192,
++142,
++16,
++0,
++0,
++240,
++12,
++0,
++0,
++254,
++128,
++142,
++17,
++0,
++0,
++240,
++12,
++0,
++128,
++144,
++13,
++0,
++8,
++255,
++227,
++3,
++0,
++244,
++192,
++51,
++0,
++0,
++8,
++255,
++35,
++4,
++0,
++180,
++192,
++51,
++0,
++0,
++111,
++3,
++32,
++246,
++192,
++11,
++1,
++16,
++32,
++246,
++2,
++140,
++47,
++240,
++32,
++247,
++35,
++141,
++63,
++178,
++64,
++254,
++35,
++141,
++2,
++68,
++192,
++243,
++128,
++11,
++32,
++255,
++35,
++240,
++58,
++226,
++192,
++243,
++188,
++10,
++0,
++254,
++0,
++141,
++4,
++4,
++0,
++240,
++128,
++10,
++88,
++246,
++35,
++141,
++3,
++68,
++240,
++246,
++35,
++141,
++48,
++66,
++0,
++247,
++227,
++143,
++52,
++242,
++32,
++247,
++227,
++142,
++52,
++178,
++90,
++0,
++161,
++3,
++6,
++64,
++23,
++64,
++96,
++8,
++70,
++98,
++97,
++8,
++70,
++98,
++98,
++8,
++70,
++98,
++99,
++8,
++70,
++98,
++100,
++8,
++70,
++98,
++101,
++8,
++70,
++98,
++255,
++159,
++8,
++250,
++23,
++102,
++7,
++106,
++112,
++30,
++33,
++3,
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..2901b6568d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,3070 @@
++static const unsigned char rpi_hevc_transform8 [] = {
 +21,
 +106,
 +0,
@@ -13487,932 +22060,9 @@ index 0000000..4309f1c
 +33,
 +3,
 +};
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000..5543093
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,917 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+#   output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+#   (a b c d) (1 2  2  1)
-+#             (3 4 -4 -3)
-+#             (5 6  6  5)
-+#             (7 8 -8 -7)
-+#
-+#  x=(a c)(1 2) = 1a+5c 2a+6c
-+#         (5 6)
-+#
-+#  y=(b d)(3 4) = 3b+7d 4b+8d
-+#         (7 8)
-+#
-+#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+#  Final results are (u , v[::-1])
-+#
-+#
-+#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+#  Apply the even matrix first and stop before rounding
-+#  Then apply the odd matrix in a full manner:
-+#
-+#   First step is to compute partial products with the first input (16 cycles)
-+#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
-+#   2a 4b 6c 8d
-+#   2a -4b 6c -8d
-+#   1a -3b 5c -7d
-+#
-+#   Second step is to sum partial products into final position (8 cycles)
-+#   1a+3b+5c+7d
-+#   2a+4b+6c+8d
-+#   2a-4b+6c-8d
-+#   1a-3b+5c-7d
-+#
-+#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+#   For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+hevc_trans_16x16:
-+  cmp r5,1
-+  beq memclear16
-+  cmp r5,2
-+  beq hevc_deblock_16x16
-+  cmp r5,3
-+  beq hevc_uv_deblock_16x16
-+  cmp r5,4
-+  beq hevc_uv_deblock_16x16_with_clear
-+  cmp r5,5
-+  beq hevc_run_command_list
-+
-+  push r6-r15, lr # TODO cut down number of used registers
-+  mov r14,r3 # coeffs32
-+  mov r15,r4 # num32
-+  mov r3, 16*2 # Stride of transMatrix2 in bytes
-+  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  # Now use r0 to describe which matrix we are working on.
-+  # Allows us to prefetch the next block of coefficients for efficiency.
-+  mov r0,0 # This describes the location where we read our coefficients from
-+  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+  mov r7,16*16*2 # Total block size
-+  mov r8,64*16 # Value used to swap from current to next VRF location
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  mov r4,64 # Constant used for rounding first pass
-+  mov r5,1<<11 # Constant used for rounding second pass
-+
-+  # At start of block r0,r1 point to the current block (that has already been loaded)
-+block_loop:
-+  eor r0,r8
-+  add r1,r7
-+  # Prefetch the next block
-+  vldh HX(0++,0)+r0,(r1 += r3) REP 16
-+  eor r0,r8
-+  sub r1,r7
-+
-+  # Transform the current block
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
-+  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
-+
-+  bl col_trans_16
-+  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
-+  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+  # Save results - note there has been a transposition during the processing so we save columns
-+  vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+  # Move onto next block
-+  eor r0,r8
-+  add r1,r7
-+
-+  addcmpbgt r2,-1,0,block_loop
-+
-+  # Now go and do any 32x32 transforms
-+  b hevc_trans_32x32
-+
-+  pop r6-r15, pc
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+col_trans_odd_16:
-+  add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+  # First compute partial products for a single column
-+  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+  # Then sum up the results and place back
-+  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+  addcmpblt r0,1,r6,col_trans_odd_16_loop
-+  sub r0,16  # put r0 back to its original value
-+  b lr
-+
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+#
-+hevc_trans_32x32:
-+  mov r1,r14 # coeffs
-+  mov r2,r15 # num
-+
-+  # Fetch odd transform matrix
-+  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+  #add r0, 16*16*2
-+  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+  mov r7, 16*16*2 # Total block size
-+  sub sp,sp,32*32*2+32 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned)
-+  # set r8 to 32byte aligned stack pointer
-+  add r8,sp,31
-+  lsr r8,5
-+  lsl r8,5
-+  mov r9,r8  # Backup of the temporary storage
-+  mov r10,r1 # Backup of the coefficient buffer
-+block_loop32:
-+
-+  # COLUMN TRANSFORM
-+  mov r4, 64 # Constant used for rounding first pass
-+  mov r5, 9 # left shift used for rounding first pass
-+
-+  # Transform the first 16 columns
-+  mov r1,r10  # Input Coefficient buffer
-+  mov r8,r9   # Output temporary storage
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  # ROW TRANSFORM
-+  mov r4, 1<<11 # Constant used for rounding second pass
-+  mov r5, 4 # left shift used for rounding second pass
-+
-+  mov r1,r9  # Input temporary storage
-+  mov r8,r10   # Output Coefficient buffer
-+  bl trans32
-+  # Transform the second 16 columns
-+  add r8,32*16*2
-+  add r1,32
-+  bl trans32
-+
-+  add r10, 32*32*2 # move onto next block of coefficients
-+  addcmpbgt r2,-1,0,block_loop32
-+
-+  add sp,sp,32*32*2+32 # Restore stack
-+
-+  pop r6-r15, pc
-+
-+trans32:
-+  push lr
-+  # We can no longer afford the VRF space to do prefetching when doing 32x32
-+  # Fetch the even rows
-+  vldh HX(0++,0),(r1 += r3) REP 16
-+  # Fetch the odd rows
-+  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+  # Transform the even rows using even matrix
-+  mov r0, 0 # Even rows
-+  bl col_trans_16
-+
-+  # Now transform the odd rows using odd matrix
-+  mov r0, 64*16 # Odd rows
-+  bl col_trans_odd_16
-+
-+  # Now apply butterfly to compute the first 16 results
-+  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  # 16bit results now in HX(48,32)
-+  mov r0,r8
-+  mov r6,32*2
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+
-+  # Now apply butterfly to compute the second 16 results (in reverse order)
-+  vsub HY(63,0),HY(0 ,0),HY(16,0)
-+  vsub HY(62,0),HY(1 ,0),HY(17,0)
-+  vsub HY(61,0),HY(2 ,0),HY(18,0)
-+  vsub HY(60,0),HY(3 ,0),HY(19,0)
-+  vsub HY(59,0),HY(4 ,0),HY(20,0)
-+  vsub HY(58,0),HY(5 ,0),HY(21,0)
-+  vsub HY(57,0),HY(6 ,0),HY(22,0)
-+  vsub HY(56,0),HY(7 ,0),HY(23,0)
-+  vsub HY(55,0),HY(8 ,0),HY(24,0)
-+  vsub HY(54,0),HY(9 ,0),HY(25,0)
-+  vsub HY(53,0),HY(10,0),HY(26,0)
-+  vsub HY(52,0),HY(11,0),HY(27,0)
-+  vsub HY(51,0),HY(12,0),HY(28,0)
-+  vsub HY(50,0),HY(13,0),HY(29,0)
-+  vsub HY(49,0),HY(14,0),HY(30,0)
-+  vsub HY(48,0),HY(15,0),HY(31,0)
-+  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
-+  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
-+  add r0,r8,32
-+  vsth VX(48,32++),(r0+=r6) REP 16
-+  pop pc
-+
-+memclear16:
-+  # r0 is address
-+  # r1 is number of 16bits values to set to 0 (may overrun past end and clear more than specified)
-+  vmov HX(0++,0),0 REP 16
-+  mov r2,32
-+loop:
-+  vsth HX(0++,0),(r0+=r2) REP 16
-+  add r0,16*16*2
-+  sub r1,16*16
-+  cmp r1,0
-+  bgt loop
-+  b lr
-+
-+
-+################################################################################
-+# HEVC VPU Deblock
-+#
-+# Vertical edges before horizontal
-+# Decision can change every 4 pixels, but only 8 pixel boundaries are deblocked
-+#
-+# ARM is responsible for storing beta and tc for each 4 pixels horiz and vert edge.
-+# The VPU code works in units of 16x16 blocks.
-+# We do vertical filtering for the current block followed by horizontal filtering for the previous (except for the first time).
-+# One final horizontal filter is required at the end.
-+# PCM is not allowed in this code.
-+#
-+#
-+# H(16-4:16+15,0) contains previous block (note that we need 4 lines above of context that may get altered during filtering)
-+# H(16:31,16) contains current block (note that we do not need the upper lines until the horizontal filtering.
-+
-+.set P0,63
-+.set P1,62
-+.set P2,61
-+.set P3,60
-+.set Q0,59
-+.set Q1,58
-+.set Q2,57
-+.set Q3,56
-+
-+.set dp,32
-+.set dq,33
-+.set d,34
-+.set decision,35
-+.set beta,36
-+.set beta2,37
-+.set beta3,38
-+.set ptest,39
-+.set qtest,40
-+.set pqtest,41
-+.set thresh,42
-+.set deltatest, 44
-+.set deltap1, 45
-+.set tc25, 46
-+.set setup,47
-+.set tc,48
-+.set tc25,49
-+.set tc2, 50
-+.set do_filter, 51
-+.set delta, 52
-+.set tc10, 53
-+.set delta0, 54
-+.set delta1, 55
-+.set zeros, 0
-+.set setup_input, 1
-+.set deltaq1, 2
-+
-+
-+
-+# hevc_deblock_16x16 deblocks an entire row that is 16 pixels high by the full width of the image.
-+# Row has num16 16x16 blocks across
-+# Beta goes from 0 to 64
-+# tc goes from 0 to 24
-+# setup[block_idx][0=vert,1=horz][0=first edge, 1=second edge][0=beta,1=tc][0..3=edge number]
-+#   has 8 bytes per edge
-+#   has 16 bytes per direction
-+#   has 32 bytes per 16x16 block
-+# hevc_deblock_16x16(uint8_t *img (r0), int stride (r1), int num16w (r2), uint8_t setup[num16][2][2][2][4](r3),int num16h(r4))
-+hevc_deblock_16x16:
-+  push r6-r15, lr
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+
-+process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl vert_filter
-+  sub r3,8
-+  b start_deblock_loop
-+deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  vstb H(zeros,0),(r4)
-+  bl vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  vstb H(zeros,0),-16(r4)
-+  bl horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt start_again
-+  pop r6-r15, pc
-+start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+vert_filter:
-+  push lr
-+
-+  vmov HX(P3,0), V(16,12)+r3
-+  vmov HX(P2,0), V(16,13)+r3
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+  vmov HX(Q2,0), V(16,18)+r3
-+  vmov HX(Q3,0), V(16,19)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds V(16,13)+r3, HX(P2,0), 0
-+  vadds V(16,14)+r3, HX(P1,0), 0
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+  vadds V(16,17)+r3, HX(Q1,0), 0
-+  vadds V(16,18)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+horz_filter:
-+  push lr
-+
-+  vmov HX(P3,0), H(12,0)+r3
-+  vmov HX(P2,0), H(13,0)+r3
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+  vmov HX(Q2,0), H(18,0)+r3
-+  vmov HX(Q3,0), H(19,0)+r3
-+
-+  bl do_luma_filter
-+
-+  vadds H(13,0)+r3, HX(P2,0), 0
-+  vadds H(14,0)+r3, HX(P1,0), 0
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+  vadds H(17,0)+r3, HX(Q1,0), 0
-+  vadds H(18,0)+r3, HX(Q2,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_luma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # b*8tc*8
-+  valtl HX(beta,0),H(setup,0),H(setup,0)
-+  valtu HX(tc,0),H(setup,0),H(setup,0)
-+  vmul HX(tc25,0), HX(tc,0), 5
-+  vadd HX(tc25,0),HX(tc25,0), 1
-+  vasr HX(tc25,0), HX(tc25,0), 1
-+
-+  # Compute decision
-+  vadd HX(dp,0),HX(P1,0),HX(P1,0) # 2*P1
-+  vsub HX(dp,0),HX(P2,0),HX(dp,0) # P2-2*P1
-+  vadd HX(dp,0),HX(dp,0),HX(P0,0) # P2-2*P1+P0
-+  vdist HX(dp,0),HX(dp,0),0 # abs(P2-2*P1+P0) # dp0
-+
-+  vadd HX(dq,0),HX(Q1,0),HX(Q1,0) # 2*Q1
-+  vsub HX(dq,0),HX(Q2,0),HX(dq,0) # Q2-2*Q1
-+  vadd HX(dq,0),HX(dq,0),HX(Q0,0) # Q2-2*Q1+Q0
-+  vdist HX(dq,0),HX(dq,0),0 # abs(Q2-2*Q1+Q0) # dq0
-+
-+  vadd HX(d,0), HX(dp,0), HX(dq,0)
-+  vasr HX(beta2,0),HX(beta,0),2
-+  vasr HX(beta3,0),HX(beta,0),3
-+
-+  # Compute flags that are negative if all conditions pass
-+  vdist HX(decision,0), HX(P0,0), HX(P3,0) CLRA SACC
-+  vdist HX(decision,0), HX(Q0,0), HX(Q3,0) SACC
-+  vsub HX(decision,0), HX(decision,0), HX(beta3,0) SETF
-+
-+  vdist HX(decision,0), HX(P0,0), HX(Q0,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(tc25,0) IFN SETF
-+  vadd HX(decision,0), HX(d,0), HX(d,0) IFN
-+  vsub HX(decision,0), HX(decision,0), HX(beta2,0) IFN SETF
-+  vmov HX(decision,0), 1 IFNN
-+  vadd H(decision,0),H(decision,3),0 IFN
-+  vadd H(decision,16),H(decision,19),0 IFN
-+  vmov -,HX(decision,0) SETF   # N marks strong filter
-+  vmov HX(decision,0), 1 IFNN  # NN marks normal filter
-+
-+  vadd HX(do_filter,0), HX(d,3), HX(d,0)
-+  vsub HX(do_filter,0), HX(do_filter,0), HX(beta,0) SETF # IFNN means no filter
-+  vmov HX(decision,0),0 IFNN # Z marks no filter
-+
-+  # Expand out decision (currently valid one every 4 pixels)  0...1...2...3
-+  # First extract out even terms
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0.1.2.3
-+  vodd HX(decision,0),HX(decision,0),HX(decision,0)  # 0123
-+  # Now expand back
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) # 00112233
-+  valtl HX(decision,0),HX(decision,0),HX(decision,0) SETF # 0000111122223333
-+
-+  # HX(decision,0) is negative if want strong filtering, 1 if want normal filtering, 0 if want no filtering
-+
-+  # Do a quick check to see if there is anything to do
-+  mov r11, 0 # Signal no filtering
-+  vmov -,1 IFNZ SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+  mov r11, 1 # Signal some filtering
-+  # And whether there is any strong filtering
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq normal_filtering
-+
-+  ##############################################################################
-+  # Strong filtering - could maybe fast case if all have same sign? (especially if all disabled!)
-+  vshl HX(tc2,0), HX(tc,0), 1  # Note that in normal filtering tx2 is tc/2, while here it is tc*2
-+
-+  # Take a copy of the original pixels for use in decision calculation
-+  vmov HX(P0,32),HX(P0,0)
-+  vmov HX(Q0,32),HX(Q0,0)
-+  vmov HX(P1,32),HX(P1,0)
-+  vmov HX(Q1,32),HX(Q1,0)
-+  vmov HX(P2,32),HX(P2,0)
-+  vmov HX(Q2,32),HX(Q2,0)
-+
-+  vadd -,HX(P2,32),4 CLRA SACC
-+  vshl -,HX(P1,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl HX(delta,0),HX(Q1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P0,0),HX(P0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P2,32),2 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vshl HX(delta,0),HX(Q0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(P1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P1,0),HX(P1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q0,32),4 CLRA SACC
-+  vadd -,HX(P1,32),HX(P0,32) SACC
-+  vmul -,HX(P2,32),3 SACC
-+  vshl HX(delta,0),HX(P3,0),1 SACC # Note that we have not made a copy of P3, so using P3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(P2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(P2,0),HX(P2,32),HX(delta,0) IFN
-+  #vmov HX(P2,0),3 IFN
-+
-+  # Now reverse all P/Qs
-+
-+  vadd -,HX(Q2,32),4 CLRA SACC
-+  vshl -,HX(Q1,32),1 SACC
-+  vshl -,HX(Q0,32),1 SACC
-+  vshl -,HX(P0,32),1 SACC
-+  vshl HX(delta,0),HX(P1,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q0,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q0,0),HX(Q0,32),HX(delta,0) IFN
-+
-+  vadd -,HX(Q2,32),2 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vshl HX(delta,0),HX(P0,32),0 SACC
-+  vasr HX(delta,0),HX(delta,0), 2
-+  vsub HX(delta,0),HX(delta,0),HX(Q1,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q1,0),HX(Q1,32),HX(delta,0) IFN
-+
-+  vadd -,HX(P0,32),4 CLRA SACC
-+  vadd -,HX(Q1,32),HX(Q0,32) SACC
-+  vmul -,HX(Q2,32),3 SACC
-+  vshl HX(delta,0),HX(Q3,0),1 SACC # Note that we have not made a copy of Q3, so using Q3,0 is correct
-+  vasr HX(delta,0),HX(delta,0), 3
-+  vsub HX(delta,0),HX(delta,0),HX(Q2,32)
-+  vclamps HX(delta,0), HX(delta,0), HX(tc2,0)
-+  vadd HX(Q2,0),HX(Q2,32),HX(delta,0) IFN
-+
-+  ##############################################################################
-+  # Normal filtering
-+normal_filtering:
-+  # Invert the decision flags
-+  # make instruction more complicated as assembler has error and loses SETF
-+  vrsub HX(tc10,0), HX(decision,0), 0 SETF # IFN means normal filtering
-+  vmov  -, HX(tc10,0) SETF # IFN means normal filtering
-+
-+  vmov -,1 IFN SUMS r5
-+  cmp r5,0
-+  beq filtering_done
-+
-+  vasr HX(tc2,0), HX(tc,0), 1
-+  vmul HX(tc10,0), HX(tc,0), 10
-+
-+  vasr HX(thresh,0), HX(beta,0), 1
-+  vadd HX(thresh,0), HX(thresh,0), HX(beta,0)
-+  vasr HX(thresh,0), HX(thresh,0), 3 CLRA SACC
-+
-+  vadd HX(ptest,0),HX(dp,3),HX(dp,0)
-+  vsub HX(ptest,0),HX(ptest,0),HX(thresh,0) # ptest is negative if we need to do the P2 pixel
-+  vadd HX(qtest,0),HX(dq,3),HX(dq,0)
-+  vsub HX(qtest,0),HX(qtest,0),HX(thresh,0) # qtest is negative if we need to do the Q2 pixel
-+  # Expand ptest and qtest together
-+  vodd HX(pqtest,0),HX(ptest,0),HX(qtest,0)  # p.p.p.p.q.q.q.q
-+  vodd HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppqqqq........
-+  valtl HX(pqtest,0),HX(pqtest,0),HX(pqtest,0) # ppppppppqqqqqqqq
-+  valtl HX(ptest,0),HX(pqtest,0),HX(pqtest,0)
-+  valtu HX(qtest,0),HX(pqtest,0),HX(pqtest,0)
-+
-+  vsub HX(delta0,0), HX(Q0,0), HX(P0,0)
-+  vsub HX(delta1,0), HX(Q1,0), HX(P1,0)
-+  vmov -,8 CLRA SACC
-+  vmul -,HX(delta0,0), 9 SACC
-+  vmul HX(delta0,0),HX(delta1,0), r6 SACC
-+  vasr HX(delta0,0), HX(delta0,0), 4
-+  vdist HX(deltatest,0), HX(delta0,0), 0
-+  vsub HX(deltatest,0), HX(deltatest,0), HX(tc10,0) IFN SETF # negative if still need to do something
-+  vmov HX(deltatest,0), 0 IFNN # clear if no need to do anything so we can reload flags later
-+
-+  vclamps HX(delta0,0), HX(delta0,0), HX(tc,0)
-+
-+  vadd HX(deltap1,0), HX(P2,0), HX(P0,0)
-+  vadd HX(deltap1,0), HX(deltap1,0), 1
-+  vasr HX(deltap1,0), HX(deltap1,0), 1 CLRA SACC
-+  vsub HX(deltap1,0), HX(delta0,0), HX(P1,0) SACC
-+  vasr HX(deltap1,0), HX(deltap1,0), 1
-+  vclamps HX(deltap1,0), HX(deltap1,0), HX(tc2,0)
-+
-+  vadd HX(deltaq1,0), HX(Q2,0), HX(Q0,0)
-+  vadd HX(deltaq1,0), HX(deltaq1,0), 1
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1 CLRA SACC
-+  vadd HX(deltaq1,0), HX(delta0,0), HX(Q1,0)
-+  vrsub -, HX(delta0,0), 0 SACC
-+  vrsub HX(deltaq1,0), HX(Q1,0), 0 SACC
-+  vasr HX(deltaq1,0), HX(deltaq1,0), 1
-+  vclamps HX(deltaq1,0), HX(deltaq1,0), HX(tc2,0)
-+
-+  vadds HX(P0,0), HX(P0,0), HX(delta0,0) IFN
-+  vsubs HX(Q0,0), HX(Q0,0), HX(delta0,0) IFN
-+
-+  vmov -,HX(ptest,0) IFN SETF # Negative if need to do p1
-+  vadds HX(P1,0), HX(P1,0), HX(deltap1,0) IFN
-+
-+  vmov -,HX(deltatest,0) SETF
-+  vmov -,HX(qtest,0) IFN SETF # Negative if need to do q1
-+  vadds HX(Q1,0), HX(Q1,0), HX(deltaq1,0) IFN
-+
-+  #vmov HX(P2,0),1 IFN
-+
-+filtering_done:
-+  b lr
-+
-+
-+hevc_uv_deblock_16x16:
-+  push r6-r15, lr
-+  mov r14,0
-+  b hevc_uv_start
-+hevc_uv_deblock_16x16_with_clear:
-+  push r6-r15, lr
-+  mov r14,1
-+  b hevc_uv_start
-+
-+hevc_uv_start:
-+  mov r9,r4
-+  mov r4,r3
-+  mov r13,r2
-+  mov r2,r0
-+  mov r10,r0
-+  subscale4 r0,r1
-+  mov r8,63
-+  mov r6,-3
-+  vmov H(zeros,0),0
-+# r7 is number of blocks still to load
-+# r0 is location of current block - 4 * stride
-+# r1 is stride
-+# r2 is location of current block
-+# r3 is offset of start of block (actual edges start at H(16,16)+r3 for horizontal and H(16,0)+r3 for vertical
-+# r4 is setup
-+# r5 is for temporary calculations
-+# r8 holds 63
-+# r6 holds -3
-+# r9 holds the number of 16 high rows to process
-+# r10 holds the original img base
-+# r11 returns 0 if no filtering was done on the edge
-+# r12 saves a copy of this
-+# r13 is copy of width
-+# r14 is 1 if we should clear the old contents, or 0 if not
-+
-+uv_process_row:
-+  # First iteration does not do horizontal filtering on previous
-+  mov r7, r13
-+  mov r3,0
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4    # Load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)  # We may wish to prefetch these
-+  cmp r14,1
-+  bne uv_skip0
-+  vstb H(zeros,0),(r4)
-+uv_skip0:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0 # Rotate to second set of 8
-+  bl uv_vert_filter
-+  sub r3,8
-+  b uv_start_deblock_loop
-+uv_deblock_loop:
-+  # Middle iterations do vertical on current block and horizontal on preceding
-+  vldb H(12++,16)+r3,(r0 += r1) REP 4  # load the current block
-+  vldb H(16++,16)+r3,(r2 += r1) REP 16
-+  vldb H(setup_input,0), (r4)
-+  cmp r14,1
-+  bne uv_skip1
-+  vstb H(zeros,0),(r4)
-+uv_skip1:
-+  bl uv_vert_filter
-+  add r3,8
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_vert_filter
-+  sub r3,8
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip3
-+  vstb H(zeros,0),-16(r4)
-+uv_skip3:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,8*64
-+  addcmpbeq r12,0,0,uv_skip_save_top
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+uv_start_deblock_loop:
-+  # move onto next 16x16 (could do this with circular buffer support instead)
-+  add r3,16
-+  and r3,r8
-+  add r4,32
-+  # Perform loop counter operations (may work with an addcmpbgt as well?)
-+  add r0,16
-+  add r2,16
-+  sub r7,1
-+  cmp r7,0 # Are there still more blocks to load
-+  bgt uv_deblock_loop
-+
-+  # Final iteration needs to just do horizontal filtering
-+  vldb H(setup_input,0), -16(r4)
-+  cmp r14,1
-+  bne uv_skip2
-+  vstb H(zeros,0),-16(r4)
-+uv_skip2:
-+  bl uv_horz_filter
-+  mov r12,r11
-+  add r3,8*64
-+  vadd H(setup_input,0),H(setup_input,8),0
-+  bl uv_horz_filter
-+  sub r3,64*8
-+  addcmpbeq r12,0,0,uv_skip_save_top2
-+  vstb H(12++,0)+r3,-16(r0 += r1) REP 4  # Save the deblocked pixels for the previous block
-+uv_skip_save_top2:
-+  vstb H(16++,0)+r3,-16(r2 += r1) REP 16
-+
-+# Now look to see if we should do another row
-+  sub r9,1
-+  cmp r9,0
-+  bgt uv_start_again
-+  pop r6-r15, pc
-+uv_start_again:
-+  # Need to sort out r0,r2 to point to next row down
-+  addscale16 r10,r1
-+  mov r2,r10
-+  subscale4 r0,r2,r1
-+  b uv_process_row
-+
-+
-+# At this stage H(16,16)+r3 points to the first pixel of the 16 high edge to be filtered
-+# So we can reuse the code we move the parts to be filtered into HX(P0/P1/P2/P3/Q0/Q1/Q2/Q3,0) - we will perform a final saturation step on placing them back into the correct locations
-+
-+uv_vert_filter:
-+  push lr
-+
-+  vmov HX(P1,0), V(16,14)+r3
-+  vmov HX(P0,0), V(16,15)+r3
-+  vmov HX(Q0,0), V(16,16)+r3
-+  vmov HX(Q1,0), V(16,17)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds V(16,15)+r3, HX(P0,0), 0
-+  vadds V(16,16)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# Filter edge at H(16,0)+r3
-+uv_horz_filter:
-+  push lr
-+
-+  vmov HX(P1,0), H(14,0)+r3
-+  vmov HX(P0,0), H(15,0)+r3
-+  vmov HX(Q0,0), H(16,0)+r3
-+  vmov HX(Q1,0), H(17,0)+r3
-+
-+  bl do_chroma_filter
-+
-+  vadds H(15,0)+r3, HX(P0,0), 0
-+  # P3 and Q3 never change so don't bother saving back
-+  vadds H(16,0)+r3, HX(Q0,0), 0
-+
-+  pop pc
-+
-+# r4 points to array of beta/tc for each 4 length edge
-+do_chroma_filter:
-+  valtl H(setup,0),H(setup_input,0),H(setup_input,0) # tc*8
-+  valtl HX(tc,0),H(setup,0),H(setup,0)
-+
-+  vsub HX(delta,0),HX(Q0,0),HX(P0,0)
-+  vshl HX(delta,0),HX(delta,0),2 CLRA SACC
-+  vsub -,HX(P1,0),HX(Q1,0) SACC
-+  vmov HX(delta,0),4 SACC
-+  vasr HX(delta,0),HX(delta,0),3
-+  vclamps HX(delta,0), HX(delta,0), HX(tc,0)
-+  vadd HX(P0,0),HX(P0,0),HX(delta,0)
-+  vsub HX(Q0,0),HX(Q0,0),HX(delta,0)
-+  b lr
-+
-+# r0 = list
-+# r1 = number
-+hevc_run_command_list:
-+  push r6-r7, lr
-+  mov r6, r0
-+  mov r7, r1
-+loop_cmds:
-+  ld r0,(r6) # How to encode r6++?
-+  add r6,4
-+  ld r1,(r6)
-+  add r6,4
-+  ld r2,(r6)
-+  add r6,4
-+  ld r3,(r6)
-+  add r6,4
-+  ld r4,(r6)
-+  add r6,4
-+  ld r5,(r6)
-+  add r6,4
-+  bl hevc_trans_16x16
-+  sub r7,1
-+  cmp r7,0
-+  bgt loop_cmds
-+
-+  pop r6-r7, pc
 diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
 new file mode 100644
-index 0000000..0255f5d
+index 0000000000..0255f5dd44
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.c
 @@ -0,0 +1,149 @@
@@ -14567,7 +22217,7 @@ index 0000000..0255f5d
 +
 diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
 new file mode 100644
-index 0000000..b316878
+index 0000000000..b3168788d2
 --- /dev/null
 +++ b/libavcodec/rpi_mailbox.h
 @@ -0,0 +1,58 @@
@@ -14629,12 +22279,64 @@ index 0000000..b316878
 +int mbox_get_image_params(int fd, VC_IMAGE_T * img);
 +
 +#endif
+diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h
+new file mode 100644
+index 0000000000..e6127749ea
+--- /dev/null
++++ b/libavcodec/rpi_opts.h
+@@ -0,0 +1,46 @@
++#ifndef AVCODEC_RPI_OPTS_H
++#define AVCODEC_RPI_OPTS_H
++
++// define RPI to split the CABAC/prediction/transform into separate stages
++#ifndef RPI
++
++  #define RPI_INTER          0
++  #define RPI_TSTATS         0
++  #define RPI_HEVC_SAND      0
++
++#else
++  #include "config.h"
++
++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
++
++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
++  // This has no effect unless RPI_WORKER is defined
++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
++  // free for the foreground to fill in.
++  #define RPI_MAX_JOBS 2
++
++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
++//  #define RPI_DEBLOCK_VPU
++
++  #define RPI_VPU_DEBLOCK_CACHED 1
++
++  #if HAVE_NEON
++  #define RPI_HEVC_SAND      1
++  #else
++  // Sand bust on Pi1 currently - reasons unknown
++  #define RPI_HEVC_SAND      0
++  #endif
++
++
++  #define RPI_QPU_EMU_Y      0
++  #define RPI_QPU_EMU_C      0
++
++  #define RPI_TSTATS 0
++#endif
++
++#endif
++
 diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
 new file mode 100644
-index 0000000..7c0eedd
+index 0000000000..e872b855b7
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,902 @@
+@@ -0,0 +1,935 @@
 +#ifdef RPI
 +#include <stdio.h>
 +#include <stdlib.h>
@@ -14653,8 +22355,9 @@ index 0000000..7c0eedd
 +#include "rpi_mailbox.h"
 +#include "rpi_qpu.h"
 +#include "rpi_shader.h"
-+#include "rpi_hevc_transform.h"
-+#include "rpi_zc.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
 +
 +#pragma GCC diagnostic push
 +// Many many redundant decls in the header files
@@ -14678,26 +22381,13 @@ index 0000000..7c0eedd
 +#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
 +#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
 +
-+// On Pi2 there is no way to access the VPU L2 cache
-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
-+#define GPU_MEM_FLG 0x4
-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
-+#define GPU_MEM_MAP 0x0
-+
 +#define vcos_verify_ge0(x) ((x)>=0)
 +
-+/*static const unsigned code[] =
-+{
-+  #include "rpi_shader.hex"
-+};*/
-+
 +// Size in 32bit words
-+#define QPU_CODE_SIZE 2048
++#define QPU_CODE_SIZE 4098
 +#define VPU_CODE_SIZE 2048
 +
-+const short rpi_transMatrix2even[32][16] = { // Even rows first
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
 +{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
 +{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
 +{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
@@ -14737,7 +22427,8 @@ index 0000000..7c0eedd
 +struct GPU
 +{
 +  unsigned int qpu_code[QPU_CODE_SIZE];
-+  unsigned int vpu_code[VPU_CODE_SIZE];
++  unsigned int vpu_code8[VPU_CODE_SIZE];
++  unsigned int vpu_code10[VPU_CODE_SIZE];
 +  short transMatrix2even[16*16*2];
 +};
 +
@@ -14749,8 +22440,9 @@ index 0000000..7c0eedd
 +#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
 +
 +struct rpi_cache_flush_env_s {
-+    unsigned int n;
-+    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++//    unsigned int n;
++//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
++  struct vcsm_user_clean_invalid2_s v;
 +};
 +
 +#define WAIT_COUNT_MAX 16
@@ -14774,7 +22466,6 @@ index 0000000..7c0eedd
 +typedef struct vq_wait_s
 +{
 +  sem_t sem;
-+  unsigned int cost;
 +  struct vq_wait_s * next;
 +} vq_wait_t;
 +
@@ -14793,7 +22484,7 @@ index 0000000..7c0eedd
 +  int open_count;
 +  int init_count;
 +  int mb;
-+  unsigned int current_load;
++  int vpu_i_cache_flushed;
 +  GPU_MEM_PTR_T code_gm_ptr;
 +  vq_wait_pool_t wait_pool;
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -14866,8 +22557,8 @@ index 0000000..7c0eedd
 +
 +// GPU_MEM_PTR_T alloc fns
 +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
-+  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++  p->numbytes = (numbytes + 255) & ~255;  // Round up
++  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
 +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
@@ -14878,12 +22569,14 @@ index 0000000..7c0eedd
 +  av_assert0(p->arm);
 +  p->vc = mbox_mem_lock(mb, p->vc_handle);
 +  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
++
 +  return 0;
 +}
 +
 +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
 +  p->numbytes = numbytes;
-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
 +  av_assert0(p->vcsm_handle);
 +  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
 +  av_assert0(p->vc_handle);
@@ -14891,6 +22584,7 @@ index 0000000..7c0eedd
 +  av_assert0(p->arm);
 +  p->vc = mbox_mem_lock(mb, p->vc_handle);
 +  av_assert0(p->vc);
++//  printf("***** %s, %d\n", __func__, numbytes);
 +  return 0;
 +}
 +
@@ -14899,6 +22593,7 @@ index 0000000..7c0eedd
 +  vcsm_unlock_ptr(p->arm);
 +  vcsm_free(p->vcsm_handle);
 +  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++//  printf("***** %s\n", __func__);
 +}
 +
 +
@@ -14955,9 +22650,14 @@ index 0000000..7c0eedd
 +  }
 +  // And the VPU code
 +  {
-+    int num_bytes = sizeof(rpi_hevc_transform);
++    int num_bytes = sizeof(rpi_hevc_transform8);
 +    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
++    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++  }
++  {
++    int num_bytes = sizeof(rpi_hevc_transform10);
++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
 +  }
 +  // And the transform coefficients
 +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
@@ -15048,10 +22748,18 @@ index 0000000..7c0eedd
 +  gpu_unlock_unref(ge);
 +}
 +
-+unsigned int vpu_get_fn(void) {
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
 +  // Make sure that the gpu is initialized
 +  av_assert0(gpu != NULL);
-+  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
++  switch (bit_depth){
++    case 8:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++    case 10:
++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++    default:
++      av_assert0(0);
++  }
++  return 0;
 +}
 +
 +unsigned int vpu_get_constants(void) {
@@ -15081,95 +22789,75 @@ index 0000000..7c0eedd
 +//
 +// Cache flush functions
 +
++#define CACHE_EL_MAX 16
 +
 +rpi_cache_flush_env_t * rpi_cache_flush_init()
 +{
-+    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
-+    if (rfe == NULL)
-+        return NULL;
++  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
++            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
++  if (rfe == NULL)
++    return NULL;
 +
-+    rfe->n = 0;
-+    return rfe;
++  rfe->v.op_count = 0;
++  return rfe;
 +}
 +
 +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
 +{
-+    if (rfe != NULL)
-+        free(rfe);
++  if (rfe != NULL)
++    free(rfe);
 +}
 +
 +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
 +{
-+    int rc = 0;
-+    unsigned int na;
-+    unsigned int nr;
++  int rc = 0;
 +
-+    // Clear any reamaining ents in the final block
-+    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
-+        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
++  if (vcsm_clean_invalid2(&rfe->v) != 0)
++    rc = -1;
 +
-+    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
-+    {
-+        if (vcsm_clean_invalid(rfe->a + na) != 0)
-+            rc = -1;
-+    }
++  free(rfe);
 +
-+    free(rfe);
++  if (rc == 0)
++    return 0;
 +
-+    if (rc == 0)
-+        return 0;
-+
-+    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
-+    return rc;
++  av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++  return rc;
 +}
 +
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
 +{
-+    // Deal with empty pointer trivially
-+    if (gm == NULL || gm->numbytes == 0)
-+        return;
++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
 +
-+    {
-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
++  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
 +
-+        av_assert0(rfe->n < CFE_ENT_COUNT);
-+
-+        a->s[n].cmd = mode;
-+        a->s[n].handle = gm->vcsm_handle;
-+        a->s[n].addr = (unsigned int)gm->arm;
-+        a->s[n].size = gm->numbytes;
-+        ++rfe->n;
-+    }
++  b->invalidate_mode = mode;
++  b->block_count = blocks;
++  b->start_address = gm->arm + offset0;
++  b->block_size = block_size;
++  b->inter_block_stride = block_stride;
 +}
 +
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
 +  const unsigned int offset, const unsigned int size)
 +{
-+    // Deal with empty pointer trivially
-+    if (gm == NULL || size == 0)
-+        return;
++  // Deal with empty pointer trivially
++  if (gm == NULL || size == 0)
++    return;
 +
-+//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
++  av_assert0(offset <= gm->numbytes);
++  av_assert0(size <= gm->numbytes);
++  av_assert0(offset + size <= gm->numbytes);
 +
-+    av_assert0(offset <= gm->numbytes);
-+    av_assert0(size <= gm->numbytes);
-+    av_assert0(offset + size <= gm->numbytes);
-+
-+    {
-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
-+
-+        av_assert0(rfe->n < CFE_ENT_COUNT);
-+
-+        a->s[n].cmd = mode;
-+        a->s[n].handle = gm->vcsm_handle;
-+        a->s[n].addr = (unsigned int)gm->arm + offset;
-+        a->s[n].size = size;
-+        ++rfe->n;
-+    }
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
 +}
 +
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
 +{
 +#if !RPI_ONE_BUF
@@ -15186,21 +22874,27 @@ index 0000000..7c0eedd
 +  }
 +}
 +
-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
 +{
-+  const unsigned int y_offset = frame->linesize[0] * start_line;
-+  const unsigned int y_size = frame->linesize[0] * n;
++  const unsigned int y_offset = frame->linesize[0] * y0;
++  const unsigned int y_size = frame->linesize[0] * height;
 +  // Round UV up/down to get everything
 +  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
-+  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
 +
++#if 0
++  // *** frame->height is cropped height so not good
 +  // As all unsigned they will also reject -ve
 +  // Test individually as well as added to reject overflow
-+  av_assert0(start_line <= (unsigned int)frame->height);
++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
 +  av_assert0(n <= (unsigned int)frame->height);
 +  av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
 +
 +  if (!gpu_is_buf1(frame))
 +  {
@@ -15212,7 +22906,7 @@ index 0000000..7c0eedd
 +      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
 +    }
 +  }
-+  else if (!rpi_sliced_frame(frame))
++  else if (!av_rpi_is_sand_frame(frame))
 +  {
 +    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
 +    if (do_luma) {
@@ -15225,16 +22919,30 @@ index 0000000..7c0eedd
 +  }
 +  else
 +  {
-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
-+    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
-+      if (do_luma) {
-+        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
-+      }
-+      if (do_chroma) {
-+        rpi_cache_flush_add_gm_range(rfe, gm, mode,
-+                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
-+      }
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
++    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++    if (do_chroma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++      b->block_size = uv_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++    if (do_luma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++      b->block_size = y_size;
++      b->inter_block_stride = stride1 * stride2;
 +    }
 +  }
 +}
@@ -15275,13 +22983,11 @@ index 0000000..7c0eedd
 +
 +
 +// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(const unsigned int cost)
++static vq_wait_t * vq_wait_new(void)
 +{
 +  gpu_env_t * const ge = gpu_lock_ref();
 +  vq_wait_t * const wait = ge->wait_pool.head;
 +  ge->wait_pool.head = wait->next;
-+  ge->current_load += cost;
-+  wait->cost = cost;
 +  wait->next = NULL;
 +
 +#if RPI_TRACE_TIME_VPU_QPU_WAIT
@@ -15337,17 +23043,13 @@ index 0000000..7c0eedd
 +
 +static void vq_wait_post(vq_wait_t * const wait)
 +{
-+#if !RPI_TRACE_TIME_VPU_QPU_WAIT
-+  if (wait->cost != 0)
-+#endif
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +  {
 +    gpu_env_t *const ge = gpu_lock();
-+    ge->current_load -= wait->cost;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
 +    tto_end(&ge->ttw.active, ns_time());
-+#endif
 +    gpu_unlock();
 +  }
++#endif
 +
 +  sem_post(&wait->sem);
 +}
@@ -15363,7 +23065,6 @@ index 0000000..7c0eedd
 +{
 +  unsigned int n;
 +  unsigned int mask;
-+  unsigned int cost;
 +  struct gpu_job_s j[VPU_QPU_JOB_MAX];
 +};
 +
@@ -15396,23 +23097,26 @@ index 0000000..7c0eedd
 +    vqj->mask |= VPU_QPU_MASK_VPU;
 +
 +    j->command = EXECUTE_VPU;
-+    j->u.v.q[0] = vpu_code;
++    // The bottom two bits of the execute address contain no-flush flags
++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
++    // as we never reload code
++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
 +    j->u.v.q[1] = r0;
 +    j->u.v.q[2] = r1;
 +    j->u.v.q[3] = r2;
 +    j->u.v.q[4] = r3;
 +    j->u.v.q[5] = r4;
 +    j->u.v.q[6] = r5;
++    gpu->vpu_i_cache_flushed = 1;
 +  }
 +}
 +
 +// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
 +{
 +  if (n != 0) {
 +    struct gpu_job_s *const j = new_job(vqj);
 +    vqj->mask |= VPU_QPU_MASK_QPU;
-+    vqj->cost += cost;
 +
 +    j->command = EXECUTE_QPU;
 +    j->u.q.jobs = n;
@@ -15442,7 +23146,7 @@ index 0000000..7c0eedd
 +  }
 +
 +  // We are going to want a sync object
-+  wait = vq_wait_new(vqj->cost);
++  wait = vq_wait_new();
 +
 +  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
 +  // If we only posted one thing or only QPU jobs
@@ -15464,7 +23168,6 @@ index 0000000..7c0eedd
 +    j->callback.cookie = wait;
 +  }
 +
-+  vqj->cost = 0;
 +  vqj->mask = 0;
 +  *wait_h = wait;
 +}
@@ -15483,11 +23186,6 @@ index 0000000..7c0eedd
 +  return rv;
 +}
 +
-+unsigned int vpu_qpu_current_load(void)
-+{
-+  return gpu_ptr()->current_load;
-+}
-+
 +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
 +{
 +  if (wait_h != NULL)
@@ -15536,13 +23234,50 @@ index 0000000..7c0eedd
 +  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
 +}
 +
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++  // Dummy values we can catch with emulation
++  qf->y_pxx = ~1U;
++  qf->y_bxx = ~2U;
++  qf->y_p00 = ~3U;
++  qf->y_b00 = ~4U;
++  qf->c_pxx = ~5U;
++  qf->c_bxx = ~6U;
++
++  switch (bit_depth) {
++    case 8:
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
++      qf->c_pxx = qpu_fn(mc_filter_c_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c_b);
++      break;
++    case 10:
++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++      break;
++    default:
++      return -1;
++  }
++  return 0;
++}
++
 +#endif // RPI
 diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
 new file mode 100644
-index 0000000..a95f7d9
+index 0000000000..485a08f8ba
 --- /dev/null
 +++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,200 @@
+@@ -0,0 +1,206 @@
 +#ifndef RPI_QPU_H
 +#define RPI_QPU_H
 +
@@ -15687,21 +23422,35 @@ index 0000000..a95f7d9
 +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
 +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
 +  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
 +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
 +
 +// init, add, finish for one gm ptr
 +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
 +
 +
 +// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++    uint32_t c_pxx;
++    uint32_t c_pxx_l1;
++    uint32_t c_bxx;
++    uint32_t y_pxx;
++    uint32_t y_bxx;
++    uint32_t y_p00;
++    uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
 +uint32_t qpu_fn(const int * const mc_fn);
 +
-+#define QPU_N_GRP_UV 4
-+#define QPU_N_UV     8
-+#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
-+#define QPU_N_Y      12
++#define QPU_N_GRP    4
++#define QPU_N_MAX    12
 +
 +#define QPU_MAIL_EL_VALS  2
 +
@@ -15717,27 +23466,19 @@ index 0000000..a95f7d9
 +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
 +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
 +  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
 +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
 +int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
 +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
 +
-+
-+extern unsigned int vpu_get_fn(void);
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
 +extern unsigned int vpu_get_constants(void);
 +
 +// Waits for previous post_codee to complete and Will null out *wait_h after use
 +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+unsigned int vpu_qpu_current_load(void);
 +int vpu_qpu_init(void);
 +void vpu_qpu_term(void);
 +
-+// Simple test of shader code
-+extern int rpi_test_shader(void);
-+
-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
-+
 +extern int gpu_get_mailbox(void);
 +void gpu_ref(void);
 +void gpu_unref(void);
@@ -15745,10 +23486,10 @@ index 0000000..a95f7d9
 +#endif
 diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
 new file mode 100644
-index 0000000..0898ecd
+index 0000000000..2c6541a8fb
 --- /dev/null
 +++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,670 @@
+@@ -0,0 +1,1570 @@
 +#include "rpi_shader.h"
 +
 +#ifdef _MSC_VER
@@ -15772,648 +23513,1548 @@ index 0000000..0898ecd
 +__attribute__((aligned(8)))
 +#endif
 +unsigned int rpi_shader[] = {
-+// ::mc_setup_c
-+/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
-+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
-+/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
-+/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
-+/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
-+/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
-+/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
-+/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
-+/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
-+/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
-+/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
-+/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
-+/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
-+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
-+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
-+/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
-+/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
-+/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
-+/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
 +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
 +/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
 +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
-+/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
-+/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
-+/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
-+/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
-+/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
-+/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
-+/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
-+/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
-+/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
-+/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
-+/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
-+/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
-+/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
-+/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
-+/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
-+/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
-+/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
-+/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
-+/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
-+/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
-+/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
-+/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
-+/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
-+/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
-+/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
-+/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
-+// ::mc_filter_uv
-+/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
-+/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
-+/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
-+/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
-+/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
-+/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
-+/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
-+/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
-+/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
-+/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
-+/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
-+/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
-+/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
-+/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
-+/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
-+/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
-+/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
-+// :uvloop
-+/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
-+/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
-+/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
-+/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
-+/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
-+/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
-+/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
-+/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
-+/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
-+/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
-+/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
-+/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
-+/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
-+/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
-+/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_filter_uv_b0
-+/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
-+/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
-+/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
-+/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
-+/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
-+/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
-+/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
-+/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
-+/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
-+/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
-+/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
-+/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
-+/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
-+/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
-+/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
-+/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
-+/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
-+/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
-+// :uvloop_b0
-+/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
-+/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
-+/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
-+/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
-+/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
-+/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
-+/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
-+/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
-+/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
-+/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
-+/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
-+/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
-+/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
-+/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
-+/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
-+/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
-+/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
-+/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
-+/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
-+/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
-+/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
-+/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
-+/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
-+/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
-+/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
-+/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
-+/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
-+/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
-+/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
-+/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
-+/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
-+/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
-+/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
-+// :uv_b0_post12
-+/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
-+/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
-+/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
-+/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
-+/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
-+/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
-+// :uv_b0_post_fin
-+/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
-+/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
-+/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
-+/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
-+/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
-+/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
-+/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
-+/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
-+/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
-+/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
-+/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
-+/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
-+/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
-+/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+// :uvloop_b
-+/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
-+/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
-+/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
-+/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
-+/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
-+/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
-+/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
-+/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
-+/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
-+/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
-+/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
-+/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
-+/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
-+/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
-+/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
-+/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
-+/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
-+/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
-+/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
-+/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
-+/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
-+/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
-+/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
-+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
-+/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
-+/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+// ::mc_interrupt_exit8c
-+/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
-+/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit
-+// ::mc_exit_c
-+/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_interrupt_exit12
-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
-+/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_exit1
-+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
-+/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
-+// ::mc_setup
-+/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
-+/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
-+/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
-+/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
-+/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
-+/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
-+/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
-+/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
-+/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
-+/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
-+/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
-+/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
-+/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
-+/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
-+/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
-+/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
-+/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
-+/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
-+/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
-+/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
-+/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
-+/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
-+/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
-+// :per_block_setup
-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
-+/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
-+/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
-+/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
-+/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
-+/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
-+/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
-+/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
-+/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
-+/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
-+/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
-+/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
-+/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
-+/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
-+/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
-+/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
-+/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
-+/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
-+/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
-+/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
-+/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
-+/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
-+/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
-+/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
-+/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
-+/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
-+/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
-+/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
-+/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
-+/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
-+/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
-+/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
-+/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
-+/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
-+// ::mc_filter
-+/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
-+// :yloop
-+/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
-+/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
-+/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
-+/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
-+/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
-+/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
-+/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
-+/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
-+/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
-+/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
-+/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
-+/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
-+/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
-+/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
-+/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
-+/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
-+/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_filter_b
-+// :yloopb
-+/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
-+/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
-+/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
-+/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
-+/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
-+/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
-+/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
-+/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
-+/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
-+/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
-+/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
-+/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
-+/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
-+/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
-+/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
-+/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
-+/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
-+/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
-+/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
-+/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
-+/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
-+/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
-+/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
-+/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
-+/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
-+/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
-+/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
-+/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
-+/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
-+/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
-+/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
-+/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000118] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :1
++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++// :1
++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_sync_q0
++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q2
++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q3
++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :1
++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter_y_pxx
++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :1
++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :1
++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :1
++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
++// :1
++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
++// :1
++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
++// :1
++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q7
++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q9
++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
++// :1
++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
++// ::mc_filter_y10_pxx
++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
++// :1
++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
++// :1
++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++// :1
++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
 +// ::mc_end
 +};
 +#ifdef __HIGHC__
@@ -16421,35 +25062,79 @@ index 0000000..0898ecd
 +#endif
 diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
 new file mode 100644
-index 0000000..d17b9fd
+index 0000000000..82bf380eb4
 --- /dev/null
 +++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,19 @@
+@@ -0,0 +1,63 @@
 +#ifndef rpi_shader_H
 +#define rpi_shader_H
 +
 +extern unsigned int rpi_shader[];
 +
-+#define mc_setup_c (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 152)
-+#define mc_filter_uv_b0 (rpi_shader + 280)
-+#define mc_interrupt_exit8c (rpi_shader + 554)
-+#define mc_exit (rpi_shader + 582)
-+#define mc_exit_c (rpi_shader + 582)
-+#define mc_interrupt_exit12 (rpi_shader + 598)
-+#define mc_exit1 (rpi_shader + 634)
-+#define mc_setup (rpi_shader + 650)
-+#define mc_filter (rpi_shader + 942)
-+#define mc_filter_b (rpi_shader + 1094)
-+#define mc_end (rpi_shader + 1246)
++#define mc_setup_c_q0 (rpi_shader + 0)
++#define mc_start (rpi_shader + 0)
++#define mc_setup_c_qn (rpi_shader + 2)
++#define mc_filter_c_p (rpi_shader + 142)
++#define mc_filter_c_p_l1 (rpi_shader + 272)
++#define mc_filter_c_b (rpi_shader + 402)
++#define mc_sync_q0 (rpi_shader + 590)
++#define mc_sync_q1 (rpi_shader + 608)
++#define mc_sync_q2 (rpi_shader + 620)
++#define mc_sync_q3 (rpi_shader + 632)
++#define mc_sync_q4 (rpi_shader + 644)
++#define mc_sync_q5 (rpi_shader + 662)
++#define mc_sync_q6 (rpi_shader + 674)
++#define mc_sync_q7 (rpi_shader + 686)
++#define mc_sync_q8 (rpi_shader + 698)
++#define mc_sync_q9 (rpi_shader + 716)
++#define mc_sync_q10 (rpi_shader + 728)
++#define mc_sync_q11 (rpi_shader + 740)
++#define mc_exit_c_qn (rpi_shader + 752)
++#define mc_exit_y_qn (rpi_shader + 752)
++#define mc_exit_c_q0 (rpi_shader + 770)
++#define mc_exit_y_q0 (rpi_shader + 770)
++#define mc_setup_y_q0 (rpi_shader + 790)
++#define mc_setup_y_qn (rpi_shader + 792)
++#define mc_filter_y_pxx (rpi_shader + 1032)
++#define mc_filter_y_bxx (rpi_shader + 1162)
++#define mc_filter_y_p00 (rpi_shader + 1292)
++#define mc_filter_y_b00 (rpi_shader + 1382)
++#define mc_setup_c10_q0 (rpi_shader + 1462)
++#define mc_setup_c10_qn (rpi_shader + 1464)
++#define mc_filter_c10_p (rpi_shader + 1600)
++#define mc_filter_c10_p_l1 (rpi_shader + 1728)
++#define mc_filter_c10_b (rpi_shader + 1856)
++#define mc_sync10_q0 (rpi_shader + 2042)
++#define mc_sync10_q1 (rpi_shader + 2060)
++#define mc_sync10_q2 (rpi_shader + 2072)
++#define mc_sync10_q3 (rpi_shader + 2084)
++#define mc_sync10_q4 (rpi_shader + 2096)
++#define mc_sync10_q5 (rpi_shader + 2114)
++#define mc_sync10_q6 (rpi_shader + 2126)
++#define mc_sync10_q7 (rpi_shader + 2138)
++#define mc_sync10_q8 (rpi_shader + 2150)
++#define mc_sync10_q9 (rpi_shader + 2168)
++#define mc_sync10_q10 (rpi_shader + 2180)
++#define mc_sync10_q11 (rpi_shader + 2192)
++#define mc_exit_c10_q0 (rpi_shader + 2204)
++#define mc_exit_y10_q0 (rpi_shader + 2204)
++#define mc_exit_c10_qn (rpi_shader + 2224)
++#define mc_exit_y10_qn (rpi_shader + 2224)
++#define mc_setup_y10_q0 (rpi_shader + 2242)
++#define mc_setup_y10_qn (rpi_shader + 2244)
++#define mc_filter_y10_pxx (rpi_shader + 2494)
++#define mc_filter_y10_p00 (rpi_shader + 2624)
++#define mc_filter_y10_bxx (rpi_shader + 2716)
++#define mc_filter_y10_b00 (rpi_shader + 2846)
++#define mc_end (rpi_shader + 2926)
 +
 +#endif
 diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
 new file mode 100644
-index 0000000..aa3fe47
+index 0000000000..ba6cc13a95
 --- /dev/null
 +++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1259 @@
+@@ -0,0 +1,1741 @@
 +
 +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
 +# the warning that we are using rotation & ra/rb registers. r0..3 can be
@@ -16457,102 +25142,197 @@ index 0000000..aa3fe47
 +# local 4.  As it happens this is what is wanted here as we do not want the
 +# constants from the other half of the calc.
 +
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# However in the current world there seems to be no benefit (and a small
++# overhead) in setting this bigger than 2.
++
++.set PREREAD,                      4
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8,               16
++.set C_BLK_HEIGHT_16,              8
++.set Y_BLK_HEIGHT_8,               16
++.set Y_BLK_HEIGHT_16,              8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in rpi_shader_cmd.h
++
++.set N_QPU_8,                      12
++.set N_QPU_16,                     12
++
 +# register allocation
 +#
-+# ra0...ra7                                     eight horizontal filter coefficients
-+#
-+# rb0 rx_shift2
-+# rb1 rb_y2_next
-+#
-+# rb4...rb7
-+#
-+# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
-+#
-+#                                               (ra15 isn't clamped to zero - this happens during the
-+#                                                copy to ra14, and during its use in the vertical filter)
-+#
-+# rb8...rb11                                    eight vertical filter coefficients
 +
-+# ra4                                           y: Fiter, UV: part -of b0 -> b stash
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
 +
-+# rb12                                          offset to add before shift (round + weighting offsets)
-+# rb13                                          shift: denom + 6 + 9
-+# rb14                                          L0 weight (U on left, V on right)
-+# rb15                                          -- free --
-+#
-+# ra16                                          width:height
-+# ra17                                          ra_y:ra_xshift
-+# ra18                                          L1 weight (Y)
-+# ra19                                          ra_y_next:ra_xshift_next
-+#
-+# rb16                                          pitch
-+# rb17                                          height + 1
-+# rb18                                          max(height,16) + 3
-+# rb19                                          frame_base2_next
-+#
-+# ra20                                          1
-+# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
-+# ra22 ra_k256                                  256
-+# ra23                                          0
-+#
-+# rb20                                          -- free --
-+# rb21                                          -- free --
-+# rb22 rb_k255                                  255
-+# rb23                                          dest (Y)
-+#
-+# rb24                                          vdw_setup_1(dst_pitch)
-+# rb25                                          frame width-1
-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
-+# rb27                                          vdw_setup_0 (depends on QPU number)
-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
-+# rb29                                          vdw_setup_1(dst_pitch-width)
-+# rb30                                          frame height-1
-+# rb31                                          used as temp to count loop iterations
-+#
-+# ra24                                          src frame base
-+# ra25                                          src frame base 2
-+# ra26                                          next ra24
-+# ra27                                          next ra25
-+# ra28                                          -- free --
-+# ra29                                          -- free --
-+#
-+# Use an even numbered register as a link register to avoid corrupting flags
-+# ra30                                          next kernel address
-+# ra31                                          chroma-B height+3; free otherwise
++# ra4-7
++# C:   L0 H filter out FIFO
++# otherwise -- free --
 +
-+.set rb_max_x,                     rb25
-+.set rb_max_y,                     rb30
-+.set rb_pitch,                     rb16
++# ra8-11
++# temp in some places - check usage
++# Y:   (with rb8-11) horiz out FIFO
++
++# ra12-15
++# -- free --
++
++# uniform: width:height
 +.set ra_width_height,              ra16
 +.set ra_width,                     ra16.16b
 +.set ra_height,                    ra16.16a
-+.set ra_y2,                        ra21.16a
-+.set ra_y2_next,                   ra21.16b
 +
-+.set rb_base2_next,                rb19
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2,                      ra17
++.set ra_y2,                        ra17.16a
++.set ra_y,                         ra17.16b
 +
-+.set rb_dest,                      rb23
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1,             ra18
++.set ra_wt_off_l1,                 ra18.16b
++.set ra_wt_mul_l1,                 ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next,                 ra19
++.set ra_y_next,                    ra19.16b
++.set ra_y2_next,                   ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff100100,                 ra20
++.set ra_k256,                      ra20.16a
++.set ra_k0,                        ra20.8a
++.set ra_k1,                        ra20.8b
++.set ra_k16,                       ra20.8c
++.set ra_k255,                      ra20.8d
++
++# Loop: xshifts
++.set ra_xshift,                    ra21.16a
++.set ra_xshift_next,               ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0,             ra22
++.set ra_wt_mul_l0,                 ra22.16a
++.set ra_wt_off_l0,                 ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax,           ra23
++.set ra_pmax,                      ra23.16a
++.set ra_blk_height,                ra23.8c
++# -- free --                       ra23.8d
++
++# Loop:  src frame base (L0)
 +.set ra_base,                      ra24
-+.set ra_base_next,                 ra26
-+.set ra_xshift,                    ra17.16a
 +
++# Loop: src frame base (L1)
 +.set ra_base2,                     ra25
 +
-+# Note ra_xy & ra_xy_next should have same structure!
-+.set ra_xshift_next,               ra19.16a
++# Loop: next src frame base (L0)
++.set ra_base_next,                 ra26
++
++# -- free --                       ra27
++# -- free --                       ra28
++# -- free --                       ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link,                      ra30
++
++# -- free --                       ra31
++
 +.set rb_xshift2,                   rb0
 +.set rb_xshift2_next,              rb1
 +
-+.set ra_y_next,                    ra19.16b
-+.set ra_y,                         ra17.16b
++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x,                    rb2
 +
-+.set ra_k1,                        ra20
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++.set rb_ef,                        rb3
++
++# rb4-7
++# C-B: L1 H filter out FIFO
++# Y:   (with ra2.8x) Y vertical filter coeffs
++
++# rb8-11
++# C:   Vertical filter coeffs
++# Y:   (with ra8-11) horiz out FIFO
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off,                    rb12
++
++# Setup: denom + 6 + 9
++.set rb_wt_den_p15,                rb13
++
++# -- free --                       rb14
++# -- free --                       rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch,                     rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu,                     rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount,                    rb18
++
++# frame_base2_next
++.set rb_base2_next,                rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
 +.set rb_xpitch,                    rb20
-+.set rb_k255,                      rb22
-+.set ra_k256,                      ra22
-+.set ra_k0,                        ra23
 +
-+.set ra_link,                      ra30
++# -- free --                       rb21
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask,                     rb22
++
++# Loop: destination address
++.set rb_dest,                      rb23
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base,                 rb24
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x,                     rb25
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set rb_dma0,                      rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base,                 rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init,                  rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1,                      rb29
++
++# Setup: pic_height - 1
++.set rb_max_y,                     rb30
++
++# -- free --                       rb31
++
++
++
 +
 +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
 +.set i_shift16,                    -16
@@ -16564,8 +25344,10 @@ index 0000000..aa3fe47
 +# Macros that express this - obviously these can't be overlapped
 +# so are probably unsuitable for loop code
 +
-+.macro m_calc_dma_regs, r_vpm, r_dma
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
 +  mov r2, qpu_num
++.if v_bit_depth <= 8
++  # 8 bit version
 +  asr r1, r2, 2
 +  shl r1, r1, 6
 +  and r0, r2, 3
@@ -16576,811 +25358,983 @@ index 0000000..aa3fe47
 +
 +  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
 +  shl r0, r0, 5
-+  add r_dma, r0, r1  # DMA out
-+.endm
 +
-+# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
-+.macro m_calc_dma_regs_c, r_vpm, r_dma
-+  mov r2, qpu_num
++.else
++  # 16 bit version
++  # Limited to 8 QPUs if blk height > 8
 +  asr r1, r2, 1
++.if v_blk_height <= 8
++  shl r1, r1, 4
++.else
 +  shl r1, r1, 5
++.endif
 +  and r0, r2, 1
 +  or  r0, r0, r1
 +
-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+  add r_vpm, r0, r1  # VPM 8bit storage
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
++  add r_vpm, r0, r1
 +
 +  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
 +  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
 +  shl r0, r0, 6
++.endif
 +  add r_dma, r0, r1  # DMA out
 +.endm
 +
 +
++.macro m_setup_q0
++  srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
 +################################################################################
 +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
-+::mc_setup_c
-+  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_pmask,           0xff
++.set v_blk_height,      C_BLK_HEIGHT_8
++.else
++.set v_x_shift,         2
++.set v_pmask,           0xffff
++.set v_blk_height,      C_BLK_HEIGHT_16
++.endif
++
++  mov tmurs, 1                                  # No swap TMUs
 +
 +# Load first request location
-+  mov ra0, unif         # next_x_y
++  mov ra0, unif                                 # next_x_y
++
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
 +
 +  mov ra_base, unif                             # Store frame c base
 +
 +# Read image dimensions
-+  sub rb_max_x, unif, 1     # pic c width
-+  sub rb_max_y, unif, 1     # pic c height
++  sub r0, unif, 1                               # pic c width
++  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
++  sub rb_max_y, unif, 1                         # pic c height
 +
 +# load constants
-+  mov ra_k1, 1
-+  mov ra_k256, 256
-+  mov rb_k255, 255
-+  mov ra_k0, 0
++  mov ra_kff100100, 0xff100100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
 +
-+# touch registers to keep simulator happy
-+
-+  # ra/b4..7: B0 -> B stash registers
-+  mov ra4, 0 ; mov rb4, 0
-+  mov ra5, 0 ; mov rb5, 0
-+  mov ra6, 0 ; mov rb6, 0
-+  mov ra7, 0 ; mov rb7, 0
-+
-+  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
-+
-+# ; ra12..15: vertical scroll registers
 +# get source pitch
-+  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
-+  mov rb_pitch, unif    ; mov ra13, 0           # stride1
-+  mov r0, elem_num      ; mov ra14, 0
-+# get destination vdw setup
-+  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
++  mov rb_xpitch, unif                           # stride2
++  mov rb_pitch, unif                            # stride1
++  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
++  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
++
++  and r0, 1, elem_num
++  nop                   ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++  add rb_elem_x, r0, elem_num
++.else
++  add r0, r0, elem_num
++  add rb_elem_x, r0, r0
++.endif
 +
 +# Compute base address for first and second access
 +# ra_base ends up with t0s base
 +# ra_base2 ends up with t1s base
 +
-+  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
++  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
++  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
 +  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
 +  min r0, r0, rb_max_x
 +
 +# Get shift
-+  and r1, r0, 1
-+  shl ra_xshift_next, r1, 4
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.else
++  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
++.endif
 +
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
 +
-+  and r0, r0, -2
-+  add r0, r0, r0        ; v8subs r1, r1, r1
-+  sub r1, r1, rb_pitch
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
 +  and r1, r0, r1
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra_y
++  add r0, r0, r1
 +  add ra_base, ra_base, r0
 +
-+  max r0, r1, 0
-+  min r0, r0, rb_max_y
-+
-+# submit texture requests for first line
-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+  add t0s, ra_base, r0
-+
-+# submit texture requests for 2nd line
-+
-+  max r0, r1, 0
-+  min r0, r0, rb_max_y
-+
-+  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+  add t0s, ra_base, r0
-+
-+  add rb13, 9, unif     # denominator
-+  mov -, unif           # Unused
++  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
 +
 +# Compute part of VPM to use for DMA output
-+  m_calc_dma_regs_c rb28, rb27
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
 +
-+# -----------------
 +# And again for L1, but only worrying about frame2 stuff
 +
-+  mov ra_link, unif        # Next fn
-+
 +# Load first request location
-+  mov ra0, unif            # next_x_y
++  mov ra0, unif                                 # next_x_y
 +
-+  mov ra_base2, unif # Store frame c base
++  mov ra_base2, unif                            # [ra0 delay] Store frame c base
 +
 +# Compute base address for first and second access
 +# ra_base ends up with t0s base
 +# ra_base2 ends up with t1s base
 +
-+  mov ra_y2, ra0.16a       # Store y
-+  mov r0, ra0.16b          # Load x
-+  add r0, r0, elem_num     # Add QPU slice
-+  max r0, r0, 0         ; mov -, unif           # Unused 0
-+  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
++  shl r0, ra0.16b, v_x_shift
++  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
++  max r0, r0, 0
++  min r0, r0, rb_max_x
 +
-+# Get shift
-+  and r1, r0, 1         ; mov -, unif           # Unused 2
-+  shl rb_xshift2_next, r1, 4
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
 +
 +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
 +
-+  and r0, r0, -2
-+  add r0, r0, r0        ; v8subs r1, r1, r1
-+  sub r1, r1, rb_pitch
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
 +  and r1, r0, r1
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra_y2
++  add r0, r0, r1        ; mov r2, ra_y2
 +  add ra_base2, ra_base2, r0
 +
-+  max r0, r1, 0
-+  min r0, r0, rb_max_y
++# Do preloads
++# r0 = ra_y, r2 = ra_y2
++  mov r3, PREREAD       ; mov r0, ra_y
 +
-+# submit texture requests for first line
-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
-+  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
 +
-+# submit texture requests for 2nd line
-+
-+  max r0, r1, 0         ; mov -, unif           # Unused 4
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
 +
++  mov ra_link, unif                             # link
++# touch registers to keep simulator happy
++  # ra/b4..7: B0 -> B stash registers
++  mov ra4, 0 ; mov rb4, 0
 +  bra -, ra_link
-+
-+  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
-+  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
-+  add t1s, ra_base2, r0
-+
++  mov ra5, 0 ; mov rb5, 0
++  mov ra6, 0 ; mov rb6, 0
++  mov ra7, 0 ; mov rb7, 0
 +# >>> ra_link
-+
-+
-+.macro setf_nz_if_v
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
 +.endm
 +
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  m_setup_c 8
 +
 +################################################################################
 +
-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv
-+  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_x_mul,           2
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_x_mul,           4
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift,        rb_xshift2              # b side more convienient
++.set vrx_xshift_next,   ra_xshift_next
++.set vra_y_next,        ra_y_next
++.set vrx_base_next,     ra_base_next
++.set vra_y,             ra_y
++.set vra_base,          ra_base
++.set vr_txs,            t0s
++.else
++.set vrx_xshift,        ra_xshift               # a side more convienient
++.set vrx_xshift_next,   rb_xshift2_next
++.set vra_y_next,        ra_y2_next
++.set vrx_base_next,     rb_base2_next
++.set vra_y,             ra_y2
++.set vra_base,          ra_base2
++.set vr_txs,            t1s
++.endif
 +
 +# per-channel shifts were calculated on the *previous* invocation
-+
 +# get base addresses and per-channel shifts for *next* invocation
-+  mov ra2, unif         ; mov r0, elem_num
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
 +
-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
++  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
++  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
++  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
++  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
 +
-+  shl ra_xshift_next, r0, 4
-+
-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++.if v_bit_depth <= 8
++  shl vrx_xshift_next, r0, 3
++  and r0, r0, -4
++.endif
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
++  add vrx_base_next, r3, r0     ; mov r1, ra_height
 +
 +# set up VPM write
-+
-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
-+  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
-+  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
 +
 +# ; unpack filter coefficients
 +
-+  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
-+  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
++  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
++  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
 +
-+  shl r1, r1, rb13      ; mov rb10, ra3.8c
-+  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
++  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
 +
-+  asr rb12, r1, 1
-+  shl rb14, ra1.16a, 1  # b14 = weight*2
++  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
 +
-+# rb14 - weight L0 * 2
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
++  sub ra3, rb_wt_den_p15, ra_k1
 +
++# r5           = 0 (loop counter)
++# ra9          = alias for rb_max_y
++# ra_wt_mul_l0 = weight L0
++# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
++# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+# r3 = 0
-+:uvloop
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++.if v_tmu == 0
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
++.else
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
++  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
++.endif
 +
-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
++  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
++  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
++  min r3, r3, ra9       ; mov.ifnc r0, r2
 +
-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+  min r2, r2, rb_max_y
-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  setf_nz_if_v
++  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
++  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
 +# apply horizontal filter
 +# The filter coeffs for the two halves of this are the same (unlike in the
 +# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are vaild for all QPUs
 +
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  sub r0, r2, r3        ; mov r3, rb31
-+  sub.setf -, r3, 4     ; mov ra12, ra13
-+  brr.anyn -, r:uvloop
-+  mov ra13, ra14        ; mul24 r1, ra14, rb9
-+  mov ra14, ra15
-+  mov ra15, r0          ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
 +
-+# apply vertical filter and write to VPM
++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
++# Have to dup block as we need to move the brr - code is more common than it
++# looks at first glance
++.if v_bit_depth <= 8
++  brr.anyn -, r:1b
++  add r2, r2, r3        ; mov ra5, ra6
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
++.else
++  add r2, r2, r3        ; mov ra5, ra6
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r1, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, ra4, rb8
++  asr ra7, r2, v_bit_depth - 8
++.endif
++# >>> .anyn 1b
 +
-+  sub r1, r1, r0        ; mul24 r0, ra14, rb10
-+  add r1, r1, r0        ; mul24 r0, ra15, rb11
++  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
 +  sub r1, r1, r0
-+  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
 +  asr r1, r1, 14
-+  nop                   ; mul24 r1, r1, rb14
-+  shl r1, r1, 8
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  shl r1, r1, 8         ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, ra3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
 +
-+  add r1, r1, rb12
-+  asr ra1.8as, r1, rb13
-+  nop                   ; mov r1, r1 << 8
-+  brr.anyn -, r:uvloop
-+  asr ra1.8bs, r1, rb13
-+  mov -, vw_wait
-+  mov vpm, ra1
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+# >>>
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
 +
-+# DMA out for U & stash for V
-+  bra -, ra_link
-+  mov vw_setup, rb26
-+  mov vw_setup, rb29
-+  mov vw_addr, unif     # u_dst_addr
-+# >>>
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++# At 10 bits
++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
++# (P)
++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
++# ... should be OK
++#
++# (B)
++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
++# So signed overflow if we sign extend here :-(
++#
++# In practice this doesn't happen (we need a maximal offset and a very unlucky
++# filter).
++#
++# This could be fixed by offsetting the filters s.t. they are unsigned until
++# weight mul and then removing the offset with the weighting offset (I think
++# this should work) or splitting the rounding & offsetting
++
++::mc_filter_c_p
++  m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++  m_filter_c_p 1, 8
 +
 +################################################################################
 +
-+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
++# mc_filter_c_b
 +
 +# At this point we have already issued two pairs of texture requests for the current block
 +# ra_x, ra_x16_base point to the current coordinates for this block
-+::mc_filter_uv_b0
-+  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++.set v_x_mul,           (1 << v_x_shift)
 +
 +# per-channel shifts were calculated on the *previous* invocation
 +
 +# get base addresses and per-channel shifts for *next* invocation
-+  mov ra2, unif         ; mov r0, elem_num
++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
 +
-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
 +
-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
++  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
++  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
++  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
 +
-+  shl ra_xshift_next, r0, 4
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.endif
 +
-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
++  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
++  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
 +
 +# set up VPM write
 +
-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
-+  add rb17, r1, 1
-+  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
 +
-+# ; unpack filter coefficients
++  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
++  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
++  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
 +
-+  add r0,   r0, r2      ; mov rb9,  ra3.8b
-+  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27
++# L1 - uniform layout could possibly be optimized
 +
-+  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
++  shl r0, ra3.16b, v_x_shift                    # r0=x*2
++  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
++  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
++  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
++  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
 +
-+  mov rb14, unif                                # U weight
-+  mov.ifnz rb14, unif                           # V weight
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
 +
-+# rb14 unused in b0 but will hang around till the second pass
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# r3 = 0
-+:uvloop_b0
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
-+
-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
-+  min r2, r2, rb_max_y
-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
-+
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+
-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+  sub r0, r2, r3        ; mov r3, rb31
-+  sub.setf -, r3, 4     ; mov ra12, ra13
-+  brr.anyn -, r:uvloop_b0
-+  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
-+  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
-+  mov ra15, r0          ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop_b0
-+
-+# apply vertical filter and write to B-FIFO
-+
-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
-+  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
-+  sub r1, r1, r0        ; mov ra7, rb6
-+
-+# FIFO goes:
-+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
-+# This arrangement optimizes the inner loop FIFOs at the expense of making the
-+# bulk shift between loops quite a bit nastier
-+# a8 used as temp
-+
-+  sub.setf -, r3, ra31
-+  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
-+  brr.anyn -, r:uvloop_b0
-+  mov ra5, rb4          ; mov rb4, ra4
-+  mov ra4, rb5          ; mov rb5, ra6
-+  mov ra6, rb7          ; mov rb7, ra8
-+# >>>
-+
-+# 1st half done all results now in the a/b4..7 fifo
-+
-+# Need to bulk rotate FIFO for heights other than 16
-+# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
-+# we are allowed 3/4 cb_size w/h :-(
-+
-+# Destination uniforms discarded
-+# At the end drop through to _b - we will always do b after b0
-+
-+  sub.setf -, 15, r3    # 12 + 3 of preroll
-+  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
-+  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
-+  mov r0, i_shift16     ; mov ra_link, unif
-+  mov r1, 0x10000
-+# >>>
-+  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
-+# If h != 16 && h != 12 then h <= 8 so
-+# shift 8 with discard (.16b = .16a on all regs)
-+  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
-+  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
-+  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
-+# >>>
-+  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
-+
-+  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
-+# Shift 4
-+  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
-+  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
-+  # If we shifted by 4 here then the max length remaining is 4
-+  # so that is it
-+
-+  brr -, r:uv_b0_post_fin
-+# Shift 2
-+  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
-+  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
-+  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
-+  # 6 / 2 so need 6 outputs
-+# >>>
-+
-+:uv_b0_post12
-+# this one is annoying as we need to swap halves of things that don't
-+# really want to be swapped
-+
-+# b7a, a6a, b5a, a4a
-+# b4a, a5a, b6a, a7a
-+# b7b, a6b, b5b, a4b
-+# b4b, a5b, b6b, a7b
-+
-+  mov r2, ra6           ; mov r3, rb7
-+  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
-+  mov ra5, r2           ; mov rb4, r3
-+
-+  mov r2,  ra4          ; mov r3,  rb5
-+  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
-+  mov ra7, r2           ; mov rb6, r3
-+
-+:uv_b0_post_fin
-+
-+##### L1 B processing
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+  mov ra2, unif         ; mov r0, elem_num
-+
-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
-+
-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
-+  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
-+
-+  shl rb_xshift2_next, r0, 4
-+
-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
-+  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
-+  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
++  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++  and r1, r0, r1        ; mov rb10, ra3.8c
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
++  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
 +  add rb_base2_next, r3, r0
 +
-+  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
-+  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
++  mov ra9, rb_max_y     ; mov rb11, ra3.8d
++  shl r1, ra_wt_off_l1, rb_wt_den_p15
++  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
 +
-+  nop                   ; mov rb11, ra3.8d
-+  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
-+  asr rb12, r1, 1
-+
-+# ra1.16a used directly in the loop
++# r5        loop counter
++# ra0       H coeffs L0
++# ra1       H coeffs L1
++# ra2       V coeffs L0
++# ra3       temp
++# ra4-7     L0 H FIFO
++# rb4-7     L1 H FIFO
++# rb8-rb11  V coeffs L1
++# ra9       rb_max_y alias
 +
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
++  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
++  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
++  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
++  add ra_y, 1, ra_y     ; mov r3, ra_y
 +
-+# r3 = 0
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
 +
-+:uvloop_b
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
++# L0 H-filter
++# H FIFO scrolls are spread all over this loop
++  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
 +
-+  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
-+  min r2, r2, rb_max_y
-+  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
-+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
++.if v_bit_depth <= 8
++  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
++.else
++  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
++  asr ra3, r2, (v_bit_depth - 8)
++.endif
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
++  shr r2, r4, rb_xshift2 ; mov ra5, ra6
++  shr r1, r2, v_v_shift ; mov r3, ra_y2
++  add ra_y2, r3, ra_k1  ; mov rb6, rb7
 +
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
 +
-+  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
-+  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
-+  sub.setf -, r3, 4    ; mov ra12, ra13
-+  brr.anyn -, r:uvloop_b
-+  mov ra13, ra14          ; mul24 r1, ra14, rb9
-+  mov ra14, ra15          ; mul24 r2, ra15, rb10
-+  mov ra15, r0            ; mul24 r0, ra12, rb8
-+# >>> .anyn uvloop_b
++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
 +
-+# apply vertical filter and write to VPM
++# L1 H-filter
 +
-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
-+  add r1, r1, r2        ; mul24 r0, ra15, rb11
-+  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
-+  mov ra7, rb6          ; mul24 r1, r1, ra_k256
-+  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
++  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
++  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
++# V filters - start in branch delay slots of H
++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
++  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
++  brr.anyn -, r:1b
++  mov ra6, ra7          ; mul24 r3, ra7, rb10
++  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
++  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
++# >>> .anyn 1b
 +
-+  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
-+  add r1, r1, r0        ; mov rb4, ra4
++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
++  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
++  sub r2, r1, r0        ; mul24 r0, ra4, rb8
++  sub r1, r3, r0        ; mul24 r0, ra5, rb9
++  add r1, r1, r0        ; mul24 r0, ra7, rb11
++  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
 +
-+  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
-+  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
++  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
 +
-+  sub.setf -, r3, ra31  ; mov ra6, rb7
-+  asr ra3.8as, r1, rb13
-+  nop                   ; mov r1, r1 << 8
-+  brr.anyn -, r:uvloop_b
-+  asr ra3.8bs, r1, rb13
-+  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
-+  mov vpm, ra3
-+# >>>
++  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
++  add r1, r1, r2        ; mov r3, ra_blk_height
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
 +
-+  bra -, ra_link
-+  mov vw_setup, rb26
-+  mov vw_setup, rb29
-+  mov vw_addr, unif     # c_dst_addr
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
 +
++::mc_filter_c_b
++  m_filter_c_b 8
 +
 +################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  nop                   ; nop           ; ldtmu0
++  mov -, vw_wait        ; nop           ; ldtmu1
++.else
++  mov.setf r3, PREREAD - 1
++:1
++  brr.anynz -, r:1b
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  sub.setf r3, r3, 1
++ # >>>
++  mov  -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 -  fns should never be called
++.if n_qpu < n_quads * 4
++  mov ra_link, unif     # Can only branch to an a reg (not r0)
++  mov -, vw_wait        # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in,  12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  bra -, ra_link
++  sacq -, n_sem_quad_in
++  srel -, n_sem_out
++  srel -, n_sem_quad_out
++
++.else
++  bra -, ra_link
++  srel -, n_sem_sync
++  sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++  srel -, n_sem_out
++.else
++  nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++  m_sync_q 0, v_quads8
++::mc_sync_q1
++  m_sync_q 1, v_quads8
++::mc_sync_q2
++  m_sync_q 2, v_quads8
++::mc_sync_q3
++  m_sync_q 3, v_quads8
++::mc_sync_q4
++  m_sync_q 4, v_quads8
++::mc_sync_q5
++  m_sync_q 5, v_quads8
++::mc_sync_q6
++  m_sync_q 6, v_quads8
++::mc_sync_q7
++  m_sync_q 7, v_quads8
++::mc_sync_q8
++  m_sync_q 8, v_quads8
++::mc_sync_q9
++  m_sync_q 9, v_quads8
++::mc_sync_q10
++  m_sync_q 10, v_quads8
++::mc_sync_q11
++  m_sync_q 11, v_quads8
 +
 +# mc_exit()
-+
-+::mc_interrupt_exit8c
-+  ldtmu0
-+  ldtmu1
-+  ldtmu1
-+  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
-+
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+#  mov -,sacq(0) # 8
-+#  mov -,sacq(0) # 9
-+#  mov -,sacq(0) # 10
-+#  mov -,sacq(0) # 11
-+
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
 +# Chroma & Luma the same now
-+::mc_exit_c
-+::mc_exit
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
 +
-+  mov -,srel(0)
++.macro m_exit_qn
++  m_exit_drain
++  nop                   ; nop           ; thrend
++  nop
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++  m_exit_qn
 +
-+  nop        ; nop ; thrend
-+  nop        ; nop # delay slot 1
-+  nop        ; nop # delay slot 2
 +
 +
 +# mc_interrupt_exit12()
-+::mc_interrupt_exit12
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
 +
-+  mov -,sacq(0) # 1
-+  mov -,sacq(0) # 2
-+  mov -,sacq(0) # 3
-+  mov -,sacq(0) # 4
-+  mov -,sacq(0) # 5
-+  mov -,sacq(0) # 6
-+  mov -,sacq(0) # 7
-+  mov -,sacq(0) # 8
-+  mov -,sacq(0) # 9
-+  mov -,sacq(0) # 10
-+  mov -,sacq(0) # 11
++.macro m_exit_q0
++  m_exit_drain
++  sacq -, 12
++  nop                   ; nop           ; thrend
++  mov interrupt, 1
++  nop
++# >>> thrend <<<
++.endm
 +
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
-+
-+
-+::mc_exit1
-+  mov  -, vw_wait # wait on the VDW
-+
-+  ldtmu0
-+  ldtmu1
-+  ldtmu0
-+  ldtmu1
-+  nop        ; nop ; thrend
-+  mov interrupt, 1; nop # delay slot 1
-+  nop        ; nop # delay slot 2
++::mc_exit_c_q0
++::mc_exit_y_q0
++  m_exit_q0
 +
 +# LUMA CODE
 +
 +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
 +# For P frames we make the second x,y coordinates offset by +8
 +
++
 +################################################################################
-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
-+::mc_setup
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t pic_h;
++#    uint16_t pic_w;
++#    uint32_t stride2;
++#    uint32_t stride1;
++#    uint32_t wdenom;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_pmask,           0xff
++.set v_blk_height,      Y_BLK_HEIGHT_8
++.else
++.set v_x_shift,         1
++.set v_pmask,           0xffff
++.set v_blk_height,      Y_BLK_HEIGHT_16
++.endif
++
++
 +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
-+  mov ra9, unif         # ref_y_base
-+  mov ra10, unif        # y2_x2
-+  mov ra11, unif        # ref_y2_base
++  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
++  mov ra9, unif                                 # ref_y_base
++  mov ra1, unif                                 # x2_y2
++  mov ra11, unif                                # ref_y2_base
++
++# load constants
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30
++
++
++  mov ra_kff100100, 0xff100100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++
++# Compute part of VPM to use
 +
 +# Read image dimensions
-+  mov ra3, unif         # width_height
-+  mov rb_xpitch, unif   # stride2
++  mov ra3, unif                                 # width_height
++  mov rb_xpitch, unif                           # stride2
++.if v_x_shift == 0
 +  sub rb_max_x, ra3.16b, 1
++.else
++  sub r0, ra3.16b, 1
++  shl rb_max_x, r0, v_x_shift
++.endif
 +  sub rb_max_y, ra3.16a, 1
-+  mov rb_pitch, unif    # stride1
++  mov rb_pitch, unif                            # stride1
 +
 +# get destination pitch
 +  mov r1, vdw_setup_1(0)
-+  or  rb24, r1, rb_pitch
++  or  rb_dma1_base, r1, rb_pitch
 +
 +# Compute base address for first and second access
 +  mov r3, elem_num
-+  add r0, ra8.16a, r3   # Load x + elem_num
++  add r0, ra0.16b, r3                           # Load x + elem_num
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
 +  shl ra_xshift_next, r0, 3 # Compute shifts
 +
-+
-+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
++# X is byte offset - we can only load words - mask
 +
 +  and r0, r0, -4        ; v8subs r2, r2, r2
 +  sub r2, r2, rb_pitch
 +  and r1, r0, r2
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
++  add r0, r0, r1                                # Add stripe offsets
 +  add ra_base, ra9, r0
 +
-+  mov r1, ra8.16b       # Load y
-+  add ra_y, r1, 1       # Set for next
-+  max r1, r1, 0
-+  min r1, r1, rb_max_y
-+
-+# submit texture requests for first line
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t0s, ra_base, r1
-+
-+
 +  # r3 still contains elem_num
-+  add r0, ra10.16a, r3  # Load x
++  add r0, ra1.16b, r3                           # Load x
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
 +  max r0, r0, 0
 +  min r0, r0, rb_max_x
-+  shl rb_xshift2_next, r0, 3 # Compute shifts
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
 +
 +  # r2 still contains mask
 +  and r0, r0, -4
 +  and r1, r0, r2
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
++  add r0, r0, r1                                # Add stripe offsets
 +  add ra_base2, ra11, r0
 +
-+  mov r1, ra10.16b       # Load y
-+  add ra_y2, r1, 1       # Set for next
-+  max r1, r1, 0
++# Do preloads
++  nop                   ; mov r0, ra0.16a       # ; r0 = y
++  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
 +  min r1, r1, rb_max_y
++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1  ; mov ra_y, r0
 +
-+# submit texture requests for first line
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t1s, ra_base2, r1
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
++  add t1s, ra_base2, r1 ; mov ra_y2, r2
++# >>> .anynz 1b
 +
-+# load constants
++  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
 +
-+  mov ra_k1, 1
-+  mov ra_k256, 256
-+  mov rb_k255, 255
-+  mov ra_k0, 0
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++  mov ra_link, unif                             # Next fn
 +
 +# touch vertical context to keep simulator happy
-+
 +  mov ra8,  0           ; mov rb8,  0
++  bra -, ra_link
 +  mov ra9,  0           ; mov rb9,  0
 +  mov ra10, 0           ; mov rb10, 0
 +  mov ra11, 0           ; mov rb11, 0
++# >>> ra_link
++.endm
 +
-+# Compute part of VPM to use
-+  m_calc_dma_regs rb28, rb27
-+
-+# Weighted prediction denom
-+  add rb13, unif, 9     # unif = weight denom + 6
-+
-+# submit texture requests for second line
-+  max r1, ra_y, 0
-+  min r1, r1, rb_max_y
-+  add ra_y, ra_y, 1
-+  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
-+  add t0s, r1, ra_base
-+
-+  max r1, ra_y2, 0
-+  min r1, r1, rb_max_y
-+  add ra_y2, ra_y2, 1
-+  nop                   ; mul24 r1, r1, rb_pitch
-+  add t1s, r1, ra_base2
-+
-+# FALL THROUGHT TO PER-BLOCK SETUP
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
++  m_setup_y 8
 +
++################################################################################
++#
 +# Start of per-block setup code
 +# P and B blocks share the same setup code to save on Icache space
-+:per_block_setup
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+  mov ra_link, unif
-+#### We do all the setup even if we are about to exit - reading junk from unif....
 +
-+  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+  mov ra_xshift, ra_xshift_next
-+  mov rb_xshift2, rb_xshift2_next
++# luma_setup_delay3 done in delay slots of branch that got us here
 +
 +# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
 +
-+  add r0, ra1.16a, r3   # Load x
-+  max r0, r0, 0
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t mymx21;
++#    uint32_t wo1;
++#    uint32_t wo2;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++  brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++  brr ra_link, r:per_block_setup_10
++.endif
++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
++  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
 +  min r0, r0, rb_max_x
 +
 +  shl ra_xshift_next, r0, 3         # Compute shifts
-+  and r0, r0, -4        ; v8subs r2, r2, r2
-+  sub r2, r2, rb_pitch
-+  and r1, r0, r2
-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
-+  add ra_base_next, unif, r0              # Base1
-+  mov ra_y_next, ra1.16b                      # Load y
-+  mov ra1, unif         # x2_y2
-+  nop                   # ra1 delay
-+
-+  add r0, ra1.16a, r3   # Load x2
-+  max r0, r0, 0
-+  min r0, r0, rb_max_x
-+
-+  shl rb_xshift2_next, r0, 3         # Compute shifts
 +  and r0, r0, -4
-+  and r1, r0, r2
++  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
 +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
-+  add r0, r0, r1        # Add stripe offsets
-+  add rb_base2_next, unif, r0              # Base1
-+  mov ra_y2_next, ra1.16b                      # Load y
-+  mov ra_width_height, unif         # width_height
++  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
++  add ra_base_next, ra_base_next, r0            # [ra1 delay]
 +
-+# set up VPM write
-+  mov vw_setup, rb28    # [ra1 delay]
++  add r0, ra1.16b, r3                           # Load x2
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
++  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
++  shl rb_xshift2_next, r0, 3                    # Compute shifts
++  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
++  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
++  add rb_base2_next, rb_base2_next, r0
 +
-+# get width,height of block (unif load above)
-+  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
-+  add rb17, ra_height, 5  ; mov r0, ra_height
-+  mov r1, 16
-+  min r0, r0, r1
-+  add rb18, r0, 7
-+  shl r0,   r0, 7
-+  add r0,   r0, ra_width                        # Combine width and height of destination area
-+  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
-+  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
++# get width,height of block (unif load above), r1 = width * pel_size
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
++  add rb_lcount, r0, 7
++  shl r0,   r0, v_dma_h_shift
++  add r0,   r0, r1                              # Combine width and height of destination area
++  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
++  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
 +
 +# get filter coefficients and discard unused B frame values
-+  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
-+  mov r2, 0x01040400                 # [ra5 delay]
-+  shl ra8, r0, 3                     ; mov rb14, ra5.16a
++  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3        ; mov r3, ra_k255
 +
 +# Pack the 1st 4 filter coefs for H & V tightly
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
 +
-+  mov r1,0x00010100  # -ve
++  mov r1,0x00010100  # -ve                      [ra8 delay]
 +  ror ra2.8a, r1, ra8.8d
 +  ror ra0.8a, r1, ra8.8c
 +
-+  ror ra2.8b, r2, ra8.8d
-+  ror ra0.8b, r2, ra8.8c
++  mov r1, 0x01040400
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
 +
 +  mov r1,0x050b0a00  # -ve
 +  ror ra2.8c, r1, ra8.8d
@@ -17390,49 +26344,44 @@ index 0000000..aa3fe47
 +  ror ra2.8d, r1, ra8.8d
 +  ror ra0.8d, r1, ra8.8c
 +
-+# In the 2nd vertical half we use b registers due to
-+# using a-side fifo regs. The easiest way to achieve this to pack it
-+# and then unpack!
++# In the 2nd vertical half we use b registers due to using a-side fifo regs
 +
 +  mov r1,0x3a281100
-+  ror ra3.8a, r1, ra8.8d
-+  ror ra1.8a, r1, ra8.8c
++  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
 +
 +  mov r1,0x0a0b0500  # -ve
-+  ror ra3.8b, r1, ra8.8d
-+  ror ra1.8b, r1, ra8.8c
++  ror r0, r1, ra8.8d
++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
 +
 +  mov r1,0x04040100
-+  ror ra3.8c, r1, ra8.8d
-+  ror ra1.8c, r1, ra8.8c
++  ror r0, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
++
++  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
 +
 +  mov r1,0x01010000  # -ve
-+  ror ra3.8d, r1, ra8.8d
-+  ror ra1.8d, r1, ra8.8c
-+
-+# Extract weighted prediction information in parallel
-+# We are annoyingly A src limited here
-+
-+  mov rb4, ra3.8a            ; mov ra18, unif
-+  mov rb5, ra3.8b
-+  mov rb6, ra3.8c
-+  mov.ifnz ra5, ra18
-+
-+  mov rb_dest, unif     # Destination address
++  ror r0, r1, ra8.8d
 +
 +  bra -, ra_link
++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
 +
-+  shl r0, ra5.16b, rb13      # Offset calc
-+  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
-+  mov r3, 0                  ; mov rb7, ra3.8d
++  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
 +# >>> branch ra_link
-+#
-+# r3 = 0
-+# ra18.16a = weight L1
-+# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
-+# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
-+# rb13     = weight denom + 6 + 9
-+# rb14     = weight L0
++
++# r5 = 0
++# ra_wt_mul_l1  = weight L1
++# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
++# rb_wt_den_p15 = weight denom + 6 + 9
++# rb_wt_mul_l0  = weight L0
++.endm
++
++:per_block_setup_8
++  m_per_block_setup 8
++
 +
 +
 +################################################################################
@@ -17440,137 +26389,118 @@ index 0000000..aa3fe47
 +# In a P block, y2_x2 should be y_x+8
 +# At this point we have already issued two pairs of texture requests for the current block
 +
-+::mc_filter
-+# ra5.16a = weight << 16; We want weight * 2 in rb14
++.macro m_filter_y_pxx, v_bit_depth
++  m_luma_setup v_bit_depth
 +
-+  shl rb14, ra5.16a, 1
++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
 +
-+# r3 = 0
++# r5 = 0 (loop count)
 +
-+:yloop
++:1
 +# retrieve texture results and pick out bytes
 +# then submit two more texture requests
 +
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
 +# N.B. Whilst y == y2 as far as this loop is concerned we will start
 +# the grab for the next block before we finish with this block and that
 +# might be B where y != y2 so we must do full processing on both y and y2
 +
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y          ; mov ra7, ra8
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
 +
 +# apply horizontal filter
-+  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
-+  brr.anyn -, r:yloop
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
-+  # >>> .anyn yloop
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++  mov ra10, ra11        ; mov rb10, rb11
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
 +
 +  # apply vertical filter and write to VPM
-+
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov -, vw_wait
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0
 +# At this point r1 is a 22-bit signed quantity: 8 (original sample),
 +#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
 +# The top 8 bits have rubbish in them as mul24 is unsigned
 +# The low 6 bits need discard before weighting
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
 +  asr r1, r1, 14
-+  nop                     ; mul24 r1, r1, rb14
-+  add r1, r1, rb12
++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
++  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
++
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
 +
-+  shl r1, r1, 8
-+  brr.anyn -, r:yloop
-+  asr r1, r1, rb13
-+# We have a saturating pack unit - I can't help feeling it should be useful here
-+  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
-+  max vpm, r1, 0         # Delay 3
 +# >>> branch.anyn yloop
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb29 (stride) remains constant
-+  # rb17 remains const (based on total height)
-+  # recalc rb26, rb18 based on new segment height
-+  # N.B. r3 is loop counter still
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
-+  mov r1, 16
-+  sub r0, ra_height, r1
-+  mov ra_height, r0
-+  max.setf r0, r0, 0    # Done if Z now
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
-+  brr.anyz -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, rb_dest # start the VDW   Delay 3
-+# >>> .anyz per_block_setup
-+
-+  min r0, r0, r1
-+  add rb18, rb18, r0
-+  sub r0, r0, r1
-+  shl r0, r0, i_shift23
-+  add rb26, rb26, r0
-+
-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
-+
-+  mov vw_setup, rb28    # Reset our VDM write pointer
-+
-+  brr -, r:yloop
-+  nop
-+  nop
-+  nop
-+# >>>
-+
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
 +
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
 +
++::mc_filter_y_pxx
++  m_filter_y_pxx 8
 +
 +
 +################################################################################
@@ -17578,243 +26508,1106 @@ index 0000000..aa3fe47
 +# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
 +# In a P block, only the first half of coefficients contain used information.
 +# At this point we have already issued two pairs of texture requests for the current block
-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
-+# Can fill in the coefficients so only
-+# Can also assume default weighted prediction for B frames.
 +# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
 +# Or possibly by taking advantage of symmetry?
-+# From 19->7 32bits per command.
 +
-+::mc_filter_b
-+  # r0 = weightL0 << 16, we want it in rb14
-+#  asr rb14, r0, i_shift16
++.macro m_filter_y_bxx, v_bit_depth
++  m_luma_setup v_bit_depth
 +
-+:yloopb
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# If we knew there was no clipping then this code would get simpler.
-+# Perhaps we could add on the pitch and clip using larger values?
-+
-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
++:1
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
 +
 +  max r2, ra_y, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
 +
-+  max r2, ra_y2, 0  # y
-+  min r2, r2, rb_max_y
-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y          ; mov ra7, ra8
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
 +
-+# generate seven shifted versions
-+# interleave with scroll of vertical context
-+
-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
 +
 +# apply horizontal filter
-+  nop                  ; mul24      r3, ra0.8a,      r0
-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
-+  sub r0, r2, r3       ; mov r3, rb31
++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
 +
-+  sub.setf -, r3, 8       ; mov r1,   ra8
-+  mov ra8,  ra9           ; mov rb8,  rb9
-+  brr.anyn -, r:yloopb
-+  mov ra9,  ra10          ; mov rb9,  rb10
-+  mov ra10, ra11          ; mov rb10, rb11
-+  mov ra11, r0            ; mov rb11, r1
-+  # >>> .anyn yloopb
++  sub.setf -, r5, 8     ; mov ra9,  ra10
++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
++  brr.anyn -, r:1b
++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
++  mov ra10, ra11        ; mov rb10, rb11
++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
++  # >>> .anyn 1b
 +
 +  # apply vertical filter and write to VPM
-+  nop                     ; mul24 r0, rb8,  ra2.8a
-+  nop                     ; mul24 r1, rb9,  ra2.8b
-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
-+  sub r1, r1, r0          ; mov r2, rb12
++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
++  add r1, r1, r0        ; mul24 r0, ra11, rb7
++  sub r1, r1, r0        ; mov r2, rb_wt_off
 +# As with P-pred r1 is a 22-bit signed quantity in 32-bits
 +# Top 8 bits are bad - low 6 bits should be discarded
-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
 +
 +  asr r1, r1, 14
-+  nop                     ; mul24 r0, r1, rb14
-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
++  nop                   ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
 +
-+  add r1, r1, r0          ; mov -, vw_wait
-+  shl r1, r1, 8
++  add r1, r1, r0        ; mov r3, ra_blk_height
++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
 +
-+  brr.anyn -, r:yloopb
-+  asr r1, r1, rb13         # Delay 1
-+  min r1, r1, rb_k255       # Delay 2
-+  max vpm, r1, 0         # Delay 3
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
 +
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
 +
-+# If looping again the we consumed 16 height last loop
-+  # rb29 (stride) remains constant
-+  # rb17 remains const (based on total height)
-+  # recalc rb26, rb18 based on new segment height
-+  # N.B. r3 is loop counter still
-+
-+  mov r1, 16
-+  sub r0, ra_height, r1
-+  mov ra_height, r0
-+  max.setf r0, r0, 0    # Done if Z now
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
 +
 +# DMA out
-+  brr.anyz -, r:per_block_setup
-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
-+  mov vw_setup, rb29 # Stride         Delay 2
-+  mov vw_addr, rb_dest # start the VDW   Delay 3
-+# >>> .anyz per_block_setup
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
 +
-+  min r0, r0, r1
-+  add rb18, rb18, r0
-+  sub r0, r0, r1
-+  shl r0, r0, i_shift23
-+  add rb26, rb26, r0
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
 +
-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
-+  add rb_dest, rb_dest, r0
-+
-+  mov vw_setup, rb28    # Reset our VDM write pointer
-+
-+  brr -, r:yloopb
-+  nop
-+  nop
-+  nop
++::mc_filter_y_bxx
++  m_filter_y_bxx 8
 +
 +################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++#    qpu_mc_src_t next_src1;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t wo1;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++  mov ra0, unif         ; mov r3, elem_num      # y_x
++  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
++  add r0, ra0.16b, r3
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3                     # Compute shifts
++  and r0, r0, -4        ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++  shl r1, ra_width, v_x_shift
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
++  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
++  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
++  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
++  add rb_dma0, r0, rb_dma0_base
++
++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
++
++:1
++  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
++  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
++  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++  m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++  m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++  mov r0, 7
++  sub rb_i_tmu, rb_i_tmu, r0
++  sub rb_lcount, rb_lcount, r0
++  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
++  shl rb_wt_off, rb_wt_off, r0
++  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
++  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
++  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
++
++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
++  add r1, r0, r1
++  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, rb_wt_den_p15
++  min r1, r1, ra_pmax   ; mov -, vw_wait
++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height (currently always 16)
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc rb_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  add rb_lcount, rb_lcount, r0
++  brr -, r:1b
++  add rb_dma0, rb_dma0, r1
++  add rb_dest, rb_dest, r2
++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++  m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++  m_setup_q0
++::mc_setup_c10_qn
++  m_setup_c 10
++
++::mc_filter_c10_p
++  m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++  m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++  m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++  m_sync_q 0, v_quads10
++::mc_sync10_q1
++  m_sync_q 1, v_quads10
++::mc_sync10_q2
++  m_sync_q 2, v_quads10
++::mc_sync10_q3
++  m_sync_q 3, v_quads10
++::mc_sync10_q4
++  m_sync_q 4, v_quads10
++::mc_sync10_q5
++  m_sync_q 5, v_quads10
++::mc_sync10_q6
++  m_sync_q 6, v_quads10
++::mc_sync10_q7
++  m_sync_q 7, v_quads10
++::mc_sync10_q8
++  m_sync_q 8, v_quads10
++::mc_sync10_q9
++  m_sync_q 9, v_quads10
++::mc_sync10_q10
++  m_sync_q 10, v_quads10
++::mc_sync10_q11
++  m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++  m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++  m_exit_qn
++
++::mc_setup_y10_q0
++  m_setup_q0
++::mc_setup_y10_qn
++  m_setup_y 10
++
++:per_block_setup_10
++  m_per_block_setup 10
++
++::mc_filter_y10_pxx
++  m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++  m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++  m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++  m_filter_y_b00 10
++
++
 +
 +::mc_end
 +# Do not add code here because mc_end must appear after all other code.
 diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
 new file mode 100644
-index 0000000..27cbb59
+index 0000000000..9f8983da52
 --- /dev/null
 +++ b/libavcodec/rpi_shader_cmd.h
-@@ -0,0 +1,88 @@
+@@ -0,0 +1,128 @@
 +#ifndef RPI_SHADER_CMD_H
 +#define RPI_SHADER_CMD_H
 +
 +#pragma pack(push, 4)
 +
-+typedef struct qpu_mc_pred_c_s {
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++    int16_t y;
++    int16_t x;
++    qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++    qpu_mc_src_t next_src;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x;
++    uint32_t coeffs_y;
++    uint32_t wo_u;
++    uint32_t wo_v;
++    qpu_mc_dst_addr_t dst_addr_c;
 +    uint32_t next_fn;
-+    int16_t next_src_y;
-+    int16_t next_src_x;
-+    uint32_t next_src_base_c;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x1;
++    uint32_t coeffs_y1;
++    uint32_t weight_u1;
++    uint32_t weight_v1;
++    qpu_mc_src_t next_src2;
++    uint32_t coeffs_x2;
++    uint32_t coeffs_y2;
++    uint32_t wo_u2;
++    uint32_t wo_v2;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++    qpu_mc_src_t next_src1;
++    uint32_t pic_cw;            // C Width (== Y width / 2)
++    uint32_t pic_ch;            // C Height (== Y Height / 2)
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    qpu_mc_src_t next_src2;
++    uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
 +    union {
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t wo_u;
-+            uint32_t wo_v;
-+            uint32_t dst_addr_c;
-+        } p;
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t weight_u;
-+            uint32_t weight_v;
-+            uint32_t dummy0;
-+        } b0;
-+        struct {
-+            uint32_t dummy0;
-+            uint32_t coeffs_x;
-+            uint32_t coeffs_y;
-+            uint32_t wo_u;
-+            uint32_t wo_v;
-+            uint32_t dst_addr_c;
-+        } b1;
-+        struct {
-+            uint32_t pic_cw;            // C Width (== Y width / 2)
-+            uint32_t pic_ch;            // C Height (== Y Height / 2)
-+            uint32_t stride2;
-+            uint32_t stride1;
-+            uint32_t wdenom;
-+            uint32_t dummy0;
-+        } s0;
-+        struct {
-+            uint32_t dummy0;
-+            uint32_t dummy1;
-+            uint32_t dummy2;
-+            uint32_t dummy3;
-+            uint32_t dummy4;
-+            uint32_t dummy5;
-+        } s1;
++        qpu_mc_pred_c_p_t p;
++        qpu_mc_pred_c_b_t b;
++        qpu_mc_pred_c_s_t s;
 +    };
 +} qpu_mc_pred_c_t;
 +
-+typedef struct qpu_mc_pred_y_s {
-+    int16_t next_src1_x;
-+    int16_t next_src1_y;
-+    uint32_t next_src1_base;
-+    int16_t next_src2_x;
-+    int16_t next_src2_y;
-+    uint32_t next_src2_base;
-+    union {
-+        struct {
-+            uint16_t h;
-+            uint16_t w;
-+            uint32_t mymx21;
-+            uint32_t wo1;
-+            uint32_t wo2;
-+            uint32_t dst_addr;
-+        } p;
-+        struct {
-+            uint16_t pic_h;
-+            uint16_t pic_w;
-+            uint32_t stride2;
-+            uint32_t stride1;
-+            uint32_t wdenom;
-+            uint32_t dummy0;
-+        } s;
-+    };
++
++typedef struct qpu_mc_pred_y_p_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t h;
++    uint16_t w;
++    uint32_t mymx21;
++    uint32_t wo1;
++    uint32_t wo2;
++    qpu_mc_dst_addr_t dst_addr;
 +    uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t wo1;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t pic_h;
++    uint16_t pic_w;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++    uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++    union {
++        qpu_mc_pred_y_p_t p;
++        qpu_mc_pred_y_p00_t p00;
++        qpu_mc_pred_y_s_t s;
++    };
 +} qpu_mc_pred_y_t;
 +
++typedef union qpu_mc_pred_cmd_u {
++    qpu_mc_pred_y_t y;
++    qpu_mc_pred_c_t c;
++    uint32_t data[1];
++} qpu_mc_pred_cmd_t;
++
++#define QPU_MC_PRED_N_Y8        12
++#define QPU_MC_PRED_N_C8        12
++
++#define QPU_MC_PRED_N_Y10       12
++#define QPU_MC_PRED_N_C10       12
++
 +#pragma pack(pop)
 +
 +#endif
 +
+diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
+new file mode 100644
+index 0000000000..1925ab7a79
+--- /dev/null
++++ b/libavcodec/rpi_shader_template.c
+@@ -0,0 +1,65 @@
++#ifdef RPI
++
++#include "hevc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_shader_cmd.h"
++#include "rpi_shader_template.h"
++
++typedef struct shader_track_s
++{
++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    const struct qpu_mc_src_s *last_l0;
++    const struct qpu_mc_src_s *last_l1;
++    uint32_t width;  // pic_width * PW
++    uint32_t height;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t wdenom;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++    return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++    int rv;
++    // As it happens we can take the 2nd filter term & divide it by 8
++    // (dropping fractions) to get the fractional move
++    rv = 8 - ((x >> 11) & 0xf);
++    av_assert2(rv >= 0 && rv <= 7);
++    return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++    return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCContext *const s, int32_t x)
++{
++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCContext *const s, int32_t x)
++{
++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++    return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_shader_template_fn.h"
++
++#endif
++
+diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
+new file mode 100644
+index 0000000000..ecf5b8185a
+--- /dev/null
++++ b/libavcodec/rpi_shader_template.h
+@@ -0,0 +1,24 @@
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++#ifdef RPI
++struct HEVCContext;
++struct HEVCRpiInterPredEnv;
++
++void rpi_shader_c8(struct HEVCContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_shader_c16(struct HEVCContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++#endif
++
+diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
+new file mode 100644
+index 0000000000..b5ac2ceed6
+--- /dev/null
++++ b/libavcodec/rpi_shader_template_fn.h
+@@ -0,0 +1,477 @@
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++        const pixel s = *(const pixel *)src;
++        pixel * d = (pixel *)dst;
++        for (unsigned int j = 0; j < w; j += PW) {
++            *d++ = s;
++        }
++    }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
++        memcpy(dst, src, w);
++    }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++                         uint8_t * dst, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > st->width) {
++        if (x >= st->width)
++            x = st->width - PW;
++        dr = (x + w) - st->width;
++        w = st->width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > st->height) {
++        if (y >= st->height)
++            y = st->height - 1;
++        db = (y + h) - st->height;
++        h = st->height - y;
++    }
++
++    dst += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++    if (dr != 0)
++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++    w += dl + dr;
++    dst -= dl;
++
++    if (dt != 0)
++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++    if (db != 0)
++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++    const int width = st->width;
++    const int height = st->height;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > width) {
++        if (x >= width)
++            x = width - PW;
++        dr = (x + w) - width;
++        w = width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > height) {
++        if (y >= height)
++            y = height - 1;
++        db = (y + h) - height;
++        h = height - y;
++    }
++
++    dst_u += dl + dt * dst_stride;
++    dst_v += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++    {
++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++    }
++    if (dr != 0)
++    {
++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++    }
++    w += dl + dr;
++    dst_u -= dl;
++    dst_v -= dl;
++
++    if (dt != 0)
++    {
++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++    }
++    if (db != 0)
++    {
++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++    }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++    if (is_c) {
++        x *= 2;
++        w *= 2;
++    }
++
++    for (int i = y; i != y + h; ++i) {
++        for (int j = x; j != x + w; ++j) {
++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++            if (j < 0 || i < 0)
++                printf("..%c", sep);
++            else
++                printf("%02x%c", *(const pixel*)p, sep);
++#else
++            if (j < 0 || i < 0)
++                printf("...%c", sep);
++            else
++                printf("%03x%c", *(const pixel*)p, sep);
++#endif
++        }
++        printf("\n");
++    }
++}
++
++
++void FUNC(rpi_shader_c)(HEVCContext *const s,
++                  const HEVCRpiInterPredEnv *const ipe_y,
++                  const HEVCRpiInterPredEnv *const ipe_c)
++{
++    for (int c_idx = 0; c_idx < 2; ++c_idx)
++    {
++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++        unsigned int exit_n = 0;
++
++        if (ipe == NULL || !ipe->used) {
++            continue;
++        }
++
++        do {
++            for (unsigned int i = 0; i != ipe->n; ++i) {
++                const HEVCRpiInterPredQ * const q = ipe->q + i;
++                shader_track_t * const st = tracka + i;
++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++                for (;;) {
++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++                    if (link == q->code_setup) {
++                        if (c_idx == 0) {
++                            // Luma
++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++                            st->height = c->pic_h;
++                            st->width = c->pic_w * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                        else {
++                            // Chroma
++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++                            st->height = c->pic_ch;
++                            st->width = c->pic_cw * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->wdenom = c->wdenom;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                    }
++                    else if (link == s->qpu.y_pxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++                        const int w1 = FFMIN(c->w, 8);
++                        const int w2 = c->w - w1;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        if (w2 > 0) {
++                            FUNC(get_patch_y)(st,
++                                        patch_y2, PATCH_STRIDE,
++                                        st->last_l1,
++                                        16, c->h + 7);
++                        }
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++                        if (w2 > 0) {
++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++                        }
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_bxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h + 7);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_p00) {
++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++                        st->last_l0 = &c->next_src1;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_b00) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        av_assert0(c->w <= 16 && c->h <= 64);
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++                           patch_y3, patch_y1, PATCH_STRIDE,
++                           c->h, 0, 0, c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), 0, 0, c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx_l1) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l1 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_bxx) {
++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++                        const int mx1 = fctom(c->coeffs_x1);
++                        const int my1 = fctom(c->coeffs_y1);
++                        const int mx2 = fctom(c->coeffs_x2);
++                        const int my2 = fctom(c->coeffs_y2);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72];
++                        uint8_t patch_v1[PATCH_STRIDE * 72];
++                        uint8_t patch_u2[PATCH_STRIDE * 72];
++                        uint8_t patch_v2[PATCH_STRIDE * 72];
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == q->code_sync) {
++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++                        break;
++                    }
++                    else if (link == q->code_exit) {
++                        // We expect exit to occur without other sync
++                        av_assert0(i == exit_n);
++                        ++exit_n;
++                        break;
++                    }
++                    else {
++                        av_assert0(0);
++                    }
++                }
++
++                st->qpu_mc_curr = cmd;
++            }
++        } while (exit_n == 0);
++    }
++}
++
++#undef FUNC
++#undef pixel
++
 diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
 new file mode 100644
-index 0000000..b061fe0
+index 0000000000..b502de0a2c
 --- /dev/null
 +++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,581 @@
+@@ -0,0 +1,745 @@
 +#include "config.h"
 +#ifdef RPI
++#include "libavcodec/avcodec.h"
 +#include "rpi_qpu.h"
 +#include "rpi_mailbox.h"
 +#include "rpi_zc.h"
 +#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
 +#include <pthread.h>
 +
 +#include "libavutil/buffer_internal.h"
@@ -17841,21 +27634,11 @@ index 0000000..b061fe0
 +    struct ZcPool * pool;
 +} ZcPoolEnt;
 +
-+#if 1
-+//#define ALLOC_PAD       0x1000
-+#define ALLOC_PAD       0
-+#define ALLOC_ROUND     0x1000
-+//#define ALLOC_N_OFFSET  0x100
-+#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    0x80
-+#define STRIDE_OR       0x80
-+#else
 +#define ALLOC_PAD       0
 +#define ALLOC_ROUND     0x1000
 +#define ALLOC_N_OFFSET  0
-+#define STRIDE_ROUND    32
++#define STRIDE_ROUND    64
 +#define STRIDE_OR       0
-+#endif
 +
 +#define DEBUG_ZAP0_BUFFERS 0
 +
@@ -18032,13 +27815,22 @@ index 0000000..b061fe0
 +    {
 +        case AV_PIX_FMT_YUV420P:
 +            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
 +            geo.stride_c = geo.stride_y / 2;
-+        //    geo.height_y = (video_height + 15) & ~15;
 +            geo.height_y = (video_height + 32 + 31) & ~31;
 +            geo.height_c = geo.height_y / 2;
 +            geo.planes_c = 2;
 +            geo.stripes = 1;
++            geo.bytes_per_pel = 1;
++            break;
++
++        case AV_PIX_FMT_YUV420P10:
++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 2;
 +            break;
 +
 +        case AV_PIX_FMT_SAND128:
@@ -18073,6 +27865,7 @@ index 0000000..b061fe0
 +            geo.height_c = img.pitch / stripe_w - geo.height_y;
 +            geo.planes_c = 1;
 +            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
 +
 +            pthread_mutex_unlock(&sand_lock);
 +
@@ -18081,6 +27874,45 @@ index 0000000..b061fe0
 +            break;
 +        }
 +
++        case AV_PIX_FMT_SAND64_16:
++        case AV_PIX_FMT_SAND64_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV_16,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                gpu_ref();
++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
++                gpu_unref();
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 2;
++
++            pthread_mutex_unlock(&sand_lock);
++            break;
++        }
++
 +        default:
 +            memset(&geo, 0, sizeof(geo));
 +            break;
@@ -18153,8 +27985,12 @@ index 0000000..b061fe0
 +    frame->linesize[0] = geo.stride_y;
 +    frame->linesize[1] = geo.stride_c;
 +    frame->linesize[2] = geo.stride_c;
++    // abuse: linesize[3] = "stripe stride"
++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++    // In a general case this makes the calculation an xor and multiply rather
++    // than a divide and multiply
 +    if (geo.stripes > 1)
-+        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
++        frame->linesize[3] = geo.height_y + geo.height_c;
 +
 +    frame->data[0] = buf->data;
 +    frame->data[1] = frame->data[0] + size_y;
@@ -18164,6 +28000,11 @@ index 0000000..b061fe0
 +    frame->extended_data = frame->data;
 +    // Leave extended buf alone
 +
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++    // *** If we intend to use this for real we will want a 2nd buffer pool
++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
++#endif
++
 +    return 0;
 +}
 +
@@ -18182,7 +28023,7 @@ index 0000000..b061fe0
 +        rv = avcodec_default_get_buffer2(s, frame, flags);
 +    }
 +    else if (frame->format == AV_PIX_FMT_YUV420P ||
-+             frame->format == AV_PIX_FMT_SAND128)
++             av_rpi_is_sand_frame(frame))
 +    {
 +        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
 +    }
@@ -18212,6 +28053,7 @@ index 0000000..b061fe0
 +    unsigned int i;
 +    uint8_t * psrc, * pdest;
 +
++    dest->format = src->format;
 +    dest->width = src->width;
 +    dest->height = src->height;
 +
@@ -18243,29 +28085,142 @@ index 0000000..b061fe0
 +}
 +
 +
++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
++    const AVFrame * const src)
++{
++    AVFrame dest_frame;
++    AVFrame * const dest = &dest_frame;
++    unsigned int i;
++    uint8_t * psrc, * psrc2, * pdest;
++
++    memset(dest, 0, sizeof(*dest));
++    dest->format = AV_PIX_FMT_SAND128;
++    dest->width = src->width;
++    dest->height = src->height;
++
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
++    {
++        return NULL;
++    }
++
++    // Y
++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++         i != dest->height;
++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++    {
++        uint16_t * s = (uint16_t*)psrc;
++        uint8_t * d = pdest;
++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
++        {
++            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
++            for (unsigned int j = 0; j != n; ++j)
++                *d++ = (uint8_t)(*s++ >> 2);
++            d += (dest->linesize[3] - 1) * dest->linesize[0];
++        }
++    }
++
++    // C
++    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
++    {
++        const uint16_t * su = (uint16_t*)psrc;
++        const uint16_t * sv = (uint16_t*)psrc2;
++        uint8_t * d = pdest;
++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
++        {
++            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
++            for (unsigned int j = 0; j != n; ++j)
++            {
++                *d++ = (uint8_t)(*su++ >> 2);
++                *d++ = (uint8_t)(*sv++ >> 2);
++            }
++            d += (dest->linesize[3] - 1) * dest->linesize[1];
++        }
++    }
++
++    return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
++    const AVFrame * const src, const unsigned int src_bits)
++{
++    AVFrame dest_frame = {
++        .format = AV_PIX_FMT_SAND128,
++        .width = src->width,
++        .height = src->height
++    };
++    AVFrame * const dest = &dest_frame;
++    const unsigned int shr = src_bits - 8;
++
++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
++    {
++        return NULL;
++    }
++
++    // Y
++    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
++                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
++                        src->width, src->height, shr);
++    // C
++    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
++                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
++                        src->width, src->height / 2, shr);
++
++    return dest->buf[0];
++}
++
++
++
 +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const int maycopy)
++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
 +{
 +    assert(s != NULL);
 +
 +    if (frame->format != AV_PIX_FMT_YUV420P &&
-+        frame->format != AV_PIX_FMT_SAND128)
++        frame->format != AV_PIX_FMT_YUV420P10 &&
++        !av_rpi_is_sand_frame(frame))
 +    {
 +        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
 +        return NULL;
 +    }
 +
-+    if (frame->buf[1] != NULL)
++    if (frame->buf[1] != NULL || frame->format != expected_format)
 +    {
-+        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
++#if RPI_ZC_SAND_8_IN_10_BUF
++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++        {
++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++        }
++#endif
++
 +        if (maycopy)
 +        {
-+            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+            return zc_copy(s, frame);
++            if (frame->buf[1] != NULL)
++                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++            else
++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++            switch (frame->format)
++            {
++                case AV_PIX_FMT_YUV420P10:
++                    return zc_420p10_to_sand128(s, frame);
++
++                case AV_PIX_FMT_SAND64_10:
++                    return zc_sand64_16_to_sand128(s, frame, 10);
++
++                default:
++                    return zc_copy(s, frame);
++            }
 +        }
 +        else
 +        {
-+            av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
++            if (frame->buf[1] != NULL)
++                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++            else
++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
 +            return NULL;
 +        }
 +    }
@@ -18392,10 +28347,10 @@ index 0000000..b061fe0
 +
 diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
 new file mode 100644
-index 0000000..f4aeb78
+index 0000000000..26fb3be999
 --- /dev/null
 +++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,137 @@
+@@ -0,0 +1,105 @@
 +#ifndef LIBAVCODEC_RPI_ZC_H
 +#define LIBAVCODEC_RPI_ZC_H
 +
@@ -18406,23 +28361,33 @@ index 0000000..f4aeb78
 +// bit of memory for the frame when can then be reference counted until
 +// display has finished with it.
 +
-+#include "libavutil/frame.h"
-+#include "libavcodec/avcodec.h"
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++//     Only works if SAO active
++//     Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF  0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
 +
 +// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef AVBufferRef * AVRpiZcRefPtr;
++typedef struct AVBufferRef * AVRpiZcRefPtr;
 +
 +struct AVZcEnv;
 +typedef struct AVZcEnv * AVZcEnvPtr;
 +
 +typedef struct AVRpiZcFrameGeometry
 +{
-+    unsigned int stride_y;
-+    unsigned int height_y;
-+    unsigned int stride_c;
-+    unsigned int height_c;
-+    unsigned int planes_c;
-+    unsigned int stripes;
++    unsigned int stride_y;  // Luma stride (bytes)
++    unsigned int height_y;  // Luma height (lines)
++    unsigned int stride_c;  // Chroma stride (bytes)
++    unsigned int height_c;  // Chroma stride (lines)
++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
++    unsigned int stripes;   // Number of stripes (sand)
++    unsigned int bytes_per_pel;
 +} AVRpiZcFrameGeometry;
 +
 +
@@ -18448,7 +28413,7 @@ index 0000000..f4aeb78
 +//     the data, then allocate a new buffer and copy the data into it
 +//   Otherwise return NULL
 +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+    const AVFrame * const frame, const int maycopy);
++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
 +
 +// Get the vc_handle from the frame ref
 +// Returns -1 if ref doesn't look valid
@@ -18489,52 +28454,10 @@ index 0000000..f4aeb78
 +
 +
 +
-+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    const unsigned int stride1 = frame->linesize[0];
-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = frame->linesize[0];
-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
-+    const unsigned int x = x_c * 2;
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
-+}
-+
-+static inline int rpi_sliced_frame(const AVFrame * const frame)
-+{
-+    return frame->format == AV_PIX_FMT_SAND128;
-+}
-+
-+
 +#endif
 +
 diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index f7adb52..3b398a3 100644
+index c4af9cbb17..c1b806e51b 100644
 --- a/libavcodec/utils.c
 +++ b/libavcodec/utils.c
 @@ -26,6 +26,12 @@
@@ -18550,7 +28473,15 @@ index f7adb52..3b398a3 100644
  #include "libavutil/atomic.h"
  #include "libavutil/attributes.h"
  #include "libavutil/avassert.h"
-@@ -64,6 +70,10 @@
+@@ -39,6 +45,7 @@
+ #include "libavutil/mathematics.h"
+ #include "libavutil/mem_internal.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/samplefmt.h"
+ #include "libavutil/dict.h"
+@@ -64,6 +71,10 @@
  #include "libavutil/ffversion.h"
  const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
  
@@ -18561,7 +28492,7 @@ index f7adb52..3b398a3 100644
  #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
  static int default_lockmgr_cb(void **arg, enum AVLockOp op)
  {
-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
      return ret;
  }
  
@@ -18609,7 +28540,7 @@ index f7adb52..3b398a3 100644
  static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
  {
      FramePool *pool = avctx->internal->pool;
-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
              av_buffer_pool_uninit(&pool->pools[i]);
              pool->linesize[i] = linesize[i];
              if (size[i]) {
@@ -18624,20 +28555,20 @@ index f7adb52..3b398a3 100644
                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
                                                       CONFIG_MEMORY_POISONING ?
                                                          NULL :
-@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
  {
      int ret;
  
 +#ifdef RPI
 +    // This is going to end badly if we let it continue
-+    av_assert0(frame->format != AV_PIX_FMT_SAND128);
++    av_assert0(!av_rpi_is_sand_frame(frame));
 +#endif
 +
      if ((ret = update_frame_pool(avctx, frame)) < 0)
          return ret;
  
 diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
-index 21f8d9e..71ce7b9 100644
+index 21f8d9e00d..71ce7b9186 100644
 --- a/libavfilter/avfilter.c
 +++ b/libavfilter/avfilter.c
 @@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
@@ -18649,7 +28580,7 @@ index 21f8d9e..71ce7b9 100644
  #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
              if (   !strcmp(filter->filter->name, "format")     ||
 diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index b31d233..2767306 100644
+index 6767b65ec8..f270190d57 100644
 --- a/libavformat/mpegts.c
 +++ b/libavformat/mpegts.c
 @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
@@ -18662,10 +28593,10 @@ index b31d233..2767306 100644
      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
 diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 6f343f2..83f26d5 100644
+index 5a35953d24..d36fdc3199 100644
 --- a/libavformat/utils.c
 +++ b/libavformat/utils.c
-@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
          int default_stream_index = av_find_default_stream_index(s);
          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
              for (i = 0; i < s->nb_streams; i++) {
@@ -18674,8 +28605,84 @@ index 6f343f2..83f26d5 100644
                      continue;
                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
+diff --git a/libavutil/Makefile b/libavutil/Makefile
+index 1e061763a2..cbc9bc145b 100644
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -59,6 +59,8 @@ HEADERS = adler32.h                                                     \
+           rational.h                                                    \
+           replaygain.h                                                  \
+           ripemd.h                                                      \
++          rpi_sand_fns.h                                                \
++          rpi_sand_fn_pw.h                                              \
+           samplefmt.h                                                   \
+           sha.h                                                         \
+           sha512.h                                                      \
+@@ -136,6 +138,7 @@ OBJS = adler32.o                                                        \
+        reverse.o                                                        \
+        rc4.o                                                            \
+        ripemd.o                                                         \
++       rpi_sand_fns.o                                                   \
+        samplefmt.o                                                      \
+        sha.o                                                            \
+        sha512.o                                                         \
+diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
+index 5da44b0542..b74b7c4e2f 100644
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+ 
+ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
+              arm/float_dsp_neon.o                                       \
++             arm/rpi_sand_neon.o                                        \
+diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..dbffdaefa4
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,40 @@
++#include "libavutil/arm/asm.S"
++
++@ void rpi_sand128b_stripe_to_8_10(
++@   uint8_t * dest,             [r0]
++@   const uint8_t * src1,       [r1]
++@   const uint8_t * src2,       [r2]
++@   unsigned int lines);        [r3]
++
++.macro  stripe2_to_8, bit_depth
++        vpush    {q4-q7}
++1:
++        vldm     r1!, {q0-q7}
++        subs     r3, #1
++        vldm     r2!, {q8-q15}
++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
++        vqrshrn.u16 d10, q10, #\bit_depth - 8
++        vqrshrn.u16 d11, q11, #\bit_depth - 8
++        vqrshrn.u16 d12, q12, #\bit_depth - 8
++        vqrshrn.u16 d13, q13, #\bit_depth - 8
++        vqrshrn.u16 d14, q14, #\bit_depth - 8
++        vqrshrn.u16 d15, q15, #\bit_depth - 8
++        vstm     r0!, {q0-q7}
++        bne      1b
++        vpop     {q4-q7}
++        bx       lr
++.endm
++
++function rpi_sand128b_stripe_to_8_10, export=1
++        stripe2_to_8     10
++endfunc
++
 diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 694e116..203ca7b 100644
+index 694e116a3c..203ca7b3a8 100644
 --- a/libavutil/buffer.c
 +++ b/libavutil/buffer.c
 @@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
@@ -18689,7 +28696,7 @@ index 694e116..203ca7b 100644
 +  return buf->opaque;
 +}
 diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 0c0ce12..82e0bc3 100644
+index 0c0ce12cf2..82e0bc3058 100644
 --- a/libavutil/buffer.h
 +++ b/libavutil/buffer.h
 @@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
@@ -18702,11 +28709,51 @@ index 0c0ce12..82e0bc3 100644
  /**
   * @}
   */
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index 2b5c3320c3..990347e484 100644
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -120,7 +120,20 @@ enum AVFrameSideDataType {
+      * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
+      * This is set on the first frame of a GOP that has a temporal reference of 0.
+      */
+-    AV_FRAME_DATA_GOP_TIMECODE
++    AV_FRAME_DATA_GOP_TIMECODE,
++
++    /**
++     * The data represents the AVSphericalMapping structure defined in
++     * libavutil/spherical.h.
++     */
++    AV_FRAME_DATA_SPHERICAL,
++
++    /**
++     * Extra data required to deal with a cropped Sand frame
++     * AVFrame holds the cropped size, but we cannot simply offset the start
++     * address to get the picture as we can for planar formats
++     */
++    AV_FRAME_DATA_SAND_INFO,
+ };
+ 
+ enum AVActiveFormatDescription {
+@@ -133,6 +146,13 @@ enum AVActiveFormatDescription {
+     AV_AFD_SP_4_3       = 15,
+ };
+ 
++typedef struct AVFrameDataSandInfo
++{
++    unsigned int left_offset;
++    unsigned int top_offset;
++    unsigned int pic_width;
++    unsigned int pic_height;
++} AVFrameDataSandInfo;
+ 
+ /**
+  * Structure to hold side data for an AVFrame.
 diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index 0dffa4d..5644176 100644
+index 0dffa4dbdb..17134b4f38 100644
 --- a/libavutil/pixdesc.c
 +++ b/libavutil/pixdesc.c
-@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
          .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
                   AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
      },
@@ -18721,35 +28768,486 @@ index 0dffa4d..5644176 100644
 +            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
 +        },
 +        .flags = 0,
-+    }
++    },
++    [AV_PIX_FMT_SAND64_10] = {
++        .name = "sand64_10",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
++            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
++            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
++        },
++        .flags = 0,
++    },
  };
  #if FF_API_PLUS1_MINUS1
  FF_ENABLE_DEPRECATION_WARNINGS
 diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 0ed01c4..4705e80 100644
+index 0ed01c4844..2155b78704 100644
 --- a/libavutil/pixfmt.h
 +++ b/libavutil/pixfmt.h
-@@ -303,7 +303,10 @@ enum AVPixelFormat {
+@@ -303,7 +303,22 @@ enum AVPixelFormat {
      AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
      AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
  
 -    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
++    AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
++
++    AV_PIX_FMT_GRAY12BE,   ///<        Y        , 12bpp, big-endian
++    AV_PIX_FMT_GRAY12LE,   ///<        Y        , 12bpp, little-endian
++    AV_PIX_FMT_GRAY10BE,   ///<        Y        , 10bpp, big-endian
++    AV_PIX_FMT_GRAY10LE,   ///<        Y        , 10bpp, little-endian
++
++    AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
++    AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
++
 +// RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
 +
 +    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
  };
  
  #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
+diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+new file mode 100644
+index 0000000000..52d52a2a83
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,182 @@
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x;
++    const unsigned int w = _w;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++            memcpy(dst, p, w);
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const uint8_t * p = p2;
++            uint8_t * d = dst;
++            memcpy(d, p1, w1);
++            d += w1;
++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++                memcpy(d, p, stride1);
++            }
++            memcpy(d, p, w3);
++        }
++    }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            const pixel * p = (const pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * p = (const pixel *)p1;
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *du++ = *p++;
++                    *dv++ = *p++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *p++ = *su++;
++                    *p++ = *sv++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+new file mode 100644
+index 0000000000..ec4cfadf8a
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,99 @@
++#include "config.h"
++#ifdef RPI
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if HAVE_NEON
++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
++#endif
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    const unsigned int rnd = (1 << shr) >> 1;
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        *dst++ = (*src++ + rnd) >> shr;
++    }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    unsigned int rnd = (1 << shr) >> 1;
++    const unsigned int mask = ((1 << shr) - 1);
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        rnd = *src++ + (rnd & mask);
++        *dst++ = rnd >> shr;
++    }
++}
++#endif
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr)
++{
++    const unsigned int n = dst_stride1 / 2;
++    unsigned int j;
++
++    // This is true for our current layouts
++    av_assert0(dst_stride1 == src_stride1);
++
++    // As we have the same stride1 for src & dest and src is wider than dest
++    // then if we loop on src we can always write contiguously to dest
++    // We make no effort to copy an exact width - round up to nearest src stripe
++    // as we will always have storage in dest for that
++
++#if HAVE_NEON
++    if (shr == 3 && src_stride1 == 128) {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++        }
++    }
++    else
++#endif
++    {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++                cpy16_to_8(d, s1, n, shr);
++                cpy16_to_8(d + n, s2, n, shr);
++            }
++        }
++    }
++
++    // Fix up a trailing dest half stripe
++    if (j < w) {
++        uint8_t * d = dst + j * dst_stride2;
++        const uint8_t * s1 = src + j * 2 * src_stride2;
++
++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++            cpy16_to_8(d, s1, n, shr);
++        }
++    }
++}
++
++#endif  // RPI
++
+diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
+new file mode 100644
+index 0000000000..aa880d0f63
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,129 @@
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++#ifdef RPI
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
++    //   to optimize a whole lot better
++    return frame->linesize[0];
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_SAND128);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++#endif
++
 diff --git a/libswscale/input.c b/libswscale/input.c
-index 14ab5ab..e61b67a 100644
+index 14ab5abb3a..7a827c71e3 100644
 --- a/libswscale/input.c
 +++ b/libswscale/input.c
-@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
+@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
      }
  }
  
-+
 +static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
 +                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
 +                       int width, uint32_t *unused)
@@ -18760,112 +29258,418 @@ index 14ab5ab..e61b67a 100644
  #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
  
  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
-@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
      case AV_PIX_FMT_P010BE:
          c->chrToYV12 = p010BEToUV_c;
          break;
 +    case AV_PIX_FMT_SAND128:
-+        c->chrToYV12 = sand128ToUV_c;
++    case AV_PIX_FMT_SAND64_10:
++        c->chrToYV12 = sand128ToUV_c;  // NIF
 +        break;
      }
      if (c->chrSrcHSubSample) {
          switch (srcFormat) {
 diff --git a/libswscale/utils.c b/libswscale/utils.c
-index 576d8f0..d7206cc 100644
+index 576d8f0d5a..fd88a5e51e 100644
 --- a/libswscale/utils.c
 +++ b/libswscale/utils.c
-@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
      [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
      [AV_PIX_FMT_P010LE]      = { 1, 0 },
      [AV_PIX_FMT_P010BE]      = { 1, 0 },
 +#ifdef RPI
 +    [AV_PIX_FMT_SAND128]     = { 1, 0 },
++    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
 +#endif
  };
  
  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
-diff --git a/pi-util/conf.sh b/pi-util/conf.sh
-new file mode 100755
-index 0000000..8b596a2
---- /dev/null
-+++ b/pi-util/conf.sh
-@@ -0,0 +1,33 @@
-+echo "Configure for Pi2/3"
-+
-+RPI_BUILDROOT=`pwd`/build
-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
+diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
-index 0000000..160e149
+index 0000000000..b1e99a6a89
 --- /dev/null
-+++ b/pi-util/conf1.sh
-@@ -0,0 +1,34 @@
-+echo "Configure for Pi1"
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,25 @@
++Building Pi FFmpeg
++==================
 +
-+RPI_BUILDROOT=`pwd`/build
-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
++Configuration:
++=============
 +
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=armv\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++pi-util/conf_pi2.sh
++
++contains suitable options to build the code for Pi2/3.  It expects to find
++git clones of
++
++https://github.com/raspberrypi/tools
++https://github.com/raspberrypi/firmware
++
++in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
++lot of history you don't want.
++
++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
++rebuilt.  Otherwise the prebuilt .c & .h files will be used.
++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
++
++pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
++H265 QPU acceleration is broken on Pi1 and so it is disabled.
 +
 +
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
+diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
+new file mode 100644
+index 0000000000..f05b7753f7
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,193 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
+diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
+new file mode 100644
+index 0000000000..6082641271
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
 new file mode 100644
-index 0000000..fc14f2a
+index 0000000000..fc14f2a3c2
 --- /dev/null
 +++ b/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
@@ -19013,14 +29817,88 @@ index 0000000..fc14f2a
 +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
 +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
 +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
+new file mode 100755
+index 0000000000..ec25b81c31
+--- /dev/null
++++ b/pi-util/conf_pi1.sh
+@@ -0,0 +1,31 @@
++echo "Configure for Pi1"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --cpu=arm1176jzf-s\
++ --arch=arm\
++ --disable-neon\
++ --target-os=linux\
++ --disable-stripping\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
+new file mode 100755
+index 0000000000..f8e5e75375
+--- /dev/null
++++ b/pi-util/conf_pi2.sh
+@@ -0,0 +1,30 @@
++echo "Configure for Pi2/3"
++
++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
++RPI_OPT_VC=`pwd`/../firmware/opt/vc
++
++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++./configure --enable-cross-compile\
++ --arch=armv6t2\
++ --cpu=cortex-a7\
++ --target-os=linux\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-mmal\
++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
++
++# --enable-extra-warnings\
++# --arch=armv71\
++# --enable-shared\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
 diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
-new file mode 100644
-index 0000000..c896bc6
+new file mode 100755
+index 0000000000..70f7be22bb
 --- /dev/null
 +++ b/pi-util/ffconf.py
-@@ -0,0 +1,154 @@
+@@ -0,0 +1,174 @@
 +#!/usr/bin/env python
 +
++import string
 +import os
 +import subprocess
 +import re
@@ -19029,12 +29907,20 @@ index 0000000..c896bc6
 +import csv
 +from stat import *
 +
-+conf_root = "/opt/conform/h265"
 +ffmpeg_exec = "./ffmpeg"
 +
-+def testone(fileroot, name, es_file, md5_file):
++def testone(fileroot, srcname, es_file, md5_file):
 +    tmp_root = "/tmp"
 +
++    names = srcname.split('/')
++    while len(names) > 1:
++        tmp_root = os.path.join(tmp_root, names[0])
++        del names[0]
++    name = names[0]
++
++    if not os.path.exists(tmp_root):
++        os.makedirs(tmp_root)
++
 +    dec_file = os.path.join(tmp_root, name + ".dec.md5")
 +    try:
 +        os.remove(dec_file)
@@ -19079,10 +29965,10 @@ index 0000000..c896bc6
 +
 +def scandir(root):
 +    aconf = []
-+    ents = os.listdir(conf_root)
++    ents = os.listdir(root)
 +    ents.sort(key=str.lower)
 +    for name in ents:
-+        test_path = os.path.join(conf_root, name)
++        test_path = os.path.join(root, name)
 +        if S_ISDIR(os.stat(test_path).st_mode):
 +            files = os.listdir(test_path)
 +            es_file = "?"
@@ -19093,7 +29979,7 @@ index 0000000..c896bc6
 +                    pass
 +                elif ext == ".bit" or ext == ".bin":
 +                    es_file = f
-+                elif ext == ".md5":
++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
 +                    if md5_file == "?":
 +                        md5_file = f
 +                    elif base[-3:] == "yuv":
@@ -19105,13 +29991,15 @@ index 0000000..c896bc6
 +    if not tests:
 +        return True
 +    for t in tests:
-+        if name[0:len(t)] == t:
++        if name[0:len(t)] == t or name.find("/" + t) != -1:
 +            return True
-+        return False
++    return False
 +
-+def doconf(csva, tests):
-+    failures = []
++def doconf(csva, tests, test_root):
++    unx_failures = []
 +    unx_success = []
++    failures = 0
++    successes = 0
 +    for a in csva:
 +        exp_test = int(a[0])
 +        if (exp_test and runtest(a[1], tests)):
@@ -19119,17 +30007,25 @@ index 0000000..c896bc6
 +            print "==== ", name,
 +            sys.stdout.flush()
 +
-+            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
++            if (rv == 0):
++                successes += 1
++            else:
++                failures += 1
++
 +            if (rv == 0):
 +                if exp_test == 2:
 +                    print ": * OK *"
 +                    unx_success.append(name)
 +                else:
 +                    print ": ok"
-+            elif exp_test > 1 and rv == 1:
++            elif exp_test == 2 and rv == 1:
 +                print ": fail"
++            elif exp_test == 3 and rv == 2:
++                # Call an expected "crash" an abort
++                print ": abort"
 +            else:
-+                failures.append(name)
++                unx_failures.append(name)
 +                if rv == 1:
 +                    print ": * FAIL *"
 +                elif (rv == 2) :
@@ -19139,11 +30035,11 @@ index 0000000..c896bc6
 +                else :
 +                    print ": * BANG *"
 +
-+    if failures or unx_success:
-+        print "Unexpected Failures:", failures
++    if unx_failures or unx_success:
++        print "Unexpected Failures:", unx_failures
 +        print "Unexpected Success: ", unx_success
 +    else:
-+        print "All tests normal"
++        print "All tests normal:", successes, "ok,", failures, "failed"
 +
 +
 +class ConfCSVDialect(csv.Dialect):
@@ -19159,2536 +30055,184 @@ index 0000000..c896bc6
 +
 +    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
 +    argp.add_argument("tests", nargs='*')
++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
 +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
 +    args = argp.parse_args()
 +
 +    if args.csvgen:
-+        csv.writer(sys.stdout).writerows(scandir(conf_root))
++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
 +        exit(0)
 +
 +    with open(args.csv, 'rt') as csvfile:
 +        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
 +
 +
-+    doconf(csva, args.tests)
++    doconf(csva, args.tests, args.test_root)
 +
-diff --git a/pi-util/qasm.py b/pi-util/qasm.py
-new file mode 100644
-index 0000000..1eacc04
+diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
+new file mode 100755
+index 0000000000..27cc453963
 --- /dev/null
-+++ b/pi-util/qasm.py
-@@ -0,0 +1,2502 @@
-+#!/usr/bin/env python
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,124 @@
++#!/usr/bin/env python3
 +
-+#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
-+#    add  r0, r0, 1                    # implicit mul nop
-+#    nop                               # explicit add nop, implicit mul nop
-+#    bkpt                              # implicit add/mul nop
-+#    mov  r0, 0x1234                   # hex immediate
-+#    mov  r0, 20 * 40                  # expressions...
-+#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
-+#    mov  r0, a:label                  # put address of label in r0
-+# :label
-+#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
-+# :1
-+#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
-+# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
-+# .set my_val, 3                       # introduce alias for 3
-+# .set my_reg, r0                      # and for r0
-+#    mov  my_reg, my_val               # then use them
-+# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
-+# .macro my_add, a, b, c               # a, b, c act as if .set on entry
-+# .set my_val, 10
-+#    add  a, b, c
-+#    mov  r0, my_val                   # 10
-+# .endm                                # forget all .sets since .macro (including arg .sets)
-+#    mov  r0, my_val                   # 3
-+#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
-+
-+import math
-+import optparse
-+import os
-+import random
-+import re
-+import struct
-+import sys
 +import time
-+
-+###############################################################################
-+# constants
-+###############################################################################
-+
-+# ops
-+######
-+
-+# negatives are internal qasm ops
-+
-+AOP_MOV     = -3   # two operands
-+AOP_BRA     = -2   # two operands
-+AOP_BRR     = -1   # two operands
-+AOP_NOP     = 0x00 # no operands
-+AOP_FADD    = 0x01
-+AOP_FSUB    = 0x02
-+AOP_FMIN    = 0x03
-+AOP_FMAX    = 0x04
-+AOP_FMINABS = 0x05
-+AOP_FMAXABS = 0x06
-+AOP_FTOI    = 0x07 # two operands
-+AOP_ITOF    = 0x08 # two operands
-+AOP_ADD     = 0x0c
-+AOP_SUB     = 0x0d
-+AOP_SHR     = 0x0e
-+AOP_ASR     = 0x0f
-+AOP_ROR     = 0x10
-+AOP_SHL     = 0x11
-+AOP_MIN     = 0x12
-+AOP_MAX     = 0x13
-+AOP_AND     = 0x14
-+AOP_OR      = 0x15
-+AOP_XOR     = 0x16
-+AOP_NOT     = 0x17 # two operands
-+AOP_CLZ     = 0x18 # two operands
-+AOP_V8ADDS  = 0x1e
-+AOP_V8SUBS  = 0x1f
-+
-+MOP_MOV    = -1  # two operands
-+MOP_NOP    = 0x0 # no operands
-+MOP_FMUL   = 0x1
-+MOP_MUL24  = 0x2
-+MOP_V8MULD = 0x3
-+MOP_V8MIN  = 0x4
-+MOP_V8MAX  = 0x5
-+MOP_V8ADDS = 0x6
-+MOP_V8SUBS = 0x7
-+
-+# ldi modes
-+############
-+
-+LDI_32          = 0
-+LDI_EL_SIGNED   = 1
-+LDI_EL_UNSIGNED = 3
-+LDI_SEMA        = 4
-+
-+# conds
-+########
-+
-+COND_NEVER  = 0
-+COND_ALWAYS = 1
-+COND_IFZ    = 2
-+COND_IFNZ   = 3
-+COND_IFN    = 4
-+COND_IFNN   = 5
-+COND_IFC    = 6
-+COND_IFNC   = 7
-+
-+BCOND_ALLZ   = 0
-+BCOND_ALLNZ  = 1
-+BCOND_ANYZ   = 2
-+BCOND_ANYNZ  = 3
-+BCOND_ALLN   = 4
-+BCOND_ALLNN  = 5
-+BCOND_ANYN   = 6
-+BCOND_ANYNN  = 7
-+BCOND_ALLC   = 8
-+BCOND_ALLNC  = 9
-+BCOND_ANYC   = 10
-+BCOND_ANYNC  = 11
-+BCOND_ALWAYS = 15
-+
-+# packing/unpacking
-+####################
-+
-+# regfile a pack modes
-+PACK_A_NOP   = 0
-+PACK_A_16A   = 1
-+PACK_A_16B   = 2
-+PACK_A_8888  = 3
-+PACK_A_8A    = 4
-+PACK_A_8B    = 5
-+PACK_A_8C    = 6
-+PACK_A_8D    = 7
-+PACK_A_32S   = 8
-+PACK_A_16AS  = 9
-+PACK_A_16BS  = 10
-+PACK_A_8888S = 11
-+PACK_A_8AS   = 12
-+PACK_A_8BS   = 13
-+PACK_A_8CS   = 14
-+PACK_A_8DS   = 15
-+
-+# mul unit pack modes
-+PACK_MUL_NOP  = 0
-+PACK_MUL_8888 = 3
-+PACK_MUL_8A   = 4
-+PACK_MUL_8B   = 5
-+PACK_MUL_8C   = 6
-+PACK_MUL_8D   = 7
-+
-+# regfile a unpack modes
-+UNPACK_A_NOP = 0
-+UNPACK_A_16A = 1
-+UNPACK_A_16B = 2
-+UNPACK_A_8R  = 3
-+UNPACK_A_8A  = 4
-+UNPACK_A_8B  = 5
-+UNPACK_A_8C  = 6
-+UNPACK_A_8D  = 7
-+
-+# r4 unpack modes
-+UNPACK_R4_NOP = 0
-+UNPACK_R4_16A = 1
-+UNPACK_R4_16B = 2
-+UNPACK_R4_8R  = 3
-+UNPACK_R4_8A  = 4
-+UNPACK_R4_8B  = 5
-+UNPACK_R4_8C  = 6
-+UNPACK_R4_8D  = 7
-+
-+PACK_TYPE_INT    = 0
-+PACK_TYPE_FLOAT  = 1
-+PACK_TYPE_EITHER = -1
-+
-+PACK_MODE_A      = 0 # regfile a
-+PACK_MODE_M      = 1 # mul unit
-+PACK_MODE_EITHER = -1
-+
-+UNPACK_LOC_A     = 0 # regfile a
-+UNPACK_LOC_R4    = 1 # r4
-+UNPACK_LOC_AB    = 2 # either regfile a or regfile b
-+UNPACK_LOC_OTHER = 3 # somewhere else
-+
-+# args
-+#######
-+
-+# loc_t, ie internal
-+MUX_AC  = 0
-+MUX_ANY = 1
-+MUX_A   = 2
-+MUX_B   = 3
-+RW_EITHER = 0
-+RW_READ   = 1
-+RW_WRITE  = 2
-+
-+RADDR_NOP = 39
-+
-+# negatives are for internal use
-+RMUX_SEMA  = -6
-+RMUX_LABEL = -5
-+RMUX_IMMV  = -4
-+RMUX_IMM   = -3
-+RMUX_AC    = -2
-+RMUX_ANY   = -1
-+RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
-+RMUX_A     = 6
-+RMUX_B     = 7
-+
-+WADDR_R0  = 32 # followed by R1, R2, R3
-+WADDR_NOP = 39
-+
-+WMUX_ANY = 0
-+WMUX_A   = 1
-+WMUX_B   = 2
-+
-+# signals
-+##########
-+
-+SIG_BKPT       = 0
-+SIG_NORMAL     = 1
-+SIG_THRSW      = 2
-+SIG_THREND     = 3
-+SIG_SBWAIT     = 4
-+SIG_SBDONE     = 5
-+SIG_INT        = 6 # on a0
-+SIG_LTHRSW     = 6 # on b0
-+SIG_LOADCV     = 7
-+SIG_LOADC      = 8
-+SIG_LDCEND     = 9
-+SIG_LDTMU0     = 10
-+SIG_LDTMU1     = 11
-+SIG_ROTATE     = 12 # on a0
-+SIG_LOADAM     = 12 # on b0
-+SIG_SMALLIMMED = 13
-+SIG_IMMED      = 14
-+SIG_BRANCH     = 15
-+
-+# multi-line assembler constructs
-+##################################
-+
-+CONSTRUCT_MACRO = 0x1
-+CONSTRUCT_IF    = 0x2
-+CONSTRUCT_ELSE  = 0x4
-+CONSTRUCT_REP   = 0x8
-+
-+###############################################################################
-+# helpers
-+###############################################################################
-+
-+def asm_error(message, location = None):
-+   if location is None:
-+      location = current_location
-+   if location == '':
-+      sys.stderr.write('qasm ERROR: %s\n' % message)
-+   else:
-+      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
-+   sys.exit(-1)
-+
-+def asm_warning(message, location = None):
-+   if disable_warnings or (nwarn_level != 0):
-+      return
-+   if location is None:
-+      location = current_location
-+   if location == '':
-+      sys.stderr.write('qasm WARNING: %s\n' % message)
-+   else:
-+      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
-+   if warnings_are_errors:
-+      asm_error('warnings are errors!', location)
-+
-+# smart_split('') = []
-+# smart_split('a') = ['a']
-+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
-+def smart_split(s, delim = ',', count = 0):
-+   if len(s) == 0:
-+      return []
-+   parts = []
-+   depth = 0
-+   i = 0
-+   for j in xrange(len(s)):
-+      if s[j] in '([{':
-+         depth += 1
-+      elif s[j] in ')]}':
-+         depth -= 1
-+      elif (s[j] == delim) and (depth == 0):
-+         parts.append(s[i:j])
-+         i = j + 1
-+         if len(parts) == count:
-+            break
-+   if depth != 0:
-+      asm_error('bracket nesting fail')
-+   parts.append(s[i:])
-+   return parts
-+
-+def is_int(x):
-+   return isinstance(x, int) or isinstance(x, long)
-+
-+###############################################################################
-+# "parsing" stuff
-+###############################################################################
-+
-+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
-+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
-+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
-+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
-+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
-+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
-+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
-+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
-+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
-+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
-+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
-+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
-+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
-+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
-+re_label_ref_left = re.compile('\\b([ar]):')
-+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
-+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
-+
-+# ops
-+######
-+
-+aops = {
-+   'mov': (AOP_MOV, 2),
-+   'bra': (AOP_BRA, 2),
-+   'brr': (AOP_BRR, 2),
-+   'nop': (AOP_NOP, 0),
-+   'fadd': (AOP_FADD, 3),
-+   'fsub': (AOP_FSUB, 3),
-+   'fmin': (AOP_FMIN, 3),
-+   'fmax': (AOP_FMAX, 3),
-+   'fminabs': (AOP_FMINABS, 3),
-+   'fmaxabs': (AOP_FMAXABS, 3),
-+   'ftoi': (AOP_FTOI, 2),
-+   'itof': (AOP_ITOF, 2),
-+   'add': (AOP_ADD, 3),
-+   'sub': (AOP_SUB, 3),
-+   'shr': (AOP_SHR, 3),
-+   'asr': (AOP_ASR, 3),
-+   'ror': (AOP_ROR, 3),
-+   'shl': (AOP_SHL, 3),
-+   'min': (AOP_MIN, 3),
-+   'max': (AOP_MAX, 3),
-+   'and': (AOP_AND, 3),
-+   'or': (AOP_OR, 3),
-+   'xor': (AOP_XOR, 3),
-+   'not': (AOP_NOT, 2),
-+   'clz': (AOP_CLZ, 2),
-+   'v8adds': (AOP_V8ADDS, 3),
-+   'v8subs': (AOP_V8SUBS, 3)}
-+
-+def get_aop(aop):
-+   if aop not in aops:
-+      asm_error('invalid aop')
-+   return aops[aop]
-+
-+mops = {
-+   'mov': (MOP_MOV, 2),
-+   'nop': (MOP_NOP, 0),
-+   'fmul': (MOP_FMUL, 3),
-+   'mul24': (MOP_MUL24, 3),
-+   'v8muld': (MOP_V8MULD, 3),
-+   'v8min': (MOP_V8MIN, 3),
-+   'v8max': (MOP_V8MAX, 3),
-+   'v8adds': (MOP_V8ADDS, 3),
-+   'v8subs': (MOP_V8SUBS, 3)}
-+
-+def get_mop(mop):
-+   if mop not in mops:
-+      asm_error('invalid mop')
-+   return mops[mop]
-+
-+# conds
-+########
-+
-+conds = {
-+   'ifz': COND_IFZ,
-+   'ifnz': COND_IFNZ,
-+   'ifn': COND_IFN,
-+   'ifnn': COND_IFNN,
-+   'ifc': COND_IFC,
-+   'ifnc': COND_IFNC}
-+
-+def get_cond(cond):
-+   if not cond:
-+      return COND_ALWAYS
-+   if cond not in conds:
-+      asm_error('invalid cond')
-+   return conds[cond]
-+
-+bconds = {
-+   'allz': BCOND_ALLZ,
-+   'allnz': BCOND_ALLNZ,
-+   'anyz': BCOND_ANYZ,
-+   'anynz': BCOND_ANYNZ,
-+   'alln': BCOND_ALLN,
-+   'allnn': BCOND_ALLNN,
-+   'anyn': BCOND_ANYN,
-+   'anynn': BCOND_ANYNN,
-+   'allc': BCOND_ALLC,
-+   'allnc': BCOND_ALLNC,
-+   'anyc': BCOND_ANYC,
-+   'anync': BCOND_ANYNC}
-+
-+def get_bcond(bcond):
-+   if not bcond:
-+      return BCOND_ALWAYS
-+   if bcond not in bconds:
-+      asm_error('invalid bcond')
-+   return bconds[bcond]
-+
-+def get_setf(setf):
-+   if not setf:
-+      return False
-+   return True
-+
-+# packing/unpacking
-+####################
-+
-+packs = {
-+   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
-+   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
-+   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
-+   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
-+   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
-+   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
-+   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
-+   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
-+
-+def get_pack(pack):
-+   if not pack:
-+      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
-+   if pack not in packs:
-+      asm_error('invalid pack')
-+   return packs[pack]
-+
-+a_unpacks = {
-+   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
-+   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
-+   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
-+   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
-+   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
-+   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
-+   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
-+   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
-+   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
-+   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
-+   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
-+   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
-+   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
-+
-+def get_a_unpack(unpack):
-+   if not unpack:
-+      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
-+   if unpack not in a_unpacks:
-+      asm_error('invalid ra unpack')
-+   return a_unpacks[unpack] + (UNPACK_LOC_A,)
-+
-+r4_unpacks = {
-+   '16af': UNPACK_R4_16A,
-+   '16bf': UNPACK_R4_16B,
-+   '8dr':  UNPACK_R4_8R,
-+   '8ac':  UNPACK_R4_8A,
-+   '8bc':  UNPACK_R4_8B,
-+   '8cc':  UNPACK_R4_8C,
-+   '8dc':  UNPACK_R4_8D}
-+
-+def get_r4_unpack(unpack):
-+   if not unpack:
-+      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
-+   if unpack not in r4_unpacks:
-+      asm_error('invalid r4 unpack')
-+   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
-+
-+# args
-+#######
-+
-+class loc_t:
-+   def __init__(self, mux, i, rot, r5_rot, pack, rw):
-+      self.mux = mux
-+      self.i = i
-+      self.rot = rot % 16
-+      self.r5_rot = r5_rot % 16
-+      self.pack = pack
-+      self.rw = rw
-+
-+   def copy(self):
-+      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __add__(self, i):
-+      if not is_int(i):
-+         raise Exception('can only add integer to loc')
-+      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __sub__(self, i):
-+      if not is_int(i):
-+         raise Exception('can only subtract integer from loc')
-+      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
-+
-+   def __cmp__(self, other):
-+      if is_int(other):
-+         return cmp(self.i, other)
-+      if not isinstance(other, loc_t):
-+         raise Exception('can only compare loc to integer or other loc')
-+      if self.mux != other.mux:
-+         return cmp(self.mux, other.mux)
-+      if self.i != other.i:
-+         return cmp(self.i, other.i)
-+      if self.rot != other.rot:
-+         return cmp(self.rot, other.rot)
-+      if self.r5_rot != other.r5_rot:
-+         return cmp(self.r5_rot, other.r5_rot)
-+      return cmp(self.pack, other.pack)
-+
-+   def is_r5(self):
-+      return (self.mux == MUX_AC) and (self.i == 5)
-+
-+   def shift(self, rot, left):
-+      if isinstance(rot, loc_t) and rot.is_r5():
-+         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
-+            raise Exception('can\'t rotate by rotated/unpacked r5')
-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
-+      if not is_int(rot):
-+         raise Exception('can only rotate by integer or r5')
-+      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
-+
-+   def __lshift__(self, rot):
-+      return self.shift(rot, True)
-+
-+   def __rshift__(self, rot):
-+      return self.shift(rot, False)
-+
-+   def __getattr__(self, name):
-+      # discard the first character if it is an underscore. this is a total hack
-+      # to allow packs starting with a digit to work
-+      if name[0] == '_':
-+         name = name[1:]
-+      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
-+         if self.pack:
-+            raise Exception('can\'t specify two packs')
-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
-+      raise AttributeError()
-+
-+   def __str__(self):
-+      if self.mux == MUX_AC:
-+         return 'r%d' % self.i
-+      if self.mux == MUX_ANY:
-+         return 'rany%d' % self.i
-+      if self.mux == MUX_A:
-+         return 'ra%d' % self.i
-+      if self.mux == MUX_B:
-+         return 'rb%d' % self.i
-+      assert 0
-+
-+class sema_t:
-+   def __init__(self, acq, i):
-+      if not is_int(i):
-+         raise Exception('semaphore index must be integer')
-+      self.acq = acq
-+      self.i = i
-+
-+class label_t:
-+   def __init__(self, rel, name, offset):
-+      self.rel = rel
-+      self.name = name
-+      self.offset = offset
-+
-+   def __add__(self, offset):
-+      return label_t(self.rel, self.name, self.offset + offset)
-+
-+   def __sub__(self, offset):
-+      return label_t(self.rel, self.name, self.offset - offset)
-+
-+class label_maker_t:
-+   def __init__(self, rel):
-+      self.rel = rel
-+
-+   def __getattr__(self, name):
-+      # we discard the first character. this is a total hack to allow numeric labels to work
-+      if not re_label_ref_right.match(name[1:]):
-+         raise Exception('invalid label reference')
-+      return label_t(self.rel, name[1:], 0)
-+
-+def bits(x, n):
-+   if (x >> n) != 0:
-+      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
-+   return x
-+
-+def bitsw(x, n):
-+   if x == (1 << n):
-+      x = 0
-+   return bits(x, n)
-+
-+def bitsws(x, n):
-+   if x == (1 << (n - 1)):
-+      x = 0
-+   if -(1 << (n - 1)) <= x < 0:
-+      x += 1 << n
-+   return bits(x, n)
-+
-+def vpm_setup(n, stride, addr, v2 = False):
-+   horiz, laned, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   if horiz:
-+      if x != 0:
-+         raise Exception('horizontal accesses must have x of 0')
-+   else:
-+      if (y & 0xf) != 0:
-+         raise Exception('vertical accesses must be 16 row aligned')
-+   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
-+   if v2:
-+      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
-+         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
-+   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
-+      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
-+
-+def vdw_setup_0(n, m, addr):
-+   horiz, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
-+      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
-+
-+def vdr_setup_0(n, m, addr, vpm_stride, stride):
-+   horiz, size, y, x, p = addr
-+   if size not in (0, 1, 2):
-+      raise Exception('addr size should be 0, 1, or 2')
-+   if (stride < 8) or (stride & (stride - 1)):
-+      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
-+   log2_stride = 3
-+   while (1 << log2_stride) != stride:
-+      log2_stride += 1
-+   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
-+      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
-+      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
-+
-+class allocator_t:
-+   def __init__(self, *available):
-+      self.available = list(available)
-+      self.allocated = {}
-+      self.reserved = []
-+
-+   def copy(self):
-+      a = allocator_t()
-+      a.available = self.available[:]
-+      a.allocated = self.allocated.copy()
-+      a.reserved = self.reserved[:]
-+      return a
-+
-+   def forget(self):
-+      self.__init__(self.available + self.allocated.values() + self.reserved)
-+
-+   def reserve(self, *rs):
-+      for r in rs:
-+         self.available.remove(r)
-+         self.reserved.append(r)
-+
-+   def retire(self, name):
-+      r = self.allocated.pop(name)
-+      del r.__invert__
-+      del r.retire
-+      self.available.append(r)
-+      return r
-+
-+   def __getattr__(self, name):
-+      if name not in self.allocated:
-+         r = self.available.pop()
-+         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
-+         r.__invert__ = r.retire
-+         self.allocated[name] = r
-+      return self.allocated[name]
-+
-+def pragma_allow_xor_0(x):
-+   global allow_xor_0
-+
-+   if not isinstance(x, bool):
-+      raise Exception('allow_xor_0 must be bool')
-+   x, allow_xor_0 = allow_xor_0, x
-+   return x
-+
-+def pragma_dont_warn_when_mul_rot_inp_r5(x):
-+   global dont_warn_when_mul_rot_inp_r5
-+
-+   if not isinstance(x, bool):
-+      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
-+   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
-+   return x
-+
-+arg_defs = {
-+   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
-+   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
-+   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
-+   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
-+   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
-+   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
-+   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
-+   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
-+   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
-+   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
-+   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
-+   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
-+   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
-+   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
-+   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
-+   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
-+   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
-+   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
-+   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
-+   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
-+   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
-+   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
-+   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
-+   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
-+   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
-+   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
-+   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
-+   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
-+   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
-+   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
-+   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
-+   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
-+   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
-+   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
-+   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
-+   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
-+   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
-+   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
-+   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
-+   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
-+   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
-+   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
-+   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
-+
-+   # semaphore acq/rel
-+   'sacq': lambda i: sema_t(True, i),
-+   'srel': lambda i: sema_t(False, i),
-+
-+   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
-+   'r_label_maker': label_maker_t(True),
-+   'a_label_maker': label_maker_t(False),
-+
-+   # handy functions
-+   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
-+   'sqrt':  math.sqrt,
-+   'sin':   math.sin,
-+   'cos':   math.cos,
-+   'atan2': math.atan2,
-+   'pi':    math.pi,
-+   'rseed': random.seed,
-+   'rand':  lambda: int(random.getrandbits(32)),
-+   'bits':  bits,
-+   'bitsw': bitsw,
-+   'bitsws': bitsws,
-+
-+   # handy vpm/vdw/vdr stuff
-+   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
-+   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
-+   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
-+   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
-+   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
-+   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
-+   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
-+   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
-+   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
-+   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
-+   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
-+   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
-+   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
-+   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
-+   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
-+   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
-+   'vpm_setup': vpm_setup,
-+   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
-+   'vdw_setup_0': vdw_setup_0,
-+   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
-+   'vdr_setup_0': vdr_setup_0,
-+   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
-+   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
-+
-+   # annotations
-+   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
-+   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
-+   'preserve_cond': ('preserve_cond', 1),
-+
-+   # somewhat experimental register allocator
-+   'allocator_t': allocator_t,
-+
-+   # pragmas
-+   'pragma_allow_xor_0': pragma_allow_xor_0,
-+   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
-+
-+# accumulators and regs (regular names -- r0, ra0, etc)
-+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
-+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
-+
-+def arg_eval(arg, sets):
-+   s = (arg.strip().split('.', 1) + [None])[:2]
-+   if s[0] == '-':
-+      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
-+   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
-+   arg = re_pack.sub('._\\1', arg)
-+   try:
-+      # todo: i would like to be able to pass both arg_defs and sets in here
-+      # (with sets hiding arg_defs in the case of conflicts), but the obvious
-+      # dict(arg_defs, **sets) won't permit things such as:
-+      # .set f, lambda x: y
-+      # .set y, 4
-+      # (the y in the lambda will be looked up in the temporary dict we created
-+      # when evaluating the f .set, which doesn't contain y)
-+      #
-+      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
-+      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
-+      # also, before dumping sets at the end, we strip out the arg_defs stuff
-+      # (this isn't entirely correct as we want to dump sets that are hiding
-+      # arg_defs)
-+      return eval(arg, sets)
-+   except Exception, e:
-+      asm_error(e)
-+   except:
-+      asm_error('unknown error while evaluating argument')
-+
-+# doesn't check/fixup pack
-+def check_and_fixup_loc(loc, read):
-+   if (not read) and (loc.rw == RW_READ):
-+      asm_error('writing to read-only hardware register')
-+   if read and (loc.rw == RW_WRITE):
-+      asm_error('reading from write-only hardware register')
-+   if not read:
-+      # conceptually, we are writing to a location rotated right by
-+      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
-+      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
-+      loc.rot = -loc.rot % 16
-+      loc.r5_rot = -loc.r5_rot % 16
-+   if (loc.rot != 0) and (loc.r5_rot != 0):
-+      asm_error('can\'t rotate by both r5 and immediate')
-+   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
-+      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
-+   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
-+      if not read:
-+         asm_error('target doesn\'t support write rotation')
-+      if loc.mux == MUX_ANY:
-+         loc.mux = MUX_A # can't do rotated read from regfile b
-+      if loc.mux != MUX_A:
-+         asm_error('rotation on read only allowed from regfile a')
-+      if loc.i >= 32:
-+         asm_warning('rotation only works from physical regfile')
-+   if loc.mux == MUX_AC:
-+      if (loc.i < 0) or (loc.i >= 6):
-+         asm_error('reg out of range')
-+      if not read:
-+         if loc.i == 4:
-+            asm_error('not allowed to write to r4')
-+         if loc.i == 5:
-+
-+            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
-+   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
-+      if (loc.i < 0) or (loc.i >= 64):
-+         asm_error('reg out of range')
-+   else:
-+      assert 0
-+
-+def get_dst(dst, sets):
-+   if not dst:
-+      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
-+   dst = arg_eval(dst, sets)
-+   if not isinstance(dst, loc_t):
-+      asm_error('invalid dst')
-+   dst = dst.copy()
-+   check_and_fixup_loc(dst, False)
-+   pack = get_pack(dst.pack)
-+   if dst.mux == MUX_AC:
-+      if pack[2] == PACK_MODE_A:
-+         asm_warning('ra packing only works when writing to physical regfile')
-+         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
-+      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
-+   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
-+      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
-+         asm_warning('ra packing only works when writing to physical regfile')
-+      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
-+   if dst.mux == MUX_ANY:
-+      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
-+   if dst.mux == MUX_B:
-+      if pack[2] == PACK_MODE_A:
-+         asm_error('this packing operation can only be used for regfile a')
-+      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
-+   assert 0
-+
-+def get_src(src, sets):
-+   if not src:
-+      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
-+   src = arg_eval(src, sets)
-+   if isinstance(src, sema_t):
-+      if not have_sema:
-+         asm_error('target does not support semaphores')
-+      if (src.i < 0) or (src.i >= 16):
-+         asm_error('semaphore number must be in [0, 16)')
-+      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if isinstance(src, label_t):
-+      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if isinstance(src, list):
-+      if len(src) != 16:
-+         asm_error('vector immediate must have length 16')
-+      src = src[:]
-+      for i in xrange(16):
-+         if not is_int(src[i]):
-+            asm_error('all elements of vector immediate must be integers')
-+         src[i] &= (1 << 32) - 1
-+      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if is_int(src):
-+      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
-+   if not isinstance(src, loc_t):
-+      asm_error('invalid src')
-+   src = src.copy()
-+   check_and_fixup_loc(src, True)
-+   if mulw_rotate:
-+      srot, sr5rot = 0, 0
-+      drot, dr5rot = src.rot, src.r5_rot
-+   else:
-+      srot, sr5rot = src.rot, src.r5_rot
-+      drot, dr5rot = 0, 0
-+   if src.mux == MUX_AC:
-+      if src.i == 4:
-+         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
-+      if src.pack:
-+         asm_error('unpack only allowed for regfile a or r4')
-+      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
-+   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
-+      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
-+   if src.mux == MUX_ANY:
-+      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
-+   if src.mux == MUX_B:
-+      if src.pack:
-+         asm_error('unpack only allowed for regfile a or r4')
-+      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
-+   assert 0
-+
-+# signals
-+##########
-+
-+sigs = {
-+   'bkpt': SIG_BKPT,
-+   'thrsw': SIG_THRSW,
-+   'thrend': SIG_THREND,
-+   'sbwait': SIG_SBWAIT,
-+   'sbdone': SIG_SBDONE,
-+   'int': SIG_INT,
-+   'loadcv': SIG_LOADCV,
-+   'loadc': SIG_LOADC,
-+   'ldcend': SIG_LDCEND,
-+   'ldtmu0': SIG_LDTMU0,
-+   'ldtmu1': SIG_LDTMU1}
-+
-+def get_sig(sig):
-+   if sig not in sigs:
-+      return SIG_NORMAL
-+   return sigs[sig]
-+
-+# annotations
-+##############
-+
-+def get_annots(annot, sets):
-+   annots = arg_eval(annot, sets)
-+   if isinstance(annots, list):
-+      annots = annots[:]
-+   else:
-+      annots = [annots]
-+   for i, annot in enumerate(annots):
-+      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
-+         (not is_int(annot[1]))):
-+         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
-+      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
-+   return annots
-+
-+###############################################################################
-+# core
-+###############################################################################
-+
-+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
-+   needfloat = PACK_TYPE_EITHER
-+   havefloata = False
-+   havefloatr4 = False
-+   unpacka = None
-+   unpackr4 = None
-+   forcebs = [False, False, False, False]
-+   forcerafloat = False
-+
-+   pm = PACK_MODE_EITHER
-+   for i in (0, 1, 2, 3):
-+      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
-+         assert rpacks[i][0] == 0
-+      else:
-+         if rpacks[i][2] == UNPACK_LOC_A:
-+            if unpacka is None:
-+               unpacka = rpacks[i][0]
-+            elif unpacka != rpacks[i][0]:
-+               asm_error('conflicting unpack operations on regfile a')
-+            havefloata = havefloata or rfloats[i]
-+         elif rpacks[i][2] == UNPACK_LOC_R4:
-+            if unpackr4 is None:
-+               unpackr4 = rpacks[i][0]
-+            elif unpackr4 != rpacks[i][0]:
-+               asm_error('conflicting unpack operations on r4')
-+            havefloatr4 = havefloatr4 or rfloats[i]
-+         else:
-+            assert 0
-+
-+         if rpacks[i][1] != PACK_TYPE_EITHER:
-+            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
-+               asm_error('conflicting unpack float requirements')
-+            needfloat = rpacks[i][1]
-+   for i in (0, 1, 2, 3):
-+      if rpacks[i][2] == UNPACK_LOC_AB:
-+         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
-+            forcebs[i] = True # non-nop unpack from regfile a. must use b
-+
-+   if unpacka:
-+      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
-+         havefloata = True
-+         forcerafloat = True
-+      havefloat = havefloata
-+   else:
-+      havefloat = havefloatr4
-+
-+   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
-+      asm_error('float unpack operation used in integer alu operations')
-+   if (needfloat == PACK_TYPE_INT) and havefloat:
-+      asm_error('integer unpack operation used in float alu operation')
-+
-+   unpack = 0
-+   if unpacka and unpackr4:
-+      asm_error('cannot specify pack operation for both regfile a and r4')
-+   if unpacka:
-+      pm = PACK_MODE_A
-+      unpack = unpacka
-+   elif unpackr4:
-+      pm = PACK_MODE_M
-+      unpack = unpackr4
-+
-+   pack = 0
-+   if wpacks[0][2] == PACK_MODE_M:
-+      asm_error('mul-unit pack operation used on add result')
-+   for i in (0, 1):
-+      if wpacks[i][2] == PACK_MODE_A:
-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
-+            asm_error('conflicting pack modes')
-+         pm = PACK_MODE_A
-+         pack = wpacks[i][0]
-+      elif wpacks[i][2] == PACK_MODE_M:
-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
-+            asm_error('conflicting pack modes')
-+         pm = PACK_MODE_M
-+         pack = wpacks[i][0]
-+
-+      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
-+         asm_error('float pack operation used with integer alu result')
-+      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
-+         asm_error('integer pack operation used with float alu result')
-+
-+   if pm == PACK_MODE_EITHER:
-+      pm = PACK_MODE_A
-+   return pm, pack, unpack, forcebs, forcerafloat
-+
-+# immediates that can be encoded with SIG_SMALLIMMED
-+bimms = {}
-+bimms.update((i, i) for i in xrange(16))
-+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
-+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
-+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
-+
-+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
-+   if rmux == RMUX_SEMA:
-+      asm_error('semaphore op can only be used with mov')
-+   if rmux == RMUX_LABEL:
-+      asm_error('label not allowed here')
-+   if rmux == RMUX_IMMV:
-+      asm_error('vector immediate can only be used with mov')
-+   if rmux == RMUX_IMM:
-+      if raddr not in bimms:
-+         asm_error('can\'t encode immediate 0x%08x' % raddr)
-+      raddr = bimms[raddr]
-+      if not immb:
-+         if raddr_b is not None:
-+            asm_error('regfile b and immediates don\'t mix')
-+         raddr_b = raddr
-+         immb = True
-+      elif raddr_b != raddr:
-+         asm_error('can only encode one rotation/immediate')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+   if rmux == RMUX_AC:
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
-+   if rmux == RMUX_ANY:
-+      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      if (not immb) and (raddr_b == raddr):
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+      if raddr_a is None:
-+         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
-+         raddr_a = raddr
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      if raddr_b is None:
-+         assert not immb
-+         raddr_b = raddr
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+      asm_error('no free read slots')
-+   if rmux == RMUX_A:
-+      if (not mulw_rotate) and (raddr_a is not None) and (
-+         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
-+         asm_error('conflicting rotations from regfile a')
-+      if raddr_a is None:
-+         raddr_a = raddr[0]
-+      elif raddr_a != raddr[0]:
-+         asm_error('can only read from one location in each regfile')
-+      arot_r5 = raddr[2]
-+      if raddr[1] == 0:
-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+      raddr = 48 + raddr[1]
-+      if not immb:
-+         if raddr_b is not None:
-+            asm_error('regfile b and rotation don\'t mix')
-+         raddr_b = raddr
-+         immb = True
-+      elif raddr_b != raddr:
-+         asm_error('can only encode one rotation/immediate')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
-+   if rmux == RMUX_B:
-+      if immb:
-+         asm_error('regfile b and rotation/immediates don\'t mix')
-+      if raddr_b is None:
-+         raddr_b = raddr
-+      elif raddr_b != raddr:
-+         asm_error('can only read from one location in each regfile')
-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
-+   assert 0
-+
-+# ok if:
-+# - accumulator (r0-r3)
-+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
-+#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
-+#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
-+#   what about vr_wait/vw_wait/mutex?
-+def read_rot_ok(rmux, raddr_a, raddr_b):
-+   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
-+      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
-+      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
-+
-+def asm_flush_prog_data():
-+   global prog_data
-+
-+   while len(prog_data) & 7:
-+      prog_data.append(0)
-+   for i in xrange(0, len(prog_data), 8):
-+      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
-+         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
-+   prog_data = []
-+
-+def asm_line(sets, location, line):
-+   global current_location, construct, nwarn_level
-+
-+   prev_location = current_location
-+   current_location = location
-+
-+   try:
-+      if construct != None:
-+         if re_macro.match(line):
-+            construct_stack.append(CONSTRUCT_MACRO)
-+         elif re_if.match(line):
-+            construct_stack.append(CONSTRUCT_IF)
-+         elif re_rep.match(line):
-+            construct_stack.append(CONSTRUCT_REP)
-+         else:
-+            else_m = line == '.else'
-+            elif_m = re_elif.match(line)
-+            if elif_m:
-+               end_construct = CONSTRUCT_IF
-+            else:
-+               end_construct = {
-+                  '.endm':  CONSTRUCT_MACRO,
-+                  '.else':  CONSTRUCT_IF,
-+                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
-+                  '.endr':  CONSTRUCT_REP}.get(line)
-+            if end_construct is not None:
-+               end_construct &= construct_stack.pop()
-+               if end_construct == 0:
-+                  if elif_m:
-+                     asm_error('unexpected .elif')
-+                  asm_error('unexpected %s' % line)
-+               if len(construct_stack) == 0:
-+                  lines = construct
-+                  construct = None
-+                  if end_construct == CONSTRUCT_MACRO:
-+                     return
-+                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
-+                     condition_if, condition_else = lines[0]
-+                     lines = lines[1:]
-+                     if condition_if:
-+                        for location, line in lines:
-+                           asm_line(sets, location, line)
-+                     if else_m:
-+                        construct = [(condition_else, False)]
-+                        construct_stack.append(CONSTRUCT_ELSE)
-+                     elif elif_m:
-+                        if elif_m.group('set'):
-+                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
-+                        else:
-+                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
-+                        condition_else = condition_else and (not condition_if)
-+                        construct = [(condition_if, condition_else)]
-+                        construct_stack.append(CONSTRUCT_IF)
-+                     return
-+                  if end_construct == CONSTRUCT_REP:
-+                     name, count = lines[0]
-+                     lines = lines[1:]
-+                     for i in xrange(count):
-+                        sets[name] = i
-+                        for location, line in lines:
-+                           asm_line(sets, location, line)
-+                     return
-+                  assert 0
-+               if else_m:
-+                  construct_stack.append(CONSTRUCT_ELSE)
-+               elif elif_m:
-+                  construct_stack.append(CONSTRUCT_IF)
-+         construct.append((current_location, line))
-+         return
-+
-+      if line in ('.endm', '.else', '.endif', '.endr'):
-+         asm_error('unexpected %s' % line)
-+      if re_elif.match(line):
-+         asm_error('unexpected .elif')
-+
-+      m = re_macro.match(line)
-+      if m:
-+         construct = []
-+         construct_stack.append(CONSTRUCT_MACRO)
-+         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
-+         return
-+
-+      m = re_if.match(line)
-+      if m:
-+         if m.group('set'):
-+            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
-+         else:
-+            # not not forces condition to a bool (this matters if condition is
-+            # something mutable like a list)
-+            condition = not not arg_eval(m.group('condition'), sets)
-+         construct = [(condition, not condition)]
-+         construct_stack.append(CONSTRUCT_IF)
-+         return
-+
-+      m = re_rep.match(line)
-+      if m:
-+         count = arg_eval(m.group('count'), sets)
-+         if not is_int(count):
-+            asm_error('.rep count must be integer')
-+         construct = [(m.group('name'), count)]
-+         construct_stack.append(CONSTRUCT_REP)
-+         return
-+
-+      m = re_include.match(line)
-+      if m:
-+         filename = arg_eval(m.group('filename'), sets)
-+         if not isinstance(filename, str):
-+            asm_error('expected string')
-+         asm_file(sets, '%s: %s' % (current_location, filename), filename)
-+         return
-+
-+      m = re_set.match(line)
-+      if m:
-+         sets[m.group('name')] = arg_eval(m.group('val'), sets)
-+         return
-+
-+      m = re_unset.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name not in sets:
-+            asm_error('%s not set' % name)
-+         if name in arg_defs: # todo: see arg_eval
-+            sets[name] = arg_defs[name]
-+         else:
-+            del sets[name]
-+         return
-+
-+      m = re_eval.match(line)
-+      if m:
-+         arg_eval(m.group('expr'), sets)
-+         return
-+
-+      m = re_print_info_warn_error.match(line)
-+      if m:
-+         def print_fn(message):
-+            print message
-+         def info_fn(message):
-+            sys.stderr.write('%s\n' % message)
-+         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
-+            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
-+         return
-+
-+      m = re_assert.match(line)
-+      if m:
-+         if not arg_eval(m.group('condition'), sets):
-+            asm_error('assertion failure: \'%s\'' % m.group('condition'))
-+         return
-+
-+      m = re_data.match(line)
-+      if m:
-+         size = int(m.group('size'))
-+         for datum in smart_split(m.group('data')):
-+            datum = arg_eval(datum, sets)
-+            if not is_int(datum):
-+               asm_error('datum must be integer')
-+            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
-+         return
-+
-+      m = re_macro_inst.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name in macros:
-+            params, lines = macros[name]
-+            args = smart_split(m.group('args'))
-+            if len(args) > len(params):
-+               asm_error('too many arguments to macro')
-+            sets = sets.copy()
-+            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
-+            for param in params[len(args):]:
-+               if param in sets:
-+                  if param in arg_defs: # todo: see arg_eval
-+                     sets[param] = arg_defs[param]
-+                  else:
-+                     del sets[param]
-+            for location, line in lines:
-+               asm_line(sets, '%s: %s' % (current_location, location), line)
-+            return
-+
-+      if line == '.pushnwarn':
-+         nwarn_level += 1
-+         return
-+      if line == '.popnwarn':
-+         if nwarn_level == 0:
-+            asm_error('.popnwarn without .pushnwarn')
-+         nwarn_level -= 1
-+         return
-+
-+      # everything below assumes prog is up to date
-+      asm_flush_prog_data()
-+
-+      m = re_label.match(line)
-+      if m:
-+         name = m.group('name')
-+         if name[0].isdigit():
-+            labels.setdefault(name, []).append(len(prog))
-+         else:
-+            if name[0] == ':':
-+               undecorated_name = name[1:]
-+            else:
-+               undecorated_name = name
-+            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
-+               asm_error('named label defined twice')
-+            labels[name] = len(prog)
-+         return
-+
-+      annots = line.split('@')
-+      ops = [op.strip() for op in annots[0].split(';')]
-+      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
-+      sig = get_sig(ops[-1])
-+      if sig != SIG_NORMAL:
-+         ops = ops[:-1]
-+      if len(ops) > 2:
-+         asm_error('too many ops')
-+      elif (len(ops) == 1) and (ops[0] == ''):
-+         ops = []
-+      ops = (ops + ['nop', 'nop'])[:2]
-+      m = re_op.match(ops[0])
-+      if not m:
-+         asm_error('invalid syntax')
-+      aop, aargs_n = get_aop(m.group('op'))
-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
-+         acond = get_bcond(m.group('cond'))
-+      else:
-+         acond = get_cond(m.group('cond'))
-+      asf = get_setf(m.group('sf'))
-+      aargs = smart_split(m.group('args'))
-+      if len(aargs) != aargs_n:
-+         asm_error('wrong operand count')
-+      ard, ara, arb = (aargs + [None, None, None])[:3]
-+      m = re_op.match(ops[1])
-+      if not m:
-+         asm_error('invalid syntax')
-+      mop, margs_n = get_mop(m.group('op'))
-+      mcond = get_cond(m.group('cond'))
-+      msf = get_setf(m.group('sf'))
-+      margs = smart_split(m.group('args'))
-+      if len(margs) != margs_n:
-+         asm_error('wrong operand count')
-+      mrd, mra, mrb = (margs + [None, None, None])[:3]
-+      # eval srcs first so allocator can retire and reuse registers for dst
-+      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
-+      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
-+      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
-+      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
-+      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
-+      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
-+      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
-+         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
-+         asm_error('cannot have 2 arguments with different rotations')
-+      if aarmux is not None:
-+         awrot = (awrot + aadrot) % 16
-+         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
-+      if (awrot != 0) or awrot_r5:
-+         asm_error('rotate not allowed on add write')
-+      if marmux is not None:
-+         mwrot = (mwrot + madrot) % 16
-+         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
-+
-+      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
-+      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
-+      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
-+         [aarpack, abrpack, marpack, mbrpack],
-+         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
-+         aop == AOP_FTOI,
-+         [awpack, mwpack],
-+         [afloatw, mop == MOP_FMUL])
-+      if forcebs[0]:
-+         aarmux = RMUX_B
-+      if forcebs[1]:
-+         abrmux = RMUX_B
-+      if forcebs[2]:
-+         marmux = RMUX_B
-+      if forcebs[3]:
-+         mbrmux = RMUX_B
-+
-+      # extend nops to 3 operands
-+      if aop == AOP_NOP:
-+         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
-+      if mop == MOP_NOP:
-+         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
-+
-+      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
-+      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
-+         if forcerafloat:
-+            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
-+            # instead of duplicating the 2nd operand, take the ra operand from
-+            # the mul op thus forcing the ra value to be considered a float for
-+            # the purposes of unpacking
-+            if marmux == RMUX_A:
-+               abraddr, abrmux = maraddr, marmux
-+            else:
-+               assert mbrmux == RMUX_A
-+               abraddr, abrmux = mbraddr, mbrmux
-+         else:
-+            abraddr, abrmux = aaraddr, aarmux
-+      else:
-+         assert not forcerafloat # can only forcerafloat if we have an unused operand
-+
-+      # handle write addrs
-+      if (awmux == mwmux) and (awmux != WMUX_ANY):
-+         asm_error('add/mul ops not allowed to write to same regfile')
-+      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
-+
-+      # handle branch
-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
-+         # check setf
-+         if asf:
-+            asm_error('setf not allowed on bra/brr')
-+
-+         # check pack/unpack
-+         if (pack != 0) or (unpack != 0):
-+            asm_error('pack/unpack not allowed with bra/brr')
-+
-+         # handle read address
-+         if aarmux == RMUX_LABEL:
-+            if (aop == AOP_BRA) and aaraddr[1]:
-+               asm_warning('bra with rel label')
-+            if (aop == AOP_BRR) and (not aaraddr[1]):
-+               asm_warning('brr with abs label')
-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
-+         if aarmux == RMUX_ANY:
-+            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
-+         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
-+            asm_error('branch destination must be either label, immediate, or from regfile a')
-+         if aarmux == RMUX_IMM:
-+            imm = aaraddr
-+            raddr = 0 # can't use RADDR_NOP
-+         elif aarmux == RMUX_A:
-+            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
-+               asm_error('rotation of read from regfile a not allowed with branch')
-+            if aop == AOP_BRR:
-+               asm_warning('brr with ra')
-+            imm = 0
-+            raddr = aaraddr[0]
-+         else:
-+            assert 0
-+
-+         # check mul op is nop
-+         if mop != MOP_NOP:
-+            asm_error('mul op not allowed with branch')
-+
-+         # check sig
-+         if sig != SIG_NORMAL:
-+            asm_error('no signal allowed with branch')
-+
-+         if raddr >= 32:
-+            asm_error('can only branch to register locations in physical regfile')
-+         if raddr & 1:
-+            asm_warning('branch instruction will destroy flags (see hw-2780)')
-+
-+         # construct branch instruction
-+         prog.append((imm,
-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
-+            line, annots))
-+
-+         return
-+
-+      # use COND_NEVER when possible (might save power / allow mul setf)
-+      if not dict(annots).get('preserve_cond', 0):
-+          if (awaddr == WADDR_NOP) and (not asf):
-+             acond = COND_NEVER
-+          if (mwaddr == WADDR_NOP) and (not msf):
-+             mcond = COND_NEVER
-+
-+      # attempt to convert movs to ldi
-+      if (# no mul setf
-+         (not msf) and
-+         # ops must either be nop or mov of sema/label/imm/immv
-+         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
-+         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
-+         # but we don't want 2 nops
-+         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
-+         # if both ops are movs, srcs must be identical
-+         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
-+         # no signal
-+         (sig == SIG_NORMAL)):
-+         # make sure aarmux/aaraddr contains the value
-+         if aop != AOP_MOV:
-+            aarmux = marmux
-+            aaraddr = maraddr
-+
-+         # convert immediate
-+         if aarmux == RMUX_SEMA:
-+            ldi_mode = LDI_SEMA
-+         elif aarmux == RMUX_LABEL:
-+            ldi_mode = LDI_32
-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
-+         elif aarmux == RMUX_IMMV:
-+            signed, unsigned = True, True
-+            imm = 0
-+            for i, elem in enumerate(aaraddr):
-+               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
-+                  signed = False
-+               if elem not in (0, 1, 2, 3):
-+                  unsigned = False
-+               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
-+            if not (signed or unsigned):
-+               asm_error('can\'t encode vector immediate')
-+            if signed:
-+               ldi_mode = LDI_EL_SIGNED
-+            else:
-+               ldi_mode = LDI_EL_UNSIGNED
-+            aaraddr, aarmux = imm, RMUX_IMM
-+         elif aarmux == RMUX_IMM:
-+            ldi_mode = LDI_32
-+         else:
-+            assert 0
-+
-+         # construct ldi instruction
-+         prog.append((aaraddr,
-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
-+            line, annots))
-+
-+         return
-+
-+      # convert movs to alu ops
-+      if aop == AOP_MOV:
-+         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
-+            aop = AOP_XOR
-+            aaraddr, aarmux = 0, RMUX_AC
-+            abraddr, abrmux = 0, RMUX_AC
-+         else:
-+            aop = AOP_OR
-+            abraddr, abrmux = aaraddr, aarmux
-+      if mop == MOP_MOV:
-+         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
-+            mop = MOP_V8SUBS
-+            maraddr, marmux = 0, RMUX_AC
-+            mbraddr, mbrmux = 0, RMUX_AC
-+         else:
-+            mop = MOP_V8MIN
-+            mbraddr, mbrmux = maraddr, marmux
-+
-+      # normal alu instruction...
-+
-+      # handle setf
-+      if asf and (aop == AOP_NOP):
-+         asm_error('nop.setf is not allowed in add pipe')
-+      if msf and (mop == MOP_NOP):
-+         asm_warning('nop.setf, really?')
-+      if (aop == AOP_NOP) or (acond == COND_NEVER):
-+         sf = msf
-+      else:
-+         if msf:
-+            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
-+         sf = asf
-+
-+      # handle read addrs
-+      raddr_a = None
-+      raddr_b = None
-+      immb = False
-+      arot_r5 = False
-+      muxes = [0, 0, 0, 0]
-+      if mwrot != 0:
-+         raddr_b = 48 + mwrot
-+         immb = True
-+      if mwrot_r5 and have_am:
-+         raddr_b = 48
-+         immb = True
-+      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
-+         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
-+            if f(rmux):
-+               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
-+      add_a, add_b, mul_a, mul_b = muxes
-+      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
-+         # some output elements might not be as expected
-+         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
-+            bad_elems = 0xffff
-+         else:
-+            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
-+            if mwrot > 12:
-+               bad_elems ^= 0xffff
-+         bad_elems &= dict(annots).get('mul_used', 0xffff)
-+         if not msf:
-+            if mwaddr == WADDR_NOP:
-+               # not writing anywhere and not setting flags. no elements used
-+               bad_elems = 0
-+            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
-+               ((not ws) and (mwaddr == 37))):
-+               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
-+               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
-+               # only use element 0
-+               bad_elems &= 0x0001
-+            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
-+               ((not ws) and (mwaddr == 42))):
-+               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
-+               # flags. only use elements 0, 4, 8, and 12
-+               bad_elems &= 0x1111
-+         if bad_elems:
-+            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
-+      if raddr_a is None:
-+         raddr_a = RADDR_NOP
-+      if raddr_b is None:
-+         raddr_b = RADDR_NOP
-+      if immb:
-+         if sig != SIG_NORMAL:
-+            asm_error('rotation/immediates and signal don\'t mix')
-+         sig = SIG_SMALLIMMED
-+      if arot_r5 or (mwrot_r5 and (not have_am)):
-+         if sig != SIG_NORMAL:
-+            asm_error('rotation/immediates/signal don\'t mix')
-+         sig = SIG_ROTATE
-+
-+      # construct instruction
-+      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
-+         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
-+         line, annots))
-+   finally:
-+      current_location = prev_location
-+
-+def preprocess_passthrough(file):
-+   line_number = 0
-+   for line in file:
-+      line_number += 1
-+      yield line_number, line
-+
-+def asm_file(sets, location, filename, preprocess = None):
-+   global current_dir, current_location
-+
-+   if filename is None:
-+      location = '<stdin>'
-+      file = sys.stdin
-+
-+      prev_dir = current_dir
-+   else:
-+      filename = os.path.normpath(os.path.join(current_dir, filename))
-+
-+      try:
-+         file = open(filename)
-+      except Exception, e:
-+         asm_error(e)
-+      except:
-+         asm_error('unknown error while opening file %s' % filename)
-+
-+      prev_dir = current_dir
-+      current_dir = os.path.dirname(filename)
-+
-+   prev_location = current_location
-+   current_location = location
-+
-+   if preprocess is None:
-+      preprocess = preprocess_passthrough
-+
-+   try:
-+      for line_number, line in preprocess(file):
-+         # strip off comments and whitespace
-+         line = line.split('#')[0].strip()
-+         if line == '':
-+            continue
-+
-+         asm_line(sets, '%s: %d' % (current_location, line_number), line)
-+   finally:
-+      current_dir = prev_dir
-+      current_location = prev_location
-+
-+def asm_end_prog():
-+   # check we aren't in a multi-line construct (eg .macro or .rep)
-+   if construct != None:
-+      asm_error({
-+         CONSTRUCT_MACRO: '.macro without .endm',
-+         CONSTRUCT_IF:    '.if/.elif without .endif',
-+         CONSTRUCT_ELSE:  '.else without .endif',
-+         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
-+
-+   # check no warnings level back to 0
-+   if nwarn_level != 0:
-+      asm_error('.pushnwarn without .popnwarn')
-+
-+   # flush queued up data
-+   asm_flush_prog_data()
-+
-+   # fixup all the label references we can
-+   for pc in xrange(len(prog)):
-+      if isinstance(prog[pc][0], tuple):
-+         location, label, rel, offset = prog[pc][0]
-+         if label[0].isdigit():
-+            label_pcs = labels.get(label[:-1], [])
-+            if label[-1] == 'b':
-+               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
-+            else:
-+               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
-+            if label_pcs == []:
-+               asm_error('search for label reached begin/end of file', location = location)
-+            imm = label_pcs[0]
-+         elif label in labels:
-+            imm = labels[label]
-+         elif (':' + label) in labels:
-+            imm = labels[':' + label]
-+         elif external_link:
-+            continue # let the external linker deal with it
-+         else:
-+            asm_error('undefined label', location = location)
-+         imm = (imm * 8) + offset
-+         if rel:
-+            imm -= (pc + 4) * 8 # relative to instruction after delay slots
-+            imm &= (1 << 32) - 1
-+         else:
-+            if not external_link:
-+               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
-+            imm = (location, label, rel, offset, imm)
-+         prog[pc] = (imm,) + prog[pc][1:]
-+
-+def asm_init():
-+   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
-+
-+   current_dir = os.getcwd()
-+   current_location = ''
-+   prog = []
-+   prog_data = []
-+   macros = {
-+      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
-+      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
-+   labels = {}
-+   construct = None
-+   construct_stack = []
-+   nwarn_level = 0
-+
-+def asm_reset_prog():
-+   global prog, labels
-+
-+   prog = []
-+   labels = {}
-+
-+###############################################################################
-+# dumping
-+###############################################################################
-+
-+def print_lines(lines):
-+   for line in lines:
-+      print line
-+
-+class dumper_t:
-+   def external_link(self): return False
-+   def begin(self): pass
-+   def label(self, pc, name): pass
-+   def line(self, pc, ls, ms, line, annots, first): pass
-+   def end(self): pass
-+   def sets(self, sets): pass
-+   def direct(self, line): pass
-+
-+class clif_dumper_t(dumper_t):
-+   def __init__(self):
-+      self.annot_mode = 0
-+
-+   def external_link(self):
-+      return True
-+
-+   def parse_annot_mode(self, line):
-+      l = line.split(',')
-+      self.annot_mode = int(l[0])
-+      if self.annot_mode not in (0, 1, 2):
-+         asm_error('bad annot mode')
-+      if self.annot_mode == 2:
-+         if len(l) != 2:
-+            asm_error('expected buffer name')
-+         self.annot_name = l[1].strip()
-+         self.annot_offset = 0
-+      elif len(l) != 1:
-+         asm_error('unexpected comma')
-+
-+   def label(self, pc, name):
-+      if (self.annot_mode != 1) and (name[0] == ':'):
-+         if self.annot_mode == 2:
-+            name = name + '_annotations'
-+         print '@label %s' % name[1:]
-+      else:
-+         print '// :%s' % name
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if self.annot_mode == 0:
-+         if isinstance(ls, tuple):
-+            if len(ls) == 5:
-+               location, label, rel, offset, offset_from_prog = ls
-+               assert not rel
-+               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
-+            else:
-+               location, label, rel, offset = ls
-+               if rel:
-+                  asm_error('relative external label references not allowed in this mode', location = location)
-+               ls = '[%s + %d]' % (label, offset)
-+         else:
-+            ls = '0x%08x' % ls
-+         print '%s 0x%08x // %s' % (ls, ms, line)
-+      elif self.annot_mode == 1:
-+         print '// %s' % line
-+         for annot in annots:
-+            print '0x%08x 0x%08x // %s' % ({
-+               # todo: would rather not have these hard coded
-+               'mul_used':              1,
-+               'preserve_cond':         2,
-+               'geomd_open':            3,
-+               'geomd_i':               4,
-+               'geomd_tris_clear':      5,
-+               'geomd_verts':           6,
-+               'geomd_tris_add':        7,
-+               'geomd_tris_set_center': 8,
-+               'geomd_region_clear':    9,
-+               'geomd_region_set':      10,
-+               'geomd_images_clear':    11,
-+               'geomd_images_l':        12,
-+               'geomd_images_b':        13,
-+               'geomd_images_r':        14,
-+               'geomd_images_t':        15,
-+               'geomd_images_add_vpm':  16,
-+               'trace_4c':              17,
-+               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
-+         if len(annots) != 0:
-+            print '0x00000000 // end'
-+      else:
-+         assert self.annot_mode == 2
-+         if len(annots) == 0:
-+            print '0x00000000 // %s' % line
-+         else:
-+            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
-+            self.annot_offset += (len(annots) * 8) + 4
-+
-+   def direct(self, line):
-+      print line
-+
-+class plain_dumper_t(dumper_t):
-+   def line(self, pc, ls, ms, line, annots, first):
-+      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
-+
-+class c_c_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, array_name):
-+      self.header_name = header_name
-+      self.array_name = array_name
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      self.external_labels = set()
-+      self.lines = []
-+
-+      print '#include "%s.h"' % self.header_name
-+      print ''
-+      print '#ifdef _MSC_VER'
-+      print '   #include <stdint.h>'
-+      print '   /* cast through uintptr_t to avoid warnings */'
-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
-+      print '#else'
-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
-+      print '#endif'
-+      print ''
-+      print '#ifdef __cplusplus'
-+      print 'extern "C" { /* the types are probably wrong... */'
-+      print '#endif'
-+
-+   def label(self, pc, name):
-+      self.lines.append('// :%s' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple):
-+         if len(ls) == 5:
-+            location, label, rel, offset, offset_from_prog = ls
-+            assert not rel
-+            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
-+         else:
-+            location, label, rel, offset = ls
-+            if rel:
-+               asm_error('relative external label references not allowed in this mode', location = location)
-+            if label not in self.external_labels:
-+               self.external_labels.add(label)
-+               print 'extern uint8_t %s[];' % label
-+            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
-+      else:
-+         ls = '0x%08x' % ls
-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
-+
-+   def end(self):
-+      print '#ifdef __cplusplus'
-+      print '}'
-+      print '#endif'
-+      print ''
-+      print '#ifdef _MSC_VER'
-+      print '__declspec(align(8))'
-+      print '#elif defined(__GNUC__)'
-+      print '__attribute__((aligned(8)))'
-+      print '#endif'
-+      print 'unsigned int %s[] = {' % self.array_name
-+      print_lines(self.lines)
-+      print '};'
-+      print '#ifdef __HIGHC__'
-+      print '#pragma Align_to(8, %s)' % self.array_name
-+      print '#endif'
-+
-+class c_h_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, array_name):
-+      self.full_header_name = full_header_name
-+      self.array_name = array_name
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '#ifndef %s_H' % self.full_header_name
-+      print '#define %s_H' % self.full_header_name
-+      print ''
-+      print 'extern unsigned int %s[];' % self.array_name
-+      print ''
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
-+
-+   def end(self):
-+      print ''
-+      print '#endif'
-+
-+class ml_c_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, name, annots):
-+      self.header_name = header_name
-+      self.name = name
-+      self.annots = annots
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      if self.annots:
-+         self.annot_lines = []
-+      self.lines = []
-+      self.external_labels = set()
-+      self.link_lines = []
-+
-+      print '#include "%s.h"' % self.header_name
-+      print '#include <assert.h>'
-+      if self.annots:
-+         print '#ifdef SIMPENROSE'
-+         print '#include <stddef.h>'
-+         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
-+      print ''
-+
-+   def label(self, pc, name):
-+      self.lines.append('// :%s' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if self.annots:
-+         if len(annots) == 0:
-+            self.annot_lines.append('NULL,')
-+         else:
-+            print 'static unsigned int const annotations_%d[] = {' % pc
-+            for annot in annots:
-+               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
-+            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
-+            print ''
-+            self.annot_lines.append('annotations_%d,' % pc)
-+      if isinstance(ls, tuple):
-+         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
-+         if len(ls) == 5:
-+            location, label, rel, offset, offset_from_prog = ls
-+            assert not rel
-+            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
-+         else:
-+            location, label, rel, offset = ls
-+            self.external_labels.add(label)
-+            if rel:
-+               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
-+            else:
-+               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
-+         ls = '0xdeadbeef'
-+      else:
-+         ls = '0x%08x' % ls
-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
-+
-+   def end(self):
-+      if self.annots:
-+         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
-+         print_lines(self.annot_lines)
-+         print '};'
-+         print '#endif'
-+         print ''
-+      print 'static unsigned int const array[] = {'
-+      print_lines(self.lines)
-+      print '};'
-+      print ''
-+      print 'void %s_link(void *p_in, unsigned int base' % self.name
-+      for label in sorted(self.external_labels):
-+         print '   , unsigned int %s' % label
-+      print '   )'
-+      print '{'
-+      print '   unsigned int *p = (unsigned int *)p_in;'
-+      print '   unsigned int i;'
-+      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
-+      print '      p[i] = array[i];'
-+      print '   }'
-+      print_lines(self.link_lines)
-+      print '}'
-+
-+class ml_h_dumper_t(dumper_t):
-+   def __init__(self, header_name, full_header_name, name, annots):
-+      self.full_header_name = full_header_name
-+      self.name = name
-+      self.annots = annots
-+
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      self.external_labels = set()
-+      self.lines_n = 0
-+
-+      print '#ifndef %s_H' % self.full_header_name
-+      print '#define %s_H' % self.full_header_name
-+      print ''
-+      if self.annots:
-+         print '#ifdef SIMPENROSE'
-+         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
-+         print '#endif'
-+         print ''
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
-+         if self.annots:
-+            print '#ifdef SIMPENROSE'
-+            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
-+            print '#endif'
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple) and (len(ls) != 5):
-+         self.external_labels.add(ls[1])
-+      self.lines_n += 1
-+
-+   def end(self):
-+      print ''
-+      print 'extern void %s_link(void *p, unsigned int base' % self.name
-+      for label in sorted(self.external_labels):
-+         print '   , unsigned int %s' % label
-+      print '   );'
-+      print ''
-+      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
-+      print ''
-+      print '#endif'
-+
-+def print_lines_lc(lines):
-+   for line in lines:
-+      print '%s \\' % line
-+
-+def print_groups_lc(groups):
-+   first = True
-+   for group in groups:
-+      if first:
-+         print '{ \\'
-+      else:
-+         print ', { \\'
-+      print_lines_lc(group)
-+      print '} \\'
-+      first = False
-+
-+class inline_c_dumper_t(dumper_t):
-+   def __init__(self, annots):
-+      self.annots = annots
-+      self.iteration = False
-+
-+   def begin_iteration(self):
-+      assert not self.iteration
-+      self.iteration = True
-+      self.iteration_lines = []
-+      if self.annots:
-+         self.iteration_annot_lines = []
-+         self.annot_arrs = []
-+
-+   def end_iteration(self):
-+      assert self.iteration
-+      self.iteration = False
-+      print '%d, \\' % self.iteration_n
-+      if self.annots:
-+         print '( \\'
-+      print_groups_lc(self.iteration_lines)
-+      if self.annots:
-+         print '), ( \\'
-+         print_groups_lc(self.iteration_annot_lines)
-+         print '), ( \\'
-+         for annot_arr in self.annot_arrs:
-+            print_lines_lc(annot_arr)
-+         print ') \\'
-+
-+   def begin(self):
-+      self.n = 0
-+      self.lines = []
-+      if self.annots:
-+         self.annot_lines = []
-+         if not self.iteration:
-+            self.annot_arrs = []
-+
-+   def label(self, pc, name):
-+      self.lines.append('/* :%s */' % name)
-+      if self.annots:
-+         self.annot_lines.append('/* :%s */' % name)
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      self.n += 1
-+      if first:
-+         prefix = ''
-+      else:
-+         prefix = ', '
-+      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
-+      if self.annots:
-+         if len(annots) == 0:
-+            a = 'NULL'
-+         else:
-+            a = 'annotations_%d' % len(self.annot_arrs)
-+            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
-+            for annot in annots:
-+               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
-+            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
-+            self.annot_arrs.append(annot_arr)
-+         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
-+
-+   def end(self):
-+      if self.iteration:
-+         if len(self.iteration_lines) == 0:
-+            self.iteration_n = self.n
-+         elif self.iteration_n != self.n:
-+            asm_error('number of instructions differs between iterations')
-+         self.iteration_lines.append(self.lines)
-+         if self.annots:
-+            self.iteration_annot_lines.append(self.annot_lines)
-+      else:
-+         if self.annots:
-+            print '( \\'
-+         print_lines_lc(self.lines)
-+         if self.annots:
-+            print '), ( \\'
-+            print_lines_lc(self.annot_lines)
-+            print '), ( \\'
-+            for annot_arr in self.annot_arrs:
-+               print_lines_lc(annot_arr)
-+            print ') \\'
-+
-+   def direct(self, line):
-+      print line
-+
-+class asvc_dumper_t(dumper_t):
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '.align 8'
-+
-+   def label(self, pc, name):
-+      if name[0] == ':':
-+         print '%s::' % name[1:]
-+      else:
-+         print '%s:' % name
-+
-+   def line(self, pc, ls, ms, line, annots, first):
-+      if isinstance(ls, tuple):
-+         location, label, rel, offset = ls[:4]
-+         if rel:
-+            ls = '%s + %d - (. + 32)' % (label, offset)
-+         else:
-+            ls = '%s + %d' % (label, offset)
-+      else:
-+         ls = '0x%08x' % ls
-+      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
-+
-+def is_ra_or_rb(val):
-+   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
-+
-+class aliases_dumper_t(dumper_t):
-+   def external_link(self):
-+      return True
-+
-+   def begin(self):
-+      print '#ifndef JUST_DQASM_ARGS'
-+
-+   def label(self, pc, name):
-+      if not name[0].isdigit():
-+         if name[0] == ':':
-+            name = name[1:]
-+         print '"bs%s", "bs%x",' % (name, pc * 8)
-+         print '"bu%s", "bu%x",' % (name, pc * 8)
-+
-+   def end(self):
-+      print '#endif'
-+
-+   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
-+   def sets(self, sets):
-+      dqasm_args = []
-+      print '#ifndef JUST_DQASM_ARGS'
-+      for name in sets:
-+         if is_ra_or_rb(sets[name]):
-+            dqasm_args.append('-r%s=%s' % (sets[name], name))
-+            print '"%s", "%s",' % (name, sets[name])
-+         elif isinstance(sets[name], list):
-+            for i, val in enumerate(sets[name]):
-+               if is_ra_or_rb(val):
-+                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
-+                  print '"%s[%d]", "%s",' % (name, i, val)
-+      print '#endif'
-+      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
-+
-+def dump(dumper):
-+   if (len(prog) != 0) or (len(labels) != 0):
-+      dumper.begin()
-+
-+      sorted_labels = []
-+      for name in labels:
-+         if name[0].isdigit():
-+            for pc in labels[name]:
-+               sorted_labels.append((pc, name))
-+         else:
-+            sorted_labels.append((labels[name], name))
-+      sorted_labels.sort(reverse = True)
-+
-+      first = True
-+      for pc in xrange(len(prog)):
-+         ls, ms, line, annots = prog[pc]
-+         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
-+            dumper.label(*sorted_labels.pop())
-+         dumper.line(pc, ls, ms, line, annots, first)
-+         first = False
-+      for sorted_label in sorted_labels:
-+         assert sorted_label[0] == len(prog)
-+         dumper.label(*sorted_label)
-+
-+      dumper.end()
-+
-+###############################################################################
-+# preprocessing
-+###############################################################################
-+
-+def preprocess_inline_c(dumper):
-+   def preprocess(file):
-+      ls = None
-+      line_number = 0
-+      for line in file:
-+         line_number += 1
-+         while True:
-+            if ls is None:
-+               l = line.split('%[', 1)
-+               if len(l) == 1:
-+                  dumper.direct(l[0].rstrip())
-+                  break
-+               dumper.direct('%s \\' % l[0].rstrip())
-+               line = l[1]
-+               ls = []
-+            else:
-+               l = line.split('%]', 1)
-+               ls.append((line_number, l[0]))
-+               if len(l) == 1:
-+                  break
-+               line = l[1]
-+               l = ls[-1][1].split('%|', 1)
-+               if len(l) == 1:
-+                  for l_number, l in ls:
-+                     yield l_number, l
-+                  asm_end_prog()
-+                  dump(dumper)
-+                  asm_reset_prog()
-+               else:
-+                  ls[-1] = (ls[-1][0], l[0])
-+                  if hasattr(dumper, 'begin_iteration'):
-+                     dumper.begin_iteration()
-+                  for repls in l[1].split('%,'):
-+                     repls = [repl.strip() for repl in repls.split('%/')]
-+                     for l_number, l in ls:
-+                        for i, repl in enumerate(repls):
-+                           l = l.replace('%' + str(i), repl)
-+                        yield l_number, l
-+                     asm_end_prog()
-+                     dump(dumper)
-+                     asm_reset_prog()
-+                  if hasattr(dumper, 'end_iteration'):
-+                     dumper.end_iteration()
-+               ls = None
-+   return preprocess
-+
-+def preprocess_clif(dumper):
-+   def preprocess(file):
-+      in_asm = False
-+      line_number = 0
-+      for line in file:
-+         line_number += 1
-+         if in_asm:
-+            if line.strip() == '%]':
-+               asm_end_prog()
-+               dump(dumper)
-+               asm_reset_prog()
-+               in_asm = False
-+            else:
-+               yield line_number, line
-+         else:
-+            if line.strip() == '%[':
-+               in_asm = True
-+            elif (line[:1] == '%') and (line[:2] != '%@'):
-+               yield line_number, line[1:]
-+            else:
-+               asm_end_prog()
-+               dump(dumper)
-+               asm_reset_prog()
-+               if line[:2] == '%@':
-+                  if hasattr(dumper, 'parse_annot_mode'):
-+                     dumper.parse_annot_mode(line[2:])
-+               else:
-+                  dumper.direct(line.rstrip())
-+   return preprocess
-+
-+###############################################################################
-+# main
-+###############################################################################
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
 +
 +def main():
-+   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
-+   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
++    global flog
 +
-+   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
 +
-+   # parse command line
-+   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
-+   parser.add_option('-m', '--mode', dest = 'mode',
-+      help = '<mode> should be clif, plain, ' +
-+      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
-+      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
-+      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
-+      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
-+      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
-+   parser.add_option('-t', '--target', dest = 'target',
-+      help = '<target> should be a0, b0, or hera', metavar = '<target>')
-+   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
-+   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
-+   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
-+   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
-+   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
-+   options, args = parser.parse_args()
-+   if len(args) == 0:
-+      filename = None
-+   elif len(args) == 1:
-+      filename = args[0]
-+   else:
-+      parser.print_help()
-+      sys.exit(-1)
++    argp.add_argument("streams", nargs='*')
++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++    argp.add_argument("--csv_in", help="CSV input filename")
++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
 +
-+   # handle mode
-+   mode = options.mode or 'clif' # assume clif if no mode specified
-+   if mode == 'clif':
-+      dumper = clif_dumper_t()
-+      preprocess = preprocess_clif(dumper)
-+   elif mode == 'plain':
-+      dumper = plain_dumper_t()
-+      preprocess = None
-+   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
-+      mode_options = mode[4:].split(',')
-+      if len(mode_options) != 3:
-+         asm_error('badly formatted mode on command line')
-+      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
-+      preprocess = None
-+   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
-+      mode_options = mode[5:].split(',')
-+      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
-+         asm_error('badly formatted mode on command line')
-+      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
-+         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
-+      preprocess = None
-+   elif mode == 'inline_c':
-+      dumper = inline_c_dumper_t(False)
-+      preprocess = preprocess_inline_c(dumper)
-+   elif mode == 'inline_c:annots':
-+      dumper = inline_c_dumper_t(True)
-+      preprocess = preprocess_inline_c(dumper)
-+   elif mode == 'asvc':
-+      dumper = asvc_dumper_t()
-+      preprocess = None
-+   elif mode == 'aliases':
-+      dumper = aliases_dumper_t()
-+      preprocess = None
-+   elif mode == 'aliases:inline_c':
-+      dumper = aliases_dumper_t()
-+      preprocess = preprocess_inline_c(dumper)
-+   else:
-+      asm_error('invalid mode')
-+   external_link = dumper.external_link()
++    args = argp.parse_args()
 +
-+   # handle target
-+   target = options.target or 'b0' # assume b0 if no target specified
-+   if target == 'a0':
-+      have_sema = False
-+      have_am = False
-+      mulw_rotate = False
-+      have_lthrsw = False
-+   elif target == 'b0':
-+      have_sema = True
-+      have_am = True
-+      mulw_rotate = True
-+      have_lthrsw = True
-+   elif target == 'hera':
-+      have_sema = True
-+      have_am = False
-+      mulw_rotate = True
-+      have_lthrsw = True
-+   else:
-+      asm_error('invalid target')
-+   if have_am:
-+      sigs['loadam'] = SIG_LOADAM
-+      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
-+   if have_lthrsw:
-+      sigs['lthrsw'] = SIG_LTHRSW
-+      del sigs['int']
-+      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++    csv_out.writeheader()
 +
-+   # handle misc options
-+   allow_xor_0 = options.allow_xor_0
-+   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
-+   warnings_are_errors = options.warnings_are_errors
-+   disable_warnings = options.disable_warnings
++    stats_in = {}
++    if args.csv_in != None:
++        with open(args.csv_in, 'r', newline='') as f_in:
++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
 +
-+   # make options visible to asm
-+   arg_defs['mode'] = mode
-+   arg_defs['target'] = target
++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
 +
-+   # arg_defs all setup at this point
-+   sets = arg_defs.copy() # todo: see arg_eval
++    streams = args.streams
++    if not streams:
++        if not stats_in:
++            print ("No source streams specified")
++            return 1
++        prefix = "" if args.prefix == None else args.prefix
++        streams = [k for k in stats_in]
++    elif args.prefix != None:
++        prefix = args.prefix
++    else:
++        prefix = streams[0]
++        for f in streams[1:]:
++            prefix = common_prefix(prefix, f)
++        pp = prefix.rpartition(os.sep)
++        prefix = pp[0] + pp[1]
++        streams = [s[len(prefix):] for s in streams]
 +
-+   # handle command line sets
-+   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
-+   for options_set in options.sets:
-+      m = re_options_set.match(options_set)
-+      if not m:
-+         asm_error('badly formatted set on command line')
-+      sets[m.group('name')] = arg_eval(m.group('val'), sets)
++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++        print ("====", f)
++
++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++        for i in range(3):
++            t = tstats.time_file(f, prefix)
++            print ("...", t.times_str())
++            if t0 > t:
++                t0 = t
++
++        if t0.name in stats_in:
++            pstat = stats_in[t0.name]
++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++        csv_out.writerow(t0.dict())
++
++        print ()
++
++    return 0
 +
-+   # assemble input file and dump
-+   asm_file(sets, filename, filename, preprocess)
-+   asm_end_prog()
-+   dump(dumper)
-+   for name in arg_defs: # todo: see arg_eval
-+      del sets[name]
-+   dumper.sets(sets)
 +
 +if __name__ == '__main__':
-+   main()
++    exit(main())
++
+diff --git a/pi-util/make_array.py b/pi-util/make_array.py
+new file mode 100755
+index 0000000000..864fa5e704
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,19 @@
++#!/usr/bin/env python
++
++# Usage
++#   make_array file.bin
++#   Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++  prefix,suffix = file.split('.')
++  assert suffix=='bin'
++  name=prefix.split('/')[-1]
++  print 'Converting',file
++  with open(prefix+'.h','wb') as out:
++    print >>out, 'static const unsigned char',name,'[] = {'
++    with open(file,'rb') as fd:  
++      for byte in fd.read():
++        print >>out, '%d,' % ord(byte)
++    print >>out,'};'
++
 diff --git a/pi-util/qem.sh b/pi-util/qem.sh
-new file mode 100644
-index 0000000..47dd071
+new file mode 100755
+index 0000000000..5ce2eeaf72
 --- /dev/null
 +++ b/pi-util/qem.sh
 @@ -0,0 +1,9 @@
 +TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ pi-util/qasm.py
++QASM=python\ ../local/bin/qasm.py
 +SRC_FILE=libavcodec/rpi_shader.qasm
 +DST_BASE=shader
 +
@@ -21696,101 +30240,9 @@ index 0000000..47dd071
 +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
 +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
 +
-diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
-new file mode 100755
-index 0000000..6a9a33f
---- /dev/null
-+++ b/pi-util/rebase_liblinks.py
-@@ -0,0 +1,37 @@
-+#!/usr/bin/env python
-+
-+import os, sys
-+from stat import *
-+
-+def walktree(top, callback, n, prefix):
-+    '''recursively descend the directory tree rooted at top,
-+       calling the callback function for each regular file'''
-+
-+    for f in os.listdir(top):
-+        pathname = os.path.join(top, f)
-+        mode = os.lstat(pathname).st_mode
-+        if S_ISDIR(mode):
-+            # It's a directory, recurse into it
-+            walktree(pathname, callback, n+1, prefix)
-+        elif S_ISLNK(mode):
-+            # It's a file, call the callback function
-+            callback(pathname, os.readlink(pathname), n, prefix)
-+
-+def visitfile(file, linkname, n, prefix):
-+    if (linkname.startswith(prefix + 'lib/')):
-+        newlink = "../" * n + linkname[len(prefix):]
-+        print 'relinking', file, "->", newlink
-+        os.remove(file)
-+        os.symlink(newlink, file)
-+
-+if __name__ == '__main__':
-+    argc = len(sys.argv)
-+    if argc == 2:
-+        walktree(sys.argv[1], visitfile, 0, "/")
-+    elif argc == 3:
-+        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
-+    else:
-+        print "rebase_liblinks.py <local root> [<old sysroot>]"
-+
-+
-+
-diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
-new file mode 100755
-index 0000000..d8bdd91
---- /dev/null
-+++ b/pi-util/syncroot.sh
-@@ -0,0 +1,43 @@
-+set -e
-+
-+if [ "$1" == "" ]; then
-+  echo Usage: $0 \<src_dir\> [\<rootname\>]
-+  echo src_dir is a source for rsync so may contain m/c name.
-+  echo rootname will be set to \"raspian_jessie_pi1\" if missing
-+  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
-+  exit 1
-+fi
-+
-+SYSROOT_NAME=$2
-+if [ "$SYSROOT_NAME" == "" ]; then
-+  SYSROOT_NAME=raspian_jessie_pi1
-+fi
-+
-+DST_ROOT=`pwd`
-+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
-+SRC=$1
-+
-+echo Sync src:  $SRC
-+echo Sync dest: $DST
-+
-+mkdir -p $DST/lib
-+mkdir -p $DST/opt/vc/include
-+mkdir -p $DST/usr/lib/pkgconfig
-+mkdir -p $DST/usr/bin
-+mkdir -p $DST/usr/share
-+
-+#### MUST NOT include /opt/vc/include/*GL*
-+# Creates conflicts with GL includes inside Chrome
-+
-+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
-+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
-+rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
-+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
-+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
-+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
-+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
-+rsync -rl $SRC/usr/include $DST/usr
-+
-+pi-util/rebase_liblinks.py $DST
-+
-+
 diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
-new file mode 100644
-index 0000000..5935a11
+new file mode 100755
+index 0000000000..5935a11ca5
 --- /dev/null
 +++ b/pi-util/v3dusage.py
 @@ -0,0 +1,128 @@
diff --git a/packages/tools/bcm2835-bootloader/package.mk b/packages/tools/bcm2835-bootloader/package.mk
index 49c28979d3..950193f90f 100644
--- a/packages/tools/bcm2835-bootloader/package.mk
+++ b/packages/tools/bcm2835-bootloader/package.mk
@@ -17,7 +17,7 @@
 ################################################################################
 
 PKG_NAME="bcm2835-bootloader"
-PKG_VERSION="756dd85"
+PKG_VERSION="ab5eb99"
 PKG_ARCH="arm"
 PKG_LICENSE="nonfree"
 PKG_SITE="http://www.broadcom.com"
diff --git a/projects/RPi/patches/kodi/kodi-001-backport.patch b/projects/RPi/patches/kodi/kodi-001-backport.patch
index ed3b647051..2f28a65810 100644
--- a/projects/RPi/patches/kodi/kodi-001-backport.patch
+++ b/projects/RPi/patches/kodi/kodi-001-backport.patch
@@ -1,7 +1,7 @@
-From 6cebd3b7186d58ee1dd14263f532f9a8c6f005bd Mon Sep 17 00:00:00 2001
+From 9eeffaec4cf147576df92e0e97cd9fd8ca248c53 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Oct 2014 00:19:40 +0000
-Subject: [PATCH 01/75] [cec] Add settings for configuring button repeats
+Subject: [PATCH 01/78] [cec] Add settings for configuring button repeats
 
 ---
  addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
@@ -10,10 +10,10 @@ Subject: [PATCH 01/75] [cec] Add settings for configuring button repeats
  3 files changed, 34 insertions(+), 1 deletion(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index e0060d1fae556de529274dbc6be07455701573a3..6443f3dd885bf0aa8e031039e36e273972a310ae 100644
+index 9009023f4d39d10b180cdbe981c0329cc3a3c3b2..7735c3cb0f010bb824896f5fb70ff28e9548b5ac 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19745,3 +19745,18 @@ msgstr ""
+@@ -19787,3 +19787,18 @@ msgstr ""
  msgctxt "#39010"
  msgid "Select sort method"
  msgstr ""
@@ -48,7 +48,7 @@ index d5704b249c3065b2980dc92c7c81dc7b384187bc..02b1a9ed6fce1986bd864bba09a9df06
  
    <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
 diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e829f35046f 100644
+index c78d1c206c14ea6d7ee92cd7fd03fbc62f0fd1d5..88289b3cbabacbe51aab3ab2ed0e1f2d46b5be79 100644
 --- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 +++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 @@ -1296,6 +1296,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
@@ -82,10 +82,10 @@ index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e82
    if (GetSettingBool("pause_playback_on_deactivate"))
    {
 
-From 0fdeeb63794764ebdd628e52d170bf8bac330efd Mon Sep 17 00:00:00 2001
+From 527184b27a880ec5bc2722d8c1e3075416889818 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Apr 2014 17:27:52 +0100
-Subject: [PATCH 02/75] [cec] Don't suspend pi on tv switch off - it can't wake
+Subject: [PATCH 02/78] [cec] Don't suspend pi on tv switch off - it can't wake
  up
 
 ---
@@ -106,10 +106,10 @@ index 02b1a9ed6fce1986bd864bba09a9df0621f9e041..54f9b70cfd5c8c82ceb99932e1b3e325
      <setting key="use_tv_menu_language" type="bool" value="1" label="36018" order="10" />
      <setting key="pause_playback_on_deactivate" type="bool" value="1" label="36033" configurable="0" />
 
-From 36f4544b7ac9c810c875e8ae19ab92b3f3dafb59 Mon Sep 17 00:00:00 2001
+From 9bc5c32ef31ccd55b48689b7287cf5afa003514f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 03/75] [rbp/omxplayer] When opening a stream don't try to
+Subject: [PATCH 03/78] [rbp/omxplayer] When opening a stream don't try to
  update gui so often
 
 ---
@@ -133,10 +133,10 @@ index c8fe0706d128b3c67a4000894129ae0fa08bb223..8a5916299575661743131b921a27a76f
          dialog->ProcessRenderLoop(false);
          if (allowCancel && dialog->IsCanceled())
 
-From 2be0471046b5e75078f1a284348b3d2fbd033555 Mon Sep 17 00:00:00 2001
+From eb6f9850358766675b79c8724d9f645ac8d9c280 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 04/75] [hifiberry] Hack: force it to be recognised as IEC958
+Subject: [PATCH 04/78] [hifiberry] Hack: force it to be recognised as IEC958
  capable to enable passthrough options
 
 ---
@@ -144,10 +144,10 @@ Subject: [PATCH 04/75] [hifiberry] Hack: force it to be recognised as IEC958
  1 file changed, 4 insertions(+)
 
 diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5ca4f62f5 100644
+index 4d87afa2c94c4e18e8001b9c105e0b5e6cc379d8..274000806ed7dc43130f4282cc0aedb3ae4ee209 100644
 --- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
 +++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-@@ -1351,6 +1351,10 @@ void CAESinkALSA::EnumerateDevice(AEDeviceInfoList &list, const std::string &dev
+@@ -1356,6 +1356,10 @@ void CAESinkALSA::EnumerateDevice(AEDeviceInfoList &list, const std::string &dev
      if (snd_card_get_name(cardNr, &cardName) == 0)
        info.m_displayName = cardName;
  
@@ -159,10 +159,10 @@ index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5
          info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
      {
 
-From e2b718b239b65f2132406355dfdf9c66da744b9c Mon Sep 17 00:00:00 2001
+From 3d01a30955c492a7992442dd493eecbfb2f4a4c6 Mon Sep 17 00:00:00 2001
 From: Ben Avison <bavison@riscosopen.org>
 Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 05/75] Improved file buffering in CArchive
+Subject: [PATCH 05/78] Improved file buffering in CArchive
 
 Even though memcpy is typically inlined by the compiler into byte/word loads
 and stores (at least for release builds), the frequency with which 1, 2 and 4
@@ -222,10 +222,10 @@ index 23cac2759fb10d532da56fa75c5528c5589e9010..89d31d4db1afa7340ed8cd51a7a9fa7a
      }
  
 
-From e59492cefc6ebc66027e7fb96475f14ad14a650c Mon Sep 17 00:00:00 2001
+From 3db1ae9a40311ab19b2b31a6b311b1e9e95db224 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 06/75] filesystem: Make support of browsing into archives
+Subject: [PATCH 06/78] filesystem: Make support of browsing into archives
  optional
 
 The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
@@ -244,10 +244,10 @@ We'll let people who don't use archives disable it manually
  4 files changed, 26 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6443f3dd885bf0aa8e031039e36e273972a310ae..7dfc5355cc0d85d94360ba21bc738733e4878f3d 100644
+index 7735c3cb0f010bb824896f5fb70ff28e9548b5ac..6b275ba8dba63f6d09a92c92a4d599af8ef46bec 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19388,6 +19388,15 @@ msgstr ""
+@@ -19430,6 +19430,15 @@ msgstr ""
  #: system/settings/rbp.xml
  msgctxt "#38010"
  msgid "GPU accelerated"
@@ -284,10 +284,10 @@ index 62e9c8ed2199f8c57a640b06b0216ee4c8f0ca1e..e8b0d3d472b02fd161a4b51e957b9129
 +  </section>
  </settings>
 diff --git a/xbmc/Util.cpp b/xbmc/Util.cpp
-index c3567941192c724f2600494a8d7e355584b57b52..da1508dcedbd196789988d895e64548a08439d8f 100644
+index ca99cf148057f44883ca6be08d340956bbe40f80..4fdac55278ee3a7e4c88f038bb6a39ddb54211cd 100644
 --- a/xbmc/Util.cpp
 +++ b/xbmc/Util.cpp
-@@ -1899,7 +1899,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
+@@ -1904,7 +1904,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
      URIUtils::RemoveExtension(strCandidate);
      if (StringUtils::StartsWithNoCase(strCandidate, videoName))
      {
@@ -296,7 +296,7 @@ index c3567941192c724f2600494a8d7e355584b57b52..da1508dcedbd196789988d895e64548a
          CUtil::ScanArchiveForAssociatedItems(pItem->GetPath(), "", item_exts, associatedFiles);
        else
        {
-@@ -1909,7 +1909,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
+@@ -1914,7 +1914,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
      }
      else
      {
@@ -335,10 +335,10 @@ index a0fd0a9011e71f4af1535110c696b6ea5c4b37db..688b71a297c7c617c6764bfe6be157d7
    {
      CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
 
-From 73698542aed16c452fc15f5cd5a438e127676b68 Mon Sep 17 00:00:00 2001
+From d32758d3aef8d023416c0911983901fb85912bfc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 07/75] [rbp] Make cachemembuffersize default depend on memory
+Subject: [PATCH 07/78] [rbp] Make cachemembuffersize default depend on memory
  size
 
 ---
@@ -402,7 +402,7 @@ index a35a509a91483f13e2cf0e688fc7e9528f254290..fffa5182126159f6dfcf750b21fa0464
    void Deinitialize();
    int GetArmMem() { return m_arm_mem; }
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 91574029c28c4fabacb4bc022aa028dcaf299adb..46d72aa072d34119f4a7273dc8f71176abebd27c 100644
+index aa802635ba3c295bd5d425af204e9ea98dee0a17..96021d579fe144d0050a7bb813e7a0dbc9d3c804 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -50,6 +50,9 @@
@@ -440,10 +440,10 @@ index 91574029c28c4fabacb4bc022aa028dcaf299adb..46d72aa072d34119f4a7273dc8f71176
  }
  
 
-From 48eb57a16b9d386dc54b42ab04700f8f7f85fab9 Mon Sep 17 00:00:00 2001
+From d31cd7eb3c58e0478dd1f388162aa3c665cc918b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 08/75] [settings] Experiment: Report DESKTOP resolution in
+Subject: [PATCH 08/78] [settings] Experiment: Report DESKTOP resolution in
  video settings
 
 ---
@@ -465,10 +465,10 @@ index ef95bc286fa982790248bad26da3c3e00c1da002..da69c6960867621d4ebe9267929664d9
          StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
                              ModeFlagsToString(resolution->flags, false).c_str()),
 
-From 952474c036385667d8ec894c178f58490af6f69c Mon Sep 17 00:00:00 2001
+From 883c180b3e1e5faf2391e2a5770a20c086608893 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 09/75] [audio] Add settings option to boost centre channel
+Subject: [PATCH 09/78] [audio] Add settings option to boost centre channel
  when downmixing
 
 This allows a dB volume increase to be added to centre channel.
@@ -486,10 +486,10 @@ Should work with Pi Sink (dvdplayer/paplayer) and omxplayer
  5 files changed, 46 insertions(+)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 7dfc5355cc0d85d94360ba21bc738733e4878f3d..c67fc9a16f303a822dadfb4f558a390ada04bca8 100644
+index 6b275ba8dba63f6d09a92c92a4d599af8ef46bec..a1b537ff0d3b1d72f0b4e122c93f2e9fbfc4f4ac 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19608,6 +19608,21 @@ msgstr ""
+@@ -19650,6 +19650,21 @@ msgstr ""
  
  #empty strings from id 38062 to 38099
  
@@ -512,10 +512,10 @@ index 7dfc5355cc0d85d94360ba21bc738733e4878f3d..c67fc9a16f303a822dadfb4f558a390a
  #: system/settings/settings.xml
  msgctxt "#38100"
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 301e7276e5b79e00457db1f33b1cd576bdef4c85..5f1f3ca48342ef1a4eeed7432221d7b2dda354e8 100644
+index 5ff71b9741c5d4d3c555042929e6764f3c6426da..536c2881d73e36ebb42ef495b426fc3fc34ba8ee 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
-@@ -2358,6 +2358,18 @@
+@@ -2374,6 +2374,18 @@
            </dependencies>
            <control type="toggle" />
          </setting>
@@ -594,10 +594,10 @@ index f16b822ed7b4aebe18b5d339b3f71ee66e97c23f..993d4b33a294e88c2c004b7943895ba5
      // stereo upmix
      if (upmix && m_src_channels == 2 && m_dst_channels > 2)
 
-From 1296ca8ae16f160bd8bdf00491582f94577122c5 Mon Sep 17 00:00:00 2001
+From a11f649848327bd03eaed9224112c14a59e092cc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 10/75] [rbp] Default extract thumbnails to false
+Subject: [PATCH 10/78] [rbp] Default extract thumbnails to false
 
 It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
 It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
@@ -623,10 +623,10 @@ index e8b0d3d472b02fd161a4b51e957b9129e3cb9792..289dc55ec41aa44848519a05f8ee1ccc
      </category>
    </section>
 
-From 221907efb819c990488518eb9c4b7cfd91151e4e Mon Sep 17 00:00:00 2001
+From 1917960dc4fd495cb2b180d8a36235b6a1879773 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 11/75] [languageinvoker] Reduce priority of python threads
+Subject: [PATCH 11/78] [languageinvoker] Reduce priority of python threads
 
 ---
  xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
@@ -649,10 +649,10 @@ index fcdd0633f30cd9595ae6cc4ed293677cdcb1f422..16f0c8916b5e0a9e90973d194cf2ebd1
  }
  
 
-From cf222655784da191a022a153fa5614cfbb4d79bd Mon Sep 17 00:00:00 2001
+From e9010a2ae44ca3ea0175a19721fd2dcd010e3019 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 12/75] [rbp] hack: wait for splash to complete before changing
+Subject: [PATCH 12/78] [rbp] hack: wait for splash to complete before changing
  hdmi mode
 
 ---
@@ -736,10 +736,10 @@ index ee297700f8583dbb15cbe53baf8c887b36bd2ea0..bbe501d40c5e101f1d0d64b8b59b1928
  
    RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
 
-From 7c77d589e065637bb0644889b520f3902b44b880 Mon Sep 17 00:00:00 2001
+From cc4bfd3f49bf4829e781fdc0a01743c9fa3927f5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 13/75] Fix for UI not showing both extractflags and
+Subject: [PATCH 13/78] Fix for UI not showing both extractflags and
  extractthumb
 
 ---
@@ -748,7 +748,7 @@ Subject: [PATCH 13/75] Fix for UI not showing both extractflags and
  2 files changed, 9 insertions(+), 5 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7 100644
+index a1b537ff0d3b1d72f0b4e122c93f2e9fbfc4f4ac..78ef8335f01cf1b023416a536155fdb5f3f62458 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -12451,7 +12451,7 @@ msgstr ""
@@ -778,7 +778,7 @@ index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea
  msgstr ""
  
  #: system/settings/settings.xml
-@@ -19784,3 +19784,7 @@ msgstr ""
+@@ -19826,3 +19826,7 @@ msgstr ""
  msgctxt "#38052"
  msgid "Remote button press release time (ms)"
  msgstr ""
@@ -787,7 +787,7 @@ index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea
 +msgid "Extract thumbnails from video files"
 +msgstr ""
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b29942315a 100644
+index 536c2881d73e36ebb42ef495b426fc3fc34ba8ee..eb96ec79d8c14a5a17af5228dd953699ae867008 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -974,8 +974,8 @@
@@ -802,10 +802,10 @@ index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b2
            <control type="toggle" />
          </setting>
 
-From 9e7d22b484cbccf5d54293a36c3cae38ce7426dd Mon Sep 17 00:00:00 2001
+From f61ebfb47fb09969e0f4e2ada140c08c1b5f08f0 Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 14/75] Disable autoscrolling while on screensaver and while
+Subject: [PATCH 14/78] Disable autoscrolling while on screensaver and while
  opening streams.
 
 ---
@@ -818,10 +818,10 @@ Subject: [PATCH 14/75] Disable autoscrolling while on screensaver and while
  6 files changed, 24 insertions(+), 3 deletions(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 947f0937d73cde5e4a4f39ed1a7932bd1e8eb0fe..593acafd15bb0409b4446b6e598f7aa4d7baf434 100644
+index a2448dc49e3be651761d5d6357ee946b46163ca9..1575f31827b842b19beea072b01ce3234c5d31b8 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -5232,3 +5232,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
+@@ -5241,3 +5241,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
    
    return false;
  }
@@ -936,10 +936,10 @@ index d7bc1c5ba6067af9a460589920367288c640a915..ac766293f1c47c7f145cb46f6b152144
        if (m_lastRenderTime)
          m_autoScrollDelayTime += currentTime - m_lastRenderTime;
 
-From 831794fa04a8589069317953f813ada9f0d3bf54 Mon Sep 17 00:00:00 2001
+From add9791eb2b60ba9a0269a2d8e749bc8c4e58d5c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 15/75] [demuxer] Avoid memcpy on every demuxer packet
+Subject: [PATCH 15/78] [demuxer] Avoid memcpy on every demuxer packet
 
 Avoids an unnecessary memcpy on every demuxer packet which for
 high bitrate videos can be significant.
@@ -1039,10 +1039,10 @@ index df0f35bd49c65b302de4ccd110d859e8b881ea5f..b4b591ae4c4dd4fb0b36d4d00fedca96
      }
      catch(...) {
 
-From 9673bb4533c0a82f4712752b6f6d28f5f1ceb24e Mon Sep 17 00:00:00 2001
+From ce7935004d48f8ac5fa752e3eb08bcdba156ff23 Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 16/75] Load OSD dialogs on startup.
+Subject: [PATCH 16/78] Load OSD dialogs on startup.
 
 Fixes skipped frames the first time they're loaded in memory on less powered
 devices, like a Raspberry Pi, when using DVDPlayer.
@@ -1137,10 +1137,10 @@ index 0534828dd85520134f7a6890e43a873e223062c1..5a86dfc1e2a54c8fe8d82cb75b612d8e
  CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
  { }
 
-From 19b2018244c328f5f88f90271e31de66bea486e3 Mon Sep 17 00:00:00 2001
+From 72df8a38ceba40e384e32f035e92f74685d5f5ef Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 17/75] [gui] Also limit GUI updates when in non full-screen
+Subject: [PATCH 17/78] [gui] Also limit GUI updates when in non full-screen
  video mode
 
 ---
@@ -1148,10 +1148,10 @@ Subject: [PATCH 17/75] [gui] Also limit GUI updates when in non full-screen
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea61b2ddcc 100644
+index 1575f31827b842b19beea072b01ce3234c5d31b8..7d597841f438ad6175444e1d6da601e479ee445d 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -2771,7 +2771,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2780,7 +2780,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
  #if defined(TARGET_RASPBERRY_PI) || defined(HAS_IMXVPU)
      // This code reduces rendering fps of the GUI layer when playing videos in fullscreen mode
      // it makes only sense on architectures with multiple layers
@@ -1160,7 +1160,7 @@ index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea
        fps = CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE);
  #endif
  
-@@ -2784,6 +2784,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2793,6 +2793,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
      {
        if (!m_skipGuiRender)
          g_windowManager.Process(CTimeUtils::GetFrameTime());
@@ -1170,10 +1170,10 @@ index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea
      g_windowManager.FrameMove();
    }
 
-From b7e74e740581f7e6ab94609171000b747da9c911 Mon Sep 17 00:00:00 2001
+From 5a463508bcd3a5c3863227f1c4fa53891b00da57 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 18/75] [screensaver] Leave GUI contents available for
+Subject: [PATCH 18/78] [screensaver] Leave GUI contents available for
  screensaver
 
 ---
@@ -1203,10 +1203,10 @@ index 5808f7ed1e94d68ead7305ba6d284edd4df12bdd..2a3b7f16531c9822e79c77efabdd30ac
  
    // Add window to the history list (we must do this before we activate it,
 
-From fe4cef6b6e2a35352ede135ac84ff3539d1ff09e Mon Sep 17 00:00:00 2001
+From d18202353c69d022f480e48d8a6b1457ea7bd162 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 19/75] ffmpeg: Automatic switch to software decode for GMC
+Subject: [PATCH 19/78] ffmpeg: Automatic switch to software decode for GMC
  with more than one warp point
 
 ---
@@ -1434,10 +1434,10 @@ index f135d423c0ca76fd70e79ae5b7d035f0cb79fc75..d9b576bc46055fdab1c134e5f2c63cd4
        else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
          supported = true;
 
-From f5dabe10623f19cd9e8ea015e2d248d47c03900c Mon Sep 17 00:00:00 2001
+From 352bf85b5b1e081fb0222625869943fc136f2a6e Mon Sep 17 00:00:00 2001
 From: Claudio-Sjo <Claudio.Porfiri@gmail.com>
 Date: Mon, 16 Feb 2015 14:51:26 +0100
-Subject: [PATCH 20/75] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
+Subject: [PATCH 20/78] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
  - fixes #15794
 
 ---
@@ -1629,10 +1629,10 @@ index 0427af4534bfe59a343f0518c7f4242d93299836..e99236294fa8b9b613e465a8ecaf3ad3
    lsn_t m_lsnCurrent; // Position inside the track in logical sector number
    lsn_t m_lsnEnd;   // End of m_iTrack in logical sector number
 
-From 9e3b4fd8c161b01d324220252289a5b3a49fb7e8 Mon Sep 17 00:00:00 2001
+From ef632a0c5391db64b1bd65181141a8bab14af2e6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 24 Jun 2016 19:38:13 +0100
-Subject: [PATCH 21/75] codecoverlay: Include codec name in overlay
+Subject: [PATCH 21/78] codecoverlay: Include codec name in overlay
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp | 4 ++++
@@ -1726,10 +1726,10 @@ index 0df7e72cc9d1947173c2bac5e72eb09976b51aa5..b5050081c360d29b1b478c27e6b88291
    double                    m_iSubtitleDelay;
    bool                      m_bRenderSubs;
 
-From 119f7291d3b7c1a57d3a86b3836c8a73a7cd1211 Mon Sep 17 00:00:00 2001
+From e207be7c755180ebf556c455e70fd9cbba0e9540 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 8 Mar 2016 21:20:58 +0300
-Subject: [PATCH 22/75] [DebugInfo] Add cpu usage info.
+Subject: [PATCH 22/78] [DebugInfo] Add cpu usage info.
 
 ---
  .../VideoPlayer/VideoRenderers/DebugRenderer.cpp   | 56 ++++++++--------------
@@ -1899,10 +1899,10 @@ index 420b5b5d8e6089e1049ef9af25e23d915df50dc1..fd8a0a2447c40357a9e13003f2ef45ef
  
        m_debugTimer.Set(1000);
 
-From 21927619971ef137030d64a0dd102a90a7effaf0 Mon Sep 17 00:00:00 2001
+From d5f5094b8c01bbfe96a13d56bdf4bbdd680f1876 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 23/75] ffmpeg: Allow neon to be enabled in unified builds
+Subject: [PATCH 23/78] ffmpeg: Allow neon to be enabled in unified builds
 
 ---
  tools/depends/target/ffmpeg/Makefile | 4 ++++
@@ -1925,10 +1925,10 @@ index 8dd14cdfd053f142f386b6dee1fc0b21bb1f8d93..b5f38a458dfb341c43089e07afded153
  ifeq ($(OS), linux)
    ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
 
-From 7c9767ac163fada0423cf8cc27b05f0d74482220 Mon Sep 17 00:00:00 2001
+From c652b5f7d6541c5a6110f8003ea85162d36418b7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 24/75] ffmpeg: Add some upstream HEVC optimisations
+Subject: [PATCH 24/78] ffmpeg: Add some upstream HEVC optimisations
 
 ---
  tools/depends/target/ffmpeg/Makefile               |    6 +-
@@ -5726,10 +5726,10 @@ index 0000000000000000000000000000000000000000..5e8e07d407f045fc99554f0f061d1e81
 +2.5.0
 +
 
-From f15eaf9000104c97d5bfc5ea046b4407cab2a261 Mon Sep 17 00:00:00 2001
+From 246118b5d0b263cc78efc5fedf3bfb0dc87727b6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 25/75] [ffmpeg] Add GPU acceleration to hevc
+Subject: [PATCH 25/78] [ffmpeg] Add GPU acceleration to hevc
 
 ---
  tools/depends/target/ffmpeg/Makefile               |     4 +-
@@ -43915,10 +43915,10 @@ index 0000000000000000000000000000000000000000..e172ebf157aebffe1ae50b4a2b25fd71
 +2.7.4
 +
 
-From 88b331888a7677058bb3dfb064d7eb952b0ce1a9 Mon Sep 17 00:00:00 2001
+From 2fb19cb4de89eb353132eddf05725bb802cf4a15 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 26/75] ffmpeg: Add cabac opimisations for hevc
+Subject: [PATCH 26/78] ffmpeg: Add cabac opimisations for hevc
 
 ---
  .../0001-Squashed-commit-of-the-following.patch    | 2179 ++++++++++++++++++++
@@ -46163,10 +46163,10 @@ index d6856dbd4fb4957ace700cbc08332223c01938f6..a61357f14cb2139e8125ae04684bed1b
  
  make -j ${BUILDTHREADS} 
 
-From ce532b19d18df015cecb0e2e2ec85f0c89885a25 Mon Sep 17 00:00:00 2001
+From 9f9d02837471153722950ee6455ce8cd0b92b4fc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 27/75] [3d] Make MVC a valid 3D filename tag
+Subject: [PATCH 27/78] [3d] Make MVC a valid 3D filename tag
 
 ---
  xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
@@ -46195,7 +46195,7 @@ index b34873cba6534086ae243326550385867a03256a..1443acaf0f25df458ae49766e13dd032
  }
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 46d72aa072d34119f4a7273dc8f71176abebd27c..cca5c7f932241d146291d2bb0a0042f99fa0d596 100644
+index 96021d579fe144d0050a7bb813e7a0dbc9d3c804..0c636c9ed2f57b7a39d58c361012337c862128bc 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -402,6 +402,7 @@ void CAdvancedSettings::Initialize()
@@ -46227,10 +46227,10 @@ index fc526d11c3a78bc74125429120e29bf295bd3b16..6b0e3b8cf9e3ff40e6af758c54fe7eef
      bool m_useDisplayControlHWStereo;
  
 
-From df4fc81637ca4b47d4ce0e64110d8bab4bd77cd4 Mon Sep 17 00:00:00 2001
+From f9e13ac675d5d62b461ed4c6bb54e52dd1c85685 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 5 Oct 2015 14:58:05 +0100
-Subject: [PATCH 28/75] [3d] Swap top/bottom sides of GUI
+Subject: [PATCH 28/78] [3d] Swap top/bottom sides of GUI
 
 ---
  xbmc/guilib/GraphicContext.cpp | 2 +-
@@ -46250,10 +46250,10 @@ index 3706e4d80b3b31da4c5be0a1b21f36e59d2910f2..e170b3fb05279ffa316794dbce1d4f9d
    }
    if(m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
 
-From 2373df61c862bc62538391596c098a80968d1c0d Mon Sep 17 00:00:00 2001
+From 67c2ae9f396aa82586b5fd896ee36447ceb5cdba Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 11 Oct 2015 20:51:37 +0100
-Subject: [PATCH 29/75] Revert "Revert "Disable extra logging by default""
+Subject: [PATCH 29/78] Revert "Revert "Disable extra logging by default""
 
 This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
 ---
@@ -46261,10 +46261,10 @@ This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2fdbe11b2 100644
+index eb96ec79d8c14a5a17af5228dd953699ae867008..3a0d9bd1274b0664e34eb8865f41caf816bc2c30 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
-@@ -2834,12 +2834,12 @@
+@@ -2850,12 +2850,12 @@
          </setting>
          <setting id="debug.extralogging" type="boolean" label="666" help="36394">
            <level>1</level>
@@ -46280,10 +46280,10 @@ index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2
              <options>loggingcomponents</options>
              <delimiter>,</delimiter>
 
-From a0543043a26699a0e4a8bed989481ab1320e3f0c Mon Sep 17 00:00:00 2001
+From 620756eeb948638bc08d6ca831cd12d82955dfa4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 30/75] [omximage] Fall back to arm jpeg encode/decode when gpu
+Subject: [PATCH 30/78] [omximage] Fall back to arm jpeg encode/decode when gpu
  is busy
 
 ---
@@ -46526,10 +46526,10 @@ index a93aa82663903fb1bf712058c2e259290ee742e6..6f38dbc7e5cc721c59a3633935f08218
  
  extern COMXImage g_OMXImage;
 
-From 72ad7c69c3f847ade231f29ac23ffb96ebaf2ae4 Mon Sep 17 00:00:00 2001
+From 3492f4e80fc154aeaebf4178079db75b536cb9d7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 31/75] [mmalcodec] Fail to open when width is invalid. Can
+Subject: [PATCH 31/78] [mmalcodec] Fail to open when width is invalid. Can
  happen with mpegts files
 
 ---
@@ -46551,10 +46551,10 @@ index 822b7bf75f2e732b5eed8687403d0eda503fa641..c43952d4d29b42f3a5c7605573294568
    if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
      return false;
 
-From 0e735b38e2891c582c5a37dc5ded26cb954948a8 Mon Sep 17 00:00:00 2001
+From 93000006d2c2edaf16d7b3302914f3f32135830f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 32/75] [videoplayer/rbp] Add pi specific option to maintain
+Subject: [PATCH 32/78] [videoplayer/rbp] Add pi specific option to maintain
  vsync with pll adjustment
 
 New A/V sync option in settings/video/playback to do "Adjust PLL".
@@ -46576,10 +46576,10 @@ or drop/dupe audio packets which is normally required.
  12 files changed, 143 insertions(+), 21 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7..55ec0a9985a8e77873d787e879d73c076e13b2c6 100644
+index 78ef8335f01cf1b023416a536155fdb5f3f62458..097464415a596cf13b3c245bbedd616f5a4e49ef 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19788,3 +19788,35 @@ msgstr ""
+@@ -19830,3 +19830,35 @@ msgstr ""
  msgctxt "#38190"
  msgid "Extract thumbnails from video files"
  msgstr ""
@@ -46641,7 +46641,7 @@ index 289dc55ec41aa44848519a05f8ee1ccc72740085..2572e25753712186f69390965ee1448b
        <group id="3">
          <setting id="audiooutput.ac3transcode" help="37024">
 diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp
-index f5671b8dfb03216301d936ae3b08bfc3e8225729..68399ab14faf813bd195d2fdf03a4a376307b4cd 100644
+index a1ea0791f48888257db50ce3b7807fd6ced6e8c1..45a293d1d34c12d77c03027cc282c3a3a7940354 100644
 --- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp
 +++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp
 @@ -363,11 +363,12 @@ void CActiveAE::StateMachine(int signal, Protocol *port, Message *msg)
@@ -46995,10 +46995,10 @@ index fffa5182126159f6dfcf750b21fa0464e229e545..815d758e7086d73b4d4eb16849fdbb50
  
  extern CRBP g_RBP;
 
-From d4a5c46043ced09c53dea24e6ca090a574806e3b Mon Sep 17 00:00:00 2001
+From e9ca06686d8335b2292676c0f1cbc7b6ab66b1fc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 33/75] rbp: Support zero copy interface with hevc acceleration
+Subject: [PATCH 33/78] rbp: Support zero copy interface with hevc acceleration
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 9 +++++++++
@@ -47042,10 +47042,10 @@ index 77ae3273bc8e224fe6c193300ccef32fb7fbafe1..c0b3f19f2ef9cdef9adf00cf81154803
    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
      CLog::Log(LOGDEBUG, "%s::%s - mmal:%p dts:%.3f pts:%.3f buf:%p gpu:%p", CLASSNAME, __FUNCTION__, picture->MMALBuffer->mmal_buffer, 1e-6*picture->dts, 1e-6*picture->pts, picture->MMALBuffer, gmem);
 
-From 0fbf365c6de020f0d094c8ab221b159593eecce5 Mon Sep 17 00:00:00 2001
+From c666a1a3c43b1dd4bf03cf64a433a945140597d3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 16 May 2015 18:26:04 +0100
-Subject: [PATCH 34/75] ffmpeg: use upstream mvc patches
+Subject: [PATCH 34/78] ffmpeg: use upstream mvc patches
 
 ---
  ...vcodec-add-h264_mvc-codec-id-and-profiles.patch |  68 ++++++++++++
@@ -47355,10 +47355,10 @@ index 0000000000000000000000000000000000000000..b39480ad098b9cd0882fcf75b96afb1b
 +2.7.4
 +
 
-From f303faf857227cee88db21f5e95bd0a7d2f8c06e Mon Sep 17 00:00:00 2001
+From af658a2b7faaf843451a5ccac8113949c8cd0de7 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 29 Jan 2016 17:18:50 +0300
-Subject: [PATCH 35/75] [win32] Settings: Added setting to enable/disable MVC
+Subject: [PATCH 35/78] [win32] Settings: Added setting to enable/disable MVC
  decoder.
 
 ---
@@ -47388,10 +47388,10 @@ index a017d30c24232fb01220b87b29398403b8ed9662..2fcee72a64e8b701c8e895143410bbe9
      <category id="display">
        <group id="1">
 
-From 9f1937bc8941347695d09078e624cc30beab4a6d Mon Sep 17 00:00:00 2001
+From 9f1ecf9e4e7dab6b7649d3ee549e0dee28274a92 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Wed, 20 Jan 2016 17:02:16 +0300
-Subject: [PATCH 36/75] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
+Subject: [PATCH 36/78] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
  streams.
 
 ---
@@ -47454,10 +47454,10 @@ index 54a18c669a058b705e0276cb7e14522ae6cd04ae..55431978dcfabee8da95e2e76292ff81
        }
      case AVMEDIA_TYPE_DATA:
 
-From a451efc2d79422565ef1cbf931444c3ef5165125 Mon Sep 17 00:00:00 2001
+From c9931f6dca15e99b1557269f4277ebfb68bfc52c Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 25 Feb 2016 11:21:25 +0300
-Subject: [PATCH 37/75] [Stereo3D] Added block_lr and block_rl to supported
+Subject: [PATCH 37/78] [Stereo3D] Added block_lr and block_rl to supported
  modes.
 
 ---
@@ -47507,10 +47507,10 @@ index 1443acaf0f25df458ae49766e13dd0323454f2eb..6aaa82f4d883b8cae0ccdedf6c5a6814
      i++;
    }
 
-From 39522c63603fb5bf00b95a0eba5df6a626ea240f Mon Sep 17 00:00:00 2001
+From 368c41c30868369865b59bea96f21a21b01bf9b9 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Sat, 23 Jan 2016 10:21:32 +0300
-Subject: [PATCH 38/75] [VideoPlayer] Fix possible wrong aspect.
+Subject: [PATCH 38/78] [VideoPlayer] Fix possible wrong aspect.
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerVideo.cpp | 2 +-
@@ -47530,10 +47530,10 @@ index 903f0d83527d9088ff1bf0ba056f357f6abfda81..a5a33d34c70892cde77ad4d8f3cb65fd
    else
      m_fForcedAspectRatio = 0.0;
 
-From b362a9d5e20db180bc6fce923188a921e7a0e985 Mon Sep 17 00:00:00 2001
+From db6058178103680919790f134a21bc7898c6561a Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 22 Jan 2016 18:18:33 +0300
-Subject: [PATCH 39/75] [VideoPlayer] DemuxFFmpeg: ssif remux
+Subject: [PATCH 39/78] [VideoPlayer] DemuxFFmpeg: ssif remux
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/CMakeLists.txt  |   2 +
@@ -47954,7 +47954,7 @@ index e4f8aed0af96fe0dceec4d8517087742f2c7df81..30076937bd084936571abf0e6eeecf5a
  LIB = DVDDemuxers.a
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index cca5c7f932241d146291d2bb0a0042f99fa0d596..edbc96f7be3ae4dae994320f8c137555c927d455 100644
+index 0c636c9ed2f57b7a39d58c361012337c862128bc..1c23e5b0f25f0c6f2e5f7cab166aac825af5a30e 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -391,7 +391,7 @@ void CAdvancedSettings::Initialize()
@@ -47967,10 +47967,10 @@ index cca5c7f932241d146291d2bb0a0042f99fa0d596..edbc96f7be3ae4dae994320f8c137555
    m_discStubExtensions = ".disc";
    // internal music extensions
 
-From 0bd2f0f4af5d90cd685380e36379590a378d024d Mon Sep 17 00:00:00 2001
+From 327cb97b6787b445345bb92befc6edb937fd11f7 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:02:46 +0300
-Subject: [PATCH 40/75] [3DBD] Added support of 3D-BluRay playback.
+Subject: [PATCH 40/78] [3DBD] Added support of 3D-BluRay playback.
 
 ---
  lib/DllLibbluray.h                                 |   8 +
@@ -48960,10 +48960,10 @@ index b967a85e6557e42a7f1235cdd804d5a0263b866f..561fb5cd4f971bc9ee4f41218a60bb3d
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 913cd365b12a9730cb04bb8a9d5ebddde02d5503 Mon Sep 17 00:00:00 2001
+From d308e52086d930545d3411b0280cfb552d237c8e Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 11 Mar 2016 16:58:53 +0300
-Subject: [PATCH 41/75] [VideoPlayer] HasVideo returns true if video stream
+Subject: [PATCH 41/78] [VideoPlayer] HasVideo returns true if video stream
  exists. This don't allow start visualization if audio is opened before video.
 
 ---
@@ -48971,7 +48971,7 @@ Subject: [PATCH 41/75] [VideoPlayer] HasVideo returns true if video stream
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index 0285de264b4abc9433d70ae056b80c3db4b318c9..b244a21ac083c6f7b0e2d455e2b7a45fb2497640 100644
+index f909c8e451a057aa9f1d7d4c3264c8a7059185c1..69ce875fd44606d55e3186868927aaaec99e934c 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3074,7 +3074,7 @@ void CVideoPlayer::Pause()
@@ -48984,10 +48984,10 @@ index 0285de264b4abc9433d70ae056b80c3db4b318c9..b244a21ac083c6f7b0e2d455e2b7a45f
  
  bool CVideoPlayer::HasAudio() const
 
-From e8a09603950b958dd1934cb460fda960759485f8 Mon Sep 17 00:00:00 2001
+From 206d1d4f0b845f29638f2139c26d3461215c3b43 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 42/75] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 42/78] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -49024,10 +49024,10 @@ index 3a080d06c90b0762482816928642e6de7810b539..a8323f419e404037c4e5fb4d78fa1b45
      CDVDOverlayImage* overlay = new CDVDOverlayImage();
  
 
-From f10689878e33dc69a2ebbd559f41de12e72784c5 Mon Sep 17 00:00:00 2001
+From c4eb63ed988135ca18ab0b77357def3513bd3585 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 7 Apr 2016 17:28:50 +0300
-Subject: [PATCH 43/75] [VideoPlayer] Disable reading extension stream from
+Subject: [PATCH 43/78] [VideoPlayer] Disable reading extension stream from
  input stream if decoder doesn't support it.
 
 ---
@@ -49257,7 +49257,7 @@ index 0b676c9b611fe956f1aa721013412e41ff5b62f6..6762e733848d1298a75a862b0aaf81aa
  
  class CDVDAudioCodec;
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index b244a21ac083c6f7b0e2d455e2b7a45fb2497640..69b031a5623888a1b9a8c0ca7fe34fe3b1900fdc 100644
+index 69ce875fd44606d55e3186868927aaaec99e934c..abdaef946b4155a74ea4abe9f8bf0db9403be710 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3802,6 +3802,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
@@ -49284,10 +49284,10 @@ index 0d4100e58e9db7e5035bcf9ae23b0147f80cec8f..69570153f0810a5840f3780c7a6681a1
    // classes
    CDVDOverlayContainer* m_pOverlayContainer;
 
-From 74d399ad03a76c6f63c4fab2ba8ba2760a2f2180 Mon Sep 17 00:00:00 2001
+From edc458879b9e964f886c17cee950951ac3d1000e Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 16 Sep 2016 11:37:48 +0300
-Subject: [PATCH 44/75] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
+Subject: [PATCH 44/78] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
  platform settings to common settings.
 
 ---
@@ -49317,7 +49317,7 @@ index 2572e25753712186f69390965ee1448bff3fadd5..7098edf32dff8c00e192229c3ffb060b
    </section>
    <section id="media">
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 850abcd174cc8773319639c7e337f2e2fdbe11b2..0fb9464a598cad05893bff627cbd7ddee7341ca8 100644
+index 3a0d9bd1274b0664e34eb8865f41caf816bc2c30..e18bc802a49be8b12fcaac2af583c8b3c167b249 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -343,6 +343,12 @@
@@ -49383,10 +49383,10 @@ index 473ca093f45f6a5779cade1268269bb7ba483e9d..11a422b1a5cbfde9914d3bfd23b5b540
    m_simpleConditions.insert("have_lcms2");
  #endif
 
-From 1f0f86550e8cfed2a5de0d436c5c1e1e2ea642a1 Mon Sep 17 00:00:00 2001
+From 3b9367571d460fefdb993c055d6a5a618976ed61 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 4 Nov 2016 22:56:56 +0300
-Subject: [PATCH 45/75] [VideoPlayer] SSIF: fix for corner case when mvc stream
+Subject: [PATCH 45/78] [VideoPlayer] SSIF: fix for corner case when mvc stream
  is switched before the last packet is read from previous stream.
 
 ---
@@ -49575,17 +49575,17 @@ index f70657c9e31fb2460d12910c635dba5163282e74..a11ec77903d2a9b2c68106a8e2301af9
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From ddc42633af64cfc6e9447d40f988c86a9a04250d Mon Sep 17 00:00:00 2001
+From 2cfde24377717ce4f3f10879b5f7fc547a00b324 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:01:08 +0300
-Subject: [PATCH 46/75] [libbluray] bump libbluray to 0.9.2-mvc.
+Subject: [PATCH 46/78] [libbluray] bump libbluray to 0.9.2-mvc.
 
 ---
  project/BuildDependencies/scripts/0_package.list | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/project/BuildDependencies/scripts/0_package.list b/project/BuildDependencies/scripts/0_package.list
-index 6f53a2785027cf6c34d084402f3f1aee7cf5860a..e4a67e91b0a6b9fafad972b0f6f8e86c619c436f 100644
+index 2d15488f24703223db57848459e536cc08eb22cf..9c0c60ce3447e4d0e992457e5ca3be95d4296ea9 100644
 --- a/project/BuildDependencies/scripts/0_package.list
 +++ b/project/BuildDependencies/scripts/0_package.list
 @@ -17,7 +17,7 @@ freetype-db5a22-win32-vc140.7z
@@ -49598,10 +49598,10 @@ index 6f53a2785027cf6c34d084402f3f1aee7cf5860a..e4a67e91b0a6b9fafad972b0f6f8e86c
  libcec-4.0.1-win32-vc140-2.7z
  libfribidi-0.19.2-win32.7z
 
-From 30060bc20c7f25701009d77d6b566e26ef77fa14 Mon Sep 17 00:00:00 2001
+From 2ca5b9e1beff48b0eb69c9d73e44acc7b8bc36bb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 29 Feb 2016 17:00:50 +0000
-Subject: [PATCH 47/75] libbluray: Bump to Nevcairie's v0.9.2
+Subject: [PATCH 47/78] libbluray: Bump to Nevcairie's v0.9.2
 
 This includes 3D support
 ---
@@ -51258,10 +51258,10 @@ index 0000000000000000000000000000000000000000..5ef0124e35c9d81143921a328e272220
 + 
 +     return fp;
 
-From d3ad5d1c9d8da1ee7c63cd9302bef058b1da1135 Mon Sep 17 00:00:00 2001
+From 38193615b554a1fccc44f24dc43586c22bf59637 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 6 Mar 2016 12:54:59 +0000
-Subject: [PATCH 48/75] mvc: Automatically enable stereo mode
+Subject: [PATCH 48/78] mvc: Automatically enable stereo mode
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.cpp | 6 +++++-
@@ -51319,10 +51319,10 @@ index 311dd6689236d660919c4c4483c51dca2752514a..536332c43e22ccb229e72b88518e54dd
      break;
      case AV_CODEC_ID_MPEG4:
 
-From f1b065ebbb0f130da3e28a6a4375f9458cee3fd3 Mon Sep 17 00:00:00 2001
+From ba2b2c2b0373f3b598d32fef5beb1de84043a7f0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 24 Mar 2016 13:02:58 +0000
-Subject: [PATCH 49/75] ffmpeg: mvc: fix for pixelation from packets with no
+Subject: [PATCH 49/78] ffmpeg: mvc: fix for pixelation from packets with no
  pts/dts
 
 ---
@@ -51384,10 +51384,10 @@ index 7e97e4d91a443d46d933df528763422ff5e8f4fa..d4f279fd4f2ceb260698cd6fedb124ba
  	cd $(PLATFORM);\
  	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
 
-From 332a8c9c8739a159f62542856c686ee14e996bdd Mon Sep 17 00:00:00 2001
+From 7d414c38f171bb4bc394725b3c9057cd5ef0e2af Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 11 Nov 2016 15:53:53 +0000
-Subject: [PATCH 50/75] stereoscopicmanager: fixups for rbp
+Subject: [PATCH 50/78] stereoscopicmanager: fixups for rbp
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecUtils.cpp | 61 ++++++++++++++++++++++
@@ -51625,10 +51625,10 @@ index 6aaa82f4d883b8cae0ccdedf6c5a6814e7aaa720..cc929b599125a44ac128713fd4331782
  };
  
 
-From 2d81f94dcaf52e951bb7e203ea248b48c24d15aa Mon Sep 17 00:00:00 2001
+From 28a24fe8f76d7a8506ec472a2075bcb110009471 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 51/75] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 51/78] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -51648,10 +51648,10 @@ index a8323f419e404037c4e5fb4d78fa1b45409337a7..7c0b70777556ac7694e7fc511cd4bb18
    }
  
 
-From 48664856527a85a6d242649a5dcebf85d9420171 Mon Sep 17 00:00:00 2001
+From 669cd51bee072e99b3b0d2b2a911b9738b1f8e75 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Nov 2016 18:24:18 +0000
-Subject: [PATCH 52/75] DemuxMVC: fixup after SeekTime API change
+Subject: [PATCH 52/78] DemuxMVC: fixup after SeekTime API change
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DemuxMVC.cpp | 2 +-
@@ -51685,17 +51685,17 @@ index bbb836a61344689a83af68c821c05c212a86b097..54f91a02391368fbfbb4d669c003f425
    virtual int GetStreamLength() { return 0; };
    virtual CDemuxStream* GetStream(int iStreamId) const override { return nullptr; };
 
-From 945b547c444e7ec5039c88e31b612c57b25edd1b Mon Sep 17 00:00:00 2001
+From e9a6484af92ae3058b3e1afbdfb96c3bc8434a12 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 3 Nov 2014 23:17:46 +0000
-Subject: [PATCH 53/75] [cec] Don't discard buttons when repeat mode is enabled
+Subject: [PATCH 53/78] [cec] Don't discard buttons when repeat mode is enabled
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
  1 file changed, 4 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27d911b4da 100644
+index 88289b3cbabacbe51aab3ab2ed0e1f2d46b5be79..543a65716cd2eec73210fa80ba992ed5acc13b6b 100644
 --- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 +++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 @@ -803,7 +803,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
@@ -51711,17 +51711,17 @@ index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27
      if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
      {
 
-From 70d24188f34e2846d42f18146baf43952c31aae3 Mon Sep 17 00:00:00 2001
+From a4f0a7285c72b817b79cbe5ee5ff0c30b00ab758 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 54/75] [cec] Temp - more logging
+Subject: [PATCH 54/78] [cec] Temp - more logging
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
  1 file changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7adde96cb8 100644
+index 543a65716cd2eec73210fa80ba992ed5acc13b6b..7192a651afef38e34f0cd6def89160c86ea39ee0 100644
 --- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 +++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 @@ -800,12 +800,15 @@ void CPeripheralCecAdapter::GetNextKey(void)
@@ -51766,10 +51766,10 @@ index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7a
  }
  
 
-From 0d75b80f8862d67a4edc9f769acc0d18448ad268 Mon Sep 17 00:00:00 2001
+From b5674b0835773d0e8f95a882b706f69de2f4f7a6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 25 May 2016 18:31:17 +0100
-Subject: [PATCH 55/75] rbp: Hard code the number of buffers to improve audio
+Subject: [PATCH 55/78] rbp: Hard code the number of buffers to improve audio
  sync
 
 ---
@@ -51811,10 +51811,10 @@ index fd8a0a2447c40357a9e13003f2ef45ef20ccb205..be0de0d962fd374bc17bfa48a27ca17d
  
  }
 
-From b7bcc39b920c47e7c4273895feae92d4a82ba08f Mon Sep 17 00:00:00 2001
+From e4311d9bb897c794054551aa3b2b2a4715daa93b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 4 Jul 2016 18:30:03 +0100
-Subject: [PATCH 56/75] rbp: Update the GL libs to new naming scheme
+Subject: [PATCH 56/78] rbp: Update the GL libs to new naming scheme
 
 As the opensource mesa GL library is getting more usable, the name collision wih the firmware GL driver is causing issues.
 As such we are renaming the firmware GL driver to avoid this.
@@ -51828,7 +51828,7 @@ will be dropped at some point
  3 files changed, 5 insertions(+), 5 deletions(-)
 
 diff --git a/configure.ac b/configure.ac
-index cbaefbe0a6a42f7d863800d87281a3f680cfea5b..2329e126f807b3eccb8cfd4e6ef3117ec20c85b5 100644
+index 71e942b1c3236a686ad6ff9fc930fff8b2019e0a..0336f766234f0825c164de17fec8e074120f1828 100644
 --- a/configure.ac
 +++ b/configure.ac
 @@ -949,7 +949,7 @@ if test "$use_gles" = "yes"; then
@@ -51879,10 +51879,10 @@ index 3626ea5204eb561dc1ae0b64c6bb7253d2ec59ec..100ff3178bafe7434bd5456100b5bb71
  fi
  
 
-From e63ee8ac3fd87a12bdcf197827a182043e58b4af Mon Sep 17 00:00:00 2001
+From 22915642ef576f666ed2976230d15e23c5d153a8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Jun 2016 14:46:01 +0100
-Subject: [PATCH 57/75] ffmpeg: hacky fix for files with GMC
+Subject: [PATCH 57/78] ffmpeg: hacky fix for files with GMC
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
@@ -51904,10 +51904,10 @@ index 9149698884c8ae6a23649abbaa0e659587dfe982..84d515e9e2df6a4c1c448a52a42f4675
          {
            if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
 
-From 73498b227b428c32c7e5ebc5623d094020fe98a7 Mon Sep 17 00:00:00 2001
+From 1e9adac965b29cac5e640b15c5aedb1dc2908114 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 19 Jul 2016 20:39:18 +0100
-Subject: [PATCH 58/75] mmalrender: Add sharpness control
+Subject: [PATCH 58/78] mmalrender: Add sharpness control
 
 ---
  addons/resource.language.en_gb/resources/strings.po         |  2 +-
@@ -51916,7 +51916,7 @@ Subject: [PATCH 58/75] mmalrender: Add sharpness control
  3 files changed, 14 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 55ec0a9985a8e77873d787e879d73c076e13b2c6..eea89feb0f698619623ec67ed0078d30d18c22fc 100644
+index 097464415a596cf13b3c245bbedd616f5a4e49ef..8b9af01094e5f2e6e47008ab8cc6fd07c95574e3 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -8694,7 +8694,7 @@ msgstr ""
@@ -51979,10 +51979,10 @@ index e0e6f7c0e0546013ca74265aef54704fd332f8e4..69eae6cbef0131d20dc979dcb35915cd
    CCriticalSection m_sharedSection;
    MMAL_COMPONENT_T *m_vout;
 
-From 57c94de16036e00a6822e374cc8ebbc8a042dc6b Mon Sep 17 00:00:00 2001
+From 9d38391944a046fe348943ff09ba6d340e22079d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 14 Oct 2016 15:37:53 +0100
-Subject: [PATCH 59/75] MMALFFMpeg: Report as SW decode in codec overlay info
+Subject: [PATCH 59/78] MMALFFMpeg: Report as SW decode in codec overlay info
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
@@ -52002,10 +52002,10 @@ index 8bace5b3eb98b3b1ddad7f56af83a41ae067bc75..c820a04c903866862b5ff04b38124ff0
    CLog::Log(LOGDEBUG, "CDVDVideoCodecFFmpeg - Updated codec: %s", m_name.c_str());
  }
 
-From 43c6b165b6d0f754f938d54bba00655d436679fd Mon Sep 17 00:00:00 2001
+From 74be792acc04d3bcc6c8f4cfc788b28f682af79f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Nov 2016 18:28:01 +0000
-Subject: [PATCH 60/75] advancedsettings: Add option to set cache size on
+Subject: [PATCH 60/78] advancedsettings: Add option to set cache size on
  libass
 
 E.g to set total cache size in libass to 32M
@@ -52071,7 +52071,7 @@ index f9de4f15e7c612d69ef46e7cad870ecb61afaec3..b5303fd100f1a930eb5c010a95193206
    END_METHOD_RESOLVE()
  };
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index edbc96f7be3ae4dae994320f8c137555c927d455..7f3325392993823b8d2d6a915579c48401ca2c12 100644
+index 1c23e5b0f25f0c6f2e5f7cab166aac825af5a30e..173cefba5b1e7f364d364020ad9ac1496f20c583 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -364,6 +364,8 @@ void CAdvancedSettings::Initialize()
@@ -52107,10 +52107,10 @@ index 6b0e3b8cf9e3ff40e6af758c54fe7eefb89a131c..35bf38719f0eaaa5ac29e9495480ae97
      unsigned int m_jsonTcpPort;
  
 
-From 84623dff0ea921cf494fb9f15379b1bbc43844a0 Mon Sep 17 00:00:00 2001
+From 5bd43949ffaf2781febfbbea59ce5fcc7bfa3298 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 13 Nov 2016 20:30:15 +0000
-Subject: [PATCH 61/75] [rbp] Experimental limit libass cache size depending on
+Subject: [PATCH 61/78] [rbp] Experimental limit libass cache size depending on
  arm memory size
 
 ---
@@ -52141,7 +52141,7 @@ index 6e8529001b1a464b4547a846f553d98f5bc0b6c0..238eba372af2cbab11d7543c857ee476
    response[sizeof(response) - 1] = '\0';
    CLog::Log(LOGNOTICE, "Config:\n%s", response);
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 7f3325392993823b8d2d6a915579c48401ca2c12..410ad30aeb60316e9438ee56aaca7e73f0b3bedd 100644
+index 173cefba5b1e7f364d364020ad9ac1496f20c583..bc6945c09397c3bd7d2107d96cbb3bc7fbd1cd7f 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -361,8 +361,10 @@ void CAdvancedSettings::Initialize()
@@ -52156,10 +52156,10 @@ index 7f3325392993823b8d2d6a915579c48401ca2c12..410ad30aeb60316e9438ee56aaca7e73
    m_libAssCache = 0;
  
 
-From b5d95824c6e029b58aaf3b1d6dd2774661925096 Mon Sep 17 00:00:00 2001
+From 96c80f46bcba665013551f6a946a17d7f6b31046 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 62/75] [rbp] Use default resampling setting on Pi2
+Subject: [PATCH 62/78] [rbp] Use default resampling setting on Pi2
 
 ---
  system/settings/rbp2.xml | 5 +++++
@@ -52182,10 +52182,10 @@ index 50bd55e9c90864c1ff4c36c4650e9ec247737a44..f218216e615d9723e5a163aab9c42ca5
    </section>
  </settings>
 
-From c6165dc89c629abd2583eb7181e0543d6b69c255 Mon Sep 17 00:00:00 2001
+From 2dd3e5dcde97a08d92e03003bdecfedffa8e634a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 1 Dec 2016 17:06:01 +0000
-Subject: [PATCH 63/75] MMALRender: Allow advanced deinterlace with software
+Subject: [PATCH 63/78] MMALRender: Allow advanced deinterlace with software
  decode
 
 Uses YUV420 directly which improves performance.
@@ -52208,10 +52208,10 @@ index f5f0f0d01227b3b4dcebb4a22a54dbcaac2d5ee9..05cbd8eeaef1a21fc32ea1fa23ea686e
      status = mmal_port_format_commit(m_deint_output);
      if (status != MMAL_SUCCESS)
 
-From 15e9791cb79c6c3b5f8c09bba979761451bea04c Mon Sep 17 00:00:00 2001
+From f26960cc83c044117dbd4d5f3458f24b1dd88e79 Mon Sep 17 00:00:00 2001
 From: Nuno Senica <nsenica@gmail.com>
 Date: Tue, 27 Dec 2016 20:59:56 +0000
-Subject: [PATCH 64/75] Apply ffmpeg patches automatically after downloading
+Subject: [PATCH 64/78] Apply ffmpeg patches automatically after downloading
  and extracting the ffmpeg tar ball
 
 ---
@@ -52219,10 +52219,10 @@ Subject: [PATCH 64/75] Apply ffmpeg patches automatically after downloading
  1 file changed, 11 insertions(+), 1 deletion(-)
 
 diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
-index 7c68b4c3d09a037d3b85c81604d47a7ea6dd1c21..eec635ef493d13ea97c9b806eb57cccbc452297d 100644
+index 28cc80710ea4a1e29f5d7050e3797d7c28901b70..88c976efe765f24034238b9933871d90a08704d4 100644
 --- a/project/cmake/modules/FindFFMPEG.cmake
 +++ b/project/cmake/modules/FindFFMPEG.cmake
-@@ -261,7 +261,17 @@ if(NOT FFMPEG_FOUND)
+@@ -264,7 +264,17 @@ if(NOT FFMPEG_FOUND)
                                      <SOURCE_DIR> &&
                                      ${CMAKE_COMMAND} -E copy
                                      ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
@@ -52242,10 +52242,10 @@ index 7c68b4c3d09a037d3b85c81604d47a7ea6dd1c21..eec635ef493d13ea97c9b806eb57cccb
    file(WRITE ${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/ffmpeg/ffmpeg-link-wrapper
  "#!/bin/bash
 
-From 358df1970de1f6f107e1681785ed723db0756f0e Mon Sep 17 00:00:00 2001
+From b2e11686321e5d6d2504085f4e49e272aa813e12 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 1 May 2016 19:56:43 +0100
-Subject: [PATCH 65/75] omxplayer: Avoid CAEFactory::Suspend which should only
+Subject: [PATCH 65/78] omxplayer: Avoid CAEFactory::Suspend which should only
  be called by application
 
 ---
@@ -52345,10 +52345,10 @@ index db7f98ddbc2db2f20bdc42379df3f08eba165bfc..02acfc8cfe57446be4e00b991ef6fde9
    COMXCoreComponent m_omx_render_analog;
    COMXCoreComponent m_omx_render_hdmi;
 
-From dd69c1880f97b81981df1ad50f09bfb457ad8532 Mon Sep 17 00:00:00 2001
+From d824f2b09a72ee5d74b558513908a0b68f0bce1a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 1 Mar 2017 21:40:22 +0000
-Subject: [PATCH 66/75] MMALRender: default to bob (x2) deinterlace for HD
+Subject: [PATCH 66/78] MMALRender: default to bob (x2) deinterlace for HD
 
 There are still issues with some dvb dongles run on the same Pi as playback.
 Default to bob. Users who aren't using these devices will have to manually enable advanced.
@@ -52390,10 +52390,10 @@ index 39bc0530cecd54ae8c3a5481c92f1a6a18a4d9c5..cb0a06888a919879155fea2a689c1bae
    if (m_deinterlace && interlace_method != VS_INTERLACEMETHOD_NONE)
    {
 
-From b96bf65f71bca91e4e029ed64c7e3dc86c0d0dad Mon Sep 17 00:00:00 2001
+From 11190e20978752245187e681aab3b089a2b504e4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 17 Feb 2017 17:58:13 +0000
-Subject: [PATCH 67/75] ffmpeg: Update hevc optimisation to use the gpu service
+Subject: [PATCH 67/78] ffmpeg: Update hevc optimisation to use the gpu service
 
 ---
  project/cmake/modules/FindFFMPEG.cmake             |    14 +-
@@ -52403,10 +52403,10 @@ Subject: [PATCH 67/75] ffmpeg: Update hevc optimisation to use the gpu service
  4 files changed, 16342 insertions(+), 35924 deletions(-)
 
 diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
-index eec635ef493d13ea97c9b806eb57cccbc452297d..301534c5b0113815f9a196fdd0ed9db6ff587cda 100644
+index 88c976efe765f24034238b9933871d90a08704d4..db2d4465e3182363a812325d6bd1aeb146018e01 100644
 --- a/project/cmake/modules/FindFFMPEG.cmake
 +++ b/project/cmake/modules/FindFFMPEG.cmake
-@@ -262,14 +262,14 @@ if(NOT FFMPEG_FOUND)
+@@ -265,14 +265,14 @@ if(NOT FFMPEG_FOUND)
                                      ${CMAKE_COMMAND} -E copy
                                      ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
                                      <SOURCE_DIR> &&
@@ -105705,10 +105705,10 @@ index e172ebf157aebffe1ae50b4a2b25fd71bc708c93..852815d5f4ae80771c5304f6f3520b5e
 ++
 ++
 
-From 1ec8569a01645467680e3090afba9927cea120d0 Mon Sep 17 00:00:00 2001
+From 700c032b538e91c8a138f4c71bebd310340ce1bb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 4 Mar 2017 19:25:40 +0000
-Subject: [PATCH 68/75] ffmpeg: Call get_format to fix an issue with MMAL
+Subject: [PATCH 68/78] ffmpeg: Call get_format to fix an issue with MMAL
  rendering
 
 ---
@@ -105720,10 +105720,10 @@ Subject: [PATCH 68/75] ffmpeg: Call get_format to fix an issue with MMAL
  create mode 100644 tools/depends/target/ffmpeg/0001-ffmpeg-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
 
 diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
-index 301534c5b0113815f9a196fdd0ed9db6ff587cda..2cfd61642d52f05a84bea5ca7eb1766ad8e8ddbd 100644
+index db2d4465e3182363a812325d6bd1aeb146018e01..7e0aeefab542fb063595b09c47e69514a656bd85 100644
 --- a/project/cmake/modules/FindFFMPEG.cmake
 +++ b/project/cmake/modules/FindFFMPEG.cmake
-@@ -270,7 +270,8 @@ if(NOT FFMPEG_FOUND)
+@@ -273,7 +273,8 @@ if(NOT FFMPEG_FOUND)
                                      #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
                                      #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
                                      #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
@@ -105830,10 +105830,10 @@ index 3d970429012c1f3aede4df0545ced5006c165d50..e070d96fc340f5bff94d72ae9004c4a9
  CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
  ./configure --prefix=$FFMPEG_PREFIX \
 
-From b230c015d539db71bb2eb04232b25805703014c6 Mon Sep 17 00:00:00 2001
+From a3e069e98940801c936f03908275d5f67a47d847 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 22:52:37 +0000
-Subject: [PATCH 69/75] MMAL: Remove periodic prime calls and handle from
+Subject: [PATCH 69/78] MMAL: Remove periodic prime calls and handle from
  buffer destructor
 
 If a number of buffers are released at once we can end up stalled in GetPicture with the buffers
@@ -105927,10 +105927,10 @@ index 9279966fa634f6f5a3e00f12dd528337392cf038..c6ba9b024b3c3bbe53d3f0870dd8c839
    CLog::Log(LOGDEBUG, "%s::%s - stopping", CLASSNAME, __func__);
  }
 
-From 6f29617ca776bb2e6459a55710a4a569311c8d7e Mon Sep 17 00:00:00 2001
+From f3393e3c78e5ffa39d26e88b9ff28207217e408f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 23:22:43 +0000
-Subject: [PATCH 70/75] MMALCodec: Include a timeout of GetPicture in default
+Subject: [PATCH 70/78] MMALCodec: Include a timeout of GetPicture in default
  debug logging
 
 ---
@@ -105951,10 +105951,10 @@ index 22d594cdc217f32f820e3618b4d90a1d75fc769b..e8bc3b930e84e058460b6cfd7caca0d7
  
    return ret;
 
-From a3185132fc1828162ad59e09155464b26a7f35b0 Mon Sep 17 00:00:00 2001
+From b558e52052ebab00434f5d44f36f2c7b8be212c2 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 21 Mar 2017 20:15:55 +0000
-Subject: [PATCH 71/75] ffmpeg: Add calls to init and deinit gpu service
+Subject: [PATCH 71/78] ffmpeg: Add calls to init and deinit gpu service
 
 ---
  tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 6 ++++--
@@ -105994,10 +105994,10 @@ index 852815d5f4ae80771c5304f6f3520b5e49b18a67..b4c15b782a4deb36c35a006e8547ce69
  +
  +  mbox_close(mb);
 
-From 9ef1f2fdde0e49ae3c5da03defa83d32ab2e432d Mon Sep 17 00:00:00 2001
+From 02a75c682b1f599af712cd67a625d708281d878c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Mar 2017 20:06:42 +0100
-Subject: [PATCH 72/75] squash: ffmpeg: hevc: Remove rules that require qasm
+Subject: [PATCH 72/78] squash: ffmpeg: hevc: Remove rules that require qasm
 
 ---
  tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 12 ------------
@@ -106027,10 +106027,10 @@ index b4c15b782a4deb36c35a006e8547ce69665a10fe..58379fb0874521205184c53be5aae893
  index 54efaad..02a89c3 100644
  --- a/libavcodec/allcodecs.c
 
-From 38a49f21a7430779830d9d4e2468e76de6faf92c Mon Sep 17 00:00:00 2001
+From 5493a835c968dda113c577afa0ba879371023f15 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 May 2017 15:11:37 +0100
-Subject: [PATCH 73/75] RBP: Add api to query gpu frame geometry
+Subject: [PATCH 73/78] RBP: Add api to query gpu frame geometry
 
 ---
  xbmc/linux/RBP.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
@@ -106204,10 +106204,10 @@ index 815d758e7086d73b4d4eb16849fdbb509a3c251d..a7f07403854b81996cca72eff82e3a7d
    double GetAdjustHDMIClock() { return m_actual_pll_adjust; }
  
 
-From 1856e86917eef62f3069c465d7c8ff2f8e213395 Mon Sep 17 00:00:00 2001
+From 450096d2ae373ac00618675372926275cef37e6f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 May 2017 15:12:28 +0100
-Subject: [PATCH 74/75] MMALFFmpeg: Add Sand/YUVUV128 support
+Subject: [PATCH 74/78] MMALFFmpeg: Add Sand/YUVUV128 support
 
 ---
  .../DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp        |  2 +-
@@ -106349,10 +106349,10 @@ index f9b7172c45d5a0158259ebfb53ea75696f0acb6d..456214a679779469ea52db7ce846a387
      return false;
  
 
-From ed215d6a95935eabbbb5f56d9259b24e8ab4929d Mon Sep 17 00:00:00 2001
+From 00ac50fa4ab6c087dce909394efd6d0a33d2151c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 May 2017 15:10:42 +0100
-Subject: [PATCH 75/75] ffmpeg: hevc: Update to latest version
+Subject: [PATCH 75/78] ffmpeg: hevc: Update to latest version
 
 ---
  .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 11940 ++++++++++++-------
@@ -121057,3 +121057,41750 @@ index 58379fb0874521205184c53be5aae893cfd39d49..96cfa9ae30e72b377b2561cf7a329e02
 ++
 ++    do_logparse(args.logfile)
 ++
+
+From d2b967d8c520b416fb30a2162dfe40a9aad3d6c0 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 13 Oct 2017 17:33:36 +0100
+Subject: [PATCH 76/78] ffmpeg: hevc: Update to latest version
+
+---
+ .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 39170 +++++++++++--------
+ 1 file changed, 23811 insertions(+), 15359 deletions(-)
+
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index 96cfa9ae30e72b377b2561cf7a329e02b9212ceb..abd1499a6d9b2500fe379c8754bdeac54e44006d 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -1,8 +1,16 @@
+ diff --git a/.gitignore b/.gitignore
+-index 524fb73..305632b 100644
++index 524fb73c16..bcc983739f 100644
+ --- a/.gitignore
+ +++ b/.gitignore
+-@@ -23,6 +23,7 @@
++@@ -1,6 +1,7 @@
++ *.a
++ *.o
++ *.o.*
+++*.bin
++ *.d
++ *.def
++ *.dll
++@@ -23,6 +24,7 @@
+  .\#*
+  /.config
+  /.version
+@@ -11,7 +19,7 @@ index 524fb73..305632b 100644
+  /ffplay
+  /ffprobe
+ diff --git a/ffmpeg.c b/ffmpeg.c
+-index 9ffd833..e2474e5 100644
++index cdded8673f..5eee7dfd40 100644
+ --- a/ffmpeg.c
+ +++ b/ffmpeg.c
+ @@ -23,6 +23,11 @@
+@@ -20,13 +28,21 @@ index 9ffd833..e2474e5 100644
+  
+ +#ifdef RPI
+ +#define RPI_DISPLAY
+-+#define RPI_ZERO_COPY
+++#define RPI_DISPLAY_ALL 0
+ +#endif
+ +
+  #include "config.h"
+  #include <ctype.h>
+  #include <string.h>
+-@@ -66,6 +71,25 @@
++@@ -42,6 +47,7 @@
++ #include "libavformat/avformat.h"
++ #include "libavdevice/avdevice.h"
++ #include "libswresample/swresample.h"
+++#include "libavutil/atomic.h"
++ #include "libavutil/opt.h"
++ #include "libavutil/channel_layout.h"
++ #include "libavutil/parseutils.h"
++@@ -66,6 +72,25 @@
+  # include "libavfilter/buffersrc.h"
+  # include "libavfilter/buffersink.h"
+  
+@@ -38,21 +54,21 @@ index 9ffd833..e2474e5 100644
+ +#include <interface/mmal/mmal.h>
+ +#include <interface/mmal/mmal_parameters_camera.h>
+ +#include <interface/mmal/mmal_buffer.h>
+++#include <interface/mmal/mmal_port.h>
+ +#include <interface/mmal/util/mmal_util.h>
+ +#include <interface/mmal/util/mmal_default_components.h>
+ +#include <interface/mmal/util/mmal_connection.h>
+ +#include <interface/mmal/util/mmal_util_params.h>
+ +#pragma GCC diagnostic pop
+-+#ifdef RPI_ZERO_COPY
+ +#include "libavcodec/rpi_qpu.h"
+-+#endif
+++#include "libavutil/rpi_sand_fns.h"
+ +#include "libavcodec/rpi_zc.h"
+ +#endif
+ +
+  #if HAVE_SYS_RESOURCE_H
+  #include <sys/time.h>
+  #include <sys/types.h>
+-@@ -158,6 +182,182 @@ static int restore_tty;
++@@ -158,6 +183,241 @@ static int restore_tty;
+  static void free_input_threads(void);
+  #endif
+  
+@@ -60,39 +76,36 @@ index 9ffd833..e2474e5 100644
+ +
+ +#define NUM_BUFFERS 4
+ +
+-+static MMAL_COMPONENT_T* rpi_display = NULL;
+-+static MMAL_POOL_T *rpi_pool = NULL;
+-+static volatile int rpi_display_count = 0;
+ +
+-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+++typedef struct rpi_display_env_s
+++{
+++    MMAL_COMPONENT_T* display;
+++    MMAL_COMPONENT_T* isp;
+++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
+++    MMAL_CONNECTION_T * conn;
+++
+++    MMAL_POOL_T *rpi_pool;
+++    volatile int rpi_display_count;
+++    enum AVPixelFormat avfmt;
+++} rpi_display_env_t;
+++
+++static rpi_display_env_t * rpi_display_env = NULL;
+++
+++
+++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
+ +{
+ +    MMAL_POOL_T* pool;
+-+    size_t i;
+-+    size_t size = (w*h*3)/2;
+-+#ifdef RPI_ZERO_COPY
+ +    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
+ +    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
+ +    assert(pool);
+-+#else
+-+    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
+-+
+-+    for (i = 0; i < NUM_BUFFERS; ++i)
+-+    {
+-+       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+-+       char * bufPtr = buffer->data;
+-+       memset(bufPtr, i*30, w*h);
+-+       memset(bufPtr+w*h, 128, (w*h)/2);
+-+    }
+-+#endif
+ +
+ +    return pool;
+ +}
+ +
+ +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+-+#ifdef RPI_ZERO_COPY
+++    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
+ +    av_rpi_zc_unref(buffer->user_data);
+-+    --rpi_display_count;
+-+#endif
+++    avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
+ +    mmal_buffer_header_release(buffer);
+ +}
+ +
+@@ -100,9 +113,12 @@ index 9ffd833..e2474e5 100644
+ +  mmal_buffer_header_release(buffer);
+ +}
+ +
+-+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
+++#define DISPLAY_PORT_DEPTH 4
+++
+++static rpi_display_env_t *
+++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
+ +{
+-+    MMAL_COMPONENT_T* display;
+++    MMAL_STATUS_T err;
+ +    MMAL_DISPLAYREGION_T region =
+ +    {
+ +        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+@@ -111,51 +127,113 @@ index 9ffd833..e2474e5 100644
+ +        .fullscreen = 0,
+ +        .dest_rect = {x, y, w, h}
+ +    };
+++#if RPI_ZC_SAND_8_IN_10_BUF
+++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
+++#else
+++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
+++#endif
+ +    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
+++    rpi_display_env_t * de;
+++    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
+++
+++    bcm_host_init();  // Needs to be done by someone...
+++
+++    if ((de = av_mallocz(sizeof(*de))) == NULL) {
+++        return NULL;
+++    }
+ +
+-+    bcm_host_init();  // TODO is this needed?
+-+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+-+    assert(display);
+++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
+++    av_assert0(de->display);
+++    de->port_in = de->display->input[0];
+++
+++    if (isp_req)
+++    {
+++        mmal_component_create("vc.ril.isp", &de->isp);
+++        de->port_in = de->isp->input[0];
+++    }
+ +
+-+    mmal_port_parameter_set(display->input[0], &region.hdr);
+++    mmal_port_parameter_set(de->display->input[0], &region.hdr);
+ +
+ +    {
+-+        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
+++        MMAL_PORT_T * const port = de->port_in;
+++        MMAL_ES_FORMAT_T* const format = port->format;
+++        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
+++        port->buffer_num = DISPLAY_PORT_DEPTH;
+++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
+++            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
+++                MMAL_ENCODING_I420;
+ +        format->es->video.width = geo.stride_y;
+-+        format->es->video.height = geo.height_y;
+++        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
+++                                      (h + 15) & ~15 : geo.height_y;  // Magic
+ +        format->es->video.crop.x = 0;
+ +        format->es->video.crop.y = 0;
+ +        format->es->video.crop.width = w;
+ +        format->es->video.crop.height = h;
+-+        mmal_port_format_commit(display->input[0]);
+++        mmal_port_format_commit(port);
+ +    }
+ +
+-+    mmal_component_enable(display);
+++    de->rpi_pool = display_alloc_pool(de->port_in);
+++    mmal_port_enable(de->port_in,display_cb_input);
+++
+++    if (isp_req) {
+++        MMAL_PORT_T * const port_out = de->isp->output[0];
+++        mmal_log_dump_port(de->port_in);
+++        mmal_format_copy(port_out->format, de->port_in->format);
+++        if (fmt == AV_PIX_FMT_SAND64_10) {
+++            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
+++                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
+++            {
+++                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
+++            }
+++            else
+++                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
+ +
+-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+++        }
+++        port_out->format->encoding = MMAL_ENCODING_I420;
+++        mmal_log_dump_port(port_out);
+++        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
+++        {
+++            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
+++            goto fail;
+++        }
+++        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
+++            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
+++            goto fail;
+++        }
+++        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
+++            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
+++            goto fail;
+++        }
+++        mmal_port_enable(de->isp->control,display_cb_control);
+++        mmal_component_enable(de->isp);
+++    }
+ +
+-+    mmal_port_enable(display->input[0],display_cb_input);
+-+    mmal_port_enable(display->control,display_cb_control);
+++    mmal_component_enable(de->display);
+++    mmal_port_enable(de->display->control,display_cb_control);
+++    de->avfmt = fmt;
+ +
+ +    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
+ +
+-+    return display;
+++    return de;
+++
+++fail:
+++    // **** Free stuff
+++    return NULL;
+ +}
+ +
+-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
+ +{
+ +    MMAL_BUFFER_HEADER_T* buf;
+ +
+-+    if (!display || !rpi_pool)
+++    if (de == NULL)
+ +        return;
+ +
+-+    if (rpi_display_count >= 3) {
+++    if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+ +        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+ +        return;
+ +    }
+ +
+-+    buf = mmal_queue_get(rpi_pool->queue);
+++    buf = mmal_queue_get(de->rpi_pool->queue);
+ +    if (!buf) {
+ +        // Running too fast so drop the frame
+ +        printf("Q alloc failure\n");
+@@ -165,67 +243,64 @@ index 9ffd833..e2474e5 100644
+ +    buf->cmd = 0;
+ +    buf->offset = 0; // Offset to valid data
+ +    buf->flags = 0;
+-+#ifdef RPI_ZERO_COPY
+-+{
+-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+-+    if (fr_buf == NULL) {
+-+        mmal_buffer_header_release(buf);
+-+        return;
+-+    }
+-+
+-+    buf->user_data = fr_buf;
+-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
+-+    buf->offset = av_rpi_zc_offset(fr_buf);
+-+    buf->length = av_rpi_zc_length(fr_buf);
+-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+-+#if 0
+ +    {
+-+        unsigned int n;
+-+        for (n = 0; n < fr->width; n += 128) {
+-+            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
+++        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
+++        if (fr_buf == NULL) {
+++            mmal_buffer_header_release(buf);
+++            return;
+ +        }
+-+    }
+-+#endif
+-+    ++rpi_display_count;
+-+}
+-+#else
+-+{
+-+#error YYY
+-+    int w = fr->width;
+-+    int h = fr->height;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+-+
+-+    buf->length = (w2 * h2 * 3)/2;
+-+    buf->user_data = NULL;
+-+
+-+    //mmal_buffer_header_mem_lock(buf);
+-+    memcpy(buf->data, fr->data[0], w2 * h);
+-+    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+-+    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+-+    //mmal_buffer_header_mem_unlock(buf);
+-+}
+-+#endif
+ +
+-+    while (rpi_display_count >= 3) {
+++        buf->user_data = fr_buf;
+++        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
+++        buf->offset = av_rpi_zc_offset(fr_buf);
+++        buf->length = av_rpi_zc_length(fr_buf);
+++        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+++        avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
+++    }
+++#if RPI_DISPLAY_ALL
+++    while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+ +        usleep(5000);
+ +    }
+++#endif
+ +
+-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
+ +    {
+-+        printf("** send failed: depth=%d\n", rpi_display_count);
+-+        display_cb_input(NULL, buf);
+++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
+++        display_cb_input(de->port_in, buf);
+ +    }
+ +}
+ +
+-+static void display_exit(MMAL_COMPONENT_T* display)
+++static void display_exit(rpi_display_env_t ** const pde)
+ +{
+++    rpi_display_env_t * const de = *pde;
+++    *pde = NULL;
+++
+++    if (de != NULL) {
+ +//    sleep(120);
+-+    if (display) {
+-+        mmal_component_destroy(display);
+-+    }
+-+    if (rpi_pool) {
+-+        mmal_port_pool_destroy(display->input[0], rpi_pool);
+++
+++        if (de->port_in != NULL) {
+++            mmal_port_disable(de->port_in);
+++        }
+++
+++        // The above disable should kick out all buffers - check that
+++        if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
+++            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
+++        }
+++
+++        if (de->conn != NULL) {
+++            mmal_connection_destroy(de->conn);
+++        }
+++        if (de->isp != NULL) {
+++            mmal_component_destroy(de->isp);
+++        }
+++        if (de->display != NULL) {
+++            mmal_component_destroy(de->display);
+++        }
+++        if (de->rpi_pool != NULL) {
+++            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
+++        }
+++
+++        av_free(de);
+ +    }
+ +}
+ +
+@@ -235,29 +310,29 @@ index 9ffd833..e2474e5 100644
+  /* sub2video hack:
+     Convert subtitles to video with alpha to insert them in filter graphs.
+     This is a temporary solution until libavfilter gets real subtitles support.
+-@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
++@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret)
+          avformat_close_input(&input_files[i]->ctx);
+          av_freep(&input_files[i]);
+      }
+ +
+ +#ifdef RPI_DISPLAY
+-+    display_exit(rpi_display);
+++    display_exit(&rpi_display_env);
+ +#endif
+ +
+      for (i = 0; i < nb_input_streams; i++) {
+          InputStream *ist = input_streams[i];
+  
+-@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
++@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret)
+          av_freep(&ist->filters);
+          av_freep(&ist->hwaccel_device);
+  
+-+#ifdef RPI_ZERO_COPY
+++#ifdef RPI_DISPLAY
+ +        av_rpi_zc_uninit(ist->dec_ctx);
+ +#endif
+          avcodec_free_context(&ist->dec_ctx);
+  
+          av_freep(&input_streams[i]);
+-@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
++@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret)
+      }
+      term_exit();
+      ffmpeg_exited = 1;
+@@ -265,28 +340,28 @@ index 9ffd833..e2474e5 100644
+  }
+  
+  void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
++@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s,
+      if (ost->source_index >= 0)
+          ist = input_streams[ost->source_index];
+  
+ +#ifdef RPI_DISPLAY
+ +    if (next_picture && ist != NULL)
+ +    {
+-+        if (!rpi_display)
+-+            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
+++        if (rpi_display_env == NULL)
+++            rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+++        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
+ +    }
+ +#endif
+ +
+      if (filter->inputs[0]->frame_rate.num > 0 &&
+          filter->inputs[0]->frame_rate.den > 0)
+          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+-@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
++@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+          ist->dec_ctx->opaque                = ist;
+          ist->dec_ctx->get_format            = get_format;
+          ist->dec_ctx->get_buffer2           = get_buffer;
+ +
+-+#ifdef RPI_ZERO_COPY
+++#ifdef RPI_DISPLAY
+ +        // Overrides the above get_buffer2
+ +        av_rpi_zc_init(ist->dec_ctx);
+ +#endif
+@@ -295,66 +370,74 @@ index 9ffd833..e2474e5 100644
+  
+          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+ diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index fd0d1f0..1740768 100644
++index bb28aea1e2..741aa0bdc4 100644
+ --- a/libavcodec/Makefile
+ +++ b/libavcodec/Makefile
+-@@ -5,6 +5,12 @@ NAME = avcodec
++@@ -5,6 +5,16 @@ NAME = avcodec
+  HEADERS = avcodec.h                                                     \
+            avdct.h                                                       \
+            avfft.h                                                       \
+++          rpi_opts.h                                                    \
+ +          rpi_qpu.h                                                     \
+ +          rpi_shader.h                                                  \
+-+	  rpi_shader_cmd.h                                              \
+++          rpi_shader_cmd.h                                              \
+++          rpi_shader_template.h                                         \
+++          rpi_shader_template_fn.h                                      \
+ +          rpi_mailbox.h                                                 \
+-+          rpi_hevc_transform.h                                          \
+++          rpi_hevc_transform8.h                                         \
+++          rpi_hevc_transform10.h                                        \
+ +          rpi_zc.h                                                      \
+            d3d11va.h                                                     \
+            dirac.h                                                       \
+            dv_profile.h                                                  \
+-@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
++@@ -43,6 +53,11 @@ OBJS = allcodecs.o                                                      \
+         resample.o                                                       \
+         resample2.o                                                      \
+         utils.o                                                          \
+ +       rpi_qpu.o                                                        \
+ +       rpi_shader.o                                                     \
+++       rpi_shader_template.o                                            \
+ +       rpi_mailbox.o                                                    \
+ +       rpi_zc.o                                                         \
+         vorbis_parser.o                                                  \
+         xiph.o                                                           \
+  
+-@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
++@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+  endif
+ +
+-+QASM := $(SUBDIR)../pi-util/qasm.py
+++QASM_PY := ../local/bin/qasm.py
+++VASMVIDCORE := ../local/bin/vasmvidcore_std
+ +
+-+ifneq ("$(wildcard $(QASM))","")
+++ifneq ("$(wildcard $(QASM_PY))","")
+ +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+-+	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+++	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+ +
+ +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+-+	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++endif
+++
+++ifneq ("$(wildcard $(VASMVIDCORE))","")
+++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
+++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
+++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
+++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
+++
+++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
+++	python pi-util/make_array.py $<
+++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
+++	python pi-util/make_array.py $<
+++
+ +endif
+ +
+-+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
+-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+-index 54efaad..02a89c3 100644
+---- a/libavcodec/allcodecs.c
+-+++ b/libavcodec/allcodecs.c
+-@@ -667,6 +667,7 @@ void avcodec_register_all(void)
+-     REGISTER_PARSER(H261,               h261);
+-     REGISTER_PARSER(H263,               h263);
+-     REGISTER_PARSER(H264,               h264);
+-+    REGISTER_PARSER(H264_MVC,           h264_mvc);
+-     REGISTER_PARSER(HEVC,               hevc);
+-     REGISTER_PARSER(MJPEG,              mjpeg);
+-     REGISTER_PARSER(MLP,                mlp);
+++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
+++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
+ diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+-index a4ceca7..cafd25d 100644
++index a4ceca7f46..f8229a80e2 100644
+ --- a/libavcodec/arm/Makefile
+ +++ b/libavcodec/arm/Makefile
+-@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
++@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+@@ -363,13 +446,15 @@ index a4ceca7..cafd25d 100644
+ +                                          arm/hevcdsp_epel_neon.o       \
+                                            arm/hevcdsp_idct_neon.o       \
+ -                                          arm/hevcdsp_qpel_neon.o
+++                                          arm/hevcdsp_cres_neon.o       \
+++                                          arm/hevcdsp_res16_neon.o      \
+ +                                          arm/hevcdsp_qpel_neon.o       \
+ +                                          arm/hevcdsp_sao_neon.o
+  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
+  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
+                                            arm/rv40dsp_neon.o
+ diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
+-index fdbf86b..0a3980a 100644
++index fdbf86b45e..0a3980a1ef 100644
+ --- a/libavcodec/arm/cabac.h
+ +++ b/libavcodec/arm/cabac.h
+ @@ -26,13 +26,34 @@
+@@ -552,7 +637,7 @@ index fdbf86b..0a3980a 100644
+  #endif /* AVCODEC_ARM_CABAC_H */
+ diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
+ new file mode 100644
+-index 0000000..31d3c59
++index 0000000000..31d3c59205
+ --- /dev/null
+ +++ b/libavcodec/arm/hevc_cabac.h
+ @@ -0,0 +1,491 @@
+@@ -1047,9 +1132,239 @@ index 0000000..31d3c59
+ +#endif /* HAVE_ARMV6T2_INLINE */
+ +
+ +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
++diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
++new file mode 100644
++index 0000000000..380d3c8d3b
++--- /dev/null
+++++ b/libavcodec/arm/hevc_idct_fn_neon.S
++@@ -0,0 +1,224 @@
+++@ Included multiple times from hevc_idct_neon.S
+++@ Macros defined there
+++
+++#define DC_SHIFT  (15 - BIT_DEPTH)
+++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
+++#define TRN_SHIFT (20 - BIT_DEPTH)
+++
+++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        vdup.16     q0, r1
+++        vdup.16     q1, r1
+++        vst1.16     {q0, q1}, [r0]
+++        bx lr
+++endfunc
+++
+++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        vdup.16     q8, r1
+++        vdup.16     q9, r1
+++        vmov.16     q10, q8
+++        vmov.16     q11, q8
+++        vmov.16     q12, q8
+++        vmov.16     q13, q8
+++        vmov.16     q14, q8
+++        vmov.16     q15, q8
+++        vstm        r0, {q8-q15}
+++        bx lr
+++endfunc
+++
+++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        vdup.16     q8, r1
+++        vdup.16     q9, r1
+++        vmov.16     q10, q8
+++        vmov.16     q11, q8
+++        vmov.16     q12, q8
+++        vmov.16     q13, q8
+++        vmov.16     q14, q8
+++        vmov.16     q15, q8
+++        vstm        r0!, {q8-q15}
+++        vstm        r0!, {q8-q15}
+++        vstm        r0!, {q8-q15}
+++        vstm        r0, {q8-q15}
+++        bx lr
+++endfunc
+++
+++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        mov         r3, #16
+++        vdup.16     q8, r1
+++        vdup.16     q9, r1
+++        vmov.16     q10, q8
+++        vmov.16     q11, q8
+++        vmov.16     q12, q8
+++        vmov.16     q13, q8
+++        vmov.16     q14, q8
+++        vmov.16     q15, q8
+++1:      subs        r3, #1
+++        vstm        r0!, {q8-q15}
+++        bne         1b
+++        bx lr
+++endfunc
+++
+++
+++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
+++        vpush       {d8-d15}
+++        vld1.16     {q14, q15}, [r0]  // coeffs
+++        ldr         r3, =0x00240053 // 36 and 83
+++        vmov.32     d0[0], r3
+++
+++        tr4_shift d28, d29, d30, d31, #7
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++
+++        tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++
+++        vst1.16     {q14, q15}, [r0]
+++        vpop        {d8-d15}
+++        bx lr
+++endfunc
+++
+++
+++
+++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
+++        vpush       {d8-d15}
+++        vld1.16     {q14, q15}, [r0]  // coeffs
+++        ldr         r3, =0x4a  // 74
+++        vmov.32     d0[0], r3
+++        ldr         r3, =0x1d  // 29
+++        vmov.32     d0[1], r3
+++        ldr         r3, =0x37  // 55
+++        vmov.32     d1[0], r3
+++
+++        tr4_luma_shift d28, d29, d30, d31, #7
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++
+++        tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++        vst1.16     {q14, q15}, [r0]
+++        vpop        {d8-d15}
+++        bx lr
+++endfunc
+++
+++
+++
+++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
+++        push   {r4-r8}
+++        vpush {d8-d15}
+++        mov    r5, #16
+++
+++        adrl      r3, tr4f
+++        vld1.16   {d0, d1}, [r3]
+++
+++        // left half
+++        vld1.16 {d24}, [r0], r5
+++        vld1.16 {d25}, [r0], r5
+++        vld1.16 {d26}, [r0], r5
+++        vld1.16 {d27}, [r0], r5
+++        vld1.16 {d28}, [r0], r5
+++        vld1.16 {d29}, [r0], r5
+++        vld1.16 {d30}, [r0], r5
+++        vld1.16 {d31}, [r0], r5
+++        sub      r0, #128
+++        tr8_begin d25, d27, d29, d31
+++        tr4       d24, d26, d28, d30
+++        tr8_end   #7
+++        vst1.16 {d2}, [r0], r5
+++        vst1.16 {d3}, [r0], r5
+++        vst1.16 {d4}, [r0], r5
+++        vst1.16 {d5}, [r0], r5
+++        vst1.16 {d6}, [r0], r5
+++        vst1.16 {d7}, [r0], r5
+++        vst1.16 {d8}, [r0], r5
+++        vst1.16 {d9}, [r0], r5
+++        sub      r0, #128
+++        //skip right half if col_limit in r1 is less than 4
+++        cmp      r1, #4
+++        blt      1f
+++        //right half
+++        add      r0, #8
+++        vld1.16 {d24}, [r0], r5
+++        vld1.16 {d25}, [r0], r5
+++        vld1.16 {d26}, [r0], r5
+++        vld1.16 {d27}, [r0], r5
+++        vld1.16 {d28}, [r0], r5
+++        vld1.16 {d29}, [r0], r5
+++        vld1.16 {d30}, [r0], r5
+++        vld1.16 {d31}, [r0], r5
+++        sub      r0, #128
+++        tr8_begin d25, d27, d29, d31
+++        tr4       d24, d26, d28, d30
+++        tr8_end   #7
+++        vst1.16 {d2}, [r0], r5
+++        vst1.16 {d3}, [r0], r5
+++        vst1.16 {d4}, [r0], r5
+++        vst1.16 {d5}, [r0], r5
+++        vst1.16 {d6}, [r0], r5
+++        vst1.16 {d7}, [r0], r5
+++        vst1.16 {d8}, [r0], r5
+++        vst1.16 {d9}, [r0], r5
+++        sub      r0, #136
+++1:
+++        // top half
+++        vldm r0, {q12-q15} // coeffs
+++        transpose_16b_4x4 d24, d26, d28, d30
+++        transpose_16b_4x4 d25, d27, d29, d31
+++        tr8_begin d26, d30, d27, d31
+++        tr4 d24, d28, d25, d29
+++        tr8_end #(TRN_SHIFT)
+++        transpose_16b_4x4 d2, d3, d4, d5
+++        transpose_16b_4x4 d6, d7, d8, d9
+++        vswp     d7, d5
+++        vswp     d7, d8
+++        vswp     d3, d6
+++        vswp     d6, d4
+++        vstm r0!, {q1-q4}
+++
+++        // bottom half
+++        vldm r0, {q12-q15} // coeffs
+++        transpose_16b_4x4 d24, d26, d28, d30
+++        transpose_16b_4x4 d25, d27, d29, d31
+++        tr8_begin d26, d30, d27, d31
+++        tr4 d24, d28, d25, d29
+++        tr8_end #(TRN_SHIFT)
+++        transpose_16b_4x4 d2, d3, d4, d5
+++        transpose_16b_4x4 d6, d7, d8, d9
+++        vswp     d7, d5
+++        vswp     d7, d8
+++        vswp     d3, d6
+++        vswp     d6, d4
+++        //vstm     r0, {q1-q4}
+++        vst1.16 {q1-q2}, [r0]
+++        add     r0, #32
+++        vst1.16 {q3-q4}, [r0]
+++        sub     r0, #32
+++        vpop {d8-d15}
+++        pop {r4-r8}
+++        bx lr
+++endfunc
+++
+++#undef DC_SHIFT
+++#undef DC_ADD
+++#undef TRN_SHIFT
+++
+ diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
+ new file mode 100644
+-index 0000000..373576b
++index 0000000000..373576b4cb
+ --- /dev/null
+ +++ b/libavcodec/arm/hevc_misc_neon.S
+ @@ -0,0 +1,62 @@
+@@ -1115,8 +1430,310 @@ index 0000000..373576b
+ +
+ +endfunc
+ +
++diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
++new file mode 100644
++index 0000000000..bafefd4318
++--- /dev/null
+++++ b/libavcodec/arm/hevcdsp_cres_neon.S
++@@ -0,0 +1,296 @@
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++@ General notes:
+++@
+++@ Residual is only guaranteed to be cliped to 16 bits
+++@ This means that we do need to do movul, qadd, qmovun
+++@ rather than addw, qmovun (if we were clipped to 15 then we could get away
+++@ with this)
+++
+++@ ============================================================================
+++@ U add
+++
+++@ add_residual4x4_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc_v)             [r3]
+++
+++function ff_hevc_add_residual_4x4_u_neon_8, export=1
+++        vld1.8      {d16}, [r0, :64], r2
+++        vld1.8      {d17}, [r0, :64], r2
+++        vld1.8      {d18}, [r0, :64], r2
+++        vld1.8      {d19}, [r0, :64], r2
+++        vld1.16     {q0, q1}, [r1]
+++        vdup.16     q2, r3
+++        vdup.16     q3, r3
+++        vmovl.u8    q10, d16
+++        sub         r0, r0, r2, lsl #2
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vzip.16     q0, q2
+++        vzip.16     q1, q3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q2
+++        vqmovun.s16 d2,  q1
+++        vqmovun.s16 d3,  q3
+++        vst1.8      {d0}, [r0, :64], r2
+++        vst1.8      {d1}, [r0, :64], r2
+++        vst1.8      {d2}, [r0, :64], r2
+++        vst1.8      {d3}, [r0, :64]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++@   int dc_v)             [r3]
+++
+++function ff_hevc_add_residual_8x8_u_neon_8, export=1
+++        mov         r12,    #4
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {d16, d17}, [r0, :128], r2
+++        vld2.8      {d18, d19}, [r0, :128]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12, #1
+++        vmovl.u8    q10, d16
+++        sub         r0, r2
+++        vmovl.u8    q11, d18
+++        vqadd.s16   q0,  q10
+++        vaddw.u8    q2,  q15, d17
+++        vqadd.s16   q1,  q11
+++        vaddw.u8    q3,  q15, d19
+++        vqmovun.s16 d16,  q0
+++        vqmovun.s16 d17,  q2
+++        vqmovun.s16 d18,  q1
+++        vqmovun.s16 d19,  q3
+++        vst2.8      {d16, d17}, [r0, :128], r2
+++        vst2.8      {d18, d19}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++@   int dc_v)             [r3]
+++
+++function ff_hevc_add_residual_16x16_u_neon_8, export=1
+++        mov         r12,    #16
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {q8, q9}, [r0, :256]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12,   #1
+++        vmovl.u8    q10, d16
+++        vmovl.u8    q11, d17
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vaddw.u8    q2,  q15, d18
+++        vaddw.u8    q3,  q15, d19
+++        vqmovun.s16 d16, q0
+++        vqmovun.s16 d17, q1
+++        vqmovun.s16 d18, q2
+++        vqmovun.s16 d19, q3
+++        vst2.8      {q8, q9}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ V add
+++
+++@ add_residual4x4_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_4x4_v_neon_8, export=1
+++        vld1.8      {d16}, [r0, :64], r2
+++        vld1.8      {d17}, [r0, :64], r2
+++        vld1.8      {d18}, [r0, :64], r2
+++        vld1.8      {d19}, [r0, :64], r2
+++        vld1.16     {q2, q3}, [r1]
+++        vdup.16     q0, r3
+++        vdup.16     q1, r3
+++        vmovl.u8    q10, d16
+++        sub         r0, r0, r2, lsl #2
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vzip.16     q0, q2
+++        vzip.16     q1, q3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q2
+++        vqmovun.s16 d2,  q1
+++        vqmovun.s16 d3,  q3
+++        vst1.8      {d0}, [r0, :64], r2
+++        vst1.8      {d1}, [r0, :64], r2
+++        vst1.8      {d2}, [r0, :64], r2
+++        vst1.8      {d3}, [r0, :64]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_8x8_v_neon_8, export=1
+++        mov         r12,    #4
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {d16, d17}, [r0, :128], r2
+++        vld2.8      {d18, d19}, [r0, :128]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12, #1
+++        vmovl.u8    q10, d17
+++        sub         r0, r2
+++        vmovl.u8    q11, d19
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vaddw.u8    q2,  q15, d16
+++        vaddw.u8    q3,  q15, d18
+++        vqmovun.s16 d17,  q0
+++        vqmovun.s16 d16,  q2
+++        vqmovun.s16 d19,  q1
+++        vqmovun.s16 d18,  q3
+++        vst2.8      {d16, d17}, [r0, :128], r2
+++        vst2.8      {d18, d19}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_16x16_v_neon_8, export=1
+++        mov         r12,    #16
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {q8, q9}, [r0, :256]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12,   #1
+++        vmovl.u8    q10, d18
+++        vmovl.u8    q11, d19
+++        vaddw.u8    q2,  q15, d16
+++        vaddw.u8    q3,  q15, d17
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqmovun.s16 d16, q2
+++        vqmovun.s16 d17, q3
+++        vqmovun.s16 d18, q0
+++        vqmovun.s16 d19, q1
+++        vst2.8      {q8, q9}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ U & V add
+++
+++@ add_residual4x4_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_4x4_c_neon_8, export=1
+++        vld1.8      {d16}, [r0, :64], r2
+++        vld1.8      {d17}, [r0, :64], r2
+++        vld1.8      {d18}, [r0, :64], r2
+++        vld1.8      {d19}, [r0, :64], r2
+++        vldm        r1, {q0-q3}           @ Q0/1 gets all of U, Q2/3 gets all of V
+++        vmovl.u8    q10, d16
+++        sub         r0, r0, r2, lsl #2
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vzip.16     q0, q2
+++        vzip.16     q1, q3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q2
+++        vqmovun.s16 d2,  q1
+++        vqmovun.s16 d3,  q3
+++        vst1.8      {d0}, [r0, :64], r2
+++        vst1.8      {d1}, [r0, :64], r2
+++        vst1.8      {d2}, [r0, :64], r2
+++        vst1.8      {d3}, [r0, :64]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_8x8_c_neon_8, export=1
+++        mov         r12,    #8
+++        add         r3, r1, #(8*8*2)  @ Offset to V
+++1:
+++        vld2.8      {d16, d17}, [r0, :128]
+++        vld1.16     {q0}, [r1, :128]!
+++        vld1.16     {q1}, [r3, :128]!
+++        subs        r12, #1
+++        vmovl.u8    q10, d16
+++        vmovl.u8    q11, d17
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vst2.8      {d0, d1}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_16x16_c_neon_8, export=1
+++        mov         r12,    #16
+++        add         r3, r1, #(16*16*2)  @ Offset to V
+++1:
+++        vld2.8      {q8, q9}, [r0, :256]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        vld1.16     {q2, q3}, [r3, :256]!
+++        subs        r12,   #1
+++        vmovl.u8    q10, d16
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vqmovun.s16 d2,  q2
+++        vqmovun.s16 d3,  q3
+++        vst2.8      {q0, q1}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ 32x32 chroma never occurs so NIF
+++
+++@ ============================================================================
+ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+-index 166bddb..9bd0a42 100644
++index 166bddb104..15c4329cdb 100644
+ --- a/libavcodec/arm/hevcdsp_deblock_neon.S
+ +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+ @@ -15,7 +15,7 @@
+@@ -1128,58 +1745,235 @@ index 166bddb..9bd0a42 100644
+   */
+  
+  
+-@@ -31,6 +31,9 @@
++@@ -24,70 +24,238 @@
++ 
++ .macro hevc_loop_filter_chroma_start
++         ldr      r12, [r2]
++-        ldr      r3, [r2, #4]
++-        add      r2, r3, r12
++-        cmp      r2, #0
+++        ldr      r2, [r2, #4]
+++        orrs     r2, r12, r2, lsl #16
++         it       eq
+          bxeq     lr
+  .endm
+  
++-.macro hevc_loop_filter_chroma_body
++-        vsubl.u8  q3, d4, d2
++-        vsubl.u8  q11, d18, d19
++-        vshl.i16  q3, #2
++-        vadd.i16  q11, q3
++-        vdup.16   d0, r12
++-        vdup.16   d1, r3
++-        vrshr.s16 q11, q11, #3
++-        vneg.s16  q12, q0
+ +@ Uses: d2, d4, d18, d19
+ +@ Returns: d2, d4
+-+@ Modifies: d0-d7, d22-d25
+- .macro hevc_loop_filter_chroma_body
+-         vsubl.u8  q3, d4, d2
+-         vsubl.u8  q11, d18, d19
+-@@ -49,6 +52,33 @@
+-         vqmovun.s16 d4, q2
+- .endm
+- 
+-+
+-+@ Uses r2[0:7], r2[8:15]
+-+@ Modifies: d0-d7, d22-d25
+-+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
+-+        vsubl.u8  q3, \Q0, \P0
+-+        vsubl.u8  q11, \P1, \Q1
+-+        vshl.i16  q3, #2
+-+        vadd.i16  q11, q3
+-+
+-+        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
+-+        vdup.16   d0, r2
+-+        vmovl.u8  q0, d0
+-+        vuzp.16   d0, d1
+-+
+-+        vrshr.s16 q11, q11, #3
+-+        vneg.s16  q12, q0
+++@ Modifies: d0-d7, d22-d25, r12
+++
+++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
+++        vsubl.u8  q0, \Q0, \P0
+++        vsubl.u8  q1, \P1, \Q1
+++        vdup.16   d4, r2
+++        lsr       r2, r2, #16
+++        vshl.i16  q0, #2
+++        ldr       r12, [sp, #0] @ r12 = &no_q
+++        vadd.i16  q0, q1
+++        ldrh      r3, [r3]      @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
+++        vdup.16   d5, r2
+++
+++        vrshr.s16 q0, q0, #3
+++        ldrh      r12, [r12]
+++        vneg.s16  q3, q2
+++        vmin.s16  q0, q0, q2
+ +        vmovl.u8  q2, \Q0
+-+        vmin.s16  q11, q11, q0
+-+        vmax.s16  q11, q11, q12
+-+        vaddw.u8  q1, q11, \P0
+-+        vsub.i16  q2, q11
+++        vmax.s16  q0, q0, q3
+++        vaddw.u8  q1, q0, \P0
+++        vsub.i16  q2, q0
+++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+ +        vqmovun.s16 \P0, q1
+ +        vqmovun.s16 \Q0, q2
+ +.endm
+ +
+++@ Uses r2 (tc a;b)
+++@ Modifies: q0-q3
+++@ On exit
+++@   r12 (and flags) contain no_p;no_q
+++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
+++        vsub.i16  q0, \Q0, \P0
+++        lsl       r12, r2, #(\bit_depth - 8)
+++        vsub.i16  q1, \P1, \Q1
+++        vshl.i16  q0, #2
+++        vdup.16   d4, r12
+++        lsr       r12, r12, #16
+++        vadd.i16  q0, q1
+++        ldrh      r3, [r3]
+++        vdup.16   d5, r12
+++
+++        vrshr.s16 q0, q0, #3
+++        vneg.s16  q3, q2
+++        movw      r12, #(1 << \bit_depth) - 1
+++        vmin.s16  q0, q0, q2
+++        vmax.s16  q0, q0, q3
+++        vdup.i16  q3, r12
+++        ldr       r12, [sp, #0]
+++
+++        vadd.i16  \P0, q0, \P0
+++        vsub.i16  \Q0, q0
+++
+++        vmov.i64  q2, #0
+++        ldrh      r12, [r12]
+++        vmin.s16  \P0, q3
+++        vmin.s16  \Q0, q3
+++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+++        vmax.s16  \P0, q2
+++        vmax.s16  \Q0, q2
+++.endm
+++
+++
+++@ Preserves r12
+++@ Clobbers r2
+++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
+++        vsubl.u8  q0, \Q0u, \P0u
+++        vsubl.u8  q1, \Q0v, \P0v
+++        vsubl.u8  q2, \P1u, \Q1u
+++        vsubl.u8  q3, \P1v, \Q1v
+++        vshl.i16  q0, #2
+++        vshl.i16  q1, #2
+++        vadd.i16  q0, q2
+++        vdup.16   d4, r2
+++        lsr       r2, #16
+++        vadd.i16  q1, q3
+++
+++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+++        vrshr.s16 q0, #3
+++        vdup.16   d6, r2
++         vmovl.u8  q2, d4
++-        vmin.s16  q11, q11, q0
++-        vmax.s16  q11, q11, q12
++-        vaddw.u8  q1, q11, d2
++-        vsub.i16  q2, q11
++-        vqmovun.s16 d2, q1
++-        vqmovun.s16 d4, q2
+++        vmovl.u8  q3, d6
+++        vuzp.16   d4, d5
+++        vrshr.s16 q1, #3
+++        vuzp.16   d6, d7
+++
+++        vmin.s16  q0, q2
+++        vneg.s16  q2, q2
+++        vmin.s16  q1, q3
+++        vneg.s16  q3, q3
+++        vmax.s16  q0, q2
+++        vaddw.u8  q2, q0, \P0u
+++        vmax.s16  q1, q3
+++        vaddw.u8  q3, q1, \P0v
+++
+++        vqmovun.s16 \P0u, q2
+++        vmovl.u8  q2, \Q0u
+++        vqmovun.s16 \P0v, q3
+++        vmovl.u8  q3, \Q0v
+++        vsub.i16  q2, q0
+++        vsub.i16  q3, q1
+++
+++        vqmovun.s16 \Q0u, q2
+++        vqmovun.s16 \Q0v, q3
++ .endm
++ 
+++@ Preserves r12
+++@ Clobbers r2
+++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
+++        vsub.i16  q0, \Q0u, \P0u
+++        vsub.i16  q1, \Q0v, \P0v
+++        vsub.i16  q2, \P1u, \Q1u
+++        vsub.i16  q3, \P1v, \Q1v
+++        vshl.i16  q0, #2
+++        vshl.i16  q1, #2
+++        vadd.i16  q0, q2
+++        vdup.16   d4, r2
+++        lsr       r2, #16
+++        vadd.i16  q1, q3
+++
+++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+++        vrshr.s16 q0, #3
+++        vdup.16   d6, r2
+++        vshll.u8  q2, d4, #\bit_depth - 8
+++        vshll.u8  q3, d6, #\bit_depth - 8
+++        vuzp.16   d4, d5
+++        vrshr.s16 q1, #3
+++        vuzp.16   d6, d7
+++
+++        movw      r2, #(1 << \bit_depth) - 1
+++        vmin.s16  q0, q2
+++        vneg.s16  q2, q2
+++        vmin.s16  q1, q3
+++        vneg.s16  q3, q3
+++        vmax.s16  q0, q2
+++        vmov.i64  q2, #0
+++        vmax.s16  q1, q3
+++        vdup.i16  q3, r2
+++        vadd.i16  \P0u, q0
+++        vsub.i16  \Q0u, q0
+++        vadd.i16  \P0v, q1
+++        vsub.i16  \Q0v, q1
+++
+++        vmax.s16  \P0u, q2
+++        vmax.s16  \Q0u, q2
+++        vmax.s16  \P0v, q2
+++        vmax.s16  \Q0v, q2
+++        vmin.s16  \P0u, q3
+++        vmin.s16  \Q0u, q3
+++        vmin.s16  \P0v, q3
+++        vmin.s16  \Q0v, q3
+++.endm
+++
+ +
+ +
+  .macro hevc_loop_filter_luma_start
+          ldr     r12, [r3]
+          ldr      r3, [r3, #4]
+-@@ -60,15 +90,17 @@
+-         lsr      r3, #16
++-        lsl      r3, #16
++-        orr      r3, r12
++-        cmp      r3, #0
+++        orrs     r3, r12, r3, lsl #16
++         it       eq
++         bxeq     lr
++-        lsr      r3, #16
+  .endm
+  
+ -.macro hevc_loop_filter_luma_body
++-        vmovl.u8  q8, d16
++-        vmovl.u8  q9, d18
++-        vmovl.u8  q10, d20
++-        vmovl.u8  q11, d22
++-        vmovl.u8  q12, d24
++-        vmovl.u8  q13, d26
++-        vmovl.u8  q14, d28
++-        vmovl.u8  q15, d30
+ +@ Uses: r2, r3, r12
+ +@ Modifies: r5, r6, r7, r8, r9
+-+function hevc_loop_filter_luma_body
+++
+++@ Input:
+++@  r2          beta    (raw: needs shift for bitdepth > 8)
+++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
+++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
+++@  [sp,#96]    &no_p[0]
+++@  [sp,#100]   &no_q[0]
+++@
+++@ Input & output
+++@  8-bit: d16-d23
+++@ 16-bit:  q8-q15
+++@
+++@ Output
+++@  Z           r10==0
+++@  r10[ 0:7 ]  no_p[0]
+++@  r10[ 8:15]  no_p[1]
+++@  r10[16:23]  no_q[0]
+++@  r10[24:31]  no_q[1]
+++
++ 
+++.macro m_filter_luma bit_depth
+++.if \bit_depth == 8
+ +        vmovl.u8  q15, d23
+ +        vmovl.u8  q14, d22
+ +        vmovl.u8  q13, d21
+@@ -1187,54 +1981,72 @@ index 166bddb..9bd0a42 100644
+ +        vmovl.u8  q11, d19
+ +        vmovl.u8  q10, d18
+ +        vmovl.u8  q9, d17
+-         vmovl.u8  q8, d16
+--        vmovl.u8  q9, d18
+--        vmovl.u8  q10, d20
+--        vmovl.u8  q11, d22
+--        vmovl.u8  q12, d24
+--        vmovl.u8  q13, d26
+--        vmovl.u8  q14, d28
+--        vmovl.u8  q15, d30
+- 
+++        vmovl.u8  q8, d16
+++.endif
+          vadd.i16   q7, q9, q11
+++.if \bit_depth > 8
+++        lsl        r2, r2, #(\bit_depth - 8)
+++.endif
+          vadd.i16   q6, q14, q12
+-@@ -77,7 +109,6 @@
+++.if \bit_depth > 8
+++        lsl        r3, r3, #(\bit_depth - 8)
+++.endif
++         vsub.i16   q7, q10
+++        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
++         vsub.i16   q6, q13
+          vabd.s16   q7, q7, q10
+          vabd.s16   q6, q6, q13
+- 
+ -
+++        ldrh       r10, [r5]
++ 
+          vdup.16    q0, r2
+          vmov       q4, q7
+          vmov       q5, q6
+-@@ -152,7 +183,7 @@
++-        vdup.16    d4, r12
+++        ldr        r5, [sp, #100]
+++        vdup.16    d4, r3
+++        lsr        r3, r3, #16
++         vtrn.16    q7, q4
+++        ldrh       r5, [r5]
++         vtrn.16    q6, q5
++ 
++         vshl.u64   q7, #32
++         vshr.u64   q4, #32
++         vshl.u64   q6, #32
+++        orr        r10, r10, r5, lsl #16
++         vshr.u64   q5, #32
++         vshr.u64   q7, #32
++         vshr.u64   q6, #32
++@@ -152,7 +320,7 @@
+  
+          and        r9, r8, r7
+          cmp        r9, #0
+ -        beq        weakfilter_\@
+-+        beq        weakfilter_
+++        beq        1f
+  
+          vadd.i16  q2, q11, q12
+          vadd.i16  q4, q9, q8
+-@@ -210,11 +241,11 @@
++@@ -210,11 +378,11 @@
+          vbit      q13, q3, q5
+          vbit      q14, q2, q5
+  
+ -weakfilter_\@:
+-+weakfilter_:
+++1:
+          mvn       r8, r8
+          and       r9, r8, r7
+          cmp       r9, #0
+ -        beq       ready_\@
+-+        beq       ready_
+++        beq       2f
+  
+          vdup.16    q4, r2
+  
+-@@ -275,75 +306,345 @@ weakfilter_\@:
++@@ -275,111 +443,1041 @@ weakfilter_\@:
+          vbit      q11, q0, q5
+          vbit      q12, q4, q5
+  
+ -ready_\@:
+-+ready_:
+++2:
+++.if \bit_depth == 8
+          vqmovun.s16 d16, q8
+ -        vqmovun.s16 d18, q9
+ -        vqmovun.s16 d20, q10
+@@ -1243,7 +2055,7 @@ index 166bddb..9bd0a42 100644
+ -        vqmovun.s16 d26, q13
+ -        vqmovun.s16 d28, q14
+ -        vqmovun.s16 d30, q15
+--.endm
+++        cmp       r10, #0
+ +        vqmovun.s16 d17, q9
+ +        vqmovun.s16 d18, q10
+ +        vqmovun.s16 d19, q11
+@@ -1251,7 +2063,30 @@ index 166bddb..9bd0a42 100644
+ +        vqmovun.s16 d21, q13
+ +        vqmovun.s16 d22, q14
+ +        vqmovun.s16 d23, q15
+++.else
+++        movw      r12, #(1 << \bit_depth - 1)
+++        vmov.i64  q0, #0
+++        vdup.i16  q1, r12
+++        @ q8 & q15 should be unaltered and so don't require clipping
+++        vmax.s16  q9,  q0
+++        cmp       r10, #0
+++        vmax.s16  q10, q0
+++        vmax.s16  q11, q0
+++        vmax.s16  q12, q0
+++        vmax.s16  q13, q0
+++        vmax.s16  q14, q0
+++        vmin.s16  q9,  q1
+++        vmin.s16  q10, q1
+++        vmin.s16  q11, q1
+++        vmin.s16  q12, q1
+++        vmin.s16  q13, q1
+++        vmin.s16  q14, q1
+++.endif
+ +        mov       pc, lr
++ .endm
++ 
+++function hevc_loop_filter_luma_body
+++        m_filter_luma 8
+ +endfunc
+ +
+ +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
+@@ -1263,7 +2098,16 @@ index 166bddb..9bd0a42 100644
+ +        b        v_loop_luma_common
+ +endfunc
+ +
+- 
+++
+++@ void ff_hevc_v_loop_filter_luma_neon(
+++@   uint8_t *_pix,      [r0]
+++@   ptrdiff_t _stride,  [r1]
+++@   int _beta,          [r2]
+++@   int *_tc,           [r3]
+++@   uint8_t *_no_p,     [sp+0]
+++@   uint8_t *_no_q)     [sp+4]
+++
+++
+  function ff_hevc_v_loop_filter_luma_neon, export=1
+          hevc_loop_filter_luma_start
+ -        push     {r5-r11}
+@@ -1271,14 +2115,6 @@ index 166bddb..9bd0a42 100644
+ +
+ +        sub      r4, r0, #4
+ +v_loop_luma_common:
+-+        @ Why this isn't a bitmask to start with I have no idea...
+-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+-+        ldr      r5, [sp, #32]
+-+        ldrh     r10, [r5]
+-+        ldr      r5, [sp, #36]
+-+        ldrh     r5, [r5]
+-+        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
+-+
+          vpush    {d8-d15}
+ -        sub      r0, #4
+ -        vld1.8   {d16}, [r0], r1
+@@ -1335,44 +2171,38 @@ index 166bddb..9bd0a42 100644
+ +
+ +        @ no_p[1]
+ +        tst     r10, #0xff00
+-+        itt ne
+-+        addne    r4, r4, r1, lsl #2
+++        add     r2, r4, r1, lsl #2
+ +        bne     1f
+ +        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+ +        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+ +        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+-+
+++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
+++1:
+++        @ no_p[0]
+++        tst     r10, #0xff
+++        bne     1f
+++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
+++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
+++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
+++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
+ +1:
+ +        @ no_q[1]
+ +        tst     r10, #0xff000000
+-+        itt ne
+-+        addne    r0, r0, r1, lsl #2
+-+        bne     2f
+++        add     r2, r0, r1, lsl #2
+++        bne     1f
+ +        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+ +        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+ +        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+-+
+-+2:
+-+        @ no_p[0]
+-+        tst     r10, #0xff
+-+        bne     3f
+-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
+-+
+-+3:
+++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
+++1:
+ +        @ no_q[0]
+ +        tst     r10, #0xff0000
+-+        bne     4f
+-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
+-+
+-+4:
+++        bne     1f
+++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
+++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
+++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+++1:
+ +bypasswrite:
+          vpop     {d8-d15}
+ -        pop      {r5-r11}
+@@ -1380,6 +2210,81 @@ index 166bddb..9bd0a42 100644
+ +        pop      {r4-r10,pc}
+  endfunc
+  
+++.macro m_filter_v_luma_common_16 bit_depth
+++        vpush    {d8-d15}
+++
+++        @ Uses slightly fewer instructions to do laned loads than unlaned
+++        @ and transpose.  This also means that we can use the same code for
+++        @ both split & unsplit deblock
+++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
+++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+++
+++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
+++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+++
+++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
+++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+++
+++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
+++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+++
+++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
+++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+++
+++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+++
+++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+++
+++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
+++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+++
+++        bl hevc_loop_filter_luma_body_\bit_depth
+++
+++        neg     r1, r1
+++
+++        @ p[1]
+++        tst      r10, #0xff00
+++        add      r2, r4, r1, lsl #2
+++        bne      1f
+++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
+++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4]
+++1:
+++        @ p[0]
+++        tst      r10, #0xff
+++        bne      1f
+++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
+++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
+++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
+++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r2]
+++1:
+++        @ q[1]
+++        tst      r10, #0xff000000
+++        add      r2, r0, r1, lsl #2
+++        bne      1f
+++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
+++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0]
+++1:
+++        @ q[0]
+++        tst      r10, #0xff0000
+++        bne      1f
+++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
+++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
+++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
+++1:
+++        vpop     {d8-d15}
+++        pop      {r4-r10,pc}
+++.endm
+++
+++
+++
+++
+ +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
+ +@                                 ptrdiff_t stride, [r1]
+ +@                                 int beta,         [r2]
+@@ -1429,13 +2334,6 @@ index 166bddb..9bd0a42 100644
+ +        neg     r1, r1
+ +        add     r0, r0, r1
+ +
+-+        @ Why this isn't a bitmask to start with I have no idea...
+-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+-+        ldr      r5, [sp, #32]
+-+        ldrh     r10, [r5]
+-+        ldr      r5, [sp, #36]
+-+        ldrh     r5, [r5]
+-+        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+ +        bne      1f
+ +
+ +        vst1.8  {d22}, [r0], r1
+@@ -1486,8 +2384,81 @@ index 166bddb..9bd0a42 100644
+ +
+ +        pop      {r4-r10,pc}
+ +
+- endfunc
+- 
+++endfunc
+++
+++
+++.macro m_filter_h_luma_16 bit_depth
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}
+++
+++        vpush    {d8-d15}
+++        sub      r0, r0, r1, lsl #2
+++
+++        vld1.16 { q8}, [r0], r1
+++        vld1.16 { q9}, [r0], r1
+++        vld1.16 {q10}, [r0], r1
+++        vld1.16 {q11}, [r0], r1
+++        vld1.16 {q12}, [r0], r1
+++        vld1.16 {q13}, [r0], r1
+++        vld1.16 {q14}, [r0], r1
+++        vld1.16 {q15}, [r0]
+++
+++        bl hevc_loop_filter_luma_body_\bit_depth
+++
+++        vpop     {d8-d15}
+++
+++        sub      r0, r1
+++        neg      r1, r1
+++        bne      1f
+++
+++        vst1.16  {q14}, [r0], r1
+++        vst1.16  {q13}, [r0], r1
+++        vst1.16  {q12}, [r0], r1
+++        vst1.16  {q11}, [r0], r1
+++        vst1.16  {q10}, [r0], r1
+++        vst1.16  { q9}, [r0]
+++        pop      {r4-r10,pc}
+++
+++@ Partial write
+++1:
+++        tst      r10, #0xff0000
+++        mov      r2, r0
+++        bne      1f
+++        vst1.16  {d28}, [r2], r1
+++        vst1.16  {d26}, [r2], r1
+++        vst1.16  {d24}, [r2]
+++
+++1:
+++        tst      r10, #0xff000000
+++        add      r2, r0, #8
+++        bne      1f
+++        vst1.16  {d29}, [r2], r1
+++        vst1.16  {d27}, [r2], r1
+++        vst1.16  {d25}, [r2]
+++
+++1:
+++        tst      r10, #0xff
+++        @ r0 = r0 + r1 * 3
+++        add      r0, r0, r1
+++        add      r0, r0, r1, lsl # 1
+++        add      r2, r0, #8
+++        bne      1f
+++        vst1.16  {d22}, [r0], r1
+++        vst1.16  {d20}, [r0], r1
+++        vst1.16  {d18}, [r0]
+++
+++1:
+++        tst      r10, #0xff00
+++        bne      1f
+++        vst1.16  {d23}, [r2], r1
+++        vst1.16  {d21}, [r2], r1
+++        vst1.16  {d19}, [r2]
+++
+++1:
+++        pop      {r4-r10,pc}
+++.endm
+++
+++
+ +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
+ +@                                     unsigned int stride,   // r1
+ +@                                     uint32_t tc4,          // r2
+@@ -1501,9 +2472,7 @@ index 166bddb..9bd0a42 100644
+ +        vld2.8   {d26,d27}, [r0], r1
+ +        vld2.8   {d28,d29}, [r0]
+ +        sub      r0, r0, r1, lsl #1
+-+        hevc_loop_filter_uv_body d16, d18, d26, d28
+-+        lsr      r2, r2, #16
+-+        hevc_loop_filter_uv_body d17, d19, d27, d29
+++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
+ +        cmp      r3, #0
+ +        bne      1f
+ +        vst2.8   {d18,d19}, [r0], r1
+@@ -1513,122 +2482,509 @@ index 166bddb..9bd0a42 100644
+ +        @ At least one no_f bit is set
+ +        @ Which means we need to break this apart in an ugly fashion
+ +1:      vzip.8   d18, d19
+++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
+ +        vzip.8   d26, d27
+ +        sub      r1, r1, #8
+ +
+-+        tst      r3, #1
+-+        bne      1f
+++        bmi      1f
+ +        vst1.8   {d18}, [r0]
+ +1:      add      r0, r0, #8
+-+        tst      r3, #2
+-+        bne      2f
+++        bcs      2f
+ +        vst1.8   {d19}, [r0]
+-+2:      add      r0, r0, r1
+++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
+++        add      r0, r0, r1
+ +
+-+        tst      r3, #4
+-+        bne      1f
+++        bmi      1f
+ +        vst1.8   {d26}, [r0]
+-+1:      add      r0, r0, #8
+-+        tst      r3, #8
+-+        it ne
+-+        bxne     lr
+++1:      it cs
+++        bxcs     lr
+++        add      r0, r0, #8
+ +        vst1.8   {d27}, [r0]
+ +        bx       lr
+ +
+ +endfunc
+ +
+ +
+++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     unsigned int no_f);    // r3
+++@
+++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++@
+++@ Macro here actual function near bottom
+++
+++.macro m_filter_h_uv_16 bit_depth
+++        sub      r0, r0, r1, lsl #1
+++        vld2.16  {q8,  q9 }, [r0], r1
+++        vld2.16  {q10, q11}, [r0], r1
+++        vld2.16  {q12, q13}, [r0], r1
+++        vld2.16  {q14, q15}, [r0]
+++        sub      r0, r0, r1, lsl #1
+++
+++        hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+++
+++        cmp      r3, #0
+++        bne      1f
+++        vst2.16  {q10, q11}, [r0], r1
+++        vst2.16  {q12, q13}, [r0]
+++        bx       lr
+++
+++        @ At least one no_f bit is set
+++        @ Which means we need to break this apart in an ugly fashion
+++1:      vzip.16  q10, q11
+++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
+++        vzip.16  q12, q13
+++        sub      r1, r1, #16
+++
+++        bmi      1f
+++        vst1.16  {q10}, [r0]
+++1:      add      r0, r0, #16
+++        bcs      2f
+++        vst1.16  {q11}, [r0]
+++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
+++        add      r0, r0, r1
+++
+++        bmi      1f
+++        vst1.16  {q12}, [r0]
+++1:      it cs
+++        bxcs     lr
+++        add      r0, r0, #16
+++        vst1.16  {q13}, [r0]
+++        bx       lr
+++.endm
+++
+++
+ +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+ +@                                     unsigned int stride,   // r1
+ +@                                     uint32_t tc4,          // r2
+ +@                                     uint8_t * src_l,       // r3
+ +@                                     unsigned int no_f);   // sp[0]
+ +@
+-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++
+ +function ff_hevc_v_loop_filter_uv2_neon_8, export=1
+ +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
+-+        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
+++        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
+++        sub      r12, r0, r3
+ +
+ +        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+-+        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+++        vld4.8   {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
+++        cmp      r12, #4
+ +
+ +        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+-+        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+++        vld4.8   {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
+ +
+ +        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+-+        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+++        vld4.8   {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
+ +
+ +        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+-+        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++        vld4.8   {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
+ +
+ +        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+-+        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+++        vld4.8   {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
+ +
+ +        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+-+        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+++        vld4.8   {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
+ +
+ +        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
+-+        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
+++        vld4.8   {d20[7], d21[7], d22[7], d23[7]}, [r0]
+++        it eq
+++        ldreq    r12, [sp, #0]
+++
+++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
+++        cmp      r12, #0
+++        add      r3, #2
+++        neg      r1, r1
+++        bne      1f
+++
+++@ Much/most of the time r0 == r3 + 4 and no_f == 0
+++@ so it is worth having this special case
+++        vst4.8   {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
+++        vst4.8   {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
+++        vst4.8   {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
+++        vst4.8   {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
+++        vst4.8   {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
+++        vst4.8   {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
+++        vst4.8   {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
+++        vst4.8   {d18[0], d19[0], d20[0], d21[0]}, [r3]
+++        bx       lr
+++
+++@ Either split or partial
+++1:
+++        ldr      r12, [sp, #0]
+++        lsls     r12, #29               @ b2 -> N, b3 -> C
+++        add      r2, r0, r1, lsl #2
+++        bcs      1f
+++        vst2.8   {d20[7], d21[7]}, [r0], r1
+++        vst2.8   {d20[6], d21[6]}, [r0], r1
+++        vst2.8   {d20[5], d21[5]}, [r0], r1
+++        vst2.8   {d20[4], d21[4]}, [r0]
+++1:
+++        bmi      2f
+++        vst2.8   {d20[3], d21[3]}, [r2], r1
+++        vst2.8   {d20[2], d21[2]}, [r2], r1
+++        vst2.8   {d20[1], d21[1]}, [r2], r1
+++        vst2.8   {d20[0], d21[0]}, [r2]
+++
+++2:
+++        lsls     r12, #2
+++        add      r2, r3, r1, lsl #2
+++        bcs      3f
+++        vst2.8   {d18[7], d19[7]}, [r3], r1
+++        vst2.8   {d18[6], d19[6]}, [r3], r1
+++        vst2.8   {d18[5], d19[5]}, [r3], r1
+++        vst2.8   {d18[4], d19[4]}, [r3]
+++3:
+++        it mi
+++        bxmi     lr
+++        vst2.8   {d18[3], d19[3]}, [r2], r1
+++        vst2.8   {d18[2], d19[2]}, [r2], r1
+++        vst2.8   {d18[1], d19[1]}, [r2], r1
+++        vst2.8   {d18[0], d19[0]}, [r2]
+++        bx       lr
++ endfunc
++ 
+++
+++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     uint8_t * src_l,       // r3
+++@                                     unsigned int no_f);   // sp[0]
+++@
+++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++.macro m_filter_v_uv2_16 bit_depth
+++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
+++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+++        sub      r12, r0, r3
+++
+++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
+++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+++        cmp      r12, #8
+++
+++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
+++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+++
+++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
+++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+ +
+-+        hevc_loop_filter_uv_body d16, d18, d26, d28
+-+        lsr      r2, r2, #16
+-+        hevc_loop_filter_uv_body d17, d19, d27, d29
+++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
+++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+ +
+++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
+++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+++
+++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
+++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+++
+++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r3]
+++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+++        it eq
+++        ldreq    r12, [sp, #0]
+++
+++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+++        cmp      r12, #0
+++        add      r3, #4
+ +        neg      r1, r1
+++        bne      1f
+ +
+-+        ldr      r2, [sp, #0]
+++@ Much/most of the time r0 == r3 + 4 and no_f == 0
+++@ so it is worth having this special case
+++        vst4.16  {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
+++        vst4.16  {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
+++        vst4.16  {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
+++        vst4.16  {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
+++        vst4.16  {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
+++        vst4.16  {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
+++        vst4.16  {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
+++        vst4.16  {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
+++        bx       lr
+ +
+-+        @ p[1]
+-+        tst      r2, #2
+-+        itt ne
+-+        addne    r3, r3, r1, lsl #2
+++@ Either split or partial
+++1:
+++        ldr      r12, [sp, #0]
+++        lsls     r12, #29               @ b2 -> N, b3 -> C
+++        add      r2, r0, r1, lsl #2
+++        bcs      1f
+++        vst2.16  {d25[3], d27[3]}, [r0], r1
+++        vst2.16  {d25[2], d27[2]}, [r0], r1
+++        vst2.16  {d25[1], d27[1]}, [r0], r1
+++        vst2.16  {d25[0], d27[0]}, [r0]
+++1:
+++        bmi      2f
+++        vst2.16  {d24[3], d26[3]}, [r2], r1
+++        vst2.16  {d24[2], d26[2]}, [r2], r1
+++        vst2.16  {d24[1], d26[1]}, [r2], r1
+++        vst2.16  {d24[0], d26[0]}, [r2]
+++
+++2:
+++        lsls     r12, #2
+++        add      r2, r3, r1, lsl #2
+++        bcs      3f
+++        vst2.16  {d21[3], d23[3]}, [r3], r1
+++        vst2.16  {d21[2], d23[2]}, [r3], r1
+++        vst2.16  {d21[1], d23[1]}, [r3], r1
+++        vst2.16  {d21[0], d23[0]}, [r3]
+++3:
+++        it mi
+++        bxmi     lr
+++        vst2.16  {d20[3], d22[3]}, [r2], r1
+++        vst2.16  {d20[2], d22[2]}, [r2], r1
+++        vst2.16  {d20[1], d22[1]}, [r2], r1
+++        vst2.16  {d20[0], d22[0]}, [r2]
+++        bx       lr
+++.endm
+++
+++
+++
++ function ff_hevc_v_loop_filter_chroma_neon, export=1
++         hevc_loop_filter_chroma_start
+++
+++        sub      r0, #2
+++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
+++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
+++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
+++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
+++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
+++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
+++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
+++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
+++
+++        sub      r0, r0, r1, lsl #3
+++        add      r0, r0, #1
+++        hevc_loop_filter_chroma_body d16, d17, d18, d19
+ +        bne      1f
+-+        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
+-+        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+-+        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+-+        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+++
+++        vst2.8   {d17[0], d18[0]}, [r0], r1
+++        vst2.8   {d17[1], d18[1]}, [r0], r1
+++        vst2.8   {d17[2], d18[2]}, [r0], r1
+++        vst2.8   {d17[3], d18[3]}, [r0], r1
+++        vst2.8   {d17[4], d18[4]}, [r0], r1
+++        vst2.8   {d17[5], d18[5]}, [r0], r1
+++        vst2.8   {d17[6], d18[6]}, [r0], r1
+++        vst2.8   {d17[7], d18[7]}, [r0], r1
+++        bx       lr
+ +
+ +1:
+-+        @ q[1]
+-+        tst      r2, #8
+-+        itt ne
+-+        addne    r0, r0, r1, lsl #2
+-+        bne 2f
+-+        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
+-+        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+-+        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+-+        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++        tst      r12, #0xff             @ P0a
+++        bne      2f
+++
+++        vst1.8   {d17[0]}, [r0], r1
+++        vst1.8   {d17[1]}, [r0], r1
+++        vst1.8   {d17[2]}, [r0], r1
+++        vst1.8   {d17[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+ +
+ +2:
+-+        @ p[0]
+-+        tst      r2, #1
+++        tst      r12, #0xff0000         @ Q0a
+++        add      r0, #1
+ +        bne      3f
+-+        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+-+        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+-+        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+-+        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
+++        vst1.8   {d18[0]}, [r0], r1
+++        vst1.8   {d18[1]}, [r0], r1
+++        vst1.8   {d18[2]}, [r0], r1
+++        vst1.8   {d18[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+ +
+ +3:
+-+        @ q[0]
+-+        tst      r2, #4
+++        tst      r12, #0xff000000       @ Q0b
+++        add      r0, r0, r1, lsl #2
+++        bne      4f
+++        vst1.8   {d18[4]}, [r0], r1
+++        vst1.8   {d18[5]}, [r0], r1
+++        vst1.8   {d18[6]}, [r0], r1
+++        vst1.8   {d18[7]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++4:
+++        tst      r12, #0xff00           @ P0b
+ +        it ne
+ +        bxne     lr
+-+        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+-+        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+-+        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+-+        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
+ +
+++        sub      r0, #1
+++        vst1.8   {d17[4]}, [r0], r1
+++        vst1.8   {d17[5]}, [r0], r1
+++        vst1.8   {d17[6]}, [r0], r1
+++        vst1.8   {d17[7]}, [r0], r1
+ +        bx       lr
+++
+ +endfunc
+ +
+ +
+- function ff_hevc_v_loop_filter_chroma_neon, export=1
+-         hevc_loop_filter_chroma_start
+++.macro m_filter_v_chroma_16 bit_depth
+++        hevc_loop_filter_chroma_start
+++
+          sub      r0, #4
+-@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+-         vst1.8   {d4}, [r0]
+++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
+++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
+++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
+++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
+++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
+++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
+++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
+++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
+++
+++        sub      r0, r0, r1, lsl #3
+++        add      r0, r0, #2
+++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+++        bne      1f
+++
+++        vst2.16  {d18[0], d20[0]}, [r0], r1
+++        vst2.16  {d18[1], d20[1]}, [r0], r1
+++        vst2.16  {d18[2], d20[2]}, [r0], r1
+++        vst2.16  {d18[3], d20[3]}, [r0], r1
+++        vst2.16  {d19[0], d21[0]}, [r0], r1
+++        vst2.16  {d19[1], d21[1]}, [r0], r1
+++        vst2.16  {d19[2], d21[2]}, [r0], r1
+++        vst2.16  {d19[3], d21[3]}, [r0], r1
+++        bx       lr
+++
+++1:
+++        tst      r12, #0xff             @ P0a
+++        bne      2f
+++
+++        vst1.16  {d18[0]}, [r0], r1
+++        vst1.16  {d18[1]}, [r0], r1
+++        vst1.16  {d18[2]}, [r0], r1
+++        vst1.16  {d18[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++2:
+++        tst      r12, #0xff0000         @ Q0a
+++        add      r0, #1
+++        bne      3f
+++        vst1.16  {d20[0]}, [r0], r1
+++        vst1.16  {d20[1]}, [r0], r1
+++        vst1.16  {d20[2]}, [r0], r1
+++        vst1.16  {d20[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++3:
+++        tst      r12, #0xff000000       @ Q0b
+++        add      r0, r0, r1, lsl #2
+++        bne      4f
+++        vst1.16  {d21[0]}, [r0], r1
+++        vst1.16  {d21[1]}, [r0], r1
+++        vst1.16  {d21[2]}, [r0], r1
+++        vst1.16  {d21[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++4:
+++        tst      r12, #0xff00           @ P0b
+++        it ne
+++        bxne     lr
+++
+++        sub      r0, #1
+++        vst1.16  {d19[0]}, [r0], r1
+++        vst1.16  {d19[1]}, [r0], r1
+++        vst1.16  {d19[2]}, [r0], r1
+++        vst1.16  {d19[3]}, [r0], r1
+++        bx       lr
+++.endm
+++
+++
+++@ void ff_hevc_h_loop_filter_chroma_neon(
+++@   uint8_t *_pix,     [r0]
+++@   ptrdiff_t _stride, [r1]
+++@   int *_tc,          [r2]
+++@   uint8_t *_no_p,    [r3]
+++@   uint8_t *_no_q);   [sp+0]
+++
+++function ff_hevc_h_loop_filter_chroma_neon, export=1
+++        hevc_loop_filter_chroma_start
+++        sub      r0, r0, r1, lsl #1
++         vld1.8   {d16}, [r0], r1
++         vld1.8   {d17}, [r0], r1
++         vld1.8   {d18}, [r0], r1
++-        vld1.8   {d2},  [r0], r1
++-        vld1.8   {d4},  [r0], r1
++-        vld1.8   {d19}, [r0], r1
++-        vld1.8   {d20}, [r0], r1
++-        vld1.8   {d21}, [r0], r1
++-        sub      r0, r0, r1, lsl #3
++-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
++-        hevc_loop_filter_chroma_body
++-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
++-        vst1.8   {d16}, [r0], r1
+++        vld1.8   {d19}, [r0]
+++        sub      r0, r0, r1, lsl #1
+++        hevc_loop_filter_chroma_body d16, d17, d18, d19
+++        bne      1f     @ Partial write
++         vst1.8   {d17}, [r0], r1
++-        vst1.8   {d18}, [r0], r1
++-        vst1.8   {d2},  [r0], r1
++-        vst1.8   {d4},  [r0], r1
++-        vst1.8   {d19}, [r0], r1
++-        vst1.8   {d20}, [r0], r1
++-        vst1.8   {d21}, [r0]
+++        vst1.8   {d18}, [r0]
+++        bx       lr
+++1:
+++        tst      r12, #0xff
+++        vmov     r2, r3, d17
+++        it eq
+++        streq    r2, [r0]
+++        tst      r12, #0xff00
+++        it eq
+++        streq    r3, [r0, #4]
+++
+++        add      r0, r1
+++        tst      r12, #0xff0000
+++        vmov     r2, r3, d18
+++        it eq
+++        streq    r2, [r0]
+++        tst      r12, #0xff000000
+++        it eq
+++        streq    r3, [r0, #4]
+++
+          bx       lr
+  endfunc
++ 
++-function ff_hevc_h_loop_filter_chroma_neon, export=1
+++.macro m_filter_h_chroma_16 bit_depth
++         hevc_loop_filter_chroma_start
++         sub      r0, r0, r1, lsl #1
++-        vld1.8   {d18}, [r0], r1
++-        vld1.8   {d2}, [r0], r1
++-        vld1.8   {d4}, [r0], r1
++-        vld1.8   {d19}, [r0]
+++        vld1.16  {q8}, [r0], r1
+++        vld1.16  {q9}, [r0], r1
+++        vld1.16  {q10}, [r0], r1
+++        vld1.16  {q11}, [r0]
++         sub      r0, r0, r1, lsl #1
++-        hevc_loop_filter_chroma_body
++-        vst1.8   {d2}, [r0], r1
++-        vst1.8   {d4}, [r0]
+++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+++        bne      1f     @ Partial write
+++        vst1.16  {q9}, [r0], r1
+++        vst1.16  {q10}, [r0]
+++        bx       lr
+++1:
+++        tst      r12, #0xff
+++        bne      2f
+++        vst1.16  {d18}, [r0]
+++2:
+++        tst      r12, #0xff00
+++        bne      3f
+++        add      r0, #8
+++        vst1.16  {d19}, [r0]
+++        sub      r0, #8
+++3:
+++        tst      r12, #0xff0000
+++        add      r0, r1
+++        bne      4f
+++        vst1.16  {d20}, [r0]
+++4:
+++        tst      r12, #0xff000000
+++        it ne
+++        bxne     lr
+++        add      r0, #8
+++        vst1.16  {d21}, [r0]
+++
++         bx       lr
+++.endm
+++
+ +
+ +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
+ + *                                            int *curr_rpl0, int *curr_
+@@ -1754,9 +3110,54 @@ index 166bddb..9bd0a42 100644
+ +        b           11b
+ +endfunc
+ +
+++@ =============================================================================
+++@
+++@ 10 bit
+++
+++function hevc_loop_filter_luma_body_10
+++        m_filter_luma 10
+++endfunc
+++
+++function ff_hevc_h_loop_filter_luma_neon_10, export=1
+++        m_filter_h_luma_16 10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_luma2_neon_10, export=1
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
+++
+++        ldr      r4, [sp, #40]
+++        b        v_loop_luma_common_10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_luma_neon_10, export=1
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}
+++
+++        sub      r4, r0, #8
+++v_loop_luma_common_10:
+++        m_filter_v_luma_common_16 10
+++endfunc
+++
+++function ff_hevc_h_loop_filter_uv_neon_10, export=1
+++        m_filter_h_uv_16 10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_uv2_neon_10, export=1
+++        m_filter_v_uv2_16 10
+++endfunc
+++
+++function ff_hevc_h_loop_filter_chroma_neon_10, export=1
+++        m_filter_h_chroma_16 10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_chroma_neon_10, export=1
+++        m_filter_v_chroma_16 10
++ endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+ new file mode 100644
+-index 0000000..00eab9e
++index 0000000000..00eab9eeee
+ --- /dev/null
+ +++ b/libavcodec/arm/hevcdsp_epel_neon.S
+ @@ -0,0 +1,337 @@
+@@ -2097,70 +3498,620 @@ index 0000000..00eab9e
+ +       .byte 4, 28, 46, 6
+ +       .byte 2, 16, 54, 4
+ +       .byte 2, 10, 58, 2
+-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+-index 5591807..b6c48ee 100644
+---- a/libavcodec/arm/hevcdsp_init_neon.c
+-+++ b/libavcodec/arm/hevcdsp_init_neon.c
+-@@ -22,11 +22,26 @@
+- #include "libavutil/arm/cpu.h"
+- #include "libavcodec/hevcdsp.h"
+- #include "hevcdsp_arm.h"
+-+#include "libavcodec/avcodec.h"
+-+#include "libavcodec/bit_depth_template.c"
++diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
++index 13d540e5ff..9b6d745556 100644
++--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++++ b/libavcodec/arm/hevcdsp_idct_neon.S
++@@ -21,82 +21,6 @@
++ #include "libavutil/arm/asm.S"
++ #include "neon.S"
++ 
++-function ff_hevc_idct_4x4_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        vdup.16     q0, r1
++-        vdup.16     q1, r1
++-        vst1.16     {q0, q1}, [r0]
++-        bx lr
++-endfunc
++-
++-function ff_hevc_idct_8x8_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        vdup.16     q8, r1
++-        vdup.16     q9, r1
++-        vmov.16     q10, q8
++-        vmov.16     q11, q8
++-        vmov.16     q12, q8
++-        vmov.16     q13, q8
++-        vmov.16     q14, q8
++-        vmov.16     q15, q8
++-        vstm        r0, {q8-q15}
++-        bx lr
++-endfunc
++-
++-function ff_hevc_idct_16x16_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        vdup.16     q8, r1
++-        vdup.16     q9, r1
++-        vmov.16     q10, q8
++-        vmov.16     q11, q8
++-        vmov.16     q12, q8
++-        vmov.16     q13, q8
++-        vmov.16     q14, q8
++-        vmov.16     q15, q8
++-        vstm        r0!, {q8-q15}
++-        vstm        r0!, {q8-q15}
++-        vstm        r0!, {q8-q15}
++-        vstm        r0, {q8-q15}
++-        bx lr
++-endfunc
++-
++-function ff_hevc_idct_32x32_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        mov         r3, #16
++-        vdup.16     q8, r1
++-        vdup.16     q9, r1
++-        vmov.16     q10, q8
++-        vmov.16     q11, q8
++-        vmov.16     q12, q8
++-        vmov.16     q13, q8
++-        vmov.16     q14, q8
++-        vmov.16     q15, q8
++-1:      subs        r3, #1
++-        vstm        r0!, {q8-q15}
++-        bne         1b
++-        bx lr
++-endfunc
++-
++ function ff_hevc_transform_add_4x4_neon_8, export=1
++         vldm        r1, {q0-q1}
++         vld1.32     d4[0], [r0], r2
++@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1
++         bx          lr
++ endfunc
+  
+- void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+- void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+- void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+- void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+ +
+-+#ifdef RPI
+-+void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+-+                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+-+                             const uint8_t no_p[2], const uint8_t no_q[2],
+-+                             uint8_t * _pix_l);
+-+void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+-+                             unsigned int no_f);
+-+void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+-+                             uint8_t * src_l,
+-+                             unsigned int no_f);
+-+#endif
+++@ ff_hevc_add_residual_4x4_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_4x4_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++
+++        vld1.32     d4[0], [r0], r1
+++        vld1.32     d4[1], [r0], r1
+++        vld1.32     d5[0], [r0], r1
+++        vld1.32     d5[1], [r0], r1
+++        sub         r0, r0, r1, lsl #2
+++        vaddw.u8    q0, q15, d4
+++        vaddw.u8    q1, q15, d5
+++        vqmovun.s16 d0, q0
+++        vqmovun.s16 d1, q1
+++        vst1.32     d0[0], [r0], r1
+++        vst1.32     d0[1], [r0], r1
+++        vst1.32     d1[0], [r0], r1
+++        vst1.32     d1[1], [r0], r1
+++        bx          lr
+++endfunc
+ +
+- void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+- void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+- void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+-@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                       ptrdiff_t stride);
+- 
+-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+ +
+-+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++@ ff_hevc_add_residual_4x4_dc_c_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+ +
+-+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
+++        vdup.32     q15, r2
+++        mov         r3,  #4
+++        b           1f
+++endfunc
+ +
+-+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
+-+                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
+++@ ff_hevc_add_residual_8x8_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_8x8_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++        mov         r3,  #8
+++
+++1:      subs        r3,   #1
+++        vld1.8      d16,  [r0]
+++        vaddw.u8    q0,   q15, d16
+++        vqmovun.s16 d0,   q0
+++        vst1.32     d0,   [r0], r1
+++        bne         1b
+++        bx          lr
+++endfunc
+ +
+-+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
+-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                                  int width, int height);
+ +
+++@ ff_hevc_add_residual_8x8_dc_c_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+ +
+++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
+++        vdup.32     q15, r2
+++        mov         r3,  #8
+++        b           1f
+++endfunc
+++
+++@ ff_hevc_add_residual_16x16_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_16x16_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++        mov         r3,  #16
+++
+++1:      subs        r3,   #1
+++        vld1.8      {q8},  [r0]
+++        vaddw.u8    q0,  q15, d16
+++        vaddw.u8    q1,  q15, d17
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vst1.8      {q0},   [r0], r1
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++
+++@ ff_hevc_add_residual_16x16_dc_c_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
+++        vdup.32     q15, r2
+++        mov         r3,  #16
+++        b           1f
+++endfunc
+++
+++@ ff_hevc_add_residual_32x32_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_32x32_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++        mov         r3,  #32
+++
+++1:      subs        r3,   #1
+++        vld1.8      {q8, q9},  [r0]
+++        vaddw.u8    q0,  q15, d16
+++        vaddw.u8    q1,  q15, d17
+++        vaddw.u8    q2,  q15, d18
+++        vaddw.u8    q3,  q15, d19
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vqmovun.s16 d2,  q2
+++        vqmovun.s16 d3,  q3
+++        vst1.8     {q0, q1},   [r0], r1
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++
+++
++ .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
++         vtrn.64         \r0, \r4
++         vtrn.64         \r1, \r5
++@@ -263,55 +312,6 @@ endfunc
++         vqrshrn.s32   \r3, q3, \shift
++ .endm
++ 
++-function ff_hevc_transform_4x4_neon_8, export=1
++-        vpush       {d8-d15}
++-        vld1.16     {q14, q15}, [r0]  // coeffs
++-        ldr         r3, =0x00240053 // 36 and 83
++-        vmov.32     d0[0], r3
++-
++-        tr4_shift d28, d29, d30, d31, #7
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-
++-        tr4_shift d28, d29, d30, d31, #12
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-
++-        vst1.16     {q14, q15}, [r0]
++-        vpop        {d8-d15}
++-        bx lr
++-endfunc
++-
++-function ff_hevc_transform_luma_4x4_neon_8, export=1
++-        vpush       {d8-d15}
++-        vld1.16     {q14, q15}, [r0]  // coeffs
++-        ldr         r3, =0x4a  // 74
++-        vmov.32     d0[0], r3
++-        ldr         r3, =0x1d  // 29
++-        vmov.32     d0[1], r3
++-        ldr         r3, =0x37  // 55
++-        vmov.32     d1[0], r3
++-
++-        tr4_luma_shift d28, d29, d30, d31, #7
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-
++-        tr4_luma_shift d28, d29, d30, d31, #12
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-        vst1.16     {q14, q15}, [r0]
++-        vpop        {d8-d15}
++-        bx lr
++-endfunc
++-
++ .macro tr8_begin in0, in1, in2, in3
++         vmull.s16  q7, \in0, d1[1]   // 89 * src1
++         vmull.s16  q8, \in0, d1[0]   // 75 * src1
++@@ -356,100 +356,6 @@ endfunc
++         vqrshrn.s32   d8, q5, \shift
++ .endm
++ 
++-function ff_hevc_transform_8x8_neon_8, export=1
++-        push   {r4-r8}
++-        vpush {d8-d15}
++-        mov    r5, #16
++-
++-        adr       r3, tr4f
++-        vld1.16   {d0, d1}, [r3]
++-
++-        // left half
++-        vld1.16 {d24}, [r0], r5
++-        vld1.16 {d25}, [r0], r5
++-        vld1.16 {d26}, [r0], r5
++-        vld1.16 {d27}, [r0], r5
++-        vld1.16 {d28}, [r0], r5
++-        vld1.16 {d29}, [r0], r5
++-        vld1.16 {d30}, [r0], r5
++-        vld1.16 {d31}, [r0], r5
++-        sub      r0, #128
++-        tr8_begin d25, d27, d29, d31
++-        tr4       d24, d26, d28, d30
++-        tr8_end   #7
++-        vst1.16 {d2}, [r0], r5
++-        vst1.16 {d3}, [r0], r5
++-        vst1.16 {d4}, [r0], r5
++-        vst1.16 {d5}, [r0], r5
++-        vst1.16 {d6}, [r0], r5
++-        vst1.16 {d7}, [r0], r5
++-        vst1.16 {d8}, [r0], r5
++-        vst1.16 {d9}, [r0], r5
++-        sub      r0, #128
++-        //skip right half if col_limit in r1 is less than 4
++-        cmp      r1, #4
++-        blt      1f
++-        //right half
++-        add      r0, #8
++-        vld1.16 {d24}, [r0], r5
++-        vld1.16 {d25}, [r0], r5
++-        vld1.16 {d26}, [r0], r5
++-        vld1.16 {d27}, [r0], r5
++-        vld1.16 {d28}, [r0], r5
++-        vld1.16 {d29}, [r0], r5
++-        vld1.16 {d30}, [r0], r5
++-        vld1.16 {d31}, [r0], r5
++-        sub      r0, #128
++-        tr8_begin d25, d27, d29, d31
++-        tr4       d24, d26, d28, d30
++-        tr8_end   #7
++-        vst1.16 {d2}, [r0], r5
++-        vst1.16 {d3}, [r0], r5
++-        vst1.16 {d4}, [r0], r5
++-        vst1.16 {d5}, [r0], r5
++-        vst1.16 {d6}, [r0], r5
++-        vst1.16 {d7}, [r0], r5
++-        vst1.16 {d8}, [r0], r5
++-        vst1.16 {d9}, [r0], r5
++-        sub      r0, #136
++-1:
++-        // top half
++-        vldm r0, {q12-q15} // coeffs
++-        transpose_16b_4x4 d24, d26, d28, d30
++-        transpose_16b_4x4 d25, d27, d29, d31
++-        tr8_begin d26, d30, d27, d31
++-        tr4 d24, d28, d25, d29
++-        tr8_end #12
++-        transpose_16b_4x4 d2, d3, d4, d5
++-        transpose_16b_4x4 d6, d7, d8, d9
++-        vswp     d7, d5
++-        vswp     d7, d8
++-        vswp     d3, d6
++-        vswp     d6, d4
++-        vstm r0!, {q1-q4}
++-
++-        // bottom half
++-        vldm r0, {q12-q15} // coeffs
++-        transpose_16b_4x4 d24, d26, d28, d30
++-        transpose_16b_4x4 d25, d27, d29, d31
++-        tr8_begin d26, d30, d27, d31
++-        tr4 d24, d28, d25, d29
++-        tr8_end #12
++-        transpose_16b_4x4 d2, d3, d4, d5
++-        transpose_16b_4x4 d6, d7, d8, d9
++-        vswp     d7, d5
++-        vswp     d7, d8
++-        vswp     d3, d6
++-        vswp     d6, d4
++-        //vstm     r0, {q1-q4}
++-        vst1.16 {q1-q2}, [r0]
++-        add     r0, #32
++-        vst1.16 {q3-q4}, [r0]
++-        sub     r0, #32
++-        vpop {d8-d15}
++-        pop {r4-r8}
++-        bx lr
++-endfunc
++ 
++ .align 4
++ tr4f:
++@@ -463,3 +369,11 @@ tr16:
++ .word 0x00500046  // 80, d2[2] = 70
++ .word 0x0039002b  // 57, d2[0] = 43
++ .word 0x00190009  // 25, d2[2] = 9
+++
+++#define BIT_DEPTH 8
+++#include "hevc_idct_fn_neon.S"
+++
+++#undef BIT_DEPTH
+++#define BIT_DEPTH 10
+++#include "hevc_idct_fn_neon.S"
+++
++diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
++index 55918077e2..e708b7c074 100644
++--- a/libavcodec/arm/hevcdsp_init_neon.c
+++++ b/libavcodec/arm/hevcdsp_init_neon.c
++@@ -22,11 +22,41 @@
++ #include "libavutil/arm/cpu.h"
++ #include "libavcodec/hevcdsp.h"
++ #include "hevcdsp_arm.h"
+++#include "libavcodec/avcodec.h"
+++#include "libavcodec/bit_depth_template.c"
++ 
++ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++
+++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++
+++#ifdef RPI
+++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                             const uint8_t no_p[2], const uint8_t no_q[2],
+++                             uint8_t * _pix_l);
+++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                             unsigned int no_f);
+++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                             uint8_t * src_l,
+++                             unsigned int no_f);
+++
+++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
+++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                             const uint8_t no_p[2], const uint8_t no_q[2],
+++                             uint8_t * _pix_l);
+++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                             unsigned int no_f);
+++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                             uint8_t * src_l,
+++                             unsigned int no_f);
+++#endif
+++
++ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
++@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
++ void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
++ void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
++ void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+++
+++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
+++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
+++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
+++
++ void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                     ptrdiff_t stride);
++ void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                     ptrdiff_t stride);
++ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                       ptrdiff_t stride);
++ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                       ptrdiff_t stride);
+++
+++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++
+++
+++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                     ptrdiff_t stride);
+++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                     ptrdiff_t stride);
+++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                       ptrdiff_t stride);
+++
+++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++
+++
+++#if RPI_HEVC_SAND
+++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++
+++
+++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++#endif
+++
+++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++
+++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++
+++#if RPI_HEVC_SAND
+++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++
+++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++
+++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++
+++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++#endif
+++
+++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++
+++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++
++ 
+  #define PUT_PIXELS(name) \
+      void name(int16_t *dst, uint8_t *src, \
+-                                 ptrdiff_t srcstride, int height, \
+-@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
++@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+  #undef PUT_PIXELS
+@@ -2176,227 +4127,110 @@ index 5591807..b6c48ee 100644
+  
+  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, int width);
+-@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
++@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+  }
+  
+-+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                                MvField *curr, MvField *neigh, uint8_t *bs);
+++
+++
+++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+ +{
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int8_t offset_table[32] = { 0 };
+-+    int k, y, x;
+-+    int shift  = 3; // BIT_DEPTH - 5
+-+    int cwidth = 0;
+++    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+++    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
+++}
+++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+++    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
+++}
+ +
+-+    stride_src /= sizeof(pixel);
+-+    stride_dst /= sizeof(pixel);
+++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+++    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++}
+++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+++    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++}
+ +
+-+    for (k = 0; k < 4; k++)
+-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+++#if SAO_FILTER_N == 6
+++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+++    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
+++}
+++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+++    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
+++}
+ +
+-+    if (height % 8 == 0)
+-+        cwidth = width;
+++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+++}
+++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+++}
+ +
+-+    switch(cwidth){
+-+    case 8:
+-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    case 16:
+-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    case 32:
+-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    case 64:
+-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    default:
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width; x++)
+-+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+-+            dst += stride_dst;
+-+            src += stride_src;
+-+        }
+-+    }
+++#if RPI_HEVC_SAND
+++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+++    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+++}
+++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+++    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+ +}
+ +
+-+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
+++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
+ +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+ +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+ +                                  int width, int height)
+ +{
+-+    // Width 32 already dealt with
+-+    // width 16 code works in double lines
+-+    if (width == 16 && (height & 1) == 0) {
+-+        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
+-+                                          sao_offset_val_u, sao_left_class_u,
+-+                                          sao_offset_val_v, sao_left_class_v,
+-+                                          width, height);
+-+    }
+-+    else
+-+    {
+-+        const int shift  = 3; // BIT_DEPTH - 5
+-+        int k, y, x;
+-+        pixel *dst = (pixel *)_dst;
+-+        pixel *src = (pixel *)_src;
+-+        int8_t offset_table_u[32] = { 0 };
+-+        int8_t offset_table_v[32] = { 0 };
+-+
+-+        stride_src /= sizeof(pixel);
+-+        stride_dst /= sizeof(pixel);
+-+
+-+        for (k = 0; k < 4; k++)
+-+            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+-+        for (k = 0; k < 4; k++)
+-+            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+-+
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width * 2; x += 2)
+-+            {
+-+                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+-+                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+-+            }
+-+            dst += stride_dst;
+-+            src += stride_src;
+-+
+-+        }
+-+    }
+++    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+++    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+ +}
+-+
+-+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+-+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-+                                          int16_t *_sao_offset_val, int eo, int width, int height)
+++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+ +{
+-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+-+    static const int8_t pos[4][2][2] = {
+-+        { { -1,  0 }, {  1, 0 } }, // horizontal
+-+        { {  0, -1 }, {  0, 1 } }, // vertical
+-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+-+    };
+-+    int8_t sao_offset_val[8];  // padding of 3 for vld
+-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int a_stride, b_stride;
+-+    int x, y;
+-+    int cwidth = 0;
+-+
+-+    for (x = 0; x < 5; x++) {
+-+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
+-+    }
+-+
+-+    if (height % 8 == 0)
+-+        cwidth = width;
+-+
+-+    stride_src /= sizeof(pixel);
+-+    stride_dst /= sizeof(pixel);
+-+
+-+    switch (cwidth) {
+-+    case 32:
+-+        switch(eo) {
+-+        case 0:
+-+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 1:
+-+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 2:
+-+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 3:
+-+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        }
+-+        break;
+-+    case 64:
+-+        switch(eo) {
+-+        case 0:
+-+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 1:
+-+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 2:
+-+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 3:
+-+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        }
+-+        break;
+-+    default:
+-+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+-+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width; x++) {
+-+                int diff0         = CMP(src[x], src[x + a_stride]);
+-+                int diff1         = CMP(src[x], src[x + b_stride]);
+-+                int idx           = diff0 + diff1;
+-+                if (idx)
+-+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
+-+            }
+-+            src += stride_src;
+-+            dst += stride_dst;
+-+        }
+-+    }
+++    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+++    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+ +}
+++#endif
+++#endif
+ +
+ +
+-+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+-+                                  int eo, int width, int height)
+-+{
+-+    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+-+
+-+    if (width == 32 && (height & 7) == 0) {
+-+        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
+-+    }
+-+    else
+-+    {
+-+        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+-+        static const int8_t pos[4][2][2] = {
+-+            { { -1,  0 }, {  1, 0 } }, // horizontal
+-+            { {  0, -1 }, {  0, 1 } }, // vertical
+-+            { { -1, -1 }, {  1, 1 } }, // 45 degree
+-+            { {  1, -1 }, { -1, 1 } }, // 135 degree
+-+        };
+-+        int8_t sao_offset_val_u[8];  // padding of 3 for vld
+-+        int8_t sao_offset_val_v[8];  // padding of 3 for vld
+-+        pixel *dst = (pixel *)_dst;
+-+        pixel *src = (pixel *)_src;
+-+        int a_stride, b_stride;
+-+        int x, y;
+-+
+-+        for (x = 0; x < 5; x++) {
+-+            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
+-+            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
+-+        }
+-+
+-+        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+-+        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width * 2; x += 2) {
+-+                int diff0u = CMP(src[x], src[x + a_stride]);
+-+                int diff1u = CMP(src[x], src[x + b_stride]);
+-+                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+-+                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+-+                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
+-+                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
+-+            }
+-+            src += stride_src;
+-+            dst += stride_dst;
+-+        }
+-+    }
+-+}
+-+#undef CMP
+ +
+-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
+++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
+++#error SAO edge src stride not 160 - value used in .S
+++#endif
+ +
+  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+  {
+@@ -2407,7 +4241,9 @@ index 5591807..b6c48ee 100644
+          c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+ +        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
+          c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
+          c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
+ +#ifdef RPI
+ +        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
+ +        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
+@@ -2416,21 +4252,68 @@ index 5591807..b6c48ee 100644
+          c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+          c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+          c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+-@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
++         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
++         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
++         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
++-        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
++-        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
++-        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
++-        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+++        c->transform_add[0]             = ff_hevc_transform_add_4x4_neon_8;
+++        c->transform_add[1]             = ff_hevc_transform_add_8x8_neon_8;
+++        c->transform_add[2]             = ff_hevc_transform_add_16x16_neon_8;
+++        c->transform_add[3]             = ff_hevc_transform_add_32x32_neon_8;
+++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
+++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
+++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
+++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
+++#if RPI_HEVC_SAND
+++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
+++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
+++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
+++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
+++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
+++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
+++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
+++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
+++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
+++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
+++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
+++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
+++#endif
+          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+-+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+-+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+-+          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
+-+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+-+          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
+-+        }
+-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
+++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
+++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
+++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
+++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
+++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
+++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
+++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
+++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
+++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
+++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
+++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
+++#endif
+++#if RPI_HEVC_SAND
+++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
+++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
+++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
+++
+++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
+++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
+++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
+++
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
+++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
+++#endif
+++#endif
+          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+-@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+@@ -2452,22 +4335,711 @@ index 5591807..b6c48ee 100644
+          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+-@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+      }
+-+
+-+    assert(offsetof(MvField, mv) == 0);
+-+    assert(offsetof(MvField, ref_idx) == 8);
+-+    assert(offsetof(MvField, pred_flag) == 10);
+-+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+++    else if (bit_depth == 10) {
+++        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
+++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
+++        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
+++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
+++        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
+++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
+++        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
+++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
+++#ifdef RPI
+++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
+++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
+++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
+++#endif
+++        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
+++        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
+++        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
+++        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
+++        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
+++        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
+++        c->transform_add[0]             = ff_hevc_add_residual_4x4_neon_10;
+++        c->transform_add[1]             = ff_hevc_add_residual_8x8_neon_10;
+++        c->transform_add[2]             = ff_hevc_add_residual_16x16_neon_10;
+++        c->transform_add[3]             = ff_hevc_add_residual_32x32_neon_10;
+++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
+++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
+++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
+++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
+++#if RPI_HEVC_SAND
+++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
+++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
+++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
+++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
+++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
+++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
+++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
+++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
+++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
+++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
+++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
+++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
+++#endif
+++        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_10;
+++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
+++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
+++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
+++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
+++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
+++
+++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
+++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
+++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
+++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
+++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
+++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
+++#endif
+++#if RPI_HEVC_SAND
+++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
+++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
+++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
+++
+++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
+++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
+++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
+++
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
+++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
+++#endif
+++#endif
+++    }
+++
+++    assert(offsetof(MvField, mv) == 0);
+++    assert(offsetof(MvField, ref_idx) == 8);
+++    assert(offsetof(MvField, pred_flag) == 10);
+++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+  }
++diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
++new file mode 100644
++index 0000000000..7cc5cd5e5c
++--- /dev/null
+++++ b/libavcodec/arm/hevcdsp_res16_neon.S
++@@ -0,0 +1,610 @@
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++#define BIT_DEPTH 10
+++
+++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+++        vmax.s16  \Q0, \Q_MIN
+++        vmax.s16  \Q1, \Q_MIN
+++        vmax.s16  \Q2, \Q_MIN
+++        vmax.s16  \Q3, \Q_MIN
+++        vmin.s16  \Q0, \Q_MAX
+++        vmin.s16  \Q1, \Q_MAX
+++        vmin.s16  \Q2, \Q_MAX
+++        vmin.s16  \Q3, \Q_MAX
+++.endm
+++
+++@ add_residual4x4(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
+++        vld1.16     {q10, q11}, [r1]
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vld1.16     {d0}, [r0, :64], r2
+++        vld1.16     {d1}, [r0, :64], r2
+++        vld1.16     {d2}, [r0, :64], r2
+++        vld1.16     {d3}, [r0, :64], r2
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        sub         r0,  r0,  r2, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vst1.16     {d0}, [r0, :64], r2
+++        vst1.16     {d1}, [r0, :64], r2
+++        vst1.16     {d2}, [r0, :64], r2
+++        vst1.16     {d3}, [r0, :64], r2
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual4x4(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vdup.i16    q9,  r3
+++        vld1.16     {d0}, [r0, :64], r1
+++        vld1.16     {d1}, [r0, :64], r1
+++        vdup.16     q15, r2
+++        vld1.16     {d2}, [r0, :64], r1
+++        vld1.16     {d3}, [r0, :64], r1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q15
+++        sub         r0,  r0,  r1, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vst1.16     {d0}, [r0, :64], r1
+++        vst1.16     {d1}, [r0, :64], r1
+++        vst1.16     {d2}, [r0, :64], r1
+++        vst1.16     {d3}, [r0, :64], r1
+++        bx          lr
+++
+++endfunc
+++
+++
+++@ add_residual8x8(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        mov         r12, #2
+++1:
+++        vldm        r1!, {q10-q13}
+++        vld1.16     {q0}, [r0, :128], r2
+++        subs        r12, #1
+++        vld1.16     {q1}, [r0, :128], r2
+++        vqadd.s16   q0,  q10
+++        vld1.16     {q2}, [r0, :128], r2
+++        vqadd.s16   q1,  q11
+++        vld1.16     {q3}, [r0, :128], r2
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r0,  r2, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vst1.16     {q0}, [r0, :128], r2
+++        vmin.s16    q2,  q2,  q9
+++        vst1.16     {q1}, [r0, :128], r2
+++        vmin.s16    q3,  q3,  q9
+++        vst1.16     {q2}, [r0, :128], r2
+++        vst1.16     {q3}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual4x4_dc_c(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc_uv)         [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
+++        mov         r12, #1
+++        vdup.32     q15, r2
+++        b           9f
+++endfunc
+++
+++@ add_residual8x8_dc(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
+++        mov         r12, #2
+++        vdup.16     q15, r2
+++9:
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++1:
+++        vld1.16     {q0}, [r0, :128], r1
+++        subs        r12, #1
+++        vld1.16     {q1}, [r0, :128], r1
+++        vqadd.s16   q0,  q15
+++        vld1.16     {q2}, [r0, :128], r1
+++        vqadd.s16   q1,  q15
+++        vld1.16     {q3}, [r0, :128], r1
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r0,  r1, lsl #2
+++        vmax.s16    q0,  q8
+++        vmax.s16    q1,  q8
+++        vmax.s16    q2,  q8
+++        vmax.s16    q3,  q8
+++        vmin.s16    q0,  q9
+++        vmin.s16    q1,  q9
+++        vst1.16     {q0}, [r0, :128], r1
+++        vmin.s16    q2,  q9
+++        vst1.16     {q1}, [r0, :128], r1
+++        vmin.s16    q3,  q9
+++        vst1.16     {q2}, [r0, :128], r1
+++        vst1.16     {q3}, [r0, :128], r1
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual16x16(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        mov         r12, #8
+++1:
+++        vldm        r1!, {q10-q13}
+++        @ For RPI Sand we could guarantee :256 but not for general
+++        @ non-RPI allocation. :128 is as good as we can claim
+++        vld1.16     {q0, q1}, [r0, :128], r2
+++        subs        r12, #1
+++        vld1.16     {q2, q3}, [r0, :128]
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++        vst1.16     {q0, q1}, [r0, :128], r2
+++        vst1.16     {q2, q3}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_dc_c(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc_uv)         [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
+++        mov         r12, #4
+++        vdup.32     q15, r2
+++        b           9f
+++endfunc
+++
+++@ add_residual16x16_dc(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
+++        vdup.i16    q15, r2
+++        mov         r12, #8
+++9:
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++1:
+++        @ For RPI Sand we could guarantee :256 but not for general
+++        @ non-RPI allocation. :128 is as good as we can claim
+++        vld1.16     {q0, q1}, [r0, :128], r1
+++        subs        r12, #1
+++        vld1.16     {q2, q3}, [r0, :128]
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r1
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst1.16     {q0, q1}, [r0, :128], r1
+++        vst1.16     {q2, q3}, [r0, :128], r1
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++
+++@ add_residual32x32(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        mov         r12, #32
+++1:
+++        vldm        r1!, {q10-q13}
+++        vldm        r0,  {q0-q3}
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vstm        r0,  {q0-q3}
+++        add         r0,  r2
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual8x8_dc_c(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc_uv)         [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
+++        mov         r12, #16
+++        vdup.32     q15, r2
+++        b           9f
+++endfunc
+++
+++@ add_residual32x32_dc(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
+++        vdup.i16    q15, r2
+++        mov         r12, #32
+++9:
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++1:
+++        vldm        r0,  {q0-q3}
+++        subs        r12, #1
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q15
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vstm        r0,  {q0-q3}
+++        add         r0,  r1
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ ============================================================================
+++@ U add
+++
+++@ add_residual4x4_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
+++        vld1.16     {q10, q11}, [r1, :256]
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++
+++        vld2.16     {d0, d2}, [r0, :128], r2
+++        vld2.16     {d1, d3}, [r0, :128], r2
+++        vld2.16     {d4, d6}, [r0, :128], r2
+++        vld2.16     {d5, d7}, [r0, :128], r2
+++
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r0,  r2, lsl #2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++
+++        vst2.16     {d0, d2}, [r0, :128], r2
+++        vst2.16     {d1, d3}, [r0, :128], r2
+++        vst2.16     {d4, d6}, [r0, :128], r2
+++        vst2.16     {d5, d7}, [r0, :128]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #4
+++        vdup.i16    q9,  r3
+++1:
+++        vld2.16     {q0, q1}, [r0, :256], r2
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256], r2
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #16
+++        vdup.i16    q9,  r3
+++        sub         r2,  #32
+++1:
+++        vld2.16     {q0, q1}, [r0, :256]!
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q3,  q15
+++        sub         r0,  #32
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256]!
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ V add
+++
+++@ add_residual4x4_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
+++        vld1.16     {q10, q11}, [r1, :256]
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++
+++        vld2.16     {d0, d2}, [r0, :128], r2
+++        vld2.16     {d1, d3}, [r0, :128], r2
+++        vld2.16     {d4, d6}, [r0, :128], r2
+++        vld2.16     {d5, d7}, [r0, :128], r2
+++
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q10
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q11
+++        sub         r0,  r0,  r2, lsl #2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++
+++        vst2.16     {d0, d2}, [r0, :128], r2
+++        vst2.16     {d1, d3}, [r0, :128], r2
+++        vst2.16     {d4, d6}, [r0, :128], r2
+++        vst2.16     {d5, d7}, [r0, :128]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #4
+++        vdup.i16    q9,  r3
+++1:
+++        vld2.16     {q0, q1}, [r0, :256], r2
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q10
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q11
+++        sub         r0,  r2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256], r2
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #16
+++        vdup.i16    q9,  r3
+++        sub         r2,  #32
+++1:
+++        vld2.16     {q0, q1}, [r0, :256]!
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q10
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q11
+++        sub         r0,  #32
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256]!
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ U & V add
+++
+++@ add_residual4x4_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
+++        vldm        r1, {q10-q13}
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++
+++        vld2.16     {d0, d2}, [r0, :128], r2
+++        vld2.16     {d1, d3}, [r0, :128], r2
+++        vld2.16     {d4, d6}, [r0, :128], r2
+++        vld2.16     {d5, d7}, [r0, :128], r2
+++
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r0,  r2, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++
+++        vst2.16     {d0, d2}, [r0, :128], r2
+++        vst2.16     {d1, d3}, [r0, :128], r2
+++        vst2.16     {d4, d6}, [r0, :128], r2
+++        vst2.16     {d5, d7}, [r0, :128]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #4
+++        vdup.i16    q9,  r3
+++        add         r3, r1, #(8*8*2)  @ Offset to V
+++1:
+++        vld2.16     {q0, q1}, [r0, :256], r2
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        vld1.16     {q12, q13}, [r3, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++        vst2.16     {q0, q1}, [r0, :256], r2
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #16
+++        vdup.i16    q9,  r3
+++        add         r3,  r1, #(16*16*2)  @ Offset to V
+++        sub         r2,  #32
+++1:
+++        vld2.16     {q0, q1}, [r0, :256]!
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        vld1.16     {q12, q13}, [r3, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  #32
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++        vst2.16     {q0, q1}, [r0, :256]!
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
+ new file mode 100644
+-index 0000000..08a021d
++index 0000000000..30113d9c93
+ --- /dev/null
+ +++ b/libavcodec/arm/hevcdsp_sao_neon.S
+-@@ -0,0 +1,862 @@
++@@ -0,0 +1,1882 @@
+ +/*
+ + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ + *
+@@ -2491,124 +5063,211 @@ index 0000000..08a021d
+ +#include "libavutil/arm/asm.S"
+ +#include "neon.S"
+ +
+-+.macro init_sao_band
+-+        pld      [r1]
+-+        vld1.8   {q0, q1}, [r2]  // offset table
+-+        ldr       r2, [sp, #0]   // stride_dst
+-+        ldr      r12, [sp, #4]   // height
+-+        vmov.u8  q3, #128
+-+.endm
+++.set EDGE_SRC_STRIDE, 160
+++
+++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
+++        vshr.u8 q12, q8, #3
+++        vadd.s8  q8, \Q_K128
+++        vshr.u8 q13, q9, #3
+++        vadd.s8  q9, \Q_K128
+++
+++        vtbl.8   d24, \XLAT0, d24
+++        vtbl.8   d25, \XLAT0, d25
+++        vtbl.8   d26, \XLAT1, d26
+++        vtbl.8   d27, \XLAT1, d27
+ +
+-+// 128 in q3
+-+// input q8 - q11
+-+.macro sao_band_64
+-+        vtbl.8   d24, {d0, d1, d2, d3}, d24
+-+        vadd.s8  q8, q3
+-+        vtbl.8   d25, {d0, d1, d2, d3}, d25
+-+        vadd.s8  q9, q3
+-+        vtbl.8   d26, {d0, d1, d2, d3}, d26
+-+        vadd.s8  q10, q3
+-+        vtbl.8   d27, {d0, d1, d2, d3}, d27
+-+        vadd.s8  q11, q3
+-+        vtbl.8   d28, {d0, d1, d2, d3}, d28
+ +        vqadd.s8 q8, q12
+-+        vtbl.8   d29, {d0, d1, d2, d3}, d29
+++        vshr.u8 q12, q10, #3
+++        vadd.s8  q10, \Q_K128
+ +        vqadd.s8 q9, q13
+-+        vtbl.8   d30, {d0, d1, d2, d3}, d30
+-+        vqadd.s8 q10, q14
+-+        vtbl.8   d31, {d0, d1, d2, d3}, d31
+-+        vsub.s8  q8, q3
+-+        vqadd.s8 q11, q15
+-+        vsub.s8  q9, q3
+-+        vsub.s8  q10, q3
+-+        vsub.s8  q11, q3
+++        vshr.u8 q13, q11, #3
+++        vadd.s8  q11, \Q_K128
+++
+++        vsub.s8  q8, \Q_K128
+++        vtbl.8   d24, \XLAT0, d24
+++        vtbl.8   d25, \XLAT0, d25
+++        vsub.s8  q9, \Q_K128
+++        vtbl.8   d26, \XLAT1, d26
+++        vtbl.8   d27, \XLAT1, d27
+++        vqadd.s8 q10, q12
+++        vqadd.s8 q11, q13
+++        vsub.s8  q10, \Q_K128
+++        vsub.s8  q11, \Q_K128
+ +.endm
+ +
+-+function ff_hevc_sao_band_w8_neon_8, export=1
+-+        init_sao_band
+-+1:      subs     r12, #8
+-+        vld1.8   {d16}, [r1, :64], r3
+-+        vld1.8   {d17}, [r1, :64], r3
+-+        vshr.u8  q12, q8, #3
+-+        vld1.8   {d18}, [r1, :64], r3
+-+        vld1.8   {d19}, [r1, :64], r3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8   {d20}, [r1, :64], r3
+-+        vld1.8   {d21}, [r1, :64], r3
+-+        vshr.u8  q14, q10, #3
+-+        vld1.8   {d22}, [r1, :64], r3
+-+        vld1.8   {d23}, [r1, :64], r3
+-+        vshr.u8  q15, q11, #3
+-+        sao_band_64
+-+        vst1.8  {d16}, [r0, :64], r2
+-+        vst1.8  {d17}, [r0, :64], r2
+-+        vst1.8  {d18}, [r0, :64], r2
+-+        vst1.8  {d19}, [r0, :64], r2
+-+        vst1.8  {d20}, [r0, :64], r2
+-+        vst1.8  {d21}, [r0, :64], r2
+-+        vst1.8  {d22}, [r0, :64], r2
+-+        vst1.8  {d23}, [r0, :64], r2
+-+        bne    1b
+++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
+++        vshr.u8 q12, q8, #3
+++        vadd.s8  q8, \Q_K128
+ +
+-+        bx lr
+-+endfunc
+++        vtbl.8   d24, \XLAT0, d24
+++        vtbl.8   d25, \XLAT1, d25
+ +
+-+function ff_hevc_sao_band_w16_neon_8, export=1
+-+        init_sao_band
+-+1:      subs     r12, #4
+-+        vld1.8  {q8}, [r1, :128], r3
+-+        vshr.u8  q12, q8, #3
+-+        vld1.8  {q9}, [r1, :128], r3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8  {q10}, [r1, :128], r3
+-+        vshr.u8  q14, q10, #3
+-+        vld1.8  {q11}, [r1, :128], r3
+-+        vshr.u8  q15, q11, #3
+-+        sao_band_64
+-+        vst1.8   {q8}, [r0, :128], r2
+-+        vst1.8   {q9}, [r0, :128], r2
+-+        vst1.8   {q10}, [r0, :128], r2
+-+        vst1.8   {q11}, [r0, :128], r2
+-+        bne    1b
+++        vqadd.s8 q8, q12
+++        vsub.s8  q8, \Q_K128
+++.endm
+ +
+-+        bx lr
+++
+++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+++        vmax.s16  \Q0, \Q_MIN
+++        vmax.s16  \Q1, \Q_MIN
+++        vmax.s16  \Q2, \Q_MIN
+++        vmax.s16  \Q3, \Q_MIN
+++        vmin.s16  \Q0, \Q_MAX
+++        vmin.s16  \Q1, \Q_MAX
+++        vmin.s16  \Q2, \Q_MAX
+++        vmin.s16  \Q3, \Q_MAX
+++.endm
+++
+++@ Clobbers q12, q13
+++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
+++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
+++        vtbl.8    d24, \XLAT0, d24
+++        vtbl.8    d25, \XLAT1, d25
+++        vtbl.8    d26, \XLAT0, d26
+++        vtbl.8    d27, \XLAT1, d27
+++        vaddw.s8  \Q0, d24
+++        vaddw.s8  \Q1, d25
+++        vaddw.s8  \Q2, d26
+++        vaddw.s8  \Q3, d27
+++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
+++.endm
+++
+++@ Clobbers q12
+++.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+++        vtbl.8    d24, \XLAT0, d24
+++        vtbl.8    d25, \XLAT1, d25
+++        vaddw.s8  \Q0, d24
+++        vaddw.s8  \Q1, d25
+++        vmax.s16  \Q0, \Q_MIN
+++        vmax.s16  \Q1, \Q_MIN
+++        vmin.s16  \Q0, \Q_MAX
+++        vmin.s16  \Q1, \Q_MAX
+++.endm
+++
+++
+++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
+++@ so we are quite safe stuffing it into a byte array
+++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
+++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
+++@ precision
+++
+++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
+++@ array via the stack
+++@ Given that sao_left_class > 28 can cause wrap we can't just poke
+++@ all 4 bytes in at once
+++@
+++@ It also loads other common regs
+++
+++function band_load_y
+++        vmov.i64  q0, #0
+++        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
+++        add       r12, #2               @ 1st interesting val is [1]
+++        vld1.16   {d16}, [r12]          @ Unaligned
+++        vmov.i64  q1, #0
+++        ldr       r12, [sp, #12]        @ sao_left_class
+++
+++        mov       r4, sp
+++        sub       sp, #32
+++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
+++        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
+++        add       r12, sp
+++        vst1.8    {d16[0]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[2]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[4]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[6]}, [r12]
+++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
+++        mov       sp, r4
+++
+++        ldr       r12, [sp, #20]        @ height
+++        pld       [r1]
+++
+++        sub       r12, #1
+++        add       r4, r1, r3
+++        bx        lr
+ +endfunc
+ +
+-+function ff_hevc_sao_band_w32_neon_8, export=1
+-+        init_sao_band
+-+1:      subs     r12, #2
+-+        vld1.8   {q8-q9}, [r1, :128], r3
+-+        vshr.u8  q12, q8, #3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8   {q10-q11}, [r1, :128], r3
+-+        vshr.u8  q14, q10, #3
+-+        vshr.u8  q15, q11, #3
+-+        sao_band_64
+-+        vst1.8   {q8-q9}, [r0, :128], r2
+-+        vst1.8   {q10-q11}, [r0, :128], r2
+-+        bne      1b
+ +
+-+        bx       lr
+++function band_load_c
+++        vmov.i64  q2, #0
+++        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
+++        add       r12, #2               @ 1st interesting val is [1]
+++        vld1.16   {d16}, [r12]          @ Unaligned
+++        vmov.i64  q3, #0
+++        ldr       r12, [sp, #12]        @ sao_left_class
+++
+++        mov       r4, sp                @ Remember SP
+++        sub       sp, #32
+++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
+++
+++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
+++        add       r12, sp
+++        vst1.8    {d16[0]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[2]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[4]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[6]}, [r12]
+++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
+++
+++        @ And again for the 2nd set
+++        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
+++        add       r12, #2               @ 1st interesting val is [1]
+++        vld1.16   {d16}, [r12]          @ Unaligned
+++        ldr       r12, [r4, #20]        @ sao_left_class2
+++
+++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
+++        add       r12, sp
+++        vst1.8    {d16[0]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[2]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[4]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[6]}, [r12]
+++        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
+++
+++        mov       sp, r4
+++
+++        ldr       r12, [sp, #28]        @ height
+++        pld       [r1]
+++
+++        subs      r12, #1
+++        add       r4, r1, r3
+++        bx        lr
+ +endfunc
+ +
+-+function ff_hevc_sao_band_w64_neon_8, export=1
+-+        init_sao_band
+ +
+++@ ff_hevc_sao_band_64_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_64_neon_8, export=1
+ +        push      {r4, lr}
+-+        subs      r12, #1
+-+        mov       r4, r1
+-+        it ne
+-+        addne     r4, r3
+++        bl        band_load_y
+++        vmov.u8   q15, #128
+ +
+ +1:      subs      r12, #1
+ +        vldm      r1, {q8-q11}
+ +        pld       [r4]
+-+        vshr.u8   q12, q8, #3
+-+        vshr.u8   q13, q9, #3
+ +        add       r1, r3
+-+        vshr.u8   q14, q10, #3
+-+        vshr.u8   q15, q11, #3
+-+        sao_band_64
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+ +        it ne
+ +        addne     r4, r3
+ +        vstm      r0, {q8-q11}
+@@ -2618,8 +5277,113 @@ index 0000000..08a021d
+ +        pop       {r4, pc}
+ +endfunc
+ +
+++@ ff_hevc_sao_band_32_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_32_neon_8, export=1
+++        push      {r4, lr}
+++        bl        band_load_y
+++        vmov.u8   q15, #128
+++
+++1:      subs      r12, #2
+++        vld1.8    { q8, q9 }, [r1, :128], r3
+++        vld1.8    {q10, q11}, [r1, :128], r3
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.8    { q8, q9 }, [r0, :128], r2
+++        vst1.8    {q10, q11}, [r0, :128], r2
+++        bpl       1b
+++
+++        pop       {r4, pc}
+++endfunc
+++
+++@ ff_hevc_sao_band_16_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_16_neon_8, export=1
+++        push      {r4, lr}
+++        bl        band_load_y
+++        vmov.u8   q15, #128
+++
+++1:      subs      r12, #4
+++        vld1.8    { q8}, [r1, :128], r3
+++        vld1.8    { q9}, [r1, :128], r3
+++        vld1.8    {q10}, [r1, :128], r3
+++        vld1.8    {q11}, [r1, :128], r3
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.8    { q8}, [r0, :128], r2
+++        vst1.8    { q9}, [r0, :128], r2
+++        vst1.8    {q10}, [r0, :128], r2
+++        vst1.8    {q11}, [r0, :128], r2
+++        bpl       1b
+++
+++        pop       {r4, pc}
+++endfunc
+++
+++@ ff_hevc_sao_band_8_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_8_neon_8, export=1
+++        push      {r4, lr}
+++        bl        band_load_y
+++        ldr       lr, [sp, #16]         @ width
+++        vmov.u8   q15, #128
+++        cmp       lr, #8
+++        blt       4f
+++
+++1:      subs      r12, #2
+++        vld1.8    {d16}, [r1, :64], r3
+++        vld1.8    {d17}, [r1, :64], r3
+ +
+-+@ ff_hevc_sao_band_c_w64_neon_8(
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.8    {d16}, [r0, :64], r2
+++        vst1.8    {d17}, [r0, :64], r2
+++        bpl       1b
+++        pop       {r4, pc}
+++
+++4:
+++1:      subs      r12, #4
+++        vld1.32   {d16[0]}, [r1, :32], r3
+++        vld1.32   {d16[1]}, [r1, :32], r3
+++        vld1.32   {d17[0]}, [r1, :32], r3
+++        vld1.32   {d17[1]}, [r1, :32], r3
+++
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.32   {d16[0]}, [r0, :32], r2
+++        vst1.32   {d16[1]}, [r0, :32], r2
+++        vst1.32   {d17[0]}, [r0, :32], r2
+++        vst1.32   {d17[1]}, [r0, :32], r2
+++        bpl       1b
+++        pop       {r4, pc}
+++endfunc
+++
+++@ ff_hevc_sao_band_c_32_neon_8(
+ +@   uint8_t * dst          [r0]
+ +@   uint8_t * src          [r1]
+ +@   uint32_t dst_stride    [r2]
+@@ -2631,10868 +5395,9596 @@ index 0000000..08a021d
+ +@   int width              sp[16]
+ +@   int height             sp[20]
+ +
+-+@ As this is often done in-place on the frame buffer it is worth preloading
+-+@ the pixel values but we want to beware of loading ouside our buffer to avoid
+-+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
+-+
+-+function ff_hevc_sao_band_c_neon_8, export=1
+-+        mov     r12, sp
+-+        push   {r4-r8, lr}  // 24 bytes
+-+
+-+        ldm     r12, {r4-r7}
+-+
+-+        add     r4, #2
+-+        add     r6, #2
+-+        vld1.16 {d16}, [r4]    @ Unaligned
+-+        lsl     r5, r5, #3
+-+        vld1.16 {d18}, [r6]
+-+        pld     [r1]
+-+        vmov.i8  d17, #0
+-+        mov     r4, r1
+-+        vmov.i8  d19, #0
+-+        lsl     r7, r7, #3
+-+        vdup.8  q1, r5
+-+        ldr     r5, [r12, #16]  @ width
+-+        vdup.8  q2, r7
+-+        ldr     r12, [r12, #20]
+-+        vqmovn.s16 d0, q8
+-+        cmp     r5, #16         @ At some point we may want a table lookup
+-+        vqmovn.s16 d1, q9
+-+        vmov.i8 q3, #128
+-+        beq     16f
+-+
+-+        @ d0 U lookup
+-+        @ d1 V lookup
+-+        @ q1 U raw offset
+-+        @ q2 V raw offset
+-+        @ q3 #128
+-+
+-+        @ r4 = r1 = src - Inteded for preload pointer
+-+        @ r12 = height
+-+
+-+        @ Might (unlikely) be called with height == 1
+-+        subs      r12, #1
+-+        it ne
+-+        addne     r4, r3
+++function ff_hevc_sao_band_c_32_neon_8, export=1
+++        push    {r4, lr}
+++        bl      band_load_c
+++
+++        vmov.i8   q15, #128
+++        sub       r3, #32
+++        sub       r2, #32
+++
+++1:      subs      r12, #1
+++        vld2.8    { q8, q9 }, [r1, :128]!
+++        vld2.8    {q10, q11}, [r1, :128], r3
+ +
+-+1:
+-+        subs      r12, #1
+-+        vld2.8    {q8-q9}, [r1, :128]!
+-+        vsub.u8   q12, q8, q1
+-+        vld2.8    {q10-q11}, [r1, :128], r3
+-+        vsub.u8   q14, q10, q1
+-+        vsub.u8   q13, q9, q2
+-+        sub       r1, #32
+-+        vsub.u8   q15, q11, q2
+ +        pld       [r4]
+-+        vshr.u8   q12, #3
+-+        vadd.s8   q8, q3
+-+        vshr.u8   q13, #3
+-+        vadd.s8   q9, q3
+-+
+-+        vtbl.8   d24, {d0}, d24
+-+        vshr.u8  q14, #3
+-+        vtbl.8   d25, {d0}, d25
+-+        vshr.u8  q15, #3
+-+        vtbl.8   d26, {d1}, d26
+-+        vadd.s8  q10, q3
+-+        vtbl.8   d27, {d1}, d27
+-+        vadd.s8  q11, q3
+-+        vtbl.8   d28, {d0}, d28
+-+        vqadd.s8 q8, q12
+-+        vtbl.8   d29, {d0}, d29
+-+        vqadd.s8 q9, q13
+-+        vtbl.8   d30, {d1}, d30
+-+        vqadd.s8 q10, q14
+-+        vtbl.8   d31, {d1}, d31
+-+        vsub.s8  q8, q3
+-+        vqadd.s8 q11, q15
+-+        vsub.s8  q9, q3
+-+        vsub.s8  q10, q3
+-+        vsub.s8  q11, q3
+ +
+-+        it ne
+-+        addne     r4, r3        @ Do not inc on final pass
+-+        vst2.8    {q8-q9}, [r0, :128]!
+-+        vst2.8    {q10-q11}, [r0, :128], r2
+-+        sub       r0, #32
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+++
+++        vst2.8    { q8, q9 }, [r0, :128]!
+++        vst2.8    {q10, q11}, [r0, :128], r2
+++
+++        itt ne
+++        addne     r4, r3
+++        addne     r4, #32
+++
+ +        bpl       1b
+ +
+-+        pop    {r4-r8, pc}
+++        pop     {r4, pc}
+++endfunc
+ +
+-+@ -- width 16 (UV pairs) --
+-+16:
+-+        subs    r12, #2
+-+        it ne
+-+        addne   r4, r4, r3, lsl #1
+++@ ff_hevc_sao_band_c_16_neon_8(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+1:
+-+        subs      r12, #2
+-+        vld2.8    {q8-q9}, [r1, :128], r3
+-+        vsub.u8   q12, q8, q1
+-+        vld2.8    {q10-q11}, [r1, :128], r3
+-+        vsub.u8   q14, q10, q1
+-+        vsub.u8   q13, q9, q2
+-+        pld       [r4]
+-+        vsub.u8   q15, q11, q2
+-+        pld       [r4, r3]
+-+        vshr.u8  q12, #3
+-+        vadd.s8  q8, q3
+-+        vshr.u8  q13, #3
+-+        vadd.s8  q9, q3
+-+
+-+        vtbl.8   d24, {d0}, d24
+-+        vshr.u8  q14, #3
+-+        vtbl.8   d25, {d0}, d25
+-+        vshr.u8  q15, #3
+-+        vtbl.8   d26, {d1}, d26
+-+        vadd.s8  q10, q3
+-+        vtbl.8   d27, {d1}, d27
+-+        vadd.s8  q11, q3
+-+        vtbl.8   d28, {d0}, d28
+-+        vqadd.s8 q8, q12
+-+        vtbl.8   d29, {d0}, d29
+-+        vqadd.s8 q9, q13
+-+        vtbl.8   d30, {d1}, d30
+-+        vqadd.s8 q10, q14
+-+        vtbl.8   d31, {d1}, d31
+-+        vsub.s8  q8, q3
+-+        vqadd.s8 q11, q15
+-+        vsub.s8  q9, q3
+-+        vsub.s8  q10, q3
+-+        vsub.s8  q11, q3
+++function ff_hevc_sao_band_c_16_neon_8, export=1
+++        push    {r4, lr}
+++        bl      band_load_c
+++        vmov.i8   q15, #128
+ +
+-+        it ne
+-+        addne   r4, r4, r3, lsl #1
+-+        vst2.8    {q8-q9}, [r0, :128], r2
+-+        vst2.8    {q10-q11}, [r0, :128], r2
+-+        bpl       1b
+++1:      subs      r12, #2
+++        vld2.8    { q8, q9 }, [r1, :128], r3
+++        vld2.8    {q10, q11}, [r1, :128], r3
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+ +
+-+        pop    {r4-r8, pc}
+++        vst2.8    { q8, q9 }, [r0, :128], r2
+++        vst2.8    {q10, q11}, [r0, :128], r2
+ +
+++        bpl       1b
+++        pop     {r4, pc}
+ +endfunc
+ +
+++@ ff_hevc_sao_band_c_8_neon_8(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+++
+++function ff_hevc_sao_band_c_8_neon_8, export=1
+++        push    {r4, lr}
+++        bl      band_load_c
+++        ldr       lr, [sp, #16]         @ width
+++        vmov.u8   q15, #128
+++        cmp       lr, #8
+++        blt       4f
+ +
+-+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+-+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+-+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+-+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
+-+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
+-+        vsub.s8 \out0, \tmp0, \out0 // diff0
+-+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+-+.endm
+++1:      subs      r12, #1
+++        vld2.8    {d16, d17}, [r1, :128], r3
+ +
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+ +
+-+// input
+-+// a in q0 - q3
+-+// c in q4 - q7
+-+// b in q8 - q11
+-+// offset table r4,r5 and r6,r7
+-+//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
+-+// output in q0 - q3
+-+// clobbers q12 - q15
+++        vst2.8    {d16, d17}, [r0, :128], r2
+++        bpl       1b
+++        pop     {r4, pc}
+ +
+-+@ a <- c <- b
+-+@
+-+@ It appears that Neon can stall if you try and use results too soon so we try to
+-+@ spread our instruction out
+-+
+-+.macro edgeidx64
+-+
+-+        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
+-+        vcgt.u8 q13, q5, q1
+-+        vcgt.u8 q14, q6, q2
+-+        vcgt.u8 q15, q7, q3
+-+
+-+        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
+-+        vcgt.u8 q1, q1, q5
+-+        vcgt.u8 q2, q2, q6
+-+        vcgt.u8 q3, q3, q7
+-+
+-+        vsub.s8 q0, q0, q12 // a = sign(c-a)
+-+        vsub.s8 q1, q1, q13
+-+        vsub.s8 q2, q2, q14
+-+        vsub.s8 q3, q3, q15
+-+
+-+        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
+-+        vcgt.u8 q13, q5, q9
+-+        vcgt.u8 q14, q6, q10
+-+        vcgt.u8 q15, q7, q11
+-+
+-+        vsub.s8 q0, q0, q12
+-+        vsub.s8 q1, q1, q13
+-+        vsub.s8 q2, q2, q14
+-+        vsub.s8 q3, q3, q15
+-+
+-+        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
+-+        vcgt.u8 q13, q9, q5
+-+        vcgt.u8 q14, q10, q6
+-+        vcgt.u8 q15, q11, q7
+-+
+-+        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+-+        vadd.s8 q1, q1, q13
+-+        vmov.u8 q12, #2
+-+        vadd.s8 q2, q2, q14
+-+        vadd.s8 q3, q3, q15
+-+
+-+        vadd.s8 q0, q0, q12
+-+        vadd.s8 q1, q1, q12
+-+        @ whilst vmov dn, rm, rn exists it is a vfp instruction
+-+        @ and causes a stall till neon pipe empty - so don't do that!
+-+        vmov    d26[0], r4
+-+        vmov    d26[1], r5
+-+        vmov    d27[0], r6
+-+        vmov    d27[1], r7
+-+        vadd.s8 q2, q2, q12
+-+        vuzp.8    q0, q1
+-+        vmov.u8 q15, #128
+-+        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
+-+
+-+        vtbl.8  d0, {d26}, d0
+-+        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
+-+
+-+        vtbl.8  d1, {d26}, d1
+-+        vadd.s8 q14, q5, q15
+-+
+-+        vtbl.8  d2, {d27}, d2
+-+        vuzp.8    q2, q3
+-+
+-+        vtbl.8  d3, {d27}, d3
+-+
+-+        vtbl.8  d4, {d26}, d4
+-+        vzip.8    q0, q1
+-+
+-+        vtbl.8  d5, {d26}, d5
+-+        vqadd.s8 q0, q0, q12
+-+        vqadd.s8 q1, q1, q14
+-+        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
+-+
+-+        vtbl.8  d6, {d27}, d6
+-+        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
+-+
+-+        vtbl.8  d7, {d27}, d7
+-+        vzip.8   q2, q3
+-+
+-+        vsub.s8 q0, q0, q15
+-+        vqadd.s8 q2, q2, q12
+-+        vqadd.s8 q3, q3, q14
+-+        vsub.s8 q1, q1, q15
+-+        vsub.s8 q2, q2, q15
+-+        vsub.s8 q3, q3, q15
+++4:
+++1:      subs      r12, #1
+++        vld1.8    {d16}, [r1, :64], r3
+++        vld1.8    {d17}, [r1, :64], r3
+++        vuzp.8    d16, d17
+ +
+-+.endm
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+ +
+-+function edge_w64_body
+-+        edgeidx64
+-+        vstm    r0, {q0-q3}
+-+        add     r0, r0, r2
+-+        bx       lr
+++        vzip.8    d16, d17
+++        vst1.8    {d16}, [r0, :64], r2
+++        vst1.8    {d17}, [r0, :64], r2
+++        bpl       1b
+++        pop     {r4, pc}
+ +endfunc
+ +
+-+.macro init_edge_64
+-+        push   {r4-r8,lr}
+-+        ldr    r12, [sp, #24] // height
+-+        ldr    r5,  [sp, #28] // sao_offset_val_table
+-+        ldrd   r4, r5, [r5]
+-+        mov    r6, r4
+-+        mov    r7, r5
+++
+++@ ff_hevc_sao_band_64_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_64_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q2, #0
+++        vdup.i16  q3, lr
+++        bl        band_load_y
+++        vpush     {q4-q7}
+++
+++1:      subs      r12, #1
+++        vldm      r1, {q4-q11}
+++        add       r1, r3
+++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+++        vstm      r0, {q4-q11}
+++        add       r0, r2
+++        bpl       1b
+++
+++        vpop      {q4-q7}
+++        pop       {r4, pc}
+ +.endm
+ +
+-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+        sub    r1, #8
+-+1:      subs    r12, #1
+-+        vld1.64  {d7}, [r1, :64]!
+-+        vld1.64  {q4-q5}, [r1, :128]! // load c
+-+        vld1.64  {q6-q7}, [r1, :128]!
+-+        vld1.64  {d24}, [r1, :64], r3
+-+        sub      r1, #72
+-+        // load a
+-+        vext.8 q0, q3, q4, #15
+-+        vext.8 q1, q4, q5, #15
+-+        vext.8 q2, q5, q6, #15
+-+        vext.8 q3, q6, q7, #15
+-+        // load b
+-+        vext.8 q8, q4, q5, #1
+-+        vext.8 q9, q5, q6, #1
+-+        vext.8 q10, q6, q7, #1
+-+        vext.8 q11, q7, q12, #1
+-+        bl    edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+++function ff_hevc_sao_band_64_neon_10, export=1
+++        band_64_16 10
+ +endfunc
+ +
+-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+        sub     r1, r3
+-+        // load a
+-+        vld1.8  {q0-q1}, [r1, :128]!
+-+        vld1.8  {q2-q3}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q8-q9}, [r1, :128]!
+-+        vld1.8  {q10-q11}, [r1, :128], r3
+-+        sub     r1, #32
+-+        bl      edge_w64_body
+-+        // copy c to a
+-+        vmov.64 q0, q4
+-+        vmov.64 q1, q5
+-+        vmov.64 q2, q6
+-+        vmov.64 q3, q7
+-+        // copy b to c
+-+        vmov.64 q4, q8
+-+        vmov.64 q5, q9
+-+        vmov.64 q6, q10
+-+        vmov.64 q7, q11
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+endfunc
+++@ ff_hevc_sao_band_32_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_32_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q2, #0
+++        vdup.i16  q3, lr
+++        bl        band_load_y
+ +
+-+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        sub     r1, #1
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #31
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        add     r1, #1
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #33
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+endfunc
+++1:      subs      r12, #1
+++        vldm      r1, {q8-q11}
+++        add       r1, r3
+++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+++        vstm      r0, {q8-q11}
+++        add       r0, r2
+++        bpl       1b
+ +
+-+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        add     r1, #1
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #33
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        sub     r1, #1
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #31
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+++        pop       {r4, pc}
+++.endm
+++
+++function ff_hevc_sao_band_32_neon_10, export=1
+++        band_32_16 10
+ +endfunc
+ +
+++@ ff_hevc_sao_band_16_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_16_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        bl        band_load_y
+++
+++1:      subs      r12, #2
+++        vld1.16   { q8, q9 }, [r1, :128], r3
+++        vld1.16   {q10, q11}, [r1, :128], r3
+++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+++        vst1.16   { q8, q9 }, [r0, :128], r2
+++        vst1.16   {q10, q11}, [r0, :128], r2
+++        bpl       1b
+ +
+-+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
+-+@   uint8_t *_dst,               r0
+-+@   uint8_t *_src,               r1
+-+@   ptrdiff_t stride_dst,        r2
+-+@   ptrdiff_t stride_src,        r3
+-+@   int height,                  sp[0]
+-+@   int16_t *sao_offset_table_u,  sp[4]
+-+@   int16_t *sao_offset_table_v); sp[8]
+-+@   int eo                        sp[12]
+-+
+-+function ff_hevc_sao_edge_c_w64_neon_8, export=1
+-+        push   {r4-r8,lr}     // 6 reg = 24
+-+        ldr    r5,  [sp, #28] // sao_offset_val_table_u
+-+        ldr    r7,  [sp, #32] // sao_offset_val_table_v
+-+
+-+        @ Load and rearrange offsets
+-+        @ Also "convert" from 16bit to 8bit
+-+        ldrb    r4, [r5, #2]
+-+        ldrb    r8, [r5, #4]
+-+        ldrb    r6, [r7, #2]
+-+        ldrb    r12, [r7, #4]
+-+        orr     r4, r4, r8, lsl #8
+-+        orr     r6, r6, r12, lsl #8
+-+        ldrb    r8, [r5, #6]
+-+        ldrb    r12, [r7, #6]
+-+        orr     r4, r4, r8, lsl #24
+-+        orr     r6, r6, r12, lsl #24
+-+        ldrb    r5, [r5, #8]
+-+        ldrb    r7, [r7, #8]
+-+
+-+        ldr     r12, [sp, #36] // e0
+-+        adr     r8, edge_c_tbl_w64
+-+        ldr     r8, [r8, r12, lsl #2]
+-+
+-+        ldr     r12, [sp, #24] // height
+-+        vpush   {d8-d15}
+-+        mov     pc, r8
+-+
+-+edge_c_tbl_w64:
+-+        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
+-+        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
+-+        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
+-+        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
+-+
+-+ff_hevc_sao_edge_c_eo0_w64_neon_8:
+-+        sub    r1, #8
+-+1:      subs    r12, #1
+-+        vld1.64  {d7}, [r1, :64]!
+-+        vld1.64  {q4-q5}, [r1, :128]! // load c
+-+        vld1.64  {q6-q7}, [r1, :128]!
+-+        vld1.64  {d24}, [r1, :64], r3
+-+        sub      r1, #72
+-+        // load a
+-+        vext.8 q0, q3, q4, #14
+-+        vext.8 q1, q4, q5, #14
+-+        vext.8 q2, q5, q6, #14
+-+        vext.8 q3, q6, q7, #14
+-+        // load b
+-+        vext.8 q8, q4, q5, #2
+-+        vext.8 q9, q5, q6, #2
+-+        vext.8 q10, q6, q7, #2
+-+        vext.8 q11, q7, q12, #2
+-+        bl    edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+
+-+ff_hevc_sao_edge_c_eo1_w64_neon_8:
+-+        sub     r1, r3
+-+        // load a
+-+        vldm    r1, {q0-q3}
+-+        add     r1, r3
+-+        // load c
+-+        vldm    r1, {q4-q7}
+-+        add     r1, r3
+-+1:      subs    r12, #1
+-+        // load b
+-+        vldm    r1, {q8-q11}
+-+        add     r1, r3
+-+        bl      edge_w64_body
+-+        // copy c to a
+-+        vmov.64 q0, q4
+-+        vmov.64 q1, q5
+-+        vmov.64 q2, q6
+-+        vmov.64 q3, q7
+-+        // copy b to c
+-+        vmov.64 q4, q8
+-+        vmov.64 q5, q9
+-+        vmov.64 q6, q10
+-+        vmov.64 q7, q11
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+
+-+ff_hevc_sao_edge_c_eo2_w64_neon_8:
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        sub     r1, #2
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #30
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        add     r1, #2
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #34
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+
+-+ff_hevc_sao_edge_c_eo3_w64_neon_8:
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        add     r1, #2
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #34
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        sub     r1, #2
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #30
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+++        pop       {r4, pc}
+++.endm
+++
+++function ff_hevc_sao_band_16_neon_10, export=1
+++        band_16_16 10
+ +endfunc
+ +
+++@ ff_hevc_sao_band_8_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_8_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        bl        band_load_y
+++        ldr       lr, [sp, #16]
+++        cmp       lr, #8
+++        blt       4f
+++
+++1:      subs      r12, #2
+++        vld1.16   { q8}, [r1, :128], r3
+++        vld1.16   { q9}, [r1, :128], r3
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+++        vst1.16   { q8}, [r0, :128], r2
+++        vst1.16   { q9}, [r0, :128], r2
+++        bpl       1b
+++        pop       {r4, pc}
+ +
+-+.macro init_edge_32
+-+        ldr     r12, [sp, #4] // sao_offset_val_table
+-+        vld1.32 {d31}, [r12]
+-+        ldr     r12, [sp] // height
+++4:
+++1:      subs      r12, #4
+++        vld1.16   {d16}, [r1, :64], r3
+++        vld1.16   {d17}, [r1, :64], r3
+++        vld1.16   {d18}, [r1, :64], r3
+++        vld1.16   {d19}, [r1, :64], r3
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+++        vst1.16   {d16}, [r0, :64], r2
+++        vst1.16   {d17}, [r0, :64], r2
+++        vst1.16   {d18}, [r0, :64], r2
+++        vst1.16   {d19}, [r0, :64], r2
+++        bpl       1b
+++        pop       {r4, pc}
+ +.endm
+ +
+-+.macro diff out0, tmp0, in0, in1
+-+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
+-+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
+-+        vsub.s8 \out0, \tmp0, \out0 // diff0
+-+.endm
+++function ff_hevc_sao_band_8_neon_10, export=1
+++        band_8_16 10
+++endfunc
+ +
+-+.macro table32
+-+        vmov.s8  q10, #2
+-+        vadd.s8  q0, q10
+-+        vadd.s8  q1, q10
+-+        vmov.s8  q10, #128
+-+        vtbl.8   d0, {d31}, d0
+-+        vadd.s8  q11, q2, q10
+-+        vtbl.8   d1, {d31}, d1
+-+        vadd.s8  q12, q3, q10
+-+        vtbl.8   d2, {d31}, d2
+-+        vqadd.s8 q11, q0
+-+        vtbl.8   d3, {d31}, d3
+-+        vqadd.s8 q12, q1
+-+        vsub.s8  q0, q11, q10
+-+        vsub.s8  q1, q12, q10
+-+        vst1.8   {q0-q1}, [r0, :128], r2
+-+.endm
+ +
+-+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
+-+        init_edge_32
+-+        vpush {q4-q7}
+-+        sub     r1, #4
+-+1:      subs    r12, #1
+-+        vld1.8  {q13-q14}, [r1]!
+-+        vld1.32 d30, [r1], r3
+-+        sub     r1, #32
+-+        // a
+-+        vext.8   q0, q13, q14, #3
+-+        vext.8   q1, q14, q15, #3
+-+        vshr.u64 d24, d30, #24
+-+        // c
+-+        vext.8   q2, q13, q14, #4
+-+        vext.8   q3, q14, q15, #4
+-+        vshr.u64 d16, d30, #32
+-+        // diff0
+-+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
+-+        diff   d18, d25, d24, d16
+-+        // -diff1
+-+        vext.s8 q0, q13, q14, #1
+-+        vext.s8 q1, q14, q9, #1
+-+
+-+        vsub.s8 q0, q13, q0 //diff0 + diff1
+-+        vsub.s8 q1, q14, q1
+-+        table32
+-+        bne     1b
+-+        vpop {q4-q7}
+++@ ff_hevc_sao_band_c_32_neon_10(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+        bx      lr
+-+endfunc
+++.macro band_c_32_16 bit_depth
+++        push      {r4, lr}
+++        bl        band_load_c
+++        vpush     {q4-q7}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        sub       r2, #96
+ +
+-+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
+-+        init_edge_32
+-+        vpush {q4-q7}
+-+        // load a
+-+        sub     r1, r3
+-+        vld1.8  {q0-q1}, [r1, :128], r3
+-+        // load c
+-+        vld1.8  {q2-q3}, [r1, :128], r3
+-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q8-q9}, [r1, :128], r3
+-+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
+-+        vadd.s8 q0, q4, q12 //diff0 + diff1
+-+        vadd.s8 q1, q5, q13
+-+        table32
+-+        // CMP ( c, a )
+-+        vneg.s8 q12, q4
+-+        vneg.s8 q13, q5
+-+        // c
+-+        vmov.64 q2, q8
+-+        vmov.64 q3, q9
+-+        bne     1b
+-+        vpop {q4-q7}
+-+        bx      lr
+-+endfunc
+++1:      subs      r12, #1
+ +
+-+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
+-+        init_edge_32
+-+        vpush   {d8-d15}
+-+        // load a
+-+        sub     r1, r3
+-+        sub     r1, #8
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {d24}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q0, q10, q11, #7
+-+        vext.8  q1, q11, q12, #7
+-+        // load c
+-+        vld1.8  {d9}, [r1, :64]!
+-+        vld1.8  {q2-q3}, [r1, :64], r3
+-+        sub     r1, #8
+-+        vext.8  q4, q4, q2, #15
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {q12}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q8, q10, q11, #9
+-+        vext.8  q9, q11, q12, #9
+-+        vext.8  q6, q10, q11, #8
+-+        vext.8  q7, q11, q12, #8
+-+        vext.8  q5, q10, q11, #7
+-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+-+        vadd.s8 q0, q12 //diff0 + diff1
+-+        vadd.s8 q1, q13
+-+        table32
+-+        // inputs for next loop iteration
+-+        // a
+-+        vmov.8  q0, q4
+-+        vext.8  q1, q2, q3, #15
+-+        // c
+-+        vmov.8  q2, q6
+-+        vmov.8  q3, q7
+-+        vmov.8  q4, q5
+-+        bne     1b
+-+        vpop    {d8-d15}
+-+        bx      lr
+-+endfunc
+++        vld2.16   { q4, q5 }, [r1, :128]!
+++        vld2.16   { q6, q7 }, [r1, :128]!
+++        vld2.16   { q8, q9 }, [r1, :128]!
+++        vld2.16   {q10, q11}, [r1, :128], r3
+ +
+-+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
+-+        init_edge_32
+-+        sub     r1, r3
+-+        // load a
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {d24}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q0, q10, q11, #1
+-+        vext.8  q1, q11, q12, #1
+-+        // load c
+-+        vld1.8  {q2-q3}, [r1, :64]!
+-+        vld1.8  {d30}, [r1, :64], r3
+-+        sub     r1, #40
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {q12}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q8, q10, q11, #7
+-+        vext.8  q9, q11, q12, #7
+-+        vext.8  q14, q12, q10, #7
+-+
+-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+-+
+-+        vadd.s8 q0, q12 //diff0 + diff1
+-+        vadd.s8 q1, q13
+-+        table32
+-+
+-+        // inputs for next loop iteration
+-+        // a
+-+        vext.8  q0, q2, q3, #1
+-+        vext.8  q1, q3, q15, #1
+-+        // c
+-+        vext.8  q2, q8, q9, #1
+-+        vext.8  q3, q9, q14, #1
+-+        vext.8  d30, d28, d2, #1
+-+        bne     1b
+-+        bx      lr
+-+endfunc
+++        pld       [r4]
+++        sub       r1, #96
+ +
+-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+-index 39713ed..25eb52b 100644
+---- a/libavcodec/avcodec.h
+-+++ b/libavcodec/avcodec.h
+-@@ -410,6 +410,8 @@ enum AVCodecID {
+-     AV_CODEC_ID_SHEERVIDEO,
+-     AV_CODEC_ID_YLC,
+- 
+-+    AV_CODEC_ID_H264_MVC,
+++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-     /* various PCM "codecs" */
+-     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
+-     AV_CODEC_ID_PCM_S16LE = 0x10000,
+-@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
+- #define FF_BUG_DC_CLIP          4096
+- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
+- #define FF_BUG_TRUNCATED       16384
+-+#define FF_BUG_GMC_UNSUPPORTED 32768
+- 
+-     /**
+-      * strictly follow the standard (MPEG-4, ...).
+-@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
+- #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
+- #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
+- #define FF_PROFILE_H264_CAVLC_444            44
+-+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+-+#define FF_PROFILE_H264_STEREO_HIGH          128
+-+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
+- 
+- #define FF_PROFILE_VC1_SIMPLE   0
+- #define FF_PROFILE_VC1_MAIN     1
+-@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
+- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+- #endif
+- 
+-+    /**
+-+     * Opaque pointer for use by replacement get_buffer2 code
+-+     *
+-+     * @author jc (08/02/2016)
+-+     */
+-+    void * get_buffer_context;
+- } AVCodecContext;
+- 
+- AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
+-index 1bf1c62..ccfa991 100644
+---- a/libavcodec/cabac.h
+-+++ b/libavcodec/cabac.h
+-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
+- typedef struct CABACContext{
+-     int low;
+-     int range;
+--    int outstanding_count;
+-+    union
+-+    {
+-+        int outstanding_count;
+-+        struct {
+-+            uint16_t bits;
+-+            uint16_t range;
+-+        } by22;
+-+    };
+-     const uint8_t *bytestream_start;
+-     const uint8_t *bytestream;
+-     const uint8_t *bytestream_end;
+-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
+-index 9d94b72..535ebf0 100644
+---- a/libavcodec/codec_desc.c
+-+++ b/libavcodec/codec_desc.c
+-@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
+-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+-     },
+-+    {
+-+        .id        = AV_CODEC_ID_H264_MVC,
+-+        .type      = AVMEDIA_TYPE_VIDEO,
+-+        .name      = "h264_mvc",
+-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+-+        .props     = AV_CODEC_PROP_LOSSY,
+-+    },
+- 
+-     /* various PCM "codecs" */
+-     {
+-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
+-index efe3555..16358aa 100644
+---- a/libavcodec/h264.h
+-+++ b/libavcodec/h264.h
+-@@ -126,7 +126,9 @@ enum {
+-     NAL_END_STREAM      = 11,
+-     NAL_FILLER_DATA     = 12,
+-     NAL_SPS_EXT         = 13,
+-+    NAL_SPS_SUBSET      = 15,
+-     NAL_AUXILIARY_SLICE = 19,
+-+    NAL_SLICE_EXT       = 20,
+-     NAL_FF_IGNORE       = 0xff0f001,
+- };
+- 
+-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+-index ce4bab2..b9b0c78 100644
+---- a/libavcodec/h264_parser.c
+-+++ b/libavcodec/h264_parser.c
+-@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
+-     uint8_t parse_history[6];
+-     int parse_history_count;
+-     int parse_last_mb;
+-+    int is_mvc;
+-+    int slice_ext;
+- } H264ParseContext;
+- 
+- 
+-@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
+-         } else if (state <= 5) {
+-             int nalu_type = buf[i] & 0x1F;
+-             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
+--                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+-+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+-+                nalu_type == NAL_SPS_SUBSET) {
+-                 if (pc->frame_start_found) {
+-                     i++;
+-                     goto found;
+-                 }
+-             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
+--                       nalu_type == NAL_IDR_SLICE) {
+-+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
+-                 state += 8;
+++        it ne
+++        addne     r4, r3
+ +
+-+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
+-                 continue;
+-             }
+-             state = 7;
+-         } else {
+-             p->parse_history[p->parse_history_count++] = buf[i];
+--            if (p->parse_history_count > 5) {
+-+            if (p->parse_history_count > 8) {
+-                 unsigned int mb, last_mb = p->parse_last_mb;
+-                 GetBitContext gb;
+- 
+--                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+-+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
+-                 p->parse_history_count = 0;
+-                 mb= get_ue_golomb_long(&gb);
+-                 p->parse_last_mb = mb;
+-@@ -145,7 +150,7 @@ found:
+-     pc->frame_start_found = 0;
+-     if (p->is_avc)
+-         return next_avc;
+--    return i - (state & 5) - 5 * (state > 7);
+-+    return i - (state & 5) - 8 * (state > 7);
+- }
+- 
+- static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
+-@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
+-         }
+-     }
+- 
+--    parse_nal_units(s, avctx, buf, buf_size);
+-+    if (!p->is_mvc)
+-+        parse_nal_units(s, avctx, buf, buf_size);
+- 
+-     if (avctx->framerate.num)
+-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+-@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
+-         if ((state & 0xFFFFFF00) != 0x100)
+-             break;
+-         nalu_type = state & 0x1F;
+--        if (nalu_type == NAL_SPS) {
+-+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
+-             has_sps = 1;
+-         } else if (nalu_type == NAL_PPS)
+-             has_pps = 1;
+-@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
+-     .parser_close   = h264_close,
+-     .split          = h264_split,
+- };
+++        vst2.16   { q4, q5 }, [r0, :128]!
+++        vst2.16   { q6, q7 }, [r0, :128]!
+++        vst2.16   { q8, q9 }, [r0, :128]!
+++        vst2.16   {q10, q11}, [r0, :128], r2
+ +
+-+static av_cold int init_mvc(AVCodecParserContext *s)
+-+{
+-+    H264ParseContext *p = s->priv_data;
+-+    int ret = init(s);
+-+    if (ret < 0)
+-+        return ret;
+++        bpl       1b
+ +
+-+    p->is_mvc = 1;
+-+    return 0;
+-+}
+++        vpop      {q4-q7}
+++        pop       {r4, pc}
+++.endm
+ +
+-+AVCodecParser ff_h264_mvc_parser = {
+-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+-+    .priv_data_size = sizeof(H264ParseContext),
+-+    .parser_init    = init_mvc,
+-+    .parser_parse   = h264_parse,
+-+    .parser_close   = h264_close,
+-+    .split          = h264_split,
+-+};
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b478065..955e426 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -41,8 +41,196 @@
+- #include "hevc.h"
+- #include "profiles.h"
+- 
+-+#ifdef RPI
+-+  #include "rpi_qpu.h"
+-+  #include "rpi_shader.h"
+-+  #include "rpi_shader_cmd.h"
+-+  #include "rpi_zc.h"
+++function ff_hevc_sao_band_c_32_neon_10, export=1
+++        band_c_32_16 10
+++endfunc
+ +
+-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-+  #define RPI_CACHE_UNIF_MVS  1
+ +
+-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+-+  //#define RPI_SIMULATE_QPUS
+-+  #ifdef RPI_WORKER
+-+    #include "pthread.h"
+-+  #endif
+++@ ff_hevc_sao_band_c_16_neon_10(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+  static void worker_core(HEVCContext * const s);
+++.macro band_c_16_16 bit_depth
+++        push      {r4, lr}
+++        bl        band_load_c
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        sub       r2, #32
+++        sub       r3, #32
+ +
+-+  // We can pred any block height but annoyingly if we we do then the TMU cache
+-+  // explodes and it goes even slower :-(
+-+  #if 0
+-+  #define Y_P_MAX_H     16
+-+  #define Y_B_MAX_H     16
+-+  #else
+-+  #define Y_P_MAX_H     64
+-+  #define Y_B_MAX_H     64
+-+  #endif
+-+#endif
+++1:      subs      r12, #1
+ +
+-+// #define DISABLE_MC
+++        vld2.16   { q8, q9 }, [r1, :128]!
+++        vld2.16   {q10, q11}, [r1, :128], r3
+ +
+-+#define DISABLE_CHROMA 0
+-+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+++        vst2.16   { q8, q9 }, [r0, :128]!
+++        vst2.16   {q10, q11}, [r0, :128], r2
+ +
+-+#ifndef av_mod_uintp2
+-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+-+{
+-+    return a & ((1 << p) - 1);
+-+}
+-+#   define av_mod_uintp2   av_mod_uintp2_c
+-+#endif
+++        bpl       1b
+++        pop       {r4, pc}
+++.endm
+ +
+-+#define Y_B_ONLY 0
+++function ff_hevc_sao_band_c_16_neon_10, export=1
+++        band_c_16_16 10
+++endfunc
+ +
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+ +
+-+#if RPI_INTER
+++@ ff_hevc_sao_band_c_8_neon_10(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+#define MC_DUMMY_X (-32)
+-+#define MC_DUMMY_Y (-32)
+++.macro band_c_8_16 bit_depth
+++        push      {r4, lr}
+++        bl        band_load_c
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        ldr       lr, [sp, #24]         @ width
+++        cmp       lr, #8
+++        blt       4f
+ +
+-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+-+// For each block of 64*64 the smallest block size is 8x4
+-+// We also need an extra command for the setup information
+++1:      subs      r12, #1
+++        vld2.16   { q8, q9 }, [r1, :128], r3
+ +
+-+#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
+-+// The QPU code for UV blocks only works up to a block width of 8
+-+#define RPI_CHROMA_BLOCK_WIDTH 8
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+++        vst2.16   { q8, q9 }, [r0, :128], r2
+ +
+-+// TODO Chroma only needs 4 taps
+++        bpl       1b
+++        pop       {r4, pc}
+ +
+-+// Actual filter goes -ve, +ve, +ve, -ve using these values
+-+static const uint32_t rpi_filter_coefs[8] = {
+-+        ENCODE_COEFFS(  0,  64,   0,  0),
+-+        ENCODE_COEFFS(  2,  58,  10,  2),
+-+        ENCODE_COEFFS(  4,  54,  16,  2),
+-+        ENCODE_COEFFS(  6,  46,  28,  4),
+-+        ENCODE_COEFFS(  4,  36,  36,  4),
+-+        ENCODE_COEFFS(  4,  28,  46,  6),
+-+        ENCODE_COEFFS(  2,  16,  54,  4),
+-+        ENCODE_COEFFS(  2,  10,  58,  2)
+-+};
+++4:
+++1:      subs      r12, #2
+++        vld2.16   {d16, d17}, [r1, :128], r3
+++        vld2.16   {d18, d19}, [r1, :128], r3
+ +
+-+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-+#endif
+++        vst2.16   {d16, d17}, [r0, :128], r2
+++        vst2.16   {d18, d19}, [r0, :128], r2
+ +
+++        bpl       1b
+++        pop       {r4, pc}
+++.endm
+ +
+-+#ifdef RPI_WORKER
+++function ff_hevc_sao_band_c_8_neon_10, export=1
+++        band_c_8_16 10
+++endfunc
+ +
+-+typedef struct worker_global_env_s
+-+{
+-+    volatile int arm_load;
+-+    pthread_mutex_t lock;
+ +
+-+    unsigned int arm_y;
+-+    unsigned int arm_c;
+-+    unsigned int gpu_y;
+-+    unsigned int gpu_c;
+-+} worker_global_env_t;
+++@ =============================================================================
+++@ SAO EDGE
+ +
+-+static worker_global_env_t worker_global_env =
+-+{
+-+    .lock = PTHREAD_MUTEX_INITIALIZER
+-+};
+++@ r0    destination address
+++@ r2    stride to post-increment r0 with
+++@ [r5]  translate values
+++@
+++@ a <- c <- b
+++@ a in q0 - q3
+++@ c in q4 - q7
+++@ b in q8 - q11
+++@
+++@ q12-15 used as temp
+++@
+++@ Can be used for both Y & C as we unzip/zip the deltas and
+++@ transform "u/v" separately via d26/d27.  For Y d26=d27
+ +
+++function edge_64b_body_8
+ +
+-+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+-+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
+++        vcgt.u8 q13,  q5,  q1
+++        vcgt.u8 q14,  q6,  q2
+++        vcgt.u8 q15,  q7,  q3
+ +
+-+#define LOG_ENTER
+-+#define LOG_EXIT
+++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
+++        vcgt.u8  q1,  q5
+++        vcgt.u8  q2,  q6
+++        vcgt.u8  q3,  q7
+ +
+-+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+-+static void worker_submit_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_tail++;
+-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vsub.s8  q0,  q12       @ a = sign(c-a)
+++        vsub.s8  q1,  q13
+++        vsub.s8  q2,  q14
+++        vsub.s8  q3,  q15
+ +
+-+// Call this to say we have completed pass1
+-+static void worker_complete_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_head++;
+-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
+++        vcgt.u8  q13, q5,  q9
+++        vcgt.u8  q14, q6,  q10
+++        vcgt.u8  q15, q7,  q11
+ +
+-+// Call this to wait for all jobs to have completed at the end of a frame
+-+static void worker_wait(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  while( s->worker_head !=s->worker_tail)
+-+  {
+-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+  }
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vsub.s8  q0,  q12
+++        vsub.s8  q1,  q13
+++        vsub.s8  q2,  q14
+++        vsub.s8  q3,  q15
+ +
+-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+-+// available to receive the next job.
+-+static void worker_pass0_ready(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+    // tail is number of submitted jobs
+-+    // head is number of completed jobs
+-+    // tail-head is number of outstanding jobs in the queue
+-+    // we need to ensure there is at least 1 space left for us to use
+-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+-+    {
+-+      // Wait until another job is completed
+-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
+++        vcgt.u8  q13, q9,  q5
+++        vcgt.u8  q14, q10, q6
+++        vcgt.u8  q15, q11, q7
+ +
+-+static void *worker_start(void *arg)
+-+{
+-+  HEVCContext *s = (HEVCContext *)arg;
+-+  while(1) {
+-+    pthread_mutex_lock(&s->worker_mutex);
+++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
+++        vadd.s8  q1,  q13
+++        vmov.u8  q12, #2
+++        vadd.s8  q2,  q14
+++        vadd.s8  q3,  q15
+ +
+-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+-+    {
+-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+++        vadd.s8  q0,  q12
+++        vadd.s8  q1,  q12
+ +
+-+    if (s->kill_worker) {
+-+      break;
+-+    }
+-+    LOG_ENTER
+-+    worker_core(s);
+++        vld1.8   {d26, d27}, [r5]
+ +
+-+    worker_complete_job(s);
+-+    LOG_EXIT
+-+  }
+-+  return NULL;
+-+}
+++        vadd.s8  q2,  q12
+++        vuzp.8   q0,  q1
+++        vmov.u8  q15, #128
+++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
+ +
+-+#endif
+++        vtbl.8   d0,  {d26}, d0
+++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
+ +
+- /**
+-  * NOTE: Each function hls_foo correspond to the function foo in the
+-  * specification (HLS stands for High Level Syntax).
+-@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- /* free everything allocated  by pic_arrays_init() */
+- static void pic_arrays_free(HEVCContext *s)
+- {
+-+#ifdef RPI
+-+    int job;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      if (s->coeffs_buf_arm[job][0]) {
+-+        gpu_free(&s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = 0;
+-+      }
+-+      if (s->coeffs_buf_arm[job][2]) {
+-+        gpu_free(&s->coeffs_buf_accelerated[job]);
+-+        s->coeffs_buf_arm[job][2] = 0;
+-+      }
+-+    }
+-+#endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    {
+-+        int i;
+-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++        vtbl.8   d1,  {d26}, d1
+++        vadd.s8  q14, q5, q15
+ +
+-+            if (dvq->vpu_cmds_arm) {
+-+                gpu_free(&dvq->deblock_vpu_gmem);
+-+              dvq->vpu_cmds_arm = 0;
+-+            }
+-+        }
+-+    }
+-+#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int ctb_count        = sps->ctb_width * sps->ctb_height;
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+-+#ifdef RPI
+-+    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-+    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+-+    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-+    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+-+    int job;
+++        vtbl.8   d2,  {d27}, d2
+++        vuzp.8   q2,  q3
+ +
+-+    av_assert0(sps);
+-+//    s->max_ctu_count = sps->ctb_width;
+-+//    printf("CTB with=%d\n", sps->ctb_width);
+-+//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-+    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
+-+    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
+-+    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
+-+
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        for(job=0;job<RPI_MAX_JOBS;job++) {
+-+            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+-+            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-+            if (!s->coeffs_buf_arm[job][0])
+-+                goto fail;
+-+
+-+            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+-+            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-+            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-+            if (!s->coeffs_buf_arm[job][2])
+-+                goto fail;
+-+            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+-+            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-+        }
+-+    }
+-+#endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    {
+-+        int i;
+-+        s->enable_rpi_deblock = !sps->sao_enabled;
+-+        s->setup_width = (sps->width+15) / 16;
+-+        s->setup_height = (sps->height+15) / 16;
+-+        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+-+        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+++        vtbl.8   d3,  {d27}, d3
+ +
+-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+-+        {
+-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+-+            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+-+            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+-+            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+-+            const unsigned int total_size =- cmd_size + y_size + uv_size;
+-+            int p_vc;
+-+            uint8_t * p_arm;
+-+ #if RPI_VPU_DEBLOCK_CACHED
+-+            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+-+ #else
+-+            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+-+ #endif
+-+            p_vc = dvq->deblock_vpu_gmem.vc;
+-+            p_arm = dvq->deblock_vpu_gmem.arm;
+++        vtbl.8   d4,  {d26}, d4
+++        vzip.8   q0,  q1
+ +
+-+            // Zap all
+-+            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+++        vtbl.8   d5,  {d26}, d5
+++        vqadd.s8 q0,  q12
+++        vqadd.s8 q1,  q14
+++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
+ +
+-+            // Subdivide
+-+            dvq->vpu_cmds_arm = (void*)p_arm;
+-+            dvq->vpu_cmds_vc = p_vc;
+++        vtbl.8   d6,  {d27}, d6
+++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
+ +
+-+            p_arm += cmd_size;
+-+            p_vc += cmd_size;
+++        vtbl.8   d7,  {d27}, d7
+++        vzip.8   q2,  q3
+ +
+-+            dvq->y_setup_arm = (void*)p_arm;
+-+            dvq->y_setup_vc = (void*)p_vc;
+++        vsub.s8  q0,  q15
+++        vqadd.s8 q2,  q12
+++        vqadd.s8 q3,  q14
+++        vsub.s8  q1,  q15
+++        vsub.s8  q2,  q15
+++        vsub.s8  q3,  q15
+ +
+-+            p_arm += y_size;
+-+            p_vc += y_size;
+++        bx      lr
+++endfunc
+ +
+-+            dvq->uv_setup_arm = (void*)p_arm;
+-+            dvq->uv_setup_vc = (void*)p_vc;
+-+        }
+++@ r0    destination address
+++@ r2    stride to post-increment r0 with
+++@ r4    upper clip value
+++@ [r5]  translate values
+++@
+++@ a <- c <- b
+++@ a in q0 - q3
+++@ c in q4 - q7
+++@ b in q8 - q11
+++@
+++@ q12-15 used as temp
+++@
+++@ Can be used for both Y & C as we unzip/zip the deltas and
+++@ transform "u/v" separately via d26/d27.  For Y d26=d27
+ +
+-+        s->dvq_n = 0;
+-+        s->dvq = s->dvq_ents + s->dvq_n;
+-+    }
+-+#endif
+++function edge_64b_body_16
+ +
+-     s->bs_width  = (width  >> 2) + 1;
+-     s->bs_height = (height >> 2) + 1;
+- 
+-@@ -137,6 +434,29 @@ fail:
+-     return AVERROR(ENOMEM);
+- }
+- 
+-+static void default_pred_weight_table(HEVCContext * const s)
+-+{
+-+  unsigned int i;
+-+  s->sh.luma_log2_weight_denom = 0;
+-+  s->sh.chroma_log2_weight_denom = 0;
+-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+-+      s->sh.luma_weight_l0[i] = 1;
+-+      s->sh.luma_offset_l0[i] = 0;
+-+      s->sh.chroma_weight_l0[i][0] = 1;
+-+      s->sh.chroma_offset_l0[i][0] = 0;
+-+      s->sh.chroma_weight_l0[i][1] = 1;
+-+      s->sh.chroma_offset_l0[i][1] = 0;
+-+  }
+-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+-+      s->sh.luma_weight_l1[i] = 1;
+-+      s->sh.luma_offset_l1[i] = 0;
+-+      s->sh.chroma_weight_l1[i][0] = 1;
+-+      s->sh.chroma_offset_l1[i][0] = 0;
+-+      s->sh.chroma_weight_l1[i][1] = 1;
+-+      s->sh.chroma_offset_l1[i][1] = 0;
+-+  }
+-+}
+++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
+++        vcgt.u16 q13, q5, q1
+++        vcgt.u16 q14, q6, q2
+++        vcgt.u16 q15, q7, q3
+ +
+- static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+- {
+-     int i = 0;
+-@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+- static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
+- {
+-     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
+--    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+-+    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+-     int ret, i;
+- 
+-     pic_arrays_free(s);
+-@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+-     switch (sps->pix_fmt) {
+-     case AV_PIX_FMT_YUV420P:
+-     case AV_PIX_FMT_YUVJ420P:
+-+#if RPI_HEVC_SAND
+-+        // Currently geometry calc is stuffed for big sizes
+-+        if (sps->width < 2048 && sps->height <= 1088) {
+-+            *fmt++ = AV_PIX_FMT_SAND128;
+-+        }
+-+#endif
+- #if CONFIG_HEVC_DXVA2_HWACCEL
+-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+- #endif
+-@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+-         ret = ff_thread_get_format(s->avctx, pix_fmts);
+-         if (ret < 0)
+-             goto fail;
+++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
+++        vcgt.u16 q1, q1, q5
+++        vcgt.u16 q2, q2, q6
+++        vcgt.u16 q3, q3, q7
+ +
+-         s->avctx->pix_fmt = ret;
+-     }
+-     else {
+-@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+-         for(c_idx = 0; c_idx < c_count; c_idx++) {
+-             int w = sps->width >> sps->hshift[c_idx];
+-             int h = sps->height >> sps->vshift[c_idx];
+-+            // ******** Very very nasty allocation kludge for plaited Chroma
+-             s->sao_pixel_buffer_h[c_idx] =
+--                av_malloc((w * 2 * sps->ctb_height) <<
+-+                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
+-                           sps->pixel_shift);
+-             s->sao_pixel_buffer_v[c_idx] =
+--                av_malloc((h * 2 * sps->ctb_width) <<
+-+                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
+-                           sps->pixel_shift);
+-         }
+-     }
+-@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
+-                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
+-                 pred_weight_table(s, gb);
+-             }
+-+            else
+-+            {
+-+              // Give us unit weights
+-+              default_pred_weight_table(s);
+-+            }
+- 
+-             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+-             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+-@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+-+{
+-+    // U & V done on U call in the case of sliced frames
+-+    if (rpi_sliced_frame(s->frame) && c_idx > 1)
+-+        return;
+++        vsub.s16 q0, q0, q12 // a = sign(c-a)
+++        vsub.s16 q1, q1, q13
+++        vsub.s16 q2, q2, q14
+++        vsub.s16 q3, q3, q15
+ +
+-+    if (s->enable_rpi) {
+-+        HEVCLocalContext *lc = s->HEVClc;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+        cmd->type = RPI_PRED_INTRA;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+-+        cmd->c_idx = c_idx;
+-+        cmd->i_pred.x = x0;
+-+        cmd->i_pred.y = y0;
+-+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+-+    }
+-+    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
+-+        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+-+    }
+-+    else {
+-+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+-+    }
+++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
+++        vcgt.u16 q13, q5, q9
+++        vcgt.u16 q14, q6, q10
+++        vcgt.u16 q15, q7, q11
+ +
+-+}
+-+#endif
+++        vsub.s16 q0, q0, q12
+++        vsub.s16 q1, q1, q13
+++        vsub.s16 q2, q2, q14
+++        vsub.s16 q3, q3, q15
+ +
+- static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
+-                               int log2_cb_size, int log2_trafo_size,
+-@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-     if (lc->cu.pred_mode == MODE_INTRA) {
+-         int trafo_size = 1 << log2_trafo_size;
+-         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+--
+-+#ifdef RPI
+-+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+-+#else
+-         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+-+#endif
+-     }
+- 
+-     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+-@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+-+#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+-+#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                     trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+-+#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+-+#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#endif
+-             }
+-         } else if (blk_idx == 3) {
+-             int trafo_size_h = 1 << (log2_trafo_size + 1);
+-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, xBase, yBase,
+-                                             trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#endif
+-             }
+-         }
+-     }
+-@@ -1275,47 +1675,120 @@ do {
+-     return 0;
+- }
+- 
+--static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
+++        vcgt.u16 q13, q9, q5
+++        vcgt.u16 q14, q10, q6
+++        vcgt.u16 q15, q11, q7
+ +
+-+static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
+- {
+--    HEVCLocalContext *lc = s->HEVClc;
+-     GetBitContext gb;
+--    int cb_size   = 1 << log2_cb_size;
+--    int stride0   = s->frame->linesize[0];
+--    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+--    int   stride1 = s->frame->linesize[1];
+--    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+--    int   stride2 = s->frame->linesize[2];
+--    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+--
+--    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+--                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+--                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+--                          s->ps.sps->pcm.bit_depth_chroma;
+--    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
+-     int ret;
+- 
+--    if (!s->sh.disable_deblocking_filter_flag)
+--        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+--
+-     ret = init_get_bits(&gb, pcm, length);
+-     if (ret < 0)
+-         return ret;
+- 
+--    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+--    if (s->ps.sps->chroma_format_idc) {
+--        s->hevcdsp.put_pcm(dst1, stride1,
+-+#ifdef RPI
+-+    if (rpi_sliced_frame(s->frame)) {
+-+        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
+-+                           s->frame->linesize[0],
+-+                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+++        vadd.s16 q1, q1, q13
+++        vmov.u8  q12, #2
+++        vadd.s16 q2, q2, q14
+++        vadd.s16 q3, q3, q15
+ +
+-+        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+-+                           s->frame->linesize[1],
+-                            cb_size >> s->ps.sps->hshift[1],
+-                            cb_size >> s->ps.sps->vshift[1],
+-                            &gb, s->ps.sps->pcm.bit_depth_chroma);
+--        s->hevcdsp.put_pcm(dst2, stride2,
+--                           cb_size >> s->ps.sps->hshift[2],
+--                           cb_size >> s->ps.sps->vshift[2],
+--                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+-     }
+-+    else
+-+#endif
+-+    {
+-+        const int stride0   = s->frame->linesize[0];
+-+        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+-+        const int   stride1 = s->frame->linesize[1];
+-+        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-+        const int   stride2 = s->frame->linesize[2];
+-+        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+++        vmovn.s16 d0, q0
+++        vmovn.s16 d1, q1
+++        vmovn.s16 d2, q2
+++        vmovn.s16 d3, q3
+ +
+-+        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+-+        if (s->ps.sps->chroma_format_idc) {
+-+            s->hevcdsp.put_pcm(dst1, stride1,
+-+                               cb_size >> s->ps.sps->hshift[1],
+-+                               cb_size >> s->ps.sps->vshift[1],
+-+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+-+            s->hevcdsp.put_pcm(dst2, stride2,
+-+                               cb_size >> s->ps.sps->hshift[2],
+-+                               cb_size >> s->ps.sps->vshift[2],
+-+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+-+        }
+- 
+-+    }
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+-+{
+-+    int16_t * const coeffs = (buf_no != 3) ?
+-+        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
+-+        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
+-+    s->num_coeffs[s->pass0_job][buf_no] += n;
+-+    return coeffs;
+-+}
+-+#endif
+++        vuzp.8   q0, q1
+ +
+-+// x * 2^(y*2)
+-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+-+{
+-+    return x << (y * 2);
+-+}
+++        vld1.8   {d26, d27}, [r5]
+ +
+-+static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+-+{
+-+    // Length in bits
+-+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+++        vadd.s8  q0, q0, q12
+++        vadd.s8  q1, q1, q12
+ +
+-+    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+++        vtbl.8   d0, {d26}, d0
+++        vtbl.8   d1, {d26}, d1
+++        vtbl.8   d2, {d27}, d2
+++        vtbl.8   d3, {d27}, d3
+ +
+-+    if (!s->sh.disable_deblocking_filter_flag)
+-+        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+++        vmov.i64 q12, #0
+ +
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        // Copy coeffs
+-+        const int blen = (length + 7) >> 3;
+-+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+-+        // Allocation is in int16_t s
+-+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+-+        // sample this rounding doesn't affect the total size we need to allocate for
+-+        // the coeff buffer
+-+        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+-+        memcpy(coeffs, pcm, blen);
+++        vzip.8   q0, q1
+ +
+-+        // Our coeff stash assumes that any partially allocated 64byte lump
+-+        // is zeroed so make that true.
+-+        {
+-+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+-+            if ((-(intptr_t)eopcm & 63) != 0)
+-+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+-+        }
+++        vdup.i16 q13, r4
+ +
+-+        // Add command
+-+        {
+-+            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+            cmd->type = RPI_PRED_I_PCM;
+-+            cmd->size = log2_cb_size;
+-+            cmd->i_pcm.src = coeffs;
+-+            cmd->i_pcm.x = x0;
+-+            cmd->i_pcm.y = y0;
+-+            cmd->i_pcm.src_len = length;
+-+        }
+-+        return 0;
+-+    }
+-+#endif
+++        @ Avoid overwrite whilst widening
+++        vaddw.s8 q2, q6, d2
+++        vaddw.s8 q3, q7, d3
+++        vaddw.s8 q1, q5, d1
+++        vaddw.s8 q0, q4, d0
+ +
+-+    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+-+}
+++        @ now clip
+++        clip16_4 q2, q3, q1, q0, q12, q13
+ +
+- /**
+-  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
+-  *
+-@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-  * @param luma_offset additive offset applied to the luma prediction value
+-  */
+- 
+-+#if RPI_INTER
+-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-+                        int block_w, int block_h, int luma_weight, int luma_offset)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_LUMA_UNI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref->data[0];
+-+    cmd->srcstride = ref->linesize[0];
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = luma_weight;
+-+    cmd->offset = luma_offset;
+-+}
+-+
+-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
+-+                       const struct MvField * const current_mv)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_LUMA_BI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref0->data[0];
+-+    cmd->srcstride = ref0->linesize[0];
+-+    cmd->mv = *mv0;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->src1 = ref1->data[0];
+-+    cmd->srcstride1 = ref1->linesize[0];
+-+    cmd->mv1 = *mv1;
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+
+-+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
+-+                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = src0;
+-+    cmd->srcstride = srcstride;
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = chroma_weight;
+-+    cmd->offset = chroma_offset;
+-+}
+-+
+-+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref0->data[cidx+1];
+-+    cmd->srcstride = ref0->linesize[cidx+1];
+-+    cmd->mv = current_mv->mv[0];
+-+    cmd->mv1 = current_mv->mv[1];
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->src1 = ref1->data[cidx+1];
+-+    cmd->srcstride1 = ref1->linesize[cidx+1];
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+
+-+#endif
+-+
+- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+-@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-     int idx              = ff_hevc_pel_weight[block_w];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+++        bx       lr
+++endfunc
+ +
+-     x_off += mv->x >> 2;
+-     y_off += mv->y >> 2;
+-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-  * @param mv1 motion vector1 (relative to block position) to get pixel data from
+-  * @param current_mv current motion vector structure
+-  */
+-- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+-@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+ +
+-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+-@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-     intptr_t _mx         = mx << (1 - hshift);
+-     intptr_t _my         = my << (1 - vshift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+++@ a <- c <- b
+++@ a in q0
+++@ c in q1
+++@ b in q2
+++@ Temp q3, q9, q10
+++@
+++@ d16, d17 (q8) xlat U, V
+++@ q14.u8 #2
+++@ q15.u8 #128
+ +
+-     x_off += mv->x >> (2 + hshift);
+-     y_off += mv->y >> (2 + vshift);
+-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+-     int hshift = s->ps.sps->hshift[1];
+-     int vshift = s->ps.sps->vshift[1];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+++function edge_16b_body_8
+++        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
+++        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
+++        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
+++        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
+ +
+-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+-@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+-     }
+- }
+- 
+--static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+--                                int nPbW, int nPbH,
+--                                int log2_cb_size, int partIdx, int idx)
+++        vsub.s8  q0,  q3
+++        vsub.s8  q10, q9
+++        vadd.s8  q0,  q10       @ a = sign(c-a)
+ +
+-+#if RPI_INTER
+++        vadd.s8  q0,  q14
+++        vuzp.8   d0,  d1
+++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
+ +
+-+static HEVCRpiLumaPred *
+-+rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
+-+{
+-+    HEVCRpiLumaPred * yp = s->curr_pred_y;
+-+    HEVCRpiLumaPred * ypt = yp + 1;
+-+    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
+-+        if (ypt->load < yp->load)
+-+            yp = ypt;
+-+    }
+++        vtbl.8   d0,  {d16}, d0
+++        vtbl.8   d1,  {d17}, d1
+ +
+-+//        yp->load += load_val;
+-+    ++yp->load;
+-+    return yp;
+-+}
+++        vzip.8   d0,  d1
+++        vqadd.s8 q0,  q3
+++        vsub.s8  q0,  q15
+ +
+-+static void
+-+rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+-+           const int nPbW, const int nPbH,
+-+           const Mv *const mv,
+-+           const int weight_mul,
+-+           const int weight_offset,
+-+           AVFrame *const src_frame)
+-+{
+-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+++        bx      lr
+++endfunc
+ +
+-+//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+-+//                    mv, x0, y0, nPbW, nPbH,
+-+//                    weight_mul, weight_offset);
+++@ a <- c <- b
+++@ a in q0
+++@ c in q1
+++@ b in q2
+++@ Temp q3
+++@
+++@ q12, #0
+++@ d16, d17 xlat U, V
+++@ q14.u8 #2
+++@ q15.u16 max
+++function edge_16b_body_16
+++        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
+++        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
+++        vsub.s16 q0, q3         @ a = sign(c-a)
+++        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
+++        vsub.s16 q0, q3
+++        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
+++        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
+++
+++        vmovn.s16 d0, q0
+++        @ d1 will have random contents that we transform but
+++        @ that doesn't matter as we then discard them
+++        vuzp.8   d0, d1
+++
+++        vadd.s8  q0, q0, q14
+++
+++        vtbl.8   d0, {d16}, d0
+++        vtbl.8   d1, {d17}, d1
+++
+++        vzip.8   d0, d1
+++
+++        vaddw.s8 q0, q1, d0
+++
+++        @ now clip
+++        vmax.s16 q0, q12
+++        vmin.s16 q0, q15
+++        bx       lr
+++endfunc
+ +
+-+    {
+-+        const unsigned int mx          = mv->x & 3;
+-+        const unsigned int my          = mv->y & 3;
+-+        const unsigned int my_mx       = (my << 8) | mx;
+-+        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+-+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+-+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+-+        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
+-+        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
+-+        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
+ +
+-+        // Potentially we could change the assembly code to support taller sizes in one go
+-+        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
+-+        {
+-+            const uint32_t src_yx_y = y1_m3 + start_y;
+-+            int start_x = 0;
+-+            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
+++@ ff_hevc_sao_edge_[c_]xx_neon(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
+++@   int eo,                           [sp, #sp_base + 0]
+++@   int width,                        [sp, #sp_base + 4]
+++@   int height)                       [sp, #sp_base + 8]
+++
+++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
+++        push     {r4-r6, lr}    @ 16 bytes
+++.set sp_base, 16
+++
+++@ Build translate registers
+++@ As translate values can only be 0-4 we don't care about junk in the rest
+++@ of the register
+++        mov      r12, #2
+++.if \is_chroma
+++        ldr      r4, [sp, #16]
+++.set sp_base, sp_base + 4
+++.endif
+++        vld1.8   {d16[2]}, [r3], r12
+++        vld1.8   {d16[0]}, [r3], r12
+++        vld1.8   {d16[1]}, [r3], r12
+++        vld1.8   {d16[3]}, [r3], r12
+++        vld1.8   {d16[4]}, [r3]
+++.if \is_chroma
+++        vld1.8   {d17[2]}, [r4], r12
+++        vld1.8   {d17[0]}, [r4], r12
+++        vld1.8   {d17[1]}, [r4], r12
+++        vld1.8   {d17[3]}, [r4], r12
+++        vld1.8   {d17[4]}, [r4]
+++.else
+++        vmov     d17, d16
+++.endif
+++
+++@ Setup constant registers
+++.if \bit_depth > 8
+++        movw     r4, (1 << \bit_depth) - 1
+++.endif
+++.if \setup_16b
+++.if \bit_depth > 8
+++        vmov.i64 q12, #0
+++        vdup.16  q15, r4
+++.else
+++        vmov.u8  q15, #128
+++.endif
+++        vmov.u8  q14, #2
+++.endif
+++        movw     r3, EDGE_SRC_STRIDE
+++
+++@ If setup_64b we need the xlat table on the stack and q4-q7 saved
+++.if \setup_64b
+++        sub      r5, sp, #16
+++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
+++.set sp_base, sp_base + 80
+++.endif
+++
+++@ Get jump address
+++@ We have a special case for width 4 as the calling code doesn't detect it
+++@ If we may have w4 then we add a 2nd jump table after the 1st
+++.if \check_w4
+++        ldr      r12, [sp, #sp_base + 4]        @ width
+++        cmp      r12, #8
+++.endif
+++        ldr      r12, [sp, #sp_base + 0]        @ e0
+++        adr      r6, \jump_tab
+++.if \check_w4
+++        it lt
+++        addlt    r6, #16
+++.endif
+++        ldr      r6, [r6, r12, lsl #2]
+++
+++        ldr      r12, [sp, #sp_base + 8]        @ height
+++
+++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
+++.if \do2
+++        push     {r0, r1, r6, r12}
+++        blx      r6
+++        pop      {r0, r1, r6, r12}
+++
+++        add      r0, #64
+++        add      r1, #64
+++.endif
+++
+++        blx      r6
+++
+++@ Tidy up & return
+++.if \setup_64b
+++        vpop     {q4-q8}        @ spurious but harmless load of q8
+++.endif
+++        pop      {r4-r6, pc}
+++.endm
+ +
+-+#if 1
+-+            // As Y-pred operates on two independant 8-wide src blocks we can merge
+-+            // this pred with the previous one if it the previous one is 8 pel wide,
+-+            // the same height as the current block, immediately to the left of our
+-+            // current dest block and mono-pred.
+ +
+-+            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
+-+            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
+-+            {
+-+                const int bw = FFMIN(nPbW, 8);
+-+                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
+-+
+-+                last_y8_lx->next_src2_x = x1_m3;
+-+                last_y8_lx->next_src2_y = src_yx_y;
+-+                last_y8_lx->next_src2_base = src_vc_address_y;
+-+                last_y8_p->p.w += bw;
+-+                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
+-+                last_y8_p->p.wo2 = wo;
+-+
+-+                s->last_y8_p = NULL;
+-+                s->last_y8_lx = NULL;
+-+                start_x = bw;
+-+#if RPI_TSTATS
+-+                ++s->tstats.y_pred1_y8_merge;
+-+#endif
+-+            }
+-+#endif
+++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
+++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
+++.endm
+ +
+-+            for (; start_x < nPbW; start_x += 16)
+-+            {
+-+                const int bw = FFMIN(nPbW - start_x, 16);
+-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+-+#if RPI_TSTATS
+-+                {
+-+                    HEVCRpiStats *const ts = &s->tstats;
+-+                    if (mx == 0 && my == 0)
+-+                        ++ts->y_pred1_x0y0;
+-+                    else if (mx == 0)
+-+                        ++ts->y_pred1_x0;
+-+                    else if (my == 0)
+-+                        ++ts->y_pred1_y0;
+-+                    else
+-+                        ++ts->y_pred1_xy;
+++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab
+++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
+++.endm
+ +
+-+                    if (nPbW > 8)
+-+                        ++ts->y_pred1_wgt8;
+-+                    else
+-+                        ++ts->y_pred1_wle8;
+ +
+-+                    if (nPbH > 16)
+-+                        ++ts->y_pred1_hgt16;
+-+                    else
+-+                        ++ts->y_pred1_hle16;
+-+                }
+-+#endif
+-+                cmd_y[-1].next_fn = s->qpu_filter;
+-+                cmd_lx->next_src1_x = x1_m3 + start_x;
+-+                cmd_lx->next_src1_y = src_yx_y;
+-+                cmd_lx->next_src1_base = src_vc_address_y;
+-+                if (bw <= 8)
+-+                {
+-+                    cmd_lx->next_src2_x = MC_DUMMY_X;
+-+                    cmd_lx->next_src2_y = MC_DUMMY_Y;
+-+                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
+-+                }
+-+                else
+-+                {
+-+                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
+-+                    cmd_lx->next_src2_y = src_yx_y;
+-+                    cmd_lx->next_src2_base = src_vc_address_y;
+-+                }
+-+                cmd_y->p.w = bw;
+-+                cmd_y->p.h = bh;
+-+                cmd_y->p.mymx21 = my2_mx2_my_mx;
+-+                cmd_y->p.wo1 = wo;
+-+                cmd_y->p.wo2 = wo;
+-+                cmd_y->p.dst_addr =  dst_addr + start_x;
+-+                yp->last_lx = cmd_y;
+-+                yp->qpu_mc_curr = cmd_y + 1;
+-+
+-+                if (bw == 8) {
+-+                    s->last_y8_lx = cmd_lx;
+-+                    s->last_y8_p = cmd_y;
+-+                }
+-+            }
+-+        }
+-+    }
+-+}
+++.macro  edge_64b_e0, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #8
+++1:      vldm     r1, {d7-d16}
+++        subs     r12, #1
+++        add      r1, r3
+++        // load a
+++        vext.8   q0,  q3,  q4, #(16 - \pb)
+++        vext.8   q1,  q4,  q5, #(16 - \pb)
+++        vext.8   q2,  q5,  q6, #(16 - \pb)
+++        vext.8   q3,  q6,  q7, #(16 - \pb)
+++        // load b
+++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
+++        vext.8   q8,  q4,  q5, #\pb
+++        vext.8   q9,  q5,  q6, #\pb
+++        vext.8   q10, q6,  q7, #\pb
+++        bl       \body_fn
+++        vstm     r0, {q0-q3}
+++        add      r0, r0, r2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+static void
+-+rpi_pred_y_b(HEVCContext * const s,
+-+           const int x0, const int y0,
+-+           const int nPbW, const int nPbH,
+-+           const struct MvField *const mv_field,
+-+           AVFrame *const src_frame,
+-+           AVFrame *const src_frame2)
+-+{
+-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+-+    const Mv * const mv  = mv_field->mv + 0;
+-+    const Mv * const mv2 = mv_field->mv + 1;
+++.macro  edge_32bx2_e0, body_fn, pb
+++        mov      r6, lr
+ +
+-+//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+-+//           mv, x0, y0, nPbW, nPbH,
+-+//           src_frame2, mv2, mv_field);
+-+    {
+-+        const unsigned int mx          = mv->x & 3;
+-+        const unsigned int my          = mv->y & 3;
+-+        const unsigned int my_mx = (my<<8) | mx;
+-+        const unsigned int mx2          = mv2->x & 3;
+-+        const unsigned int my2          = mv2->y & 3;
+-+        const unsigned int my2_mx2 = (my2<<8) | mx2;
+-+        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+-+        const int x1 = x0 + (mv->x >> 2) - 3;
+-+        const int y1 = y0 + (mv->y >> 2) - 3;
+-+        const int x2 = x0 + (mv2->x >> 2) - 3;
+-+        const int y2 = y0 + (mv2->y >> 2) - 3;
+-+        const unsigned int ref_idx0 = mv_field->ref_idx[0];
+-+        const unsigned int ref_idx1 = mv_field->ref_idx[1];
+-+        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
+-+                     s->sh.luma_offset_l1[ref_idx1] + 1;
+-+        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+-+        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+-+
+-+        uint32_t dst = get_vc_address_y(s->frame) + y_off;
+-+        const uint32_t src1_base = get_vc_address_y(src_frame);
+-+        const uint32_t src2_base = get_vc_address_y(src_frame2);
+-+
+-+        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
+-+        {
+-+            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
+++1:      subs     r12, #2
+ +
+-+            for (int start_x=0; start_x < nPbW; start_x += 8)
+-+            { // B blocks work 8 at a time
+-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+-+#if RPI_TSTATS
+-+              {
+-+                  HEVCRpiStats *const ts = &s->tstats;
+-+                  const unsigned int mmx = mx | mx2;
+-+                  const unsigned int mmy = my | my2;
+-+                  if (mmx == 0 && mmy == 0)
+-+                      ++ts->y_pred2_x0y0;
+-+                  else if (mmx == 0)
+-+                      ++ts->y_pred2_x0;
+-+                  else if (mmy == 0)
+-+                      ++ts->y_pred2_y0;
+-+                  else
+-+                      ++ts->y_pred2_xy;
+-+
+-+                  if (nPbH > 16)
+-+                      ++ts->y_pred2_hgt16;
+-+                  else
+-+                      ++ts->y_pred2_hle16;
+-+              }
+-+#endif
+-+              cmd_y[-1].next_fn = s->qpu_filter_b;
+-+              cmd_lx->next_src1_x = x1 + start_x;
+-+              cmd_lx->next_src1_y = y1 + start_y;
+-+              cmd_lx->next_src1_base = src1_base;
+-+              cmd_lx->next_src2_x = x2 + start_x;
+-+              cmd_lx->next_src2_y = y2 + start_y;
+-+              cmd_lx->next_src2_base = src2_base;
+-+              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
+-+              cmd_y->p.h = bh;
+-+              cmd_y->p.mymx21 = my2_mx2_my_mx;
+-+              cmd_y->p.wo1 = wo1;
+-+              cmd_y->p.wo2 = wo2;
+-+              cmd_y->p.dst_addr =  dst + start_x;
+-+              yp->last_lx = cmd_y;
+-+              yp->qpu_mc_curr = cmd_y + 1;
+-+          }
+-+          dst += s->frame->linesize[0] * 16;
+-+        }
+-+    }
+-+}
+++        vld1.8   {q4-q5}, [r1]
+++        sub      r1, #\pb
+++        vld1.8   {q0-q1}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {q8-q9}, [r1], r3
+++        sub      r1, #\pb
+++        vld1.8   {q6-q7}, [r1]
+++        sub      r1, #\pb
+++        vld1.8   {q2-q3}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {q10-q11}, [r1], r3
+++        sub      r1, #\pb
+++
+++        bl       \body_fn
+++
+++        vst1.8   {q0,q1}, [r0], r2
+++        vst1.8   {q2,q3}, [r0], r2
+++
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+++.macro  edge_16b_e0, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++        sub      r3, #\pb * 2
+ +
+-+static HEVCRpiChromaPred *
+-+rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
+-+{
+-+    HEVCRpiChromaPred * cp = s->curr_pred_c;
+-+    HEVCRpiChromaPred * cpt = cp + 1;
+-+    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
+-+        if (cpt->load < cp->load)
+-+            cp = cpt;
+-+    }
+-+    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
+-+    ++cp->load;
+-+//    cp->load += load_val;
+-+    return cp;
+-+}
+++1:      subs     r12, #1
+ +
+-+static void
+-+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
+-+  const int nPbW_c, const int nPbH_c,
+-+  const Mv * const mv,
+-+  const int16_t * const c_weights,
+-+  const int16_t * const c_offsets,
+-+  AVFrame * const src_frame)
+-+{
+++        vld1.64  {q0}, [r1]             @ load a
+++        add      r1, #\pb
+++        vld1.64  {q1}, [r1, :128]       @ load c
+++        add      r1, #\pb
+++        vld1.64  {q2}, [r1], r3         @ load b
+ +
+-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+-+#if 0
+-+    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
+++        bl       \body_fn
+++        vst1.8   {q0}, [r0], r2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
+-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+-+                c_weights[0], c_offsets[0]);
+++.macro  edge_8bx2_e0, body_fn, pb
+++        mov      r6, lr
+ +
+-+    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
+-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+-+                c_weights[1], c_offsets[1]);
+-+#endif
+-+    {
+-+        const int hshift           = s->ps.sps->hshift[1];
+-+        const int vshift           = s->ps.sps->vshift[1];
+-+
+-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+-+        const uint32_t src_base_u = get_vc_address_u(src_frame);
+-+        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+-+        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+-+        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
+-+        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
+-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+-+
+-+        for(int start_y=0;start_y < nPbH_c;start_y+=16)
+-+        {
+-+            const int bh = FFMIN(nPbH_c-start_y, 16);
+++1:      subs     r12, #2
+ +
+-+            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+-+            {
+-+                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
+-+                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+-+                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+-+                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+-+
+-+                u[-1].next_fn  = s->qpu_filter_uv;
+-+                last_l0->next_src_x = x1_c + start_x;
+-+                last_l0->next_src_y = y1_c + start_y;
+-+                last_l0->next_src_base_c = src_base_u;
+-+                u[0].p.h = bh;
+-+                u[0].p.w = bw;
+-+                u[0].p.coeffs_x = x_coeffs;
+-+                u[0].p.coeffs_y = y_coeffs;
+-+                u[0].p.wo_u = wo_u;
+-+                u[0].p.wo_v = wo_v;
+-+                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
+-+                cp->last_l0 = u;
+-+                cp->qpu_mc_curr = u + 1;
+-+            }
+++        vld1.8   {d2}, [r1, :64]
+++        sub      r1, #\pb
+++        vld1.8   {d0}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {d4}, [r1], r3
+++        sub      r1, #\pb
+++        vld1.8   {d3}, [r1, :64]
+++        sub      r1, #\pb
+++        vld1.8   {d1}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {d5}, [r1], r3
+++        sub      r1, #\pb
+++
+++        bl       \body_fn
+++
+++        vst1.8   {d0}, [r0, :64], r2
+++        vst1.8   {d1}, [r0, :64], r2
+++
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+            dst_base_u += s->frame->linesize[1] * 16;
+-+        }
+-+    }
+-+  return;
+-+}
+++.macro  edge_4bx4_e0, body_fn, pb
+++        mov      r6, lr
+ +
+-+static void
+-+rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+-+  const int nPbW_c, const int nPbH_c,
+-+  const struct MvField * const mv_field,
+-+  const int16_t * const c_weights,
+-+  const int16_t * const c_offsets,
+-+  const int16_t * const c_weights2,
+-+  const int16_t * const c_offsets2,
+-+  AVFrame * const src_frame,
+-+  AVFrame * const src_frame2)
+-+{
+-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+-+#if 0
+-+    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
+-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
+++1:      subs     r12, #4
+ +
+-+    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
+-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
+-+#endif
+-+    {
+-+        const int hshift = s->ps.sps->hshift[1];
+-+        const int vshift = s->ps.sps->vshift[1];
+-+        const Mv * const mv = mv_field->mv + 0;
+-+        const Mv * const mv2 = mv_field->mv + 1;
+-+
+-+        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+-+        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+-+        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+-+        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+-+
+-+        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+-+        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+-+        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+-+        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+-+
+-+        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+-+        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+-+
+-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+-+
+-+        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
+-+          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
+-+
+-+          // We are allowed 3/4 powers of two as well as powers of 2
+-+          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
+-+
+-+          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
+-+              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+-+
+-+              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
+-+              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+-+              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+-+              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
+-+
+-+              u[-1].next_fn = s->qpu_filter_uv_b0;
+-+              last_l0->next_src_x = x1_c + start_x;
+-+              last_l0->next_src_y = y1_c + start_y;
+-+              last_l0->next_src_base_c = get_vc_address_u(src_frame);
+-+
+-+              u[0].next_fn = 0;  // Ignored - 2 block cmd
+-+              u[0].next_src_x = x2_c + start_x;
+-+              u[0].next_src_y = y2_c + start_y;
+-+              u[0].next_src_base_c = get_vc_address_u(src_frame2);
+-+
+-+              u[0].b0.h = (bh<16 ? bh : 16);
+-+              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
+-+              u[0].b0.coeffs_x = coefs0_x;
+-+              u[0].b0.coeffs_y = coefs0_y;
+-+              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
+-+              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
+-+              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
+-+
+-+              last_l1->next_src_x = x2_c + start_x;
+-+              last_l1->next_src_y = y2_c + start_y;
+-+              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
+-+
+-+              u[1].b1.dummy0 = 0;  // w,h inherited from b0
+-+              u[1].b1.coeffs_x = coefs1_x;
+-+              u[1].b1.coeffs_y = coefs1_y;
+-+              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
+-+              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
+-+              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
+-+
+-+              cp->last_l0 = u;
+-+              cp->last_l1 = u + 1;
+-+              cp->qpu_mc_curr = u + 2;
+-+          }
+-+
+-+          dst_base_u += s->frame->linesize[1] * 16;
+-+        }
+-+    }
+-+}
+-+#endif
+++        vld1.32  {d2[0]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d0[0]}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.32  {d4[0]}, [r1], r3      @ R
+++        vld1.32  {d4[1]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d2[1]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d0[1]}, [r1], r3      @ L
+++        vld1.32  {d1[0]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d3[0]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d5[0]}, [r1], r3      @ R
+++        vld1.32  {d5[1]}, [r1]
+++        sub      r1, #(\pb * 2)
+++        vld1.32  {d1[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d3[1]}, [r1], r3      @ M
+++
+++        bl       \body_fn
+++
+++        vst1.32  {d0[0]}, [r0], r2
+++        vst1.32  {d0[1]}, [r0], r2
+++        vst1.32  {d1[0]}, [r0], r2
+++        vst1.32  {d1[1]}, [r0], r2
+++
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+ +
+++.macro  edge_64b_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {q0-q1}, [r1, :128]!
+++        vld1.8   {q2-q3}, [r1, :128], r3
+++        sub      r1, #32
+++        // load c
+++        vld1.8   {q4-q5}, [r1, :128]!
+++        vld1.8   {q6-q7}, [r1, :128], r3
+++        sub      r1, #32
+++1:      subs     r12, #1
+++        // load b
+++        vld1.8   {q8-q9}, [r1, :128]!
+++        vld1.8   {q10-q11}, [r1, :128], r3
+++        sub      r1, #32
+++        bl       \body_fn
+++        vstm     r0, {q0-q3}
+++        add      r0, r0, r2
+++        // copy c to a
+++        vmov.64  q0, q4
+++        vmov.64  q1, q5
+++        vmov.64  q2, q6
+++        vmov.64  q3, q7
+++        // copy b to c
+++        vmov.64  q4, q8
+++        vmov.64  q5, q9
+++        vmov.64  q6, q10
+++        vmov.64  q7, q11
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+-+                                const int nPbW, const int nPbH,
+-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+- {
+- #define POS(c_idx, x, y)                                                              \
+-     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
+-                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
+--    HEVCLocalContext *lc = s->HEVClc;
+-+    HEVCLocalContext * const lc = s->HEVClc;
+-     int merge_idx = 0;
+-     struct MvField current_mv = {{{ 0 }}};
+- 
+-@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-     int y_cb             = y0 >> log2_min_cb_size;
+-     int x_pu, y_pu;
+-     int i, j;
+--
+--    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+-+    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+- 
+-     if (!skip_flag)
+-         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+-@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#if RPI_INTER
+-+        if (s->enable_rpi) {
+-+            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+-+              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+-+              ref0->frame);
+-+        } else
+-+#endif
+-+        {
+-+            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#if RPI_INTER
+-+            if (s->enable_rpi) {
+-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+-+                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+-+                  ref0->frame);
+-+                return;
+-+            }
+-+#endif
+-             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+-@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-+#if RPI_INTER
+-+        if (s->enable_rpi) {
+-+            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+-+              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+-+              ref1->frame);
+-+        } else
+-+#endif
+-+        {
+-+            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#if RPI_INTER
+-+            if (s->enable_rpi) {
+-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+-+                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+-+                  ref1->frame);
+-+                return;
+-+            }
+-+#endif
+-             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+-@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#if RPI_INTER
+-+        if (s->enable_rpi) {
+-+            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+-+        } else
+-+#endif
+-+        {
+-+            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#if RPI_INTER
+-+          if (s->enable_rpi) {
+-+              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+-+                           &current_mv,
+-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+-+                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+-+                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+-+                           ref0->frame,
+-+                           ref1->frame);
+-+                return;
+-+            }
+-+#endif
+-             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+- 
+-@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+-                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
+-                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+-+                {
+-                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+-+                }
+- 
+-                 if (ret < 0)
+-                     return ret;
+-@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+-     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+- }
+- 
+-+#ifdef RPI
+-+static void rpi_execute_dblk_cmds(HEVCContext *s)
+-+{
+-+    int n;
+-+    int job = s->pass1_job;
+-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+-+    int (*p)[2] = s->dblk_cmds[job];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+-+    }
+-+    s->num_dblk_cmds[job] = 0;
+-+}
+++.macro  edge_32bx2_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {q0-q1}, [r1, :128], r3
+++        vld1.8   {q4-q5}, [r1, :128], r3
+ +
+-+#if 0
+-+static void rpi_execute_transform(HEVCContext *s)
+-+{
+-+    int i=2;
+-+    int job = s->pass1_job;
+-+    /*int j;
+-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-+        s->hevcdsp.idct[4-2](coeffs, 16);
+-+    }
+-+    i=3;
+-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+-+        s->hevcdsp.idct[5-2](coeffs, 32);
+-+    }*/
+++1:      subs     r12, #2
+++        @ Given the data duplication here we could obviously do better than
+++        @ using the generic body_fn but it almost certainly isn't worth it
+++        vmov     q2, q4
+++        vmov     q3, q5
+++        vld1.8   {q8-q9}, [r1, :128], r3
+++        vld1.8   {q10-q11}, [r1, :128], r3
+++        vmov     q6, q8
+++        vmov     q7, q9
+ +
+-+    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    //vpu_wait(s->vpu_id);
+++        bl       \body_fn
+ +
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[job][i] = 0;
+-+}
+-+#endif
+++        vst1.8   {q0,q1}, [r0], r2
+++        vst1.8   {q2,q3}, [r0], r2
+ +
+++        // copy c to a
+++        vmov.64  q0, q8
+++        vmov.64  q1, q9
+ +
+-+// I-pred, transform_and_add for all blocks types done here
+-+// All ARM
+-+static void rpi_execute_pred_cmds(HEVCContext * const s)
+-+{
+-+  int i;
+-+  int job = s->pass1_job;
+-+  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+-+#ifdef RPI_WORKER
+-+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+#else
+-+  HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+++        // copy b to c
+++        vmov.64  q4, q10
+++        vmov.64  q5, q11
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+-+//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++.macro  edge_16b_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {q0}, [r1, :128], r3
+++        // load c
+++        vld1.8   {q1}, [r1, :128], r3
+++1:      subs     r12, #1
+++        // load b
+++        vld1.8   {q2}, [r1, :128], r3
+++        bl       \body_fn
+++        vst1.8   {q0}, [r0], r2
+++        // copy c to a
+++        vmov.64  q0, q1
+++        // copy b to c
+++        vmov.64  q1, q2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+      switch (cmd->type)
+-+      {
+-+          case RPI_PRED_INTRA:
+-+              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+-+              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-+              lc->na.cand_left         = (cmd->na >> 3) & 1;
+-+              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+-+              lc->na.cand_up           = (cmd->na >> 1) & 1;
+-+              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-+              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
+-+                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+-+              else
+-+                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+-+              break;
+++.macro  edge_8bx2_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {d0}, [r1, :64], r3
+++        vld1.8   {d2}, [r1, :64], r3
+ +
+-+          case RPI_PRED_ADD_RESIDUAL:
+-+              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+-+#ifdef RPI_PRECLEAR
+-+              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
+-+#endif
+-+              break;
+-+          case RPI_PRED_ADD_RESIDUAL_U:
+-+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+-+              break;
+-+          case RPI_PRED_ADD_RESIDUAL_V:
+-+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+-+              break;
+++1:      subs     r12, #2
+++        @ Given the data duplication here we could obviously do better than
+++        @ using the generic body_fn but it almost certainly isn't worth it
+++        vmov.64  d1, d2
+++        vld1.8   {d4}, [r1, :64], r3
+++        vld1.8   {d5}, [r1, :64], r3
+++        vmov.64  d3, d4
+ +
+-+          case RPI_PRED_I_PCM:
+-+              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+-+              break;
+++        bl       \body_fn
+ +
+-+          default:
+-+              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+-+              abort();
+-+      }
+-+  }
+-+  s->num_pred_cmds[job] = 0;
+-+}
+++        vst1.8   {d0}, [r0], r2
+++        vst1.8   {d1}, [r0], r2
+ +
+-+// Do any inter-pred that we want to do in software
+-+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
+-+// All ARM
+-+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
+-+{
+-+    unsigned int cidx;
+-+    AVFrame myref;
+-+    AVFrame myref1;
+-+    struct MvField mymv;
+++        // copy c to a
+++        vmov.64  d0, d4
+++        // copy b to c
+++        vmov.64  d2, d5
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+    for(; n>0 ; n--, cmd++) {
+-+        av_assert0(0);
+++.macro  edge_4bx4_e1, body_fn
+++        mov      r6, lr
+++debug_me:
+++        sub      r1, r3
+++        // load a
+++        vld1.32  {d0[0]}, [r1], r3
+++        vld1.32  {d0[1]}, [r1], r3
+ +
+-+        switch(cmd->cmd) {
+-+        case RPI_CMD_LUMA_UNI:
+-+            if (b_only)
+-+                break;
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_LUMA_BI:
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            myref1.data[0] = cmd->src1;
+-+            myref1.linesize[0] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
+-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+-+                       &myref1, &cmd->mv1, &mymv);
+-+            break;
+-+        case RPI_CMD_CHROMA_UNI:
+-+            if (b_only)
+-+                break;
+-+            mymv.mv[0] = cmd->mv;
+-+            chroma_mc_uni(s, cmd->dst,
+-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_CHROMA_BI:
+-+        case RPI_CMD_CHROMA_BI+1:
+-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+-+            myref.data[cidx+1] = cmd->src;
+-+            myref.linesize[cidx+1] = cmd->srcstride;
+-+            myref1.data[cidx+1] = cmd->src1;
+-+            myref1.linesize[cidx+1] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            mymv.mv[0] = cmd->mv;
+-+            mymv.mv[1] = cmd->mv1;
+-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+-+            break;
+-+        }
+-+    }
+-+}
+++1:      subs     r12, #4
+++        @ Given the data duplication here we could probably do better than
+++        @ using the generic body_fn but it almost certainly isn't worth it
+++        vld1.32  {d4[0]}, [r1], r3
+++        vld1.32  {d4[1]}, [r1], r3
+++        vld1.32  {d5[0]}, [r1], r3
+++        vld1.32  {d5[1]}, [r1], r3
+++
+++        vmov.32  d1, d4
+++        vext.32  d2, d0, d4, #1
+++        vext.32  d3, d4, d5, #1
+++
+++        bl       \body_fn
+++
+++        vst1.32  {d0[0]}, [r0], r2
+++        vst1.32  {d0[1]}, [r0], r2
+++        vst1.32  {d1[0]}, [r0], r2
+++        vst1.32  {d1[1]}, [r0], r2
+++
+++        vmov.32  d0, d5
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
+-+{
+-+    const int job = s->pass1_job;
+++.macro  edge_64b_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #32
+++        sub      r3, #(32 - \pb)
+ +
+-+    if (!qpu_luma || luma_b_only)
+-+        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
+-+    s->num_mv_cmds_y[job] = 0;
+-+    if (!qpu_chroma || chroma_b_only)
+-+        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
+-+    s->num_mv_cmds_c[job] = 0;
+-+}
+++1:      sub      r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        vld1.8   {q0-q1}, [r1]!
+++        vld1.8   {q2-q3}, [r1], r3
+++        subs     r12, #1
+++        // load  c
+++        vld1.8   {q4-q5}, [r1, :128]!
+++        vld1.8   {q6-q7}, [r1, :128], r3
+++        // load  b
+++        vld1.8   {q8-q9}, [r1]!
+++        vld1.8   {q10-q11}, [r1]
+++        sub      r1, #(64 + \pb)
+++        bl       \body_fn
+++        vstm     r0, {q0-q3}
+++        add      r0, r0, r2
+++        bgt      1b
+++
+++        add      r3, #(32 - \pb)
+++        bx       r6
+++.endm
+ +
+-+#endif
+++.macro  edge_32bx2_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++
+++1:      sub      r1, r3
+++        vld1.8   {q0-q1}, [r1], r3
+++        vld1.8   {q2-q3}, [r1]
+++        subs     r12, #2
+++        // load  c
+++        add      r1, #\pb
+++        vld1.8   {q4-q5}, [r1, :128], r3
+++        vld1.8   {q6-q7}, [r1, :128]
+++        // load  b
+++        add      r1, #\pb
+++        vld1.8   {q8-q9}, [r1], r3
+++        vld1.8   {q10-q11}, [r1]
+++        sub      r1, #(\pb * 2)
+++
+++        bl       \body_fn
+++
+++        vst1.8   {q0-q1}, [r0], r2
+++        vst1.8   {q2-q3}, [r0], r2
+++        bgt      1b
+++
+++        bx       r6
+++.endm
+ +
+-+#ifdef RPI
+-+// Set initial uniform job values & zero ctu_count
+-+static void rpi_begin(HEVCContext *s)
+-+{
+-+#if RPI_INTER
+-+    int job = s->pass0_job;
+-+    int i;
+++.macro  edge_16b_e2, body_fn, pb
+++        mov      r6, lr
+++        add     r3, #\pb
+ +
+-+    const uint16_t pic_width_y        = s->ps.sps->width;
+-+    const uint16_t pic_height_y       = s->ps.sps->height;
+++1:      sub      r1, r3
+++        // load a
+++        vld1.8   {q0}, [r1], r3
+++        subs     r12, #1
+++        // load  c
+++        vld1.8   {q1}, [r1, :128], r3
+++        // load  b
+++        vld1.8   {q2}, [r1]
+++        sub      r1, #\pb
+++        bl       \body_fn
+++        vst1.8   {q0}, [r0], r2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-+    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+++.macro  edge_8bx2_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++
+++1:      sub      r1, r3
+++        vld1.8   {d0}, [r1], r3
+++        vld1.8   {d1}, [r1]
+++        subs     r12, #2
+++        // load  c
+++        add      r1, #\pb
+++        vld1.8   {d2}, [r1, :64], r3
+++        vld1.8   {d3}, [r1, :64]
+++        // load  b
+++        add      r1, #\pb
+++        vld1.8   {d4}, [r1], r3
+++        vld1.8   {d5}, [r1]
+++        sub      r1, #(\pb * 2)
+++
+++        bl       \body_fn
+++
+++        vst1.8   {d0}, [r0], r2
+++        vst1.8   {d1}, [r0], r2
+++        bgt      1b
+++
+++        bx       r6
+++.endm
+ +
+-+    for(i=0; i < QPU_N_UV;i++) {
+-+        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
+-+        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
+++.macro  edge_4bx4_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++
+++1:      sub      r1, r3
+++        @ line 0 {d0[0], -,     -    }  r1 lo
+++        vld1.32  {d0[0]}, [r1], r3
+++        subs     r12, #4
+++        @ Line 1 {d0[1], d2[0], -    }  r1 lo
+++        vld1.32  {d0[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d2[0]}, [r1], r3
+++        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
+++        vld1.32  {d2[1]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d1[0]}, [r1]
+++        add      r1, #\pb * 2
+++        vld1.32  {d4[0]}, [r1], r3
+++        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
+++        vld1.32  {d4[1]}, [r1]
+++        sub      r1, #\pb * 2
+++        vld1.32  {d1[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d3[0]}, [r1], r3
+++        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
+++        vld1.32  {d3[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d5[0]}, [r1], r3
+++        @ Line 4 {-,      -,    d5[1]}  r1 hi
+++        vld1.32  {d5[1]}, [r1]
+++        sub      r1, #(\pb * 2)
+++
+++        bl       \body_fn
+++
+++        vst1.32  {d0[0]}, [r0], r2
+++        vst1.32  {d0[1]}, [r0], r2
+++        vst1.32  {d1[0]}, [r0], r2
+++        vst1.32  {d1[1]}, [r0], r2
+++        bgt      1b
+++
+++        bx       r6
+++.endm
+ +
+-+        // Chroma setup is a double block with L0 fetch
+-+        // and other stuff in the 1st block and L1 fetch
+-+        // in the 2nd along with a lot of dummy vars
+-+        // This could be packed a lot tighter but it would make
+-+        // L0, L1 management a lot harder
+++.macro  edge_64b_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_64b_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+        u->next_fn = 0;
+-+        u->next_src_x = 0;
+-+        u->next_src_y = 0;
+-+        u->next_src_base_c = 0;
+-+        u->s0.pic_cw = pic_width_c;
+-+        u->s0.pic_ch = pic_height_c;
+-+        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
+-+        u->s0.stride1 = s->frame->linesize[1];
+-+        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
+-+        u->s0.dummy0 = 0;
+-+        cp->last_l0 = u;
+-+        ++u;
+++.macro  edge_32bx2_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_32bx2_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+        u->next_fn = 0;
+-+        u->next_src_x = 0;
+-+        u->next_src_y = 0;
+-+        u->next_src_base_c = 0;
+-+        u->s1.dummy0 = 0;
+-+        u->s1.dummy1 = 0;
+-+        u->s1.dummy2 = 0;
+-+        u->s1.dummy3 = 0;
+-+        u->s1.dummy4 = 0;
+-+        u->s1.dummy5 = 0;
+-+        cp->last_l1 = u;
+-+        ++u;
+-+
+-+        cp->load = 0;
+-+        cp->qpu_mc_curr = u;
+-+    }
+-+    s->curr_pred_c = NULL;
+-+
+-+    for(i=0;i < QPU_N_Y;i++) {
+-+        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
+-+        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
+-+
+-+        y->next_src1_x = 0;
+-+        y->next_src1_y = 0;
+-+        y->next_src1_base = 0;
+-+        y->next_src2_x = 0;
+-+        y->next_src2_y = 0;
+-+        y->next_src2_base = 0;
+-+        y->s.pic_h = pic_height_y;
+-+        y->s.pic_w = pic_width_y;
+-+        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
+-+        y->s.stride1 = s->frame->linesize[0];
+-+        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
+-+        y->s.dummy0 = 0;
+-+        y->next_fn = 0;
+-+        yp->last_lx = y;
+-+        ++y;
+++.macro  edge_16b_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_16b_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+        yp->load = 0;
+-+        yp->qpu_mc_curr = y;
+-+    }
+-+    s->curr_pred_y = NULL;
+-+    s->last_y8_p = NULL;
+-+    s->last_y8_lx = NULL;
+-+#endif
+-+    s->ctu_count = 0;
+-+}
+-+#endif
+++.macro  edge_8bx2_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_8bx2_e2 \body_fn, (-\pb)
+++.endm
+ +
+++.macro  edge_4bx4_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_4bx4_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+#if RPI_INTER
+-+static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
+-+{
+-+    unsigned int i;
+-+    const uint32_t exit_fn = qpu_fn(mc_exit);
+-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
+-+    unsigned int tc = 0;
+-+    HEVCRpiJob * const jb = s->jobs + job;
+++.macro edge_64b_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+ +
+-+    // Add final commands to Q
+-+    for(i = 0; i != QPU_N_Y; ++i) {
+-+        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
+-+        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
+++0:      edge_64b_e0     \body_fn, \pb
+++10:     edge_64b_e1     \body_fn
+++20:     edge_64b_e2     \body_fn, \pb
+++30:     edge_64b_e3     \body_fn, \pb
+++.endm
+ +
+-+        // We will always have had L0 if we have L1 so only test L0
+-+        if (px != yp->qpu_mc_base)
+-+            tc = 1;
+++.macro edge_32bx2_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+ +
+-+        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++0:      edge_32bx2_e0   \body_fn, \pb
+++10:     edge_32bx2_e1   \body_fn
+++20:     edge_32bx2_e2   \body_fn, \pb
+++30:     edge_32bx2_e3   \body_fn, \pb
+++.endm
+ +
+-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+-+        px->next_src1_x = MC_DUMMY_X;
+-+        px->next_src1_y = MC_DUMMY_Y;
+-+        px->next_src1_base = s->qpu_dummy_frame;
+-+        px->next_src2_x = MC_DUMMY_X;
+-+        px->next_src2_y = MC_DUMMY_Y;
+-+        px->next_src2_base = s->qpu_dummy_frame;
+-+
+-+        yp->last_lx = NULL;
+-+    }
+++.macro edge_16b_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+ +
+-+    return tc;
+-+}
+++0:      edge_16b_e0     \body_fn, \pb
+++10:     edge_16b_e1     \body_fn
+++20:     edge_16b_e2     \body_fn, \pb
+++30:     edge_16b_e3     \body_fn, \pb
+++.endm
+ +
+-+#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
+-+#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
+++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+++        .word   5f
+++        .word   15f
+++        .word   25f
+++        .word   35f
+++
+++0:      edge_32bx2_e0   \body_fn_64b, \pb
+++10:     edge_32bx2_e1   \body_fn_64b
+++20:     edge_32bx2_e2   \body_fn_64b, \pb
+++30:     edge_32bx2_e3   \body_fn_64b, \pb
+++5:      edge_16b_e0     \body_fn_16b, \pb
+++15:     edge_16b_e1     \body_fn_16b
+++25:     edge_16b_e2     \body_fn_16b, \pb
+++35:     edge_16b_e3     \body_fn_16b, \pb
+++.endm
+ +
+-+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
+-+{
+-+    unsigned int i;
+-+    const uint32_t exit_fn = qpu_fn(mc_exit_c);
+-+    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
+-+    unsigned int tc = 0;
+-+    HEVCRpiJob * const jb = s->jobs + job;
+++.macro edge_16b_8bx2_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+++        .word   5f
+++        .word   15f
+++        .word   25f
+++        .word   35f
+++
+++0:      edge_16b_e0     \body_fn, \pb
+++10:     edge_16b_e1     \body_fn
+++20:     edge_16b_e2     \body_fn, \pb
+++30:     edge_16b_e3     \body_fn, \pb
+++5:      edge_8bx2_e0    \body_fn, \pb
+++15:     edge_8bx2_e1    \body_fn
+++25:     edge_8bx2_e2    \body_fn, \pb
+++35:     edge_8bx2_e3    \body_fn, \pb
+++.endm
+ +
+-+    // Add final commands to Q
+-+    for(i = 0; i != QPU_N_UV; ++i) {
+-+        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
+-+        qpu_mc_pred_c_t *const p0 = cp->last_l0;
+-+        qpu_mc_pred_c_t *const p1 = cp->last_l1;
+++.macro edge_8bx2_4bx4_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+++        .word   5f
+++        .word   15f
+++        .word   25f
+++        .word   35f
+++
+++0:      edge_8bx2_e0    \body_fn, \pb
+++10:     edge_8bx2_e1    \body_fn
+++20:     edge_8bx2_e2    \body_fn, \pb
+++30:     edge_8bx2_e3    \body_fn, \pb
+++5:      edge_4bx4_e0    \body_fn, \pb
+++15:     edge_4bx4_e1    \body_fn
+++25:     edge_4bx4_e2    \body_fn, \pb
+++35:     edge_4bx4_e3    \body_fn, \pb
+++.endm
+ +
+-+        // We will always have had L0 if we have L1 so only test L0
+-+        if (p0 != cp->qpu_mc_base)
+-+            tc = 1;
+++@ void ff_hevc_sao_edge_8_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_8_neon_8, export=1
+++        edge_16b_init   8, 0, 1, 99f
+++99:
+++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
+++endfunc
+ +
+-+        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++@ void ff_hevc_sao_edge_16_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_16_neon_8, export=1
+++        edge_16b_init   8, 0, 0, 99f
+++99:
+++        edge_16b_bodies edge_16b_body_8, 1
+++endfunc
+ +
+-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+-+        p0->next_src_x = MC_DUMMY_X;
+-+        p0->next_src_y = MC_DUMMY_Y;
+-+        p0->next_src_base_c = s->qpu_dummy_frame;
+-+        p1->next_src_x = MC_DUMMY_X;
+-+        p1->next_src_y = MC_DUMMY_Y;
+-+        p1->next_src_base_c = s->qpu_dummy_frame;;
+-+
+-+        cp->last_l0 = NULL;
+-+        cp->last_l1 = NULL;
+-+    }
+++@ void ff_hevc_sao_edge_32_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_32_neon_8, export=1
+++        edge_64b_init   8, 0, 0, 99f
+++99:
+++        edge_32bx2_bodies edge_64b_body_8, 1
+++endfunc
+ +
+-+    return tc;
+-+}
+-+#endif
+++@ void ff_hevc_sao_edge_64_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_64_neon_8, export=1
+++        edge_64b_init   8, 0, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_8, 1
+++endfunc
+ +
+-+#ifdef RPI
+++@ ff_hevc_sao_edge_c_8_neon_8(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_8_neon_8, export=1
+++        edge_16b_init   8, 1, 1, 99f
+++99:
+++        edge_16b_8bx2_bodies edge_16b_body_8, 2
+++endfunc
+ +
+++@ ff_hevc_sao_edge_c_16_neon_8(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_16_neon_8, export=1
+++        edge_64b_init   8, 1, 0, 99f
+++99:
+++        edge_32bx2_bodies edge_64b_body_8, 2
+++endfunc
+ +
+-+static void flush_frame(HEVCContext *s,AVFrame *frame)
+-+{
+-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+-+  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+  rpi_cache_flush_finish(rfe);
+-+}
+++@ ff_hevc_sao_edge_c_32_neon_8(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_32_neon_8, export=1
+++        edge_64b_init   8, 1, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_8, 2
+++endfunc
+ +
+++@ void ff_hevc_sao_edge_8_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_8_neon_10, export=1
+++        edge_16b_init   10, 0, 1, 99f
+++99:
+++        edge_16b_8bx2_bodies edge_16b_body_16, 2
+++endfunc
+ +
+-+// Core execution tasks
+-+static void worker_core(HEVCContext * const s)
+-+{
+-+    worker_global_env_t * const wg = &worker_global_env;
+-+    int arm_cost = 0;
+-+//    vpu_qpu_wait_h sync_c;
+-+    vpu_qpu_wait_h sync_y;
+-+    int qpu_luma = 0;
+-+    int qpu_chroma = 0;
+-+    int gpu_load;
+-+    int arm_load;
+-+    static const int arm_const_cost = 2;
+++@ void ff_hevc_sao_edge_16_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_16_neon_10, export=1
+++        edge_64b_init   10, 0, 0, 99f
+++99:
+++        edge_32bx2_bodies edge_64b_body_16, 2
+++endfunc
+ +
+-+//    static int z = 0;
+++@ void ff_hevc_sao_edge_64_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++@ We simply split the 32 case into 2 vertical stripes
+++@ and call the fns for w32
+++@
+++@ Calling code will always have src != dst so we don't have to worry
+++@ about edge effects
+ +
+-+    const int job = s->pass1_job;
+-+    unsigned int flush_start = 0;
+-+    unsigned int flush_count = 0;
+++function ff_hevc_sao_edge_64_neon_10, export=1
+++        edge_64b_init   10, 0, 1, 99f
+++endfunc
+ +
+-+    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+-+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++@ void ff_hevc_sao_edge_32_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_32_neon_10, export=1
+++        edge_64b_init   10, 0, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_16, 2
+++endfunc
+ +
+-+    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
+-+        vpu_qpu_job_add_vpu(vqj,
+-+            vpu_get_fn(),
+-+            vpu_get_constants(),
+-+            s->coeffs_buf_vc[job][2],
+-+            s->num_coeffs[job][2] >> 8,
+-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-+            s->num_coeffs[job][3] >> 10,
+-+            0);
+-+
+-+        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+    }
+++@ ff_hevc_sao_edge_c_8_neon_10(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_8_neon_10, export=1
+++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
+++99:
+++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
+++endfunc
+ +
+++@ ff_hevc_sao_edge_c_32_neon_10(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_32_neon_10, export=1
+++        edge_64b_init   10, 1, 1, 99f
+++endfunc
+ +
+-+#if RPI_INTER
+-+    pthread_mutex_lock(&wg->lock);
+-+
+-+//    ++z;
+-+    gpu_load = vpu_qpu_current_load();
+-+    arm_load = avpriv_atomic_int_get(&wg->arm_load);
+-+#if 0 // Y_B_ONLY
+-+    qpu_luma =  gpu_load + 2 < arm_load;
+-+    qpu_chroma = gpu_load < arm_load + 8;
+-+#elif 0
+-+    qpu_luma =  gpu_load < arm_load + 2;
+-+    qpu_chroma = gpu_load < arm_load + 8;
+-+#else
+-+    qpu_chroma = 1;
+-+    qpu_luma = 1;
+-+#endif
+ +
+-+    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
+-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
+++@ ff_hevc_sao_edge_c_16_neon_10(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_16_neon_10, export=1
+++        edge_64b_init   10, 1, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_16, 4
+++endfunc
+ +
+-+    wg->gpu_c += qpu_chroma;
+-+    wg->gpu_y += qpu_luma;
+-+    wg->arm_c += !qpu_chroma;
+-+    wg->arm_y += !qpu_luma;
++diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
++index 1be52e7a12..bae5df4bc6 100644
++--- a/libavcodec/avcodec.h
+++++ b/libavcodec/avcodec.h
++@@ -410,6 +410,8 @@ enum AVCodecID {
++     AV_CODEC_ID_SHEERVIDEO,
++     AV_CODEC_ID_YLC,
++ 
+++    AV_CODEC_ID_H264_MVC,
+ +
++     /* various PCM "codecs" */
++     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
++     AV_CODEC_ID_PCM_S16LE = 0x10000,
++@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext {
++ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
++ #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
++ #define FF_PROFILE_H264_CAVLC_444            44
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+++#define FF_PROFILE_H264_STEREO_HIGH          128
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
++ 
++ #define FF_PROFILE_VC1_SIMPLE   0
++ #define FF_PROFILE_VC1_MAIN     1
++@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext {
++ #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
++ #endif
++ 
+++    /**
+++     * Opaque pointer for use by replacement get_buffer2 code
+++     *
+++     * @author jc (08/02/2016)
+++     */
+++    void * get_buffer_context;
+ +
+-+//    if ((z & 511) == 0) {
+-+//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
+-+//    }
++ } AVCodecContext;
++ 
++ AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
++diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
++index 1bf1c620d6..ccfa991f60 100644
++--- a/libavcodec/cabac.h
+++++ b/libavcodec/cabac.h
++@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
++ typedef struct CABACContext{
++     int low;
++     int range;
++-    int outstanding_count;
+++    union
+++    {
+++        int outstanding_count;
+++        struct {
+++            uint16_t bits;
+++            uint16_t range;
+++        } by22;
+++    };
++     const uint8_t *bytestream_start;
++     const uint8_t *bytestream;
++     const uint8_t *bytestream_end;
++diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
++index c1fa67f67b..6f99021339 100644
++--- a/libavcodec/hevc.c
+++++ b/libavcodec/hevc.c
++@@ -41,8 +41,346 @@
++ #include "hevc.h"
++ #include "profiles.h"
++ 
+++#ifdef RPI
+++  #include "rpi_qpu.h"
+++  #include "rpi_shader.h"
+++  #include "rpi_shader_cmd.h"
+++  #include "rpi_shader_template.h"
+++  #include "rpi_zc.h"
+++  #include "libavutil/rpi_sand_fns.h"
+ +
+++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+++  #define RPI_CACHE_UNIF_MVS  1
+ +
+-+    {
+-+        int (*d)[2] = s->dblk_cmds[job];
+-+        unsigned int high=(*d)[1];
+-+        int n;
+-+
+-+        flush_start = high;
+-+        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+-+            unsigned int y = (*d)[1];
+-+            flush_start = FFMIN(flush_start, y);
+-+            high=FFMAX(high,y);
+-+        }
+-+        // Avoid flushing past end of frame
+-+        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
+-+    }
+++  #include "pthread.h"
+++  #include "libavutil/atomic.h"
+ +
+-+#if !DISABLE_CHROMA
+-+    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
+-+    {
+-+        HEVCRpiJob * const jb = s->jobs + job;
+-+        const uint32_t code = qpu_fn(mc_setup_c);
+-+        uint32_t * p;
+-+        unsigned int i;
+-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+++  static void worker_core(HEVCContext * const s);
+++#endif
+ +
+-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+-+            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
+-+            *p++ = code;
+-+        }
+++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+ +
+-+        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
+++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+ +
+-+#if RPI_CACHE_UNIF_MVS
+-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+#endif
+-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
+-+    }
+++#ifndef av_mod_uintp2
+++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+++{
+++    return a & ((1 << p) - 1);
+++}
+++#   define av_mod_uintp2   av_mod_uintp2_c
+ +#endif
+ +
+-+// We can take a sync here and try to locally overlap QPU processing with ARM
+-+// but testing showed a slightly negative benefit with noticable extra complexity
+-+//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
++ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ 
+ +
+-+    if (qpu_luma && mc_terminate_y(s, job) != 0)
+-+    {
+-+        HEVCRpiJob * const jb = s->jobs + job;
+-+        const uint32_t code = qpu_fn(mc_setup);
+-+        uint32_t * p;
+-+        unsigned int i;
+-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+++#if RPI_INTER
+ +
+-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+-+            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
+-+            *p++ = code;
+-+        }
+++#define MC_DUMMY_X (-32)
+++#define MC_DUMMY_Y (-32)
+ +
+-+        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
+++// UV still has min 4x4 pred
+++// Allow for even spread +1 for setup, +1 for rounding
+++// As we have load sharing this can (in theory) be exceeded so we have to
+++// check after each CTU, but it is a good base size
+ +
+-+#if RPI_CACHE_UNIF_MVS
+-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+#endif
+-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
+-+    }
+++// Worst case (all 4x4) commands per CTU
+++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
+++#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
+ +
+-+    pthread_mutex_unlock(&wg->lock);
+++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
+++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
+ +
+-+#endif
+++// The QPU code for UV blocks only works up to a block width of 8
+++#define RPI_CHROMA_BLOCK_WIDTH 8
+ +
+-+    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+ +
+-+    // Having accumulated some commands - do them
+-+    rpi_cache_flush_finish(rfe);
+-+    vpu_qpu_job_finish(vqj);
+ +
+-+    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
+++// Actual filter goes -ve, +ve, +ve, -ve using these values
+++static const uint32_t rpi_filter_coefs[8] = {
+++        ENCODE_COEFFS(  0,  64,   0,  0),
+++        ENCODE_COEFFS(  2,  58,  10,  2),
+++        ENCODE_COEFFS(  4,  54,  16,  2),
+++        ENCODE_COEFFS(  6,  46,  28,  4),
+++        ENCODE_COEFFS(  4,  36,  36,  4),
+++        ENCODE_COEFFS(  4,  28,  46,  6),
+++        ENCODE_COEFFS(  2,  16,  54,  4),
+++        ENCODE_COEFFS(  2,  10,  58,  2)
+++};
+ +
+-+#if Y_B_ONLY
+-+    if (qpu_luma)
+-+        vpu_qpu_wait(&sync_y);
+-+#endif
+-+    // Perform inter prediction
+-+    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
+++// Function arrays by QPU
+ +
+-+    // Wait for transform completion
+++static const int * const inter_pred_setup_c_qpu[12] = {
+++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
+++};
+ +
+-+    // Perform intra prediction and residual reconstruction
+-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
+-+#if Y_B_ONLY
+-+    if (!qpu_luma)
+-+        vpu_qpu_wait(&sync_y);
+-+#else
+-+    vpu_qpu_wait(&sync_y);
+-+#endif
+-+    rpi_execute_pred_cmds(s);
+++static const int * const inter_pred_setup_c10_qpu[12] = {
+++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
+++};
+ +
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+++static const int * const inter_pred_setup_y_qpu[12] = {
+++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
+++};
+ +
+-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
+-+}
+++static const int * const inter_pred_setup_y10_qpu[12] = {
+++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
+++};
+ +
+-+static void rpi_do_all_passes(HEVCContext *s)
+-+{
+-+    // Do the various passes - common with the worker code
+-+    worker_core(s);
+-+    // Prepare next batch
+-+    rpi_begin(s);
+-+}
+++static const int * const inter_pred_sync_qpu[12] = {
+++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
+++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
+++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
+++};
+ +
+++static const int * const inter_pred_sync10_qpu[12] = {
+++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
+++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
+++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
+++};
+ +
+++static const int * const inter_pred_exit_c_qpu[12] = {
+++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
+++};
+ +
+-+#endif
+++static const int * const inter_pred_exit_c10_qpu[12] = {
+++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
+++};
+ +
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int y_ctb       = 0;
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
+-+        s->frame->format == AV_PIX_FMT_SAND128 &&
+-+        !s->ps.pps->cross_component_prediction_enabled_flag;
+++static const int * const inter_pred_exit_y_qpu[12] = {
+++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
+++};
+ +
+-+    if (!s->enable_rpi) {
+-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
+-+        printf("Cross component\n");
+-+    }
+-+#endif
+-+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+++static const int * const inter_pred_exit_y10_qpu[12] = {
+++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
+++};
+ +
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-         return AVERROR_INVALIDDATA;
+-@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         }
+-     }
+- 
+-+#ifdef RPI_WORKER
+-+    s->pass0_job = 0;
+-+    s->pass1_job = 0;
+-+#endif
+-+#ifdef RPI
+-+    rpi_begin(s);
+-+#endif
+++typedef struct ipe_chan_info_s
+++{
+++    const unsigned int n;
+++    const int * const * setup_fns;
+++    const int * const * sync_fns;
+++    const int * const * exit_fns;
+++} ipe_chan_info_t;
+ +
+-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+- 
+-@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+-         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+- 
+++typedef struct ipe_init_info_s
+++{
+++    ipe_chan_info_t luma;
+++    ipe_chan_info_t chroma;
+++} ipe_init_info_t;
+++
+++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
+++   {  // 8
+++      .luma =   {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
+++      .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
+++   },
+++   {  // 9
+++      .luma =   {0},
+++      .chroma = {0}
+++   },
+++   {  // 10
+++      .luma =   {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
+++      .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
+++   }
+ +
+-         ff_hevc_cabac_init(s, ctb_addr_ts);
+- 
+-         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+-@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-+#if RPI_INTER
+-+        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
+-+        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
+-+#endif
+++};
+ +
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
+++{
+++    const unsigned int n = ici->n;
+++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
+++
+++    ipe->n = n;
+++    ipe->max_fill = q1_size - ipe->min_gap;
+++    for(unsigned int i = 0; i < n; i++) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i;
+++        q->qpu_mc_curr = q->qpu_mc_base =
+++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
+++        q->code_setup = qpu_fn(ici->setup_fns[i]);
+++        q->code_sync = qpu_fn(ici->sync_fns[i]);
+++        q->code_exit = qpu_fn(ici->exit_fns[i]);
+++    }
+++}
+ +
+-+#ifdef RPI
+-+        if (s->enable_rpi) {
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+-+          //av_assert0(s->pass0_job>=0);
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+-+          s->ctu_count++;
+-+
+-+          if ( s->ctu_count >= s->max_ctu_count ) {
+-+#ifdef RPI_WORKER
+-+            if (s->used_for_ref)
+-+            {
+-+//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
+++{
+++    const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
+ +
+-+//                worker_wait(s);
+-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+-+              // Pass on this job to worker thread
+-+              worker_submit_job(s);
+++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
+ +
+-+              // Make sure we have space to prepare the next job
+-+              worker_pass0_ready(s);
+++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
+++
+++    for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
+++        HEVCRpiJob *const jb = s->jobs + i;
+++        set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
+++        set_ipe_from_ici(&jb->luma_ip,   &iii->luma);
+++    }
+++}
+ +
+-+              // Prepare the next batch of commands
+-+              rpi_begin(s);
+-+            } else {
+-+              // Non-ref frame so do it all on this thread
+-+              rpi_do_all_passes(s);
+-+            }
+-+#else
+-+            rpi_do_all_passes(s);
+-+#endif
+-+          }
+ +
+-+        }
+ +#endif
+ +
+ +
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-             return more_data;
+-@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         ctb_addr_ts++;
+-         ff_hevc_save_states(s, ctb_addr_ts);
+-+#ifdef RPI
+-+        if (s->enable_rpi)
+-+            continue;
+-+#endif
+-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+-     }
+- 
+ +#ifdef RPI
+ +
+-+#ifdef RPI_WORKER
+-+    // Wait for the worker to finish all its jobs
+-+    if (s->enable_rpi) {
+-+        worker_wait(s);
+-+    }
+-+#endif
+++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +
+-+    // Finish off any half-completed rows
+-+    if (s->enable_rpi && s->ctu_count) {
+-+        rpi_do_all_passes(s);
+-+    }
+++#define LOG_ENTER
+++#define LOG_EXIT
+ +
+-+#if RPI_TSTATS
+-+    {
+-+        HEVCRpiStats *const ts = &s->tstats;
+++#define USE_SEM 1
+ +
+-+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+-+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+-+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+-+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+-+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+-+        memset(ts, 0, sizeof(*ts));
+-+    }
+-+#endif
+++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+++static void worker_submit_job(HEVCContext * const s)
+++{
+++    LOG_ENTER
+++    sem_post(&s->jb0->sem_in);
+++    s->jb0->pending = 1;
+++    s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++    s->jb0 = s->jobs + s->pass0_job;
+++    LOG_EXIT
+++}
+ +
+-+#endif
+++// Call this to say we have completed pass1
+++static void worker_complete_job(HEVCContext * const s)
+++{
+++    LOG_ENTER
+++    sem_t * const sem = &s->jb1->sem_out;
+++    // Must set job no before signalling as otherwise rpi_do_all_passes
+++    // may call worker_core from the main thread with a bad job number
+++    s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++    s->jb1 = s->jobs + s->pass1_job;
+++    sem_post(sem);
+++    LOG_EXIT
+++}
+ +
+-     if (x_ctb + ctb_size >= s->ps.sps->width &&
+-         y_ctb + ctb_size >= s->ps.sps->height)
+-         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+-@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+-     s = s1->sList[self_id];
+-     lc = s->HEVClc;
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = 0;
+-+    //printf("Wavefront\n");
+-+#endif
+ +
+-     if(ctb_row) {
+-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+- 
+-@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+-         if (ret < 0)
+-             return ret;
+- 
+-+        // The definition of _N unit types is "non-reference for other frames
+-+        // with the same temporal_id" so they may/will be ref frames for pics
+-+        // with a higher temporal_id.
+-+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+-+            !(s->nal_unit_type == NAL_TRAIL_N ||
+-+                        s->nal_unit_type == NAL_TSA_N   ||
+-+                        s->nal_unit_type == NAL_STSA_N  ||
+-+                        s->nal_unit_type == NAL_RADL_N  ||
+-+                        s->nal_unit_type == NAL_RASL_N);
+++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+++// available to receive the next job.
+++static void worker_pass0_ready(HEVCContext *s)
+++{
+++    LOG_ENTER
+++    HEVCRpiJob * const jb = s->jb0;
+++    if (jb->pending) {
+++        while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+++            /* Loop */;
+++        jb->pending = 0;
+++    }
+++    LOG_EXIT
+++}
+ +
+-+#if DEBUG_DECODE_N
+-+        {
+-+            static int z = 0;
+-+            if (IS_IDR(s)) {
+-+                z = 1;
+-+            }
+-+            if (z != 0 && z++ > DEBUG_DECODE_N) {
+-+                s->is_decoded = 0;
+-+                break;
+-+            }
+-+        }
+-+#endif
+-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+-+            s->is_decoded = 0;
+-+            break;
+++// Call this to wait for all jobs to have completed at the end of a frame
+++static void worker_wait(HEVCContext * const s)
+++{
+++    LOG_ENTER
+++    unsigned int i;
+++    for (i = 0; i != RPI_MAX_JOBS; ++i) {
+++        HEVCRpiJob * const jb = s->jobs + i;
+++        if (jb->pending) {
+++            while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+++                /* Loop */;
+++            jb->pending = 0;
+ +        }
+-         if (s->max_ra == INT_MAX) {
+-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+-                 s->max_ra = s->poc;
+-@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+-         }
+-     }
+- 
+--fail:
+--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+-+fail:  // Also success path
+-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+-+#if RPI_INTER
+-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+-+#endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+--
+-+    }
+-+#if RPI_INTER
+-+    else if (s->ref && s->enable_rpi) {
+-+      // When running single threaded we need to flush the whole frame
+-+      flush_frame(s,s->frame);
+ +    }
+-+#endif
+-     return ret;
+- }
+- 
+-@@ -3064,6 +4764,41 @@ fail:
+-     return AVERROR(ENOMEM);
+- }
+- 
+-+#ifdef RPI_WORKER
+-+static av_cold void hevc_init_worker(HEVCContext *s)
+++    LOG_EXIT
+++}
+++
+++static void *worker_start(void *arg)
+ +{
+-+    int err;
+-+    pthread_cond_init(&s->worker_cond_head, NULL);
+-+    pthread_cond_init(&s->worker_cond_tail, NULL);
+-+    pthread_mutex_init(&s->worker_mutex, NULL);
+++    HEVCContext * const s = (HEVCContext *)arg;
+ +
+-+    s->worker_tail=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+-+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+-+    if (err) {
+-+        printf("Failed to create worker thread\n");
+-+        exit(-1);
+++    for (;;)
+++    {
+++        HEVCRpiJob * const jb = s->jb1;
+++        while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
+++            /* Loop */;
+++        if (jb->terminate)
+++            break;
+++
+++        LOG_ENTER
+++        worker_core(s);
+++        worker_complete_job(s);
+++        LOG_EXIT
+ +    }
+++    return NULL;
+ +}
+ +
+-+static av_cold void hevc_exit_worker(HEVCContext *s)
+++static void worker_pic_free_all(HEVCContext * const s)
+ +{
+-+    void *res;
+-+    s->kill_worker=1;
+-+    pthread_cond_broadcast(&s->worker_cond_tail);
+-+    pthread_join(s->worker_thread, &res);
+++    unsigned int i;
+ +
+-+    pthread_cond_destroy(&s->worker_cond_head);
+-+    pthread_cond_destroy(&s->worker_cond_tail);
+-+    pthread_mutex_destroy(&s->worker_mutex);
+++    // Free coeff stuff - allocation not the same for all buffers
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++    {
+++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+ +
+-+    s->worker_tail=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+++        if (cf->s[0].buf != NULL)
+++            av_freep(&cf->mptr);
+++        if (cf->s[2].buf != NULL)
+++            gpu_free(&cf->gptr);
+++        memset(cf, 0, sizeof(*cf));
+++    }
+ +}
+-+#endif
+ +
+- static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- {
+-     HEVCContext       *s = avctx->priv_data;
+-@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- 
+-     av_freep(&s->cabac_state);
+- 
+-+#ifdef RPI
+++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
+++{
+++    unsigned int i;
+ +
+-+#ifdef RPI_WORKER
+-+    hevc_exit_worker(s);
+-+#endif
+++    // Free coeff stuff - allocation not the same for all buffers
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++    {
+++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+ +
+-+    for(i=0;i<RPI_MAX_JOBS;i++) {
+++//        av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
+++//        av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
+++//        av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
+++//        av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
+ +
+-+        av_freep(&s->unif_mv_cmds_y[i]);
+-+        av_freep(&s->unif_mv_cmds_c[i]);
+-+        av_freep(&s->univ_pred_cmds[i]);
+++        if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
+++            goto fail;
+++        cf->s[2].buf = (int16_t *)cf->gptr.arm;
+++        cf->s[3].buf = cf->s[2].buf + coeff_count;
+ +
+-+#if RPI_INTER
+-+        gpu_free(&s->jobs[i].chroma_mvs_gptr);
+-+        gpu_free(&s->jobs[i].luma_mvs_gptr);
+-+#endif
+++        // Must be 64 byte aligned for our zero apping code so over-allocate &
+++        // round
+++        if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
+++            goto fail;
+++        cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
+ +    }
+++    return 0;
+ +
+-+    vpu_qpu_term();
+++fail:
+++    printf("%s: **** Failed\n", __func__);
+++    worker_pic_free_all(s);
+++    return -1;
+++}
+ +
+-+    av_rpi_zc_uninit(avctx);
+-+#endif
+-+
+-     for (i = 0; i < 3; i++) {
+-         av_freep(&s->sao_pixel_buffer_h[i]);
+-         av_freep(&s->sao_pixel_buffer_v[i]);
+-@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+#ifdef RPI_PRECLEAR
+-+static av_cold void memclear16(int16_t *p, int n)
+++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
+ +{
+-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+-+  //int i;
+-+  //for(i=0;i<n;i++)
+-+  //  p[i] = 0;
+++    unsigned int i;
+++    for (i = 0; i != 4; ++i) {
+++        cf->s[i].n = 0;
+++    }
+ +}
+ +#endif
+-+#endif
+ +
+- static av_cold int hevc_init_context(AVCodecContext *avctx)
+++
++ /**
++  * NOTE: Each function hls_foo correspond to the function foo in the
++  * specification (HLS stands for High Level Syntax).
++@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
++ /* free everything allocated  by pic_arrays_init() */
++ static void pic_arrays_free(HEVCContext *s)
+  {
+-     HEVCContext *s = avctx->priv_data;
+-     int i;
+ +#ifdef RPI
+-+    unsigned int job;
+++    worker_pic_free_all(s);
+ +#endif
+++
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++
+++            if (dvq->vpu_cmds_arm) {
+++                gpu_free(&dvq->deblock_vpu_gmem);
+++              dvq->vpu_cmds_arm = 0;
+++            }
+++        }
+++    }
+++#endif
++     av_freep(&s->sao);
++     av_freep(&s->deblock);
+  
+-     s->avctx = avctx;
+- 
+-@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->HEVClcList[0] = s->HEVClc;
+-     s->sList[0] = s;
++@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
++     int ctb_count        = sps->ctb_width * sps->ctb_height;
++     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+  
+ +#ifdef RPI
+-+    // Whilst FFmpegs init fn is only called once the close fn is called as
+-+    // many times as we have threads (init_thread_copy is called for the
+-+    // threads).  So to match init & term put the init here where it will be
+-+    // called by both init & copy
+-+    av_rpi_zc_init(avctx);
+++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+ +
+-+    if (vpu_qpu_init() != 0)
+++    av_assert0(sps);
+++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+++#if RPI_ROUND_TO_LINES
+++    // Round down to an integral quantity of lines
+++    if (s->max_ctu_count > sps->ctb_width)
+++        s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
+++#endif
+++
+++    if (worker_pic_alloc_all(s, coefs_per_row) != 0)
+ +        goto fail;
+++#endif
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        s->enable_rpi_deblock = !sps->sao_enabled;
+++        s->setup_width = (sps->width+15) / 16;
+++        s->setup_height = (sps->height+15) / 16;
+++        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+++        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+ +
+-+    for(job = 0; job < RPI_MAX_JOBS; job++) {
+-+        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
+-+        if (!s->unif_mv_cmds_y[job])
+-+            goto fail;
+-+        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
+-+        if (!s->unif_mv_cmds_c[job])
+-+            goto fail;
+-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-+        if (!s->univ_pred_cmds[job])
+-+            goto fail;
+-+    }
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+++        {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+++            const unsigned int total_size =- cmd_size + y_size + uv_size;
+++            int p_vc;
+++            uint8_t * p_arm;
+++ #if RPI_VPU_DEBLOCK_CACHED
+++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+++ #else
+++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+++ #endif
+++            p_vc = dvq->deblock_vpu_gmem.vc;
+++            p_arm = dvq->deblock_vpu_gmem.arm;
+ +
+-+#if RPI_INTER
+-+    // We divide the image into blocks 256 wide and 64 high
+-+    // We support up to 2048 widths
+-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+-+    // Also add space for the startup command for each stream.
+++            // Zap all
+++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+ +
+-+    for (job = 0; job < RPI_MAX_JOBS; job++) {
+-+        HEVCRpiJob * const jb = s->jobs + job;
+-+#if RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+-+        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+-+#else
+-+        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+-+        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+-+#endif
+++            // Subdivide
+++            dvq->vpu_cmds_arm = (void*)p_arm;
+++            dvq->vpu_cmds_vc = p_vc;
+ +
+-+        {
+-+            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
+-+            for(i = 0; i < QPU_N_UV; i++) {
+-+                jb->chroma_mvs[i].qpu_mc_base = p;
+-+                jb->chroma_mvs[i].qpu_mc_curr = p;
+-+                p += UV_COMMANDS_PER_QPU;
+-+            }
+-+        }
+-+        {
+-+            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
+-+            for(i = 0; i < QPU_N_Y; i++) {
+-+                jb->luma_mvs[i].qpu_mc_base = p;
+-+                jb->luma_mvs[i].qpu_mc_curr = p;
+-+                p += Y_COMMANDS_PER_QPU;
+-+            }
+++            p_arm += cmd_size;
+++            p_vc += cmd_size;
+++
+++            dvq->y_setup_arm = (void*)p_arm;
+++            dvq->y_setup_vc = (void*)p_vc;
+++
+++            p_arm += y_size;
+++            p_vc += y_size;
+++
+++            dvq->uv_setup_arm = (void*)p_arm;
+++            dvq->uv_setup_vc = (void*)p_vc;
+ +        }
+++
+++        s->dvq_n = 0;
+++        s->dvq = s->dvq_ents + s->dvq_n;
+ +    }
+-+    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
+-+    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
+-+    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
+-+    s->qpu_filter = qpu_fn(mc_filter);
+-+    s->qpu_filter_b = qpu_fn(mc_filter_b);
+ +#endif
+-+    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+-+    s->enable_rpi = 0;
++     s->bs_width  = (width  >> 2) + 1;
++     s->bs_height = (height >> 2) + 1;
++ 
++@@ -137,6 +560,29 @@ fail:
++     return AVERROR(ENOMEM);
++ }
++ 
+++static void default_pred_weight_table(HEVCContext * const s)
+++{
+++  unsigned int i;
+++  s->sh.luma_log2_weight_denom = 0;
+++  s->sh.chroma_log2_weight_denom = 0;
+++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+++      s->sh.luma_weight_l0[i] = 1;
+++      s->sh.luma_offset_l0[i] = 0;
+++      s->sh.chroma_weight_l0[i][0] = 1;
+++      s->sh.chroma_offset_l0[i][0] = 0;
+++      s->sh.chroma_weight_l0[i][1] = 1;
+++      s->sh.chroma_offset_l0[i][1] = 0;
+++  }
+++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+++      s->sh.luma_weight_l1[i] = 1;
+++      s->sh.luma_offset_l1[i] = 0;
+++      s->sh.chroma_weight_l1[i][0] = 1;
+++      s->sh.chroma_offset_l1[i][0] = 0;
+++      s->sh.chroma_weight_l1[i][1] = 1;
+++      s->sh.chroma_offset_l1[i][1] = 0;
+++  }
+++}
+ +
+-+#ifdef RPI_WORKER
+-+    hevc_init_worker(s);
++ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
++ {
++     int i = 0;
++@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
++ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
++ {
++     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
++-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
++-    int ret, i;
+++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+++    int ret;
++ 
++     pic_arrays_free(s);
++     s->ps.sps = NULL;
++@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++     switch (sps->pix_fmt) {
++     case AV_PIX_FMT_YUV420P:
++     case AV_PIX_FMT_YUVJ420P:
+++#if RPI_HEVC_SAND
+++        // Currently geometry calc is stuffed for big sizes
+++        if (sps->width < 2048 && sps->height <= 1088) {
+++            *fmt++ = AV_PIX_FMT_SAND128;
+++        }
+ +#endif
+-+
++ #if CONFIG_HEVC_DXVA2_HWACCEL
++         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
++ #endif
++@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++ #endif
++         break;
++     case AV_PIX_FMT_YUV420P10:
+++#if RPI_HEVC_SAND
+++        // Currently geometry calc is stuffed for big sizes
+++        if (sps->width < 2048 && sps->height <= 1088) {
+++            *fmt++ = AV_PIX_FMT_SAND64_10;
+++        }
+ +#endif
++ #if CONFIG_HEVC_DXVA2_HWACCEL
++         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
++ #endif
++@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++         ret = ff_thread_get_format(s->avctx, pix_fmts);
++         if (ret < 0)
++             goto fail;
+ +
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-     if (!s->cabac_state)
+-         goto fail;
+-@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
++         s->avctx->pix_fmt = ret;
+      }
++     else {
++@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
++     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
++     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
+++#ifdef RPI
+++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
+++#endif
+  
+-     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+--            s->threads_type = FF_THREAD_FRAME;
+--        else
+--            s->threads_type = FF_THREAD_SLICE;
+-+        s->threads_type = FF_THREAD_FRAME;
+-+    else
+-+        s->threads_type = FF_THREAD_SLICE;
++-    for (i = 0; i < 3; i++) {
++-        av_freep(&s->sao_pixel_buffer_h[i]);
++-        av_freep(&s->sao_pixel_buffer_v[i]);
++-    }
+++    av_freep(&s->sao_pixel_buffer_h[0]);
+++    av_freep(&s->sao_pixel_buffer_v[0]);
+  
+-     return 0;
+- }
+-@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
+-     .update_thread_context = hevc_update_thread_context,
+-     .init_thread_copy      = hevc_init_thread_copy,
+-     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+-+//                             0,
+-+//                             AV_CODEC_CAP_FRAME_THREADS,
+-                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+-     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
+- };
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index be91010..dd7d152 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -23,6 +23,9 @@
+- #ifndef AVCODEC_HEVC_H
+- #define AVCODEC_HEVC_H
++     if (sps->sao_enabled && !s->avctx->hwaccel) {
++-        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
++-        int c_idx;
+++        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+++        unsigned int c_idx;
+++        size_t vsize[3] = {0};
+++        size_t hsize[3] = {0};
+  
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#include "config.h"
++         for(c_idx = 0; c_idx < c_count; c_idx++) {
++             int w = sps->width >> sps->hshift[c_idx];
++             int h = sps->height >> sps->vshift[c_idx];
++-            s->sao_pixel_buffer_h[c_idx] =
++-                av_malloc((w * 2 * sps->ctb_height) <<
++-                          sps->pixel_shift);
++-            s->sao_pixel_buffer_v[c_idx] =
++-                av_malloc((h * 2 * sps->ctb_width) <<
++-                          sps->pixel_shift);
+++            // ctb height & width are a min of 8 so this must a multiple of 16
+++            // so no point rounding up!
+++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
+++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++         }
+ +
+- #include "libavutil/buffer.h"
+- #include "libavutil/md5.h"
+++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
+++        // when we have plaited chroma
+++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
+++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
+++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
+++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
+++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
+++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++     }
+  
+-@@ -37,6 +40,45 @@
+- #include "thread.h"
+- #include "videodsp.h"
++     s->ps.sps = sps;
++@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s)
++                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
++                 pred_weight_table(s, gb);
++             }
+++            else
+++            {
+++              // Give us unit weights
+++              default_pred_weight_table(s);
+++            }
+  
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#ifndef RPI
+-+
+-+  #define RPI_INTER          0
+-+  #define RPI_TSTATS         0
+-+  #define RPI_HEVC_SAND      0
+-+
+-+#else
+-+
+-+  #include "rpi_qpu.h"
+-+  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
++             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
++     return 0;
++ }
++ 
+++#ifdef RPI
+++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
+++{
+++    return s->jb0->intra.cmds + s->jb0->intra.n++;
+++}
+ +
+-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-+  #define RPI_WORKER
+-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+-+  // This has no effect unless RPI_WORKER is defined
+-+  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+-+  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+-+  // free for the foreground to fill in.
+-+  #define RPI_MAX_JOBS 2
+++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+++{
+++    // U & V done on U call in the case of sliced frames
+++    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
+++        return;
+ +
+-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+-+  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+-+  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+-+  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+-+//  #define RPI_DEBLOCK_VPU
+-+
+-+  #define RPI_VPU_DEBLOCK_CACHED 1
+-+
+-+  #if HAVE_NEON
+-+  #define RPI_HEVC_SAND      1
+-+  #else
+-+  // Sand bust on Pi1 currently - reasons unknown
+-+  #define RPI_HEVC_SAND      0
+-+  #endif
+++    if (s->enable_rpi) {
+++        HEVCLocalContext *lc = s->HEVClc;
+++        HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
+++        cmd->type = RPI_PRED_INTRA;
+++        cmd->size = log2_trafo_size;
+++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+++        cmd->c_idx = c_idx;
+++        cmd->i_pred.x = x0;
+++        cmd->i_pred.y = y0;
+++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+++    }
+++    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
+++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+++    else {
+++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+ +
+-+  #define RPI_TSTATS 0
+++}
+ +#endif
+ +
+- #define MAX_DPB_SIZE 16 // A.4.1
+- #define MAX_REFS 16
++ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                               int xBase, int yBase, int cb_xBase, int cb_yBase,
++                               int log2_cb_size, int log2_trafo_size,
++@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++     if (lc->cu.pred_mode == MODE_INTRA) {
++         int trafo_size = 1 << log2_trafo_size;
++         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
++-
+++#ifdef RPI
+++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+++#else
++         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+++#endif
++     }
+  
+-@@ -660,17 +702,6 @@ typedef struct CodingUnit {
+-     uint8_t cu_transquant_bypass_flag;
+- } CodingUnit;
++     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
++@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+++#endif
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+++#endif
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                     trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+++#endif
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+++#endif
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
++             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#endif
++             }
++         } else if (blk_idx == 3) {
++             int trafo_size_h = 1 << (log2_trafo_size + 1);
++             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, xBase, yBase,
++                                             trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#endif
++             }
++         }
++     }
++@@ -1281,47 +1827,119 @@ do {
++     return 0;
++ }
+  
+--typedef struct Mv {
+--    int16_t x;  ///< horizontal component of motion vector
+--    int16_t y;  ///< vertical component of motion vector
+--} Mv;
++-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+++
+++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++ {
++-    HEVCLocalContext *lc = s->HEVClc;
++     GetBitContext gb;
++-    int cb_size   = 1 << log2_cb_size;
++-    int stride0   = s->frame->linesize[0];
++-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++-    int   stride1 = s->frame->linesize[1];
++-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++-    int   stride2 = s->frame->linesize[2];
++-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+ -
+--typedef struct MvField {
+--    DECLARE_ALIGNED(4, Mv, mv)[2];
+--    int8_t ref_idx[2];
+--    int8_t pred_flag;
+--} MvField;
++-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
++-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
++-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
++-                          s->ps.sps->pcm.bit_depth_chroma;
++-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++     int ret;
++ 
++-    if (!s->sh.disable_deblocking_filter_flag)
++-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+ -
+- typedef struct NeighbourAvailable {
+-     int cand_bottom_left;
+-     int cand_left;
+-@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
+-     uint8_t flags;
+- } HEVCFrame;
++     ret = init_get_bits(&gb, pcm, length);
++     if (ret < 0)
++         return ret;
+  
+-+#ifdef RPI_WORKER
+-+typedef struct HEVCLocalContextIntra {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;
+-+} HEVCLocalContextIntra;
+-+#endif
++-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
++-    if (s->ps.sps->chroma_format_idc) {
++-        s->hevcdsp.put_pcm(dst1, stride1,
+++#if RPI_HEVC_SAND
+++    if (av_rpi_is_sand_frame(s->frame)) {
+++        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
+++                           s->frame->linesize[0],
+++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+ +
+- typedef struct HEVCLocalContext {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+++        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+++                           s->frame->linesize[1],
++                            cb_size >> s->ps.sps->hshift[1],
++                            cb_size >> s->ps.sps->vshift[1],
++                            &gb, s->ps.sps->pcm.bit_depth_chroma);
++-        s->hevcdsp.put_pcm(dst2, stride2,
++-                           cb_size >> s->ps.sps->hshift[2],
++-                           cb_size >> s->ps.sps->vshift[2],
++-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
++     }
+++    else
+++#endif
+++    {
+++        const int stride0   = s->frame->linesize[0];
+++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+++        const int   stride1 = s->frame->linesize[1];
+++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++        const int   stride2 = s->frame->linesize[2];
+++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+ +
+-     uint8_t cabac_state[HEVC_CONTEXTS];
+- 
+-     uint8_t stat_coeff[4];
+-@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
+- 
+-     int qPy_pred;
+- 
+--    TransformUnit tu;
+- 
+-     uint8_t ctb_left_flag;
+-     uint8_t ctb_up_flag;
+-@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
+-     int ct_depth;
+-     CodingUnit cu;
+-     PredictionUnit pu;
+--    NeighbourAvailable na;
+++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++        if (s->ps.sps->chroma_format_idc) {
+++            s->hevcdsp.put_pcm(dst1, stride1,
+++                               cb_size >> s->ps.sps->hshift[1],
+++                               cb_size >> s->ps.sps->vshift[1],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++            s->hevcdsp.put_pcm(dst2, stride2,
+++                               cb_size >> s->ps.sps->hshift[2],
+++                               cb_size >> s->ps.sps->vshift[2],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++        }
+  
+- #define BOUNDARY_LEFT_SLICE     (1 << 0)
+- #define BOUNDARY_LEFT_TILE      (1 << 1)
+-@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+++    }
++     return 0;
++ }
+  
+-+
+ +#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+++{
+++    HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
+++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
+++    cfe->n += n;
+++    return coeffs;
+++}
+++#endif
+ +
+-+// The processing is done in chunks
+-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+-+// This is a distance of 1536 pixels across the screen
+-+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+-+// but allocate more memory and increase the latency before data in the next frame can be processed
+-+#define RPI_NUM_CHUNKS 4
+-+#define RPI_CHUNK_SIZE 12
+-+
+-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+++// x * 2^(y*2)
+++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+++{
+++    return x << (y * 2);
+++}
+ +
+-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+-+#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+-+#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+-+// Each block can have an intra prediction and a transform_add command
+-+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+-+// Worst case is 16x16 CTUs
+-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+-+
+-+#define RPI_CMD_LUMA_UNI 0
+-+#define RPI_CMD_CHROMA_UNI 1
+-+#define RPI_CMD_LUMA_BI 2
+-+#define RPI_CMD_CHROMA_BI 3
+-+#define RPI_CMD_V_BI 4
+-+
+-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+-+// #define RPI_PRECLEAR
+-+
+-+// Command for inter prediction
+-+typedef struct HEVCMvCmd {
+-+    uint8_t cmd;
+-+    uint8_t block_w;
+-+    uint8_t block_h;
+-+    int8_t ref_idx[2];
+-+    uint16_t dststride;
+-+    uint16_t srcstride;
+-+    uint16_t srcstride1;
+-+    int16_t weight;
+-+    int16_t offset;
+-+    int16_t x_off;
+-+    int16_t y_off;
+-+    uint8_t *src;
+-+    uint8_t *src1;
+-+    uint8_t *dst;
+-+    Mv mv;
+-+    Mv mv1;
+-+} HEVCMvCmd;
+-+
+-+
+-+// Command for intra prediction and transform_add of predictions to coefficients
+-+enum rpi_pred_cmd_e
+++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+ +{
+-+    RPI_PRED_ADD_RESIDUAL,
+-+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+-+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+-+    RPI_PRED_INTRA,
+-+    RPI_PRED_I_PCM,
+-+    RPI_PRED_CMD_MAX
+-+};
+++    // Length in bits
+++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+ +
+-+typedef struct HEVCPredCmd {
+-+    uint8_t type;
+-+    uint8_t size;  // log2 "size" used by all variants
+-+    uint8_t na;    // i_pred - but left here as they pack well
+-+    uint8_t c_idx; // i_pred
+-+    union {
+-+        struct {  // TRANSFORM_ADD
+-+            uint8_t * dst;
+-+            const int16_t * buf;
+-+            uint32_t stride;
+-+        } ta;
+-+        struct {  // INTRA
+-+            uint16_t x;
+-+            uint16_t y;
+-+            enum IntraPredMode mode;
+-+        } i_pred;
+-+        struct {  // I_PCM
+-+            uint16_t x;
+-+            uint16_t y;
+-+            const void * src;
+-+            uint32_t src_len;
+-+        } i_pcm;
+-+    };
+-+} HEVCPredCmd;
+++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+ +
+-+#endif
+++    if (!s->sh.disable_deblocking_filter_flag)
+++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+ +
+ +#ifdef RPI
+++    if (s->enable_rpi) {
+++        // Copy coeffs
+++        const int blen = (length + 7) >> 3;
+++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+++        // Allocation is in int16_t s
+++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+++        // sample this rounding doesn't affect the total size we need to allocate for
+++        // the coeff buffer
+++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+++        memcpy(coeffs, pcm, blen);
+ +
+-+struct qpu_mc_pred_c_s;
+-+struct qpu_mc_pred_y_s;
+-+
+-+typedef struct HEVCRpiLumaPred
+-+{
+-+    struct qpu_mc_pred_y_s *qpu_mc_base;
+-+    struct qpu_mc_pred_y_s *qpu_mc_curr;
+-+    struct qpu_mc_pred_y_s *last_lx;
+-+    unsigned int load;
+-+} HEVCRpiLumaPred;
+-+
+-+typedef struct HEVCRpiChromaPred
+-+{
+-+    struct qpu_mc_pred_c_s *qpu_mc_base;
+-+    struct qpu_mc_pred_c_s *qpu_mc_curr;
+-+    struct qpu_mc_pred_c_s *last_l0;
+-+    struct qpu_mc_pred_c_s *last_l1;
+-+    unsigned int load;
+-+} HEVCRpiChromaPred;
+-+
+-+typedef struct HEVCRpiJob {
+-+    GPU_MEM_PTR_T chroma_mvs_gptr;
+-+    GPU_MEM_PTR_T luma_mvs_gptr;
+-+    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
+-+    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
+-+} HEVCRpiJob;
+++        // Our coeff stash assumes that any partially allocated 64byte lump
+++        // is zeroed so make that true.
+++        {
+++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+++            if ((-(intptr_t)eopcm & 63) != 0)
+++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+++        }
+ +
+-+#if RPI_TSTATS
+-+typedef struct HEVCRpiStats {
+-+    int y_pred1_y8_merge;
+-+    int y_pred1_xy;
+-+    int y_pred1_x0;
+-+    int y_pred1_y0;
+-+    int y_pred1_x0y0;
+-+    int y_pred1_wle8;
+-+    int y_pred1_wgt8;
+-+    int y_pred1_hle16;
+-+    int y_pred1_hgt16;
+-+    int y_pred2_xy;
+-+    int y_pred2_x0;
+-+    int y_pred2_y0;
+-+    int y_pred2_x0y0;
+-+    int y_pred2_hle16;
+-+    int y_pred2_hgt16;
+-+} HEVCRpiStats;
+++        // Add command
+++        {
+++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
+++            cmd->type = RPI_PRED_I_PCM;
+++            cmd->size = log2_cb_size;
+++            cmd->i_pcm.src = coeffs;
+++            cmd->i_pcm.x = x0;
+++            cmd->i_pcm.y = y0;
+++            cmd->i_pcm.src_len = length;
+++        }
+++        return 0;
+++    }
+ +#endif
+ +
+-+#endif
+++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+++}
+ +
+- typedef struct HEVCContext {
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+-@@ -798,13 +978,103 @@ typedef struct HEVCContext {
++ /**
++  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
++  *
++@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
++     int idx              = ff_hevc_pel_weight[block_w];
+  
+-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+-     HEVCLocalContext    *HEVClc;
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra HEVClcIntra;
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+-     uint8_t             threads_type;
+-     uint8_t             threads_number;
+- 
+-     int                 width;
+-     int                 height;
+- 
+-+    int used_for_ref;
+-+
+-+#ifdef RPI
+-+    int enable_rpi;
+-+    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
+-+    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
+-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+-+    int buf_width;
+-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+-+    int num_coeffs[RPI_MAX_JOBS][4];
+-+    int num_xfm_cmds[RPI_MAX_JOBS];
+-+    int num_mv_cmds_y[RPI_MAX_JOBS];
+-+    int num_mv_cmds_c[RPI_MAX_JOBS];
+-+    int num_pred_cmds[RPI_MAX_JOBS];
+-+    int num_dblk_cmds[RPI_MAX_JOBS];
+-+    int vpu_id;
+-+    int pass0_job; // Pass0 does coefficient decode
+-+    int pass1_job; // Pass1 does pixel processing
+-+    int ctu_count; // Number of CTUs done in pass0 so far
+-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
+-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+ +
+-+    HEVCRpiJob jobs[RPI_MAX_JOBS];
+-+#if RPI_TSTATS
+-+    HEVCRpiStats tstats;
++     x_off += mv->x >> 2;
++     y_off += mv->y >> 2;
++     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++  * @param mv1 motion vector1 (relative to block position) to get pixel data from
++  * @param current_mv current motion vector structure
++  */
++- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
++                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
++ {
++@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
++     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+-+#if RPI_INTER
+-+    HEVCRpiChromaPred * curr_pred_c;
+-+    HEVCRpiLumaPred * curr_pred_y;
+-+    struct qpu_mc_pred_y_s * last_y8_p;
+-+    struct qpu_mc_pred_y_s * last_y8_lx;
+ +
+-+    // Function pointers
+-+    uint32_t qpu_filter_uv;
+-+    uint32_t qpu_filter_uv_b0;
+-+    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
+-+    uint32_t qpu_filter;
+-+    uint32_t qpu_filter_b;
++     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
++         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
++         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
++@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++     intptr_t _mx         = mx << (1 - hshift);
++     intptr_t _my         = my << (1 - vshift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+ +
+-+#ifdef RPI_WORKER
+-+    pthread_t worker_thread;
+-+    pthread_cond_t worker_cond_head;
+-+    pthread_cond_t worker_cond_tail;
+-+    pthread_mutex_t worker_mutex;
+-+
+-+    int worker_tail; // Contains the number of posted jobs
+-+    int worker_head; // Contains the number of completed jobs
+-+    int kill_worker; // set to 1 to terminate the worker
++     x_off += mv->x >> (2 + hshift);
++     y_off += mv->y >> (2 + vshift);
++     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++     int hshift = s->ps.sps->hshift[1];
++     int vshift = s->ps.sps->vshift[1];
++ 
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+ +
+-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
++     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
++     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
++     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
++@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++                                                          _mx1, _my1, block_w);
++ }
++ 
++-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
++-                                const Mv *mv, int y0, int height)
+++#ifdef RPI
+++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int val, const int field)
++ {
++-    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
+++        HEVCContext *const fs = ref->tf.owner->priv_data;
+++        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
+++        sem_t * sem = NULL;
+++
+++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
+++            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
+++
+++            av_assert0(pwait->req == -1 && pwait->next == NULL);
++ 
++-    if (s->threads_type == FF_THREAD_FRAME )
++-        ff_thread_await_progress(&ref->tf, y, 0);
+++            pwait->req = val;
+++            pwait->next = NULL;
+++            if (pstate->first == NULL)
+++                pstate->first = pwait;
+++            else
+++                pstate->last->next = pwait;
+++            pstate->last = pwait;
+++            sem = &pwait->sem;
+++        }
+++        pthread_mutex_unlock(&pstate->lock);
+ +
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int enable_rpi_deblock;
+++        if (sem != NULL) {
+++            while (sem_wait(sem) != 0)
+++                av_assert0(errno == EINTR);
+++        }
+++    }
+++}
+ +
+-+    int uv_setup_width;
+-+    int uv_setup_height;
+-+    int setup_width; // Number of 16x16 blocks across the image
+-+    int setup_height; // Number of 16x16 blocks down the image
+++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
+++{
+++    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
+ +
+-+    struct dblk_vpu_q_s
+-+    {
+-+        GPU_MEM_PTR_T deblock_vpu_gmem;
+++    ((int *)s->ref->tf.progress->data)[field] = val;
+ +
+-+        uint8_t (*y_setup_arm)[2][2][2][4];
+-+        uint8_t (*y_setup_vc)[2][2][2][4];
+++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+++    {
+++        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
+++        HEVCRPiFrameProgressWait * pwait;
+ +
+-+        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+-+        uint8_t (*uv_setup_vc)[2][2][2][4];
+++        while ((pwait = *ppwait) != NULL) {
+++            if (pwait->req > val)
+++            {
+++                ppwait = &pwait->next;
+++                pstate->last = pwait;
+++            }
+++            else
+++            {
+++                *ppwait = pwait->next;
+++                pwait->req = -1;
+++                pwait->next = NULL;
+++                sem_post(&pwait->sem);
+++            }
+++        }
+++    }
+++    pthread_mutex_unlock(&pstate->lock);
+++}
+ +
+-+        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+-+        int vpu_cmds_vc;
+++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
+++{
+++    pstate->first = NULL;
+++    pstate->last = NULL;
+++    pthread_mutex_init(&pstate->lock, NULL);
+++}
+ +
+-+        vpu_qpu_wait_h cmd_id;
+-+    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
+++{
+++    pwait->req = -1;
+++    pwait->next = NULL;
+++    sem_init(&pwait->sem, 0, 0);
+++}
+ +
+-+    struct dblk_vpu_q_s * dvq;
+-+    unsigned int dvq_n;
+++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
+++{
+++    av_assert0(pstate->first == NULL);
+++    pthread_mutex_destroy(&pstate->lock);
+++}
+ +
+++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
+++{
+++    sem_destroy(&pwait->sem);
+++}
+ +#endif
+ +
+-+#endif
+++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
+++                                const Mv * const mv, const int y0, const int height)
+++{
+++    if (s->threads_type == FF_THREAD_FRAME) {
+++        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+ +
+-     uint8_t *cabac_state;
+- 
+-     /** 1 if the independent slice segment header was successfully parsed */
+-@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
+-     uint32_t max_mastering_luminance;
+-     uint32_t min_mastering_luminance;
+- 
+ +#ifdef RPI
+-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+++        if (s->enable_rpi) {
+++            int16_t *const pr = s->jb0->progress + ref->dpb_no;
+++            if (*pr < y) {
+++                *pr = y;
+++            }
+++        }
+++        else
+ +#endif
+- } HEVCContext;
+++        // It is a const ThreadFrame but the prototype isn't
+++        ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+++    }
++ }
+  
+- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+-@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                                  int log2_trafo_size, enum ScanType scan_idx,
+-                                  int c_idx);
++ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++     }
++ }
+  
+-+#if RPI_INTER
+-+extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+-+#endif
+-+
+- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+- 
+- 
+-@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
+- extern const uint8_t ff_hevc_diag_scan8x8_x[64];
+- extern const uint8_t ff_hevc_diag_scan8x8_y[64];
+- 
+-+#ifdef RPI
+-+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+-+
+-+// arm/hevc_misc_neon.S
+-+// Neon coeff zap fn
+-+#if HAVE_NEON
+-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+-+#endif
+-+
+-+#endif
+-+
+- #endif /* AVCODEC_HEVC_H */
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 05b2821..733efde 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -21,14 +21,76 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+- 
+-+#define UNCHECKED_BITSTREAM_READER 1
+-+
+- #include "libavutil/attributes.h"
+- #include "libavutil/common.h"
+- 
+--#include "cabac_functions.h"
+- #include "hevc.h"
+-+#include "cabac_functions.h"
+-+
+-+#ifdef RPI
+-+#include "rpi_zc.h"
+-+#endif
+-+
+-+// BY22 is probably faster than simple bypass if the processor has
+-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+-+// x86 has fast int divide
+-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+-+// Use native divide if we have a fast one - otherwise use mpy 1/x
+-+// x86 has a fast integer divide - arm doesn't - unsure about other
+-+// architectures
+-+#define USE_BY22_DIV  ARCH_X86
+-+
+-+// Special case blocks with a single significant ceoff
+-+// Decreases the complexity of the code for a common case but increases the
+-+// code size.
+-+#define USE_N_END_1 1
+-+
+-+#if ARCH_ARM
+-+#include "arm/hevc_cabac.h"
+-+#endif
+- 
+- #define CABAC_MAX_BIN 31
+- 
+-+
+-+#if USE_BY22 && !USE_BY22_DIV
+-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+-+
+-+static const uint32_t cabac_by22_inv_range[256] = {
+-+                                                    0,      I(257), I(258), I(259),
+-+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+-+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+-+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+-+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+-+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+-+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+-+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+-+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+-+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+-+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+-+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+-+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+-+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+-+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+-+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+-+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+-+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+-+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+-+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+-+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+-+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+-+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+-+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+-+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+-+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+-+    I(510), I(511)
+-+};
+-+#undef I
+-+#endif  // USE_BY22
++-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++-                                int nPbW, int nPbH,
++-                                int log2_cb_size, int partIdx, int idx)
+ +
+- /**
+-  * number of bin by SyntaxElement.
+-  */
+-@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+-     { 28, 36, 43, 49, 54, 58, 61, 63, },
+- };
+- 
+++#if RPI_INTER
+ +
+-+typedef struct
+++static HEVCRpiInterPredQ *
+++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
+ +{
+-+    uint16_t coeff;
+-+    uint16_t scale;
+-+} xy_off_t;
+-+
+-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+++    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
+++    HEVCRpiInterPredQ * ypt = yp + 1;
+++    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
+++        if (ypt->load < yp->load)
+++            yp = ypt;
+++    }
+ +
+-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+++    yp->load += load_val;
+++    ipe->used_grp = 1;
+++    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
+ +
+-+#define OFF_DIAG(t) {\
+-+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+-+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+-+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+-+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+++    return yp;
+ +}
+ +
+-+#define OFF_HORIZ(t) {\
+-+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+-+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+-+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+-+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+-+}
+ +
+-+#define OFF_VERT(t) {\
+-+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+-+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+-+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+-+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
+++{
+++    for (unsigned int i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i;
+++        q->qpu_mc_curr->data[-1] = q->code_sync;
+++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
+++        q->load = 0;
+++    }
+ +}
+ +
+-+static const xy_off_t off_xys[3][4][16] =
+++// Returns 0 on success, -1 if Q is dangerously full
+++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+-+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+-+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+-+};
+-+
+++    if (!ipe->used_grp)
+++        return 0;
+ +
+-+// Helper fns
+-+#ifndef hevc_mem_bits32
+-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+-+{
+-+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+-+}
+-+#endif
+++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
+++    {
+++        ipe->curr = 0;
+++        rpi_inter_pred_sync(ipe);
+++    }
+++    ipe->used = 1;
+++    ipe->used_grp = 0;
+ +
+-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+-+#define hevc_clz32 hevc_clz32_builtin
+-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+-+{
+-+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+-+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+++    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
+++        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
+++            return -1;
+++        }
+++    }
+++    return 0;
+ +}
+-+#endif
+ +
+-+// It is unlikely that we will ever need this but include for completeness
+-+#ifndef hevc_clz32
+-+static inline unsigned int hevc_clz32(unsigned int x)
+++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    unsigned int n = 1;
+-+    if ((x & 0xffff0000) == 0) {
+-+        n += 16;
+-+        x <<= 16;
+-+    }
+-+    if ((x & 0xff000000) == 0) {
+-+        n += 8;
+-+        x <<= 8;
+-+    }
+-+    if ((x & 0xf0000000) == 0) {
+-+        n += 4;
+-+        x <<= 4;
+-+    }
+-+    if ((x & 0xc0000000) == 0) {
+-+        n += 2;
+-+        x <<= 2;
+++    unsigned int i;
+++    ipe->curr = 0;
+++    ipe->used = 0;
+++    ipe->used_grp = 0;
+++    for (i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i;
+++        q->qpu_mc_curr = q->qpu_mc_base;
+++        q->load = 0;
+++        q->last_l0 = NULL;
+++        q->last_l1 = NULL;
+ +    }
+-+    return n - ((x >> 31) & 1);
+ +}
+-+#endif
+ +
+++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
+++                                 const unsigned int n_max, const unsigned int n_grp,
+++                                 const unsigned int total_size, const unsigned int min_gap)
+++{
+++    memset(ipe, 0, sizeof(*ipe));
+++    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
+++    ipe->n_grp = n_grp;
+++    ipe->min_gap = min_gap;
+ +
+-+#if !USE_BY22
+-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+-+// will no longer be called but the setup calls will still exist and we want
+-+// to null them out
+-+#define bypass_start(s)
+-+#define bypass_finish(s)
+++#if RPI_CACHE_UNIF_MVS
+++    gpu_malloc_cached(total_size, &ipe->gptr);
+ +#else
+-+// Use BY22 for residual bypass block
+-+
+-+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+-+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+-+
+-+// BY22 notes that bypass is simply a divide into the bitstream and so we
+-+// can peek out large quantities of bits at once and treat the result as if
+-+// it was VLC.  In many cases this will lead to O(1) processing rather than
+-+// O(n) though the setup and teardown is sufficiently expensive that it is
+-+// only worth using if we expect to be dealing with more than a few bits
+-+// The definition of "a few bits" will vary from platform to platform but
+-+// tests on ARM show that it probably isn't worth it for a single coded
+-+// residual, but is for >1 - it also seems likely that if there are
+-+// more residuals then they are likely to be bigger and this will make the
+-+// O(1) nature of the code more worthwhile.
+++    gpu_malloc_uncached(total_size, &ipe->gptr);
+++#endif
+++}
+ +
+ +
+-+#if !USE_BY22_DIV
+-+// * 1/x @ 32 bits gets us 22 bits of accuracy
+-+#define CABAC_BY22_PEEK_BITS  22
+++#if RPI_QPU_EMU_Y
+++#define get_mc_address_y(f) ((f)->data[0])
+ +#else
+-+// A real 32-bit divide gets us another bit
+-+// If we have a 64 bit int & a unit time divider then we should get a lot
+-+// of bits (55)  but that is untested and it is unclear if it would give
+-+// us a large advantage
+-+#define CABAC_BY22_PEEK_BITS  23
+++#define get_mc_address_y(f) get_vc_address_y(f)
+ +#endif
+-+
+-+// Bypass block start
+-+// Must be called before _by22_peek is used as it sets the CABAC environment
+-+// into the correct state.  _by22_finish must be called to return to 'normal'
+-+// (i.e. non-bypass) cabac decoding
+-+static inline void get_cabac_by22_start(CABACContext * const c)
+-+{
+-+    const unsigned int bits = __builtin_ctz(c->low);
+-+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+-+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+-+#if !USE_BY22_DIV
+-+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+++#if RPI_QPU_EMU_C
+++#define get_mc_address_u(f) ((f)->data[1])
+++#else
+++#define get_mc_address_u(f) get_vc_address_u(f)
+ +#endif
+ +
+-+    c->bytestream -= (CABAC_BITS / 8);
+-+    c->by22.bits = bits;
+-+#if !USE_BY22_DIV
+-+    c->by22.range = c->range;
+-+    c->range = inv;
+-+#endif
+-+    c->low = x;
+++static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
+++{
+++    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
+++           wt << (s->ps.sps->bit_depth - 8);
+ +}
+ +
+-+// Bypass block finish
+-+// Must be called at the end of the bypass block to return to normal operation
+-+static inline void get_cabac_by22_finish(CABACContext * const c)
+++static void
+++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const Mv *const mv,
+++           const int weight_mul,
+++           const int weight_offset,
+++           AVFrame *const src_frame)
+ +{
+-+    unsigned int used = c->by22.bits;
+-+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+-+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+++    const unsigned int mx          = mv->x & 3;
+++    const unsigned int my          = mv->y & 3;
+++    const unsigned int my_mx       = (my << 8) | mx;
+++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
+++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
+++    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+++
+++    if (my_mx == 0)
+++    {
+++        const int x1 = x0 + (mv->x >> 2);
+++        const int y1 = y0 + (mv->y >> 2);
+++        const int bh = nPbH;
+ +
+-+    c->bytestream += bytes_used + (CABAC_BITS / 8);
+-+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+-+#if !USE_BY22_DIV
+-+    c->range = c->by22.range;
+++        for (int start_x = 0; start_x < nPbW; start_x += 16)
+++        {
+++            const int bw = FFMIN(nPbW - start_x, 16);
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
+++
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                ++ts->y_pred1_x0y0;
+++
+++                if (nPbW > 8)
+++                    ++ts->y_pred1_wgt8;
+++                else
+++                    ++ts->y_pred1_wle8;
+++
+++                if (nPbH > 16)
+++                    ++ts->y_pred1_hgt16;
+++                else
+++                    ++ts->y_pred1_hle16;
+++            }
+ +#endif
+-+}
+ +
+-+// Peek bypass bits
+-+// _by22_start must be called before _by22_peek is called and _by22_flush
+-+// must be called afterwards to flush any used bits
+-+// The actual number of valid bits returned is
+-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+-+// will be at least 22 which should be long enough for any prefix or suffix
+-+// though probably not long enough for the worst case combination
+-+#ifndef get_cabac_by22_peek
+-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+-+{
+-+#if USE_BY22_DIV
+-+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+-+#else
+-+    uint32_t x = c->low & ~1U;
+-+    const uint32_t inv = c->range;
+++            src1->x = x1 + start_x;
+++            src1->y = y1;
+++            src1->base = src_vc_address_y;
+++            cmd_y->w = bw;
+++            cmd_y->h = bh;
+++            cmd_y->wo1 = wo;
+++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++        }
+++    }
+++    else
+++    {
+++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+++        const unsigned int bh = nPbH;
+++        int start_x = 0;
+ +
+-+    if (inv != 0)
+-+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+++#if 1
+++        // As Y-pred operates on two independant 8-wide src blocks we can merge
+++        // this pred with the previous one if it the previous one is 8 pel wide,
+++        // the same height as the current block, immediately to the left of our
+++        // current dest block and mono-pred.
+ +
+-+    return x << 1;
+++        qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
+++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
+++        {
+++            const int bw = FFMIN(nPbW, 8);
+++            qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
+++
+++            last_y8_src2->x = x1_m3;
+++            last_y8_src2->y = y1_m3;
+++            last_y8_src2->base = src_vc_address_y;
+++            last_y8_p->w += bw;
+++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
+++            last_y8_p->wo2 = wo;
+++
+++            s->last_y8_p = NULL;
+++            s->last_y8_l1 = NULL;
+++            start_x = bw;
+++#if RPI_TSTATS
+++            ++s->tstats.y_pred1_y8_merge;
+ +#endif
+-+}
+++        }
+ +#endif
+ +
+-+// Flush bypass bits peeked by _by22_peek
+-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+-+// val is an unmodified copy of whatever _by22_peek returned
+-+#ifndef get_cabac_by22_flush
+-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+-+{
+-+    // Subtract the bits used & reshift up to the top of the word
+-+#if USE_BY22_DIV
+-+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+++        for (; start_x < nPbW; start_x += 16)
+++        {
+++            const int bw = FFMIN(nPbW - start_x, 16);
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_src_t *const src2 = yp->last_l1;
+++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                if (mx == 0 && my == 0)
+++                    ++ts->y_pred1_x0y0;
+++                else if (mx == 0)
+++                    ++ts->y_pred1_x0;
+++                else if (my == 0)
+++                    ++ts->y_pred1_y0;
+++                else
+++                    ++ts->y_pred1_xy;
+++
+++                if (nPbW > 8)
+++                    ++ts->y_pred1_wgt8;
+++                else
+++                    ++ts->y_pred1_wle8;
+++
+++                if (nPbH > 16)
+++                    ++ts->y_pred1_hgt16;
+++                else
+++                    ++ts->y_pred1_hle16;
+++            }
+++#endif
+++            src1->x = x1_m3 + start_x;
+++            src1->y = y1_m3;
+++            src1->base = src_vc_address_y;
+++            if (bw <= 8)
+++            {
+++                src2->x = MC_DUMMY_X;
+++                src2->y = MC_DUMMY_Y;
+++#if RPI_QPU_EMU_Y
+++                src2->base = s->qpu_dummy_frame_emu;
+ +#else
+-+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+++                src2->base = s->qpu_dummy_frame_qpu;
+ +#endif
+-+
+-+    // and refill lower bits
+-+    // We will probably OR over some existing bits but that doesn't matter
+-+    c->by22.bits += n;
+-+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+++            }
+++            else
+++            {
+++                src2->x = x1_m3 + start_x + 8;
+++                src2->y = y1_m3;
+++                src2->base = src_vc_address_y;
+++            }
+++            cmd_y->w = bw;
+++            cmd_y->h = bh;
+++            cmd_y->mymx21 = my2_mx2_my_mx;
+++            cmd_y->wo1 = wo;
+++            cmd_y->wo2 = wo;
+++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->last_l1 = &cmd_y->next_src2;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++
+++            if (bw == 8) {
+++                s->last_y8_l1 = src2;
+++                s->last_y8_p = cmd_y;
+++            }
+++        }
+++    }
+ +}
+-+#endif
+ +
+-+#endif  // USE_BY22
+++static void
+++rpi_pred_y_b(HEVCContext * const s,
+++           const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const struct MvField *const mv_field,
+++           AVFrame *const src_frame,
+++           AVFrame *const src_frame2)
+++{
+++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+++    const Mv * const mv  = mv_field->mv + 0;
+++    const Mv * const mv2 = mv_field->mv + 1;
+ +
+++    const unsigned int mx          = mv->x & 3;
+++    const unsigned int my          = mv->y & 3;
+++    const unsigned int my_mx = (my<<8) | mx;
+++    const unsigned int mx2          = mv2->x & 3;
+++    const unsigned int my2          = mv2->y & 3;
+++    const unsigned int my2_mx2 = (my2<<8) | mx2;
+++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
+++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
+++    const uint32_t wt_offset =
+++        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
+++    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+++    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+++
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
+++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
+++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+++
+++    if (my2_mx2_my_mx == 0)
+++    {
+++        const int x1 = x0 + (mv->x >> 2);
+++        const int y1 = y0 + (mv->y >> 2);
+++        const int x2 = x0 + (mv2->x >> 2);
+++        const int y2 = y0 + (mv2->y >> 2);
+++        const int bh = nPbH;
+++
+++        // Can do chunks a full 16 wide if we don't want the H filter
+++        for (int start_x=0; start_x < nPbW; start_x += 16)
+++        {
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_src_t *const src2 = yp->last_l1;
+++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                ++ts->y_pred2_x0y0;
+ +
+- void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
+- {
+-     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
+-@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+-     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+- }
+- 
+--static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+-+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
+- {
+--    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+-+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
+- }
+- 
+--static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+-+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
+- {
+--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+-+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
+- }
+- 
+--static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+-+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
+- {
+--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+-+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
+- }
+- 
+- int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+-@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+-     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+- }
+- 
+--static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
+-                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
+- {
+-     int i = 0;
+-     int max = (log2_size << 1) - 1;
+-     int ctx_offset, ctx_shift;
+- 
+--    if (!c_idx) {
+-+    if (!c_idx_nz) {
+-         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+-         ctx_shift = (log2_size + 1) >> 2;
+-     } else {
+-@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+-     return value;
+- }
+- 
+--static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+-+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
+- {
+-     int inc;
+- 
+--    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+-+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
+- 
+-     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
+- }
+--static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
+--                                           int offset, const uint8_t *ctx_idx_map)
+--{
+--    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
+--    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
+--}
+- 
+--static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+-+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
+- {
+-     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+- }
+-@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+-     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
+- }
+- 
+--static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+++                if (nPbH > 16)
+++                    ++ts->y_pred2_hgt16;
+++                else
+++                    ++ts->y_pred2_hle16;
+++            }
+++#endif
+++            src1->x = x1 + start_x;
+++            src1->y = y1;
+++            src1->base = src1_base;
+++            src2->x = x2 + start_x;
+++            src2->y = y2;
+++            src2->base = src2_base;
+++            cmd_y->w = FFMIN(nPbW - start_x, 16);
+++            cmd_y->h = bh;
+++            cmd_y->mymx21 = 0;
+++            cmd_y->wo1 = wo1;
+++            cmd_y->wo2 = wo2;
+++            cmd_y->dst_addr =  dst + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->last_l1 = &cmd_y->next_src2;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++        }
+++    }
+++    else
+++    {
+++        // Filter requires a run-up of 3
+++        const int x1 = x0 + (mv->x >> 2) - 3;
+++        const int y1 = y0 + (mv->y >> 2) - 3;
+++        const int x2 = x0 + (mv2->x >> 2) - 3;
+++        const int y2 = y0 + (mv2->y >> 2) - 3;
+++        const int bh = nPbH;
+++
+++        for (int start_x=0; start_x < nPbW; start_x += 8)
+++        { // B blocks work 8 at a time
+++            // B weights aren't doubled as the QPU code does the same
+++            // amount of work as it does for P
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_src_t *const src2 = yp->last_l1;
+++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                const unsigned int mmx = mx | mx2;
+++                const unsigned int mmy = my | my2;
+++                if (mmx == 0 && mmy == 0)
+++                    ++ts->y_pred2_x0y0;
+++                else if (mmx == 0)
+++                    ++ts->y_pred2_x0;
+++                else if (mmy == 0)
+++                    ++ts->y_pred2_y0;
+++                else
+++                    ++ts->y_pred2_xy;
+ +
+-+#if !USE_BY22
+-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+++                if (nPbH > 16)
+++                    ++ts->y_pred2_hgt16;
+++                else
+++                    ++ts->y_pred2_hle16;
+++            }
+ +#endif
+++            src1->x = x1 + start_x;
+++            src1->y = y1;
+++            src1->base = src1_base;
+++            src2->x = x2 + start_x;
+++            src2->y = y2;
+++            src2->base = src2_base;
+++            cmd_y->w = FFMIN(nPbW - start_x, 8);
+++            cmd_y->h = bh;
+++            cmd_y->mymx21 = my2_mx2_my_mx;
+++            cmd_y->wo1 = wo1;
+++            cmd_y->wo2 = wo2;
+++            cmd_y->dst_addr =  dst + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->last_l1 = &cmd_y->next_src2;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++        }
+++    }
+++}
+ +
+++// h/v shifts fixed at one as that is all the qasm copes with
+++static void
+++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const Mv * const mv,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  AVFrame * const src_frame)
+++{
+++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+++    const int hshift = 1; // = s->ps.sps->hshift[1];
+++    const int vshift = 1; // = s->ps.sps->vshift[1];
+++
+++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
+++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+++    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
+++    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
+++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+++    const unsigned int bh = nPbH_c;
+++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
+++
+++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+++    {
+++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
+++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
+++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
+++        qpu_mc_src_t * const last_lx = *plast_lx;
+++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++        last_lx->x = x1_c + start_x;
+++        last_lx->y = y1_c;
+++        last_lx->base = src_base_u;
+++        cmd_c->h = bh;
+++        cmd_c->w = bw;
+++        cmd_c->coeffs_x = x_coeffs;
+++        cmd_c->coeffs_y = y_coeffs;
+++        cmd_c->wo_u = wo_u;
+++        cmd_c->wo_v = wo_v;
+++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
+++        *plast_lx = &cmd_c->next_src;
+++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
+++    }
+++    return;
+++}
+ +
+-+#ifndef coeff_abs_level_remaining_decode_bypass
+-+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
+++// h/v shifts fixed at one as that is all the qasm copes with
+++static void
+++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const struct MvField * const mv_field,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  const int16_t * const c_weights2,
+++  const int16_t * const c_offsets2,
+++  AVFrame * const src_frame,
+++  AVFrame * const src_frame2)
+ +{
+-+    CABACContext * const c = &s->HEVClc->cc;
+-+    uint32_t y;
+-+    unsigned int prefix;
+-+    unsigned int last_coeff_abs_level_remaining;
+-+    unsigned int n;
+++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+++    const int hshift = 1; // s->ps.sps->hshift[1];
+++    const int vshift = 1; // s->ps.sps->vshift[1];
+++    const Mv * const mv = mv_field->mv + 0;
+++    const Mv * const mv2 = mv_field->mv + 1;
+ +
+-+    y = get_cabac_by22_peek(c);
+-+    prefix = hevc_clz32(~y);
+-+    // y << prefix will always have top bit 0
+++    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+++    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+ +
+-+    if (prefix < 3) {
+-+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+-+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+-+        n = prefix + 1 + rice_param;
+-+    }
+-+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+-+    {
+-+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+++    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+++    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+ +
+-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+-+        n = prefix * 2 + rice_param - 2;
+-+    }
+-+    else {
+-+        unsigned int suffix;
+++    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+++    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+ +
+-+        get_cabac_by22_flush(c, prefix, y);
+-+        y = get_cabac_by22_peek(c);
+++    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
+++    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
+ +
+-+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+-+        n = prefix + rice_param - 2;
+++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
+++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+++    const unsigned int bh = nPbH_c;
+++
+++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
+++    {
+++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
+++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
+++        qpu_mc_src_t * const src_l0 = cp->last_l0;
+++        qpu_mc_src_t * const src_l1 = cp->last_l1;
+++
+++        src_l0->x = x1_c + start_x;
+++        src_l0->y = y1_c;
+++        src_l0->base = src1_base;
+++        src_l1->x = x2_c + start_x;
+++        src_l1->y = y2_c;
+++        src_l1->base = src2_base;
+++
+++        u[0].h = bh;
+++        u[0].w = bw;
+++        u[0].coeffs_x1 = coefs0_x;
+++        u[0].coeffs_y1 = coefs0_y;
+++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
+++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
+++        u[0].coeffs_x2 = coefs1_x;
+++        u[0].coeffs_y2 = coefs1_y;
+++        u[0].wo_u2 = wo_u2;
+++        u[0].wo_v2 = wo_v2;
+++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
+++
+++        cp->last_l0 = &u[0].next_src1;
+++        cp->last_l1 = &u[0].next_src2;
+++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+ +    }
+++}
+ +
+-+    get_cabac_by22_flush(c, n, y);
+ +
+-+    return last_coeff_abs_level_remaining;
+-+}
+ +#endif
+ +
+-+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+++
+++
+++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+++                                const int nPbW, const int nPbH,
+++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+  {
+-+    CABACContext * const c = &s->HEVClc->cc;
+-     int prefix = 0;
+-     int suffix = 0;
+-     int last_coeff_abs_level_remaining;
+-     int i;
+- 
+--    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+-+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
+-         prefix++;
+-     if (prefix == CABAC_MAX_BIN) {
+-         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+-         return 0;
+-     }
+-+
+-     if (prefix < 3) {
+-         for (i = 0; i < rc_rice_param; i++)
+--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+-+            suffix = (suffix << 1) | get_cabac_bypass(c);
+-         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
+-     } else {
+-         int prefix_minus3 = prefix - 3;
+-         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
+--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+-+            suffix = (suffix << 1) | get_cabac_bypass(c);
+-         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
+-                                               << rc_rice_param) + suffix;
+-     }
+-+
+-     return last_coeff_abs_level_remaining;
++ #define POS(c_idx, x, y)                                                              \
++     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
++                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
++-    HEVCLocalContext *lc = s->HEVClc;
+++    HEVCLocalContext * const lc = s->HEVClc;
++     int merge_idx = 0;
++     struct MvField current_mv = {{{ 0 }}};
++ 
++@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++     int y_cb             = y0 >> log2_min_cb_size;
++     int x_pu, y_pu;
++     int i, j;
++-
++-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++ 
++     if (!skip_flag)
++         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
++@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+++              ref0->frame);
+++        } else
+++#endif
+++        {
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
++                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
++                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
+++#if RPI_INTER
+++            if (s->enable_rpi) {
+++                rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                  ref0->frame);
+++                return;
+++            }
+++#endif
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
++@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+++              ref1->frame);
+++        } else
+++#endif
+++        {
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
++                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
++                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
+++#if RPI_INTER
+++            if (s->enable_rpi) {
+++                rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                  ref1->frame);
+++                return;
+++            }
+++#endif
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
++@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+++        } else
+++#endif
+++        {
+++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
++                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                    ref1->frame, &current_mv.mv[1], &current_mv);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
+++#if RPI_INTER
+++          if (s->enable_rpi) {
+++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+++                           &current_mv,
+++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                           ref0->frame,
+++                           ref1->frame);
+++                return;
+++            }
+++#endif
++             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
++ 
++@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
++                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
++                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
++                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+++                {
++                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+++                }
++ 
++                 if (ret < 0)
++                     return ret;
++@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
++     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+  }
+  
+--static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+-+#if !USE_BY22
+-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+-+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
+- {
+--    int i;
+--    int ret = 0;
+-+    CABACContext * const c = &s->HEVClc->cc;
+++#ifdef RPI
+++static void rpi_execute_dblk_cmds(HEVCContext *s)
+++{
+++    const unsigned int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+++    HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
+ +    unsigned int i;
+-+    uint32_t ret = 0;
+- 
+-     for (i = 0; i < nb; i++)
+--        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
+--    return ret;
+-+        ret = (ret << 1) | get_cabac_bypass(c);
+ +
+-+    return ret << (32 - nb);
+++    for (i = 0; i != de->n; ++i)
+++    {
+++        ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
+++    }
+++    de->n = 0;
+ +}
+-+#endif
+ +
+-+#ifndef coeff_sign_flag_decode_bypass
+-+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+++#if 0
+++static void rpi_execute_transform(HEVCContext *s)
+ +{
+-+    CABACContext * const c = &s->HEVClc->cc;
+-+    uint32_t y;
+-+    y = get_cabac_by22_peek(c);
+-+    get_cabac_by22_flush(c, nb, y);
+-+    return y & ~(0xffffffffU >> nb);
+-+}
+-+#endif
+++    int i=2;
+++    int job = s->pass1_job;
+++    /*int j;
+++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+++        s->hevcdsp.idct[4-2](coeffs, 16);
+++    }
+++    i=3;
+++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+++        s->hevcdsp.idct[5-2](coeffs, 32);
+++    }*/
+ +
+++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+++    //vpu_wait(s->vpu_id);
+ +
+-+#ifndef get_cabac_greater1_bits
+-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+-+    uint8_t * const state0)
+-+{
+-+    unsigned int i;
+-+    unsigned int rv = 0;
+-+    for (i = 0; i != n; ++i) {
+-+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+-+        const unsigned int b = get_cabac(c, state0 + idx);
+-+        rv = (rv << 1) | b;
+-+    }
+-+    return rv;
+++    for(i=0;i<4;i++)
+++        s->num_coeffs[job][i] = 0;
+ +}
+ +#endif
+ +
+ +
+-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+-+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+-+// this version of events.
+-+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+-+    int * const pprev_subset_coded, int * const psum,
+-+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+-+{
+-+    CABACContext * const c = &s->HEVClc->cc;
+-+    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+-+    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+-+    unsigned int rv;
+-+    unsigned int i;
+-+    const unsigned int n = FFMIN(n_end, 8);
+-+
+-+    // Really this is i != n but the simple unconditional loop is cheaper
+-+    // and faster
+-+    for (i = 0; i != 8; ++i)
+-+        levels[i] = 1;
+++#define RPI_OPT_SEP_PRED 0
+ +
+-+    rv = get_cabac_greater1_bits(c, n, state0);
+ +
+-+    *pprev_subset_coded = 0;
+-+    *psum = n;
+++// I-pred, transform_and_add for all blocks types done here
+++// All ARM
+++#if RPI_OPT_SEP_PRED
+++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
+++#else
+++static void rpi_execute_pred_cmds(HEVCContext * const s)
+++#endif
+++{
+++  int i;
+++  HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
+++  const HEVCPredCmd *cmd = iap->cmds;
+++#ifdef RPI
+++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+++#else
+++  HEVCLocalContext *lc = s->HEVClc;
+++#endif
+ +
+-+    rv <<= (32 - n);
+-+    if (rv != 0)
+-+    {
+-+        *pprev_subset_coded = 1;
+-+        *psum = n + 1;
+-+        i = hevc_clz32(rv);
+-+        levels[i] = 2;
+-+        if (get_cabac(c, state_gt2) == 0)
+-+        {
+-+            // Unset first coded bit
+-+            rv &= ~(0x80000000U >> i);
+-+        }
+-+    }
+++  for(i = iap->n; i > 0; i--, cmd++) {
+++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++#if RPI_OPT_SEP_PRED
+++      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
+++          continue;
+++      }
+++#endif
+ +
+-+    if (n_end > 8) {
+-+        const unsigned int g8 = n_end - 8;
+-+        rv |= ((1 << g8) - 1) << (24 - g8);
+-+        for (i = 0; i != g8; ++i) {
+-+            levels[i + 8] = 0;
+-+        }
+-+    }
+++      switch (cmd->type)
+++      {
+++          case RPI_PRED_INTRA:
+++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+++              lc->na.cand_left         = (cmd->na >> 3) & 1;
+++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+++              lc->na.cand_up           = (cmd->na >> 1) & 1;
+++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+++              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
+++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              else
+++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              break;
+ +
+-+    return rv;
+-+}
+++          case RPI_PRED_ADD_RESIDUAL:
+++              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++          case RPI_PRED_ADD_DC:
+++              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+++              break;
+++#if RPI_HEVC_SAND
+++          case RPI_PRED_ADD_RESIDUAL_U:
+++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_V:
+++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_C:
+++              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++          case RPI_PRED_ADD_DC_U:
+++          case RPI_PRED_ADD_DC_V:
+++              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+++              break;
+++#endif
+ +
+-+// extended_precision_processing_flag must be false given we are
+-+// putting the result into a 16-bit array
+-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+-+// scale_m is uint8_t
+-+//
+-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+-+//   or it can be 2 (if we have transquant_bypass)
+-+// shift is set to one less than we really want but would normally be
+-+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+-+// to achieve it
+++          case RPI_PRED_I_PCM:
+++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+++              break;
+ +
+-+#ifndef trans_scale_sat
+-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+-+{
+-+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+++          default:
+++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+++              abort();
+++      }
+++  }
+++#if RPI_OPT_SEP_PRED
+++  if (do_luma)
+++#endif
+++  {
+++      iap->n = 0;
+++  }
+ +}
+++
+++
+ +#endif
+ +
+++#ifdef RPI
+ +
+-+#ifndef update_rice
+-+static inline void update_rice(uint8_t * const stat_coeff,
+-+    const unsigned int last_coeff_abs_level_remaining,
+-+    const unsigned int c_rice_param)
+++// Set initial uniform job values & zero ctu_count
+++static void rpi_begin(HEVCContext *s)
+ +{
+-+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+-+    if (x >= 6)
+-+        (*stat_coeff)++;
+-+    else if (x == 0 && *stat_coeff > 0)
+-+        (*stat_coeff)--;
+++#if RPI_INTER
+++    unsigned int i;
+++    HEVCRpiJob * const jb = s->jb0;
+++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
+++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
+++
+++    const uint16_t pic_width_y        = s->ps.sps->width;
+++    const uint16_t pic_height_y       = s->ps.sps->height;
+++
+++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+++
+++    rpi_inter_pred_reset(cipe);
+++    for (i = 0; i < cipe->n; i++) {
+++        HEVCRpiInterPredQ * const cp = cipe->q + i;
+++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
+++
+++        u->next_src1.x = 0;
+++        u->next_src1.y = 0;
+++        u->next_src1.base = 0;
+++        u->pic_cw = pic_width_c;
+++        u->pic_ch = pic_height_c;
+++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
+++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
+++        u->wdenom = s->sh.chroma_log2_weight_denom;
+++        cp->last_l0 = &u->next_src1;
+++
+++        u->next_fn = 0;
+++        u->next_src2.x = 0;
+++        u->next_src2.y = 0;
+++        u->next_src2.base = 0;
+++        cp->last_l1 = &u->next_src2;
+++
+++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+++    }
+++
+++    rpi_inter_pred_reset(yipe);
+++    for (i = 0; i < yipe->n; i++) {
+++        HEVCRpiInterPredQ * const yp = yipe->q + i;
+++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
+++
+++        y->next_src1.x = 0;
+++        y->next_src1.y = 0;
+++        y->next_src1.base = 0;
+++        y->next_src2.x = 0;
+++        y->next_src2.y = 0;
+++        y->next_src2.base = 0;
+++        y->pic_h = pic_height_y;
+++        y->pic_w = pic_width_y;
+++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
+++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
+++        y->wdenom = s->sh.luma_log2_weight_denom;
+++        y->next_fn = 0;
+++        yp->last_l0 = &y->next_src1;
+++        yp->last_l1 = &y->next_src2;
+++
+++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
+++    }
+++
+++    s->last_y8_p = NULL;
+++    s->last_y8_l1 = NULL;
+++
+++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+++        jb->progress[i] = -1;
+++    }
+++
+++#endif
+++    s->ctu_count = 0;
+ +}
+ +#endif
+ +
+ +
+-+// n must be > 0 on entry
+-+#ifndef get_cabac_sig_coeff_flag_idxs
+-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+-+    unsigned int n,
+-+    const uint8_t const * ctx_map,
+-+    uint8_t * p)
+++#if RPI_INTER
+++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+++static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
+++                                     const vpu_qpu_job_h vqj,
+++                                     rpi_cache_flush_env_t * const rfe,
+++                                     HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    do {
+-+        if (get_cabac(c, state0 + ctx_map[n]))
+-+            *p++ = n;
+-+    } while (--n != 0);
+-+    return p;
+-+}
+++    unsigned int i;
+++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
+++    unsigned int max_block = 0;
+++
+++    if (!ipe->used) {
+++        return 0;
+++    }
+++
+++    if (ipe->curr != 0) {
+++        rpi_inter_pred_sync(ipe);
+++    }
+++
+++    // Add final commands to Q
+++    for(i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const yp = ipe->q + i;
+++        qpu_mc_src_t *const p0 = yp->last_l0;
+++        qpu_mc_src_t *const p1 = yp->last_l1;
+++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
+++
+++        if (block_size > max_block)
+++            max_block = block_size;
+++
+++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        p0->x = MC_DUMMY_X;
+++        p0->y = MC_DUMMY_Y;
+++        p0->base = s->qpu_dummy_frame_qpu;
+++        p1->x = MC_DUMMY_X;
+++        p1->y = MC_DUMMY_Y;
+++        p1->base = s->qpu_dummy_frame_qpu;
+++
+++        yp->last_l0 = NULL;
+++        yp->last_l1 = NULL;
+++
+++        // Add to mailbox list
+++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
+++        mail[i][1] = yp->code_setup;
+++    }
+++
+++#if RPI_CACHE_UNIF_MVS
+++    // We don't need invalidate here as the uniforms aren't changed by the QPU
+++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
+++    // new values which seems to give us a small performance advantage
+++    //
+++    // In most cases we will not have a completely packed set of uniforms and as
+++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
+++    // fullest
+++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
+++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
+++                                  ipe->n, ipe->max_fill + ipe->min_gap);
+ +#endif
+++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
+ +
+++    return 1;
+++}
+++#endif
+ +
+-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+-+    unsigned int n,
+-+    const uint8_t const * ctx_map,
+-+    uint8_t * const flag_idx)
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++static unsigned int mc_terminate_add_emu(HEVCContext * const s,
+++                                     const vpu_qpu_job_h vqj,
+++                                     rpi_cache_flush_env_t * const rfe,
+++                                     HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    int rv;
+++    unsigned int i;
+++    if (!ipe->used) {
+++        return 0;
+++    }
+ +
+-+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+++    if (ipe->curr != 0) {
+++        rpi_inter_pred_sync(ipe);
+++    }
+ +
+-+    return rv;
+++    // Add final commands to Q
+++    for(i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const yp = ipe->q + i;
+++        qpu_mc_src_t *const p0 = yp->last_l0;
+++        qpu_mc_src_t *const p1 = yp->last_l1;
+++
+++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        p0->x = MC_DUMMY_X;
+++        p0->y = MC_DUMMY_Y;
+++        p0->base = s->qpu_dummy_frame_emu;
+++        p1->x = MC_DUMMY_X;
+++        p1->y = MC_DUMMY_Y;
+++        p1->base = s->qpu_dummy_frame_emu;
+++
+++        yp->last_l0 = NULL;
+++        yp->last_l1 = NULL;
+++    }
+++
+++    return 1;
+ +}
+++#endif
+ +
+-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+-+     x0,  x1,  x2,  x3,\
+-+     x4,  x5,  x6,  x7,\
+-+     x8,  x9, x10, x11,\
+-+    x12, x13, x14, x15}
+ +
+-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+-+     x0,  x4,  x8, x12,\
+-+     x1,  x5,  x9, x13,\
+-+     x2,  x6, x10, x14,\
+-+     x3,  x7, x11, x15}
+++#if RPI_QPU_EMU_Y
+++#define mc_terminate_add_y mc_terminate_add_emu
+++#else
+++#define mc_terminate_add_y mc_terminate_add_qpu
+++#endif
+++#if RPI_QPU_EMU_C
+++#define mc_terminate_add_c mc_terminate_add_emu
+++#else
+++#define mc_terminate_add_c mc_terminate_add_qpu
+++#endif
+++#endif
+ +
+-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+-+     x0,  x4,  x1,  x8,\
+-+     x5,  x2, x12,  x9,\
+-+     x6,  x3, x13, x10,\
+-+     x7, x14, x11, x15}
+++#ifdef RPI
+ +
+ +
+-+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+-+    uint8_t * const significant_coeff_group_flag,
+-+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+-+    int * const pPrev_sig)
+++static void flush_frame(HEVCContext *s,AVFrame *frame)
+ +{
+-+    while (--i >= 0) {
+-+        unsigned int x_cg = scan_x_cg[i];
+-+        unsigned int y_cg = scan_y_cg[i];
+++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++  rpi_cache_flush_finish(rfe);
+++}
+ +
+-+        // For the flag decode we only care about Z/NZ but
+-+        // we use the full Right + Down * 2 when calculating
+-+        // significant coeff flags so we obtain it here
+-+        //.
+-+        // The group flag array is one longer than it needs to
+-+        // be so we don't need to check for y_cg limits
+-+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+-+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+ +
+-+        if (i == 0 ||
+-+            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+++// Core execution tasks
+++static void worker_core(HEVCContext * const s)
+++{
+++#if RPI_OPT_SEP_PRED
+++    vpu_qpu_wait_h sync_c;
+++#endif
+++    vpu_qpu_wait_h sync_y;
+++
+++    HEVCRpiJob * const jb = s->jb1;
+++    int pred_y, pred_c;
+++
+++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++
+++    {
+++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+++        if (cf->s[3].n + cf->s[2].n != 0)
+ +        {
+-+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+-+            *pPrev_sig = prev_sig;
+-+            break;
+++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
+++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
+++            vpu_qpu_job_add_vpu(vqj,
+++                vpu_get_fn(s->ps.sps->bit_depth),
+++                vpu_get_constants(),
+++                cf->gptr.vc,
+++                cf->s[2].n >> 8,
+++                cf->gptr.vc + offset32,
+++                cf->s[3].n >> 10,
+++                0);
+++
+++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
+++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
+ +        }
+ +    }
+ +
+-+    return i;
+-+}
+++    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
+ +
+-+#ifdef RPI
+-+static void rpi_add_residual(HEVCContext * const s,
+-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
+-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+-+{
+-+    const AVFrame * const frame = s->frame;
+-+    unsigned int stride = frame->linesize[c_idx];
+-+    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+-+    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+-+    const int is_sliced = rpi_sliced_frame(frame);
+-+    uint8_t * dst = !is_sliced ?
+-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+-+        c_idx == 0 ?
+-+            rpi_sliced_frame_pos_y(frame, x, y) :
+-+            rpi_sliced_frame_pos_c(frame, x, y);
+++// We can take a sync here and try to locally overlap QPU processing with ARM
+++// but testing showed a slightly negative benefit with noticable extra complexity
+++#if RPI_OPT_SEP_PRED
+++    vpu_qpu_job_add_sync_this(vqj, &sync_c);
+++#endif
+ +
+-+//    if (c_idx != 0) {
+-+//        return;
+-+//    }
+-+    if (s->enable_rpi) {
+-+        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+-+        cmd->size = log2_trafo_size;
+-+        cmd->c_idx = c_idx;
+-+        cmd->ta.buf = coeffs;
+-+        cmd->ta.dst = dst;
+-+        cmd->ta.stride = stride;
+-+    }
+-+    else if (!is_sliced || c_idx == 0) {
+-+        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+-+    }
+-+    else if (c_idx == 1) {
+-+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
+++
+++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+++
+++
+++    // We are expecting a contiguous Z-shaped set of blocks
+++    // So generate up to 3 blocks:
+++    //   1st line
+++    //   body
+++    //   last line
+++    // This will work even if we don't have the expected geometry
+++    if (pred_y || pred_c)
+++    {
+++        const HEVCRpiDeblkEnv *const de = &jb->deblk;
+++        const HEVCRpiDeblkBlk * db = de->blks + 0;
+++        const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
+++        unsigned int x0 = db->x_ctb;
+++        unsigned int xx = x0 + ctb_size;
+++        unsigned int y0 = db->y_ctb;
+++
+++        unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
+++        unsigned int b = 0;
+++        unsigned int i;
+++
+++        for (i = 1, ++db; i < de->n; ++i, ++db)
+++        {
+++            if (db->x_ctb == xx && db->y_ctb == y0) {
+++                xx += ctb_size;
+++            }
+++            else
+++            {
+++                unsigned int * const tlbr = blks_tlbr[b];
+++                if (tlbr[0] > y0)
+++                    tlbr[0] = y0;
+++                if (tlbr[1] > x0)
+++                    tlbr[1] = x0;
+++                if (tlbr[2] < y0 + ctb_size)
+++                    tlbr[2] = y0 + ctb_size;
+++                if (tlbr[3] < xx)
+++                    tlbr[3] = xx;
+++                x0 = db->x_ctb;
+++                xx = x0 + ctb_size;
+++                y0 = db->y_ctb;
+++                b = 1;
+++            }
+++        }
+++
+++        if (blks_tlbr[b][0] != ~0U)
+++            ++b;
+++
+++        {
+++            unsigned int * const tlbr = blks_tlbr[b];
+++            tlbr[0] = y0;
+++            tlbr[1] = x0;
+++            tlbr[2] = y0 + ctb_size;
+++            tlbr[3] = xx;
+++        }
+++
+++        // ??? Coalesce blocks ???
+++        for (i = 0; i <= b; ++i) {
+++            const unsigned int * const tlbr = blks_tlbr[i];
+++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
+++              tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
+++        }
+ +    }
+-+    else {
+-+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++
+++
+++    // Having accumulated some commands - do them
+++    rpi_cache_flush_finish(rfe);
+++
+++    // Await progress as required
+++    {
+++        unsigned int i;
+++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+++            if (jb->progress[i] >= 0) {
+++                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
+++            }
+++        }
+ +    }
+- }
+++
+++    vpu_qpu_job_finish(vqj);
+++
+++    worker_pic_reset(&jb->coeffs);
+++
+++    // If we have emulated VPU ops - do it here
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++    if (av_rpi_is_sand8_frame(s->frame))
+++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+++        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
+++#elif RPI_QPU_EMU_Y
+++        rpi_shader_c8(s, &jb->luma_ip, NULL);
+++#else
+++        rpi_shader_c8(s, NULL, &jb->chroma_ip);
+ +#endif
+- 
+- void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                                 int log2_trafo_size, enum ScanType scan_idx,
+-                                 int c_idx)
+++    else
+++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+++        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
+++#elif RPI_QPU_EMU_Y
+++        rpi_shader_c16(s, &jb->luma_ip, NULL);
+++#else
+++        rpi_shader_c16(s, NULL, &jb->chroma_ip);
+++#endif
+++#endif
+++
+++#if RPI_OPT_SEP_PRED
+++    // Wait for transform completion
+++    vpu_qpu_wait(&sync_c);
+++
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s, 0, 1);
+++
+++    // Wait for transform completion
+++    vpu_qpu_wait(&sync_y);
+++
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s, 1, 0);
+++#else
+++    // Wait for transform completion
+++    vpu_qpu_wait(&sync_y);
+++
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s);
+++#endif
+++
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+++}
+++
+++static void rpi_do_all_passes(HEVCContext *s)
+++{
+++    // Called from main thread - must be no pending background jobs
+++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+++
+++    // Do the various passes - common with the worker code
+++    worker_core(s);
+++    // Prepare next batch
+++    rpi_begin(s);
+++}
+++
+++
+++#endif
+++
++ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+  {
+--#define GET_COORD(offset, n)                                    \
+--    do {                                                        \
+--        x_c = (x_cg << 2) + scan_x_off[n];                      \
+--        y_c = (y_cg << 2) + scan_y_off[n];                      \
+--    } while (0)
+--    HEVCLocalContext *lc = s->HEVClc;
+--    int transform_skip_flag = 0;
+-+    HEVCLocalContext * const lc = s->HEVClc;
+-+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
+- 
+-     int last_significant_coeff_x, last_significant_coeff_y;
+--    int last_scan_pos;
+--    int n_end;
+-     int num_coeff = 0;
+--    int greater1_ctx = 1;
+-+    int prev_subset_coded = 0;
+- 
+-     int num_last_subset;
+-     int x_cg_last_sig, y_cg_last_sig;
+- 
+--    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+-+    const uint8_t *scan_x_cg, *scan_y_cg;
+-+    const xy_off_t * scan_xy_off;
++     HEVCContext *s  = avctxt->priv_data;
++@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++     int y_ctb       = 0;
++     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+  
+-+#ifndef RPI
+-     ptrdiff_t stride = s->frame->linesize[c_idx];
+-     int hshift = s->ps.sps->hshift[c_idx];
+-     int vshift = s->ps.sps->vshift[c_idx];
+--    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-+    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+--    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++#ifdef RPI
+++    // * We don't support cross_component_prediction_enabled_flag but as that
+++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
+++    //   only deal with sand which is never 4:4:4
+++    //   [support wouldn't be hard]
+++    s->enable_rpi =
+++        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
+++         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
+ +#endif
+++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+++
++     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++         return AVERROR_INVALIDDATA;
++@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         }
++     }
++ 
+ +#ifdef RPI
+-+    int use_vpu;
+++    // Worker must be idle at start
+++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+++    rpi_begin(s);
+ +#endif
+-+    int16_t *coeffs;
+-+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
+-     int explicit_rdpcm_flag = 0;
+-     int explicit_rdpcm_dir_flag;
+++
++     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
++-        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+++        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+  
+-     int trafo_size = 1 << log2_trafo_size;
+-     int i;
+--    int qp,shift,add,scale,scale_m;
+-+    int qp,shift,scale;
+-     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+-     const uint8_t *scale_matrix = NULL;
+-     uint8_t dc_scale;
+-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+-                                          lc->tu.intra_pred_mode_c;
++         x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
++         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
++@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+  
+--    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    int prev_sig = 0;
+-+    const int c_idx_nz = (c_idx != 0);
++         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+ +
+-+    int may_hide_sign;
+++#ifdef RPI
+++        // Report progress so we can use our MVs in other frames
+++        // If we are tiled then this isn't really optimal but given that tiling
+++        // can change on a per pic basis (described in PPS) other schemes are
+++        // quite a lot harder
+++        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
+++            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
+++        }
+ +
+- 
+-     // Derive QP for dequant
+-     if (!lc->cu.cu_transquant_bypass_flag) {
+--        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+-+        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+-         static const uint8_t rem6[51 + 4 * 6 + 1] = {
+-             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+-             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+-@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         };
+-         int qp_y = lc->qp_y;
+- 
+-+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+++        if (s->enable_rpi) {
+++            int q_full = (++s->ctu_count >= s->max_ctu_count);
+ +
+-         if (s->ps.pps->transform_skip_enabled_flag &&
+-             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+--            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+-+            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+-+            if (transform_skip_flag) {
+-+                trans_skip_or_bypass = 1;
+-+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+-+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+-+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+-+                    may_hide_sign = 0;
+++            if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
+++                q_full = 1;
+++            if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
+++                q_full = 1;
+++
+++            s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
+++            s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
+++
+++            if (q_full) {
+++                if (s->used_for_ref)
+++                {
+++//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+++
+++//                  worker_wait(s);
+++                    // Split work load onto separate threads so we make as rapid progress as possible with this frame
+++                    // Pass on this job to worker thread
+++                    worker_submit_job(s);
+++
+++                    // Make sure we have space to prepare the next job
+++                    worker_pass0_ready(s);
+++
+++                    // Prepare the next batch of commands
+++                    rpi_begin(s);
+++                } else {
+++                    // Non-ref frame so do it all on this thread
+++                    rpi_do_all_passes(s);
+ +                }
+ +            }
+-         }
+- 
+-         if (c_idx == 0) {
+-@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             qp += s->ps.sps->qp_bd_offset;
+-         }
+- 
+--        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
+--        add      = 1 << (shift-1);
+--        scale    = level_scale[rem6[qp]] << (div6[qp]);
+--        scale_m  = 16; // default when no custom scaling lists.
+--        dc_scale = 16;
+-+        // Shift is set to one less than will actually occur as the scale
+-+        // and saturate step adds 1 and then shifts right again
+-+        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+-+        scale = level_scale[rem6[qp]];
+-+        if (div6[qp] >= shift) {
+-+            scale <<= (div6[qp] - shift);
+-+            shift = 0;
+-+        } else {
+-+            shift -= div6[qp];
+-+        }
+- 
+--        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+-+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
+-             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+--            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+-+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+-             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+- 
+-             matrix_id = 3 * matrix_id + c_idx;
+- 
+-             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+-+            dc_scale = scale_matrix[0];
+-             if (log2_trafo_size >= 4)
+-                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+-         }
+-+        else
+-+        {
+-+            static const uint8_t sixteen_scale[64] = {
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16
+-+            };
+-+            scale_matrix = sixteen_scale;
+-+            dc_scale = 16;
+++
+ +        }
+-     } else {
+-+        static const uint8_t unit_scale[64] = {
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+        };
+-+        scale_matrix = unit_scale;
+-         shift        = 0;
+--        add          = 0;
+--        scale        = 0;
+--        dc_scale     = 0;
+-+        scale        = 2;  // We will shift right to kill this
+-+        dc_scale     = 1;
+++#endif
+ +
+-+        may_hide_sign = 0;
+++
++         if (more_data < 0) {
++             s->tab_slice_address[ctb_addr_rs] = -1;
++             return more_data;
++@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++ 
++         ctb_addr_ts++;
++         ff_hevc_save_states(s, ctb_addr_ts);
+++#ifdef RPI
+++        if (s->enable_rpi)
+++            continue;
+++#endif
++         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+      }
+  
+++#ifdef RPI
+ +
+++    // Wait for the worker to finish all its jobs
+++    if (s->enable_rpi) {
+++        worker_wait(s);
+++    }
+ +
+++    // Finish off any half-completed rows
+++    if (s->enable_rpi && s->ctu_count) {
+++        rpi_do_all_passes(s);
+++    }
+ +
+-     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+--        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+--        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+-+        trans_skip_or_bypass) {
+-+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
+-         if (explicit_rdpcm_flag) {
+--            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+-+            may_hide_sign = 0;
+-+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
+-         }
+-     }
+- 
+--    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+-+    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
+-                                            &last_significant_coeff_x, &last_significant_coeff_y);
+- 
+-     if (last_significant_coeff_x > 3) {
+-@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         int last_x_c = last_significant_coeff_x & 3;
+-         int last_y_c = last_significant_coeff_y & 3;
+- 
+--        scan_x_off = ff_hevc_diag_scan4x4_x;
+--        scan_y_off = ff_hevc_diag_scan4x4_y;
+-         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+--        if (trafo_size == 4) {
+-+
+-+        switch (log2_trafo_size) {
+-+        case 2:
+-             scan_x_cg = scan_1x1;
+-             scan_y_cg = scan_1x1;
+--        } else if (trafo_size == 8) {
+-+            break;
+-+        case 3:
+-             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+-             scan_x_cg = diag_scan2x2_x;
+-             scan_y_cg = diag_scan2x2_y;
+--        } else if (trafo_size == 16) {
+-+            break;
+-+        case 4:
+-             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+-             scan_x_cg = ff_hevc_diag_scan4x4_x;
+-             scan_y_cg = ff_hevc_diag_scan4x4_y;
+--        } else { // trafo_size == 32
+-+            break;
+-+        case 5:
+-+        default:
+-             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+-             scan_x_cg = ff_hevc_diag_scan8x8_x;
+-             scan_y_cg = ff_hevc_diag_scan8x8_y;
+-+            break;
+-         }
+-         break;
+-     }
+-     case SCAN_HORIZ:
+-         scan_x_cg = horiz_scan2x2_x;
+-         scan_y_cg = horiz_scan2x2_y;
+--        scan_x_off = horiz_scan4x4_x;
+--        scan_y_off = horiz_scan4x4_y;
+-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+-         break;
+-     default: //SCAN_VERT
+-         scan_x_cg = horiz_scan2x2_y;
+-         scan_y_cg = horiz_scan2x2_x;
+--        scan_x_off = horiz_scan4x4_y;
+--        scan_y_off = horiz_scan4x4_x;
+-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+-         break;
+-     }
+-     num_coeff++;
+-     num_last_subset = (num_coeff - 1) >> 4;
+- 
+--    for (i = num_last_subset; i >= 0; i--) {
+--        int n, m;
+--        int x_cg, y_cg, x_c, y_c, pos;
+-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+++#if RPI_TSTATS
+++    {
+++        HEVCRpiStats *const ts = &s->tstats;
+ +
+-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+++        memset(ts, 0, sizeof(*ts));
+++    }
+++#endif
+ +
+-+    {
+-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+++#endif
+++
++     if (x_ctb + ctb_size >= s->ps.sps->width &&
++         y_ctb + ctb_size >= s->ps.sps->height)
++         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
++@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
++     s = s1->sList[self_id];
++     lc = s->HEVClc;
++ 
+ +#ifdef RPI
+-+        use_vpu = 0;
+-+        if (s->enable_rpi) {
+-+            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
+-+            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+-+#if HAVE_NEON
+-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+-+#else
+-+            memset(coeffs, 0, ccount * sizeof(int16_t));
+++    s->enable_rpi = 0;
+++    //printf("Wavefront\n");
+ +#endif
+++
++     if(ctb_row) {
++         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
++ 
++@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
++         if (ret < 0)
++             return ret;
++ 
++-        if (s->max_ra == INT_MAX) {
++-            if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
++-                s->max_ra = s->poc;
+++        // The definition of _N unit types is "non-reference for other frames
+++        // with the same temporal_id" so they may/will be ref frames for pics
+++        // with a higher temporal_id.
+++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+++            !(s->nal_unit_type == NAL_TRAIL_N ||
+++                        s->nal_unit_type == NAL_TSA_N   ||
+++                        s->nal_unit_type == NAL_STSA_N  ||
+++                        s->nal_unit_type == NAL_RADL_N  ||
+++                        s->nal_unit_type == NAL_RASL_N);
+++
+++#if DEBUG_DECODE_N
+++        {
+++            static int z = 0;
+++            if (IS_IDR(s)) {
+++                z = 1;
+++            }
+++            if (z != 0 && z++ > DEBUG_DECODE_N) {
+++                s->is_decoded = 0;
+++                break;
+++            }
+ +        }
+-+        else
+ +#endif
+-+        {
+-+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-+            memset(coeffs, 0, ccount * sizeof(int16_t));
+++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+++            s->is_decoded = 0;
+++            break;
+ +        }
+-+    }
+ +
+-+    i = num_last_subset;
+-+    do {
+-         int implicit_non_zero_coeff = 0;
+--        int64_t trans_coeff_level;
+--        int prev_sig = 0;
+--        int offset = i << 4;
+--        int rice_init = 0;
+-+        int n_end;
+++        if (s->sh.first_slice_in_pic_flag) {
+++            if (s->max_ra == INT_MAX) {
+++                if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+++                    s->max_ra = s->poc;
+++                } else {
+++                    if (IS_IDR(s))
+++                        s->max_ra = INT_MIN;
+++                }
+++            }
+++
+++            if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
+++                s->poc <= s->max_ra) {
+++                s->is_decoded = 0;
+++                break;
++             } else {
++                 if (IS_IDR(s))
++                     s->max_ra = INT_MIN;
++@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++         }
++     }
+  
+-         uint8_t significant_coeff_flag_idx[16];
+--        uint8_t nb_significant_coeff_flag = 0;
+--
+--        x_cg = scan_x_cg[i];
+--        y_cg = scan_y_cg[i];
+--
+--        if ((i < num_last_subset) && (i > 0)) {
+--            int ctx_cg = 0;
+--            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+--            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+--
+--            significant_coeff_group_flag[x_cg][y_cg] =
+--                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+--            implicit_non_zero_coeff = 1;
+--        } else {
+--            significant_coeff_group_flag[x_cg][y_cg] =
+--            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+--             (x_cg == 0 && y_cg == 0));
+--        }
++-fail:
++-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
++-        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+ -
+--        last_scan_pos = num_coeff - offset - 1;
+-+        unsigned int nb_significant_coeff_flag = 0;
+++fail:  // Also success path
+++    if (s->ref != NULL) {
+++        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
+++#ifdef RPI
+++            rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+++#endif
+++            ff_hevc_progress_signal_all_done(s);
+++        }
+++#ifdef RPI
+++        // * Flush frame will become confused if we pass it something
+++        //   that doesn't have an expected number of planes (e.g. 400)
+++        //   So only flush if we are sure we can.
+++        else if (s->enable_rpi) {
+++            // Flush frame to real memory as we expect to be able to pass
+++            // it straight on to mmal
+++            flush_frame(s, s->frame);
+++        }
+++#endif
+++    }
++     return ret;
++ }
+  
+-         if (i == num_last_subset) {
+-+            // First time through
+-+            int last_scan_pos = num_coeff - (i << 4) - 1;
+-             n_end = last_scan_pos - 1;
+-             significant_coeff_flag_idx[0] = last_scan_pos;
+-             nb_significant_coeff_flag = 1;
+-         } else {
+-             n_end = 15;
+-+            implicit_non_zero_coeff = (i != 0);
+-         }
++@@ -3070,6 +5056,83 @@ fail:
++     return AVERROR(ENOMEM);
++ }
+  
+--        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+--            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+--        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+--            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+--
+--        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+--            static const uint8_t ctx_idx_map[] = {
+--                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+--                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+--                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+--                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+--                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+-+        if (n_end >= 0) {
+-+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+-+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+-+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+-+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+-+            };
+-+            static const uint8_t ctx_idx_maps[3][4][16] = {
+-+                {
+-+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+-+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+-+                },
+-+                {
+-+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+-+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+-+                },
+-+                {
+-+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+-+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+-+                }
+-             };
+-             const uint8_t *ctx_idx_map_p;
+-             int scf_offset = 0;
+--            if (s->ps.sps->transform_skip_context_enabled_flag &&
+--                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+--                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+--                if (c_idx == 0) {
+--                    scf_offset = 40;
+--                } else {
+--                    scf_offset = 14 + 27;
+--                }
+++#ifdef RPI
+++static av_cold void hevc_init_worker(HEVCContext * const s)
+++{
+++    int err;
+ +
+-+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+-+                ctx_idx_map_p = ctx_idx_maps[0][3];
+-+                scf_offset = 40 + c_idx_nz;
+-             } else {
+--                if (c_idx != 0)
+-+                if (c_idx_nz != 0)
+-                     scf_offset = 27;
+++    memset(s->jobs, 0, sizeof(s->jobs));
+ +
+-                 if (log2_trafo_size == 2) {
+--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+-+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
+-                 } else {
+--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+--                    if (c_idx == 0) {
+--                        if ((x_cg > 0 || y_cg > 0))
+-+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+-+                    if (!c_idx_nz) {
+-+                        if (i != 0)
+-                             scf_offset += 3;
+++    for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
+++        HEVCRpiJob * const jb = s->jobs + job;
+ +
+-                         if (log2_trafo_size == 3) {
+-                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+-                         } else {
+-@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                     }
+-                 }
+-             }
+--            for (n = n_end; n > 0; n--) {
+--                x_c = scan_x_off[n];
+--                y_c = scan_y_off[n];
+--                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+--                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+--                    nb_significant_coeff_flag++;
+++        sem_init(&jb->sem_in, 0, 0);
+++        sem_init(&jb->sem_out, 0, 0);
+++        ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
+ +
+-+            if (n_end > 0) {
+-+                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+-+                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+-+                    n_end, ctx_idx_map_p,
+-+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+++        jb->intra.n = 0;
+++        jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
+ +
+-+                nb_significant_coeff_flag += cnt;
+-+                if (cnt != 0) {
+-                     implicit_non_zero_coeff = 0;
+-                 }
+-             }
+++        // ** Sizeof the union structure might be overkill but at the moment it
+++        //    is correct (it certainly isn't going to be too small)
+ +
+-             if (implicit_non_zero_coeff == 0) {
+--                if (s->ps.sps->transform_skip_context_enabled_flag &&
+--                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+--                    if (c_idx == 0) {
+--                        scf_offset = 42;
+--                    } else {
+--                        scf_offset = 16 + 27;
+--                    }
+-+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+-+                    scf_offset = 42 + c_idx_nz;
+-                 } else {
+-                     if (i == 0) {
+--                        if (c_idx == 0)
+--                            scf_offset = 0;
+--                        else
+--                            scf_offset = 27;
+-+                        scf_offset = c_idx_nz ? 27 : 0;
+-                     } else {
+-                         scf_offset = 2 + scf_offset;
+-                     }
+-                 }
+--                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+-+                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
+-                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+-                     nb_significant_coeff_flag++;
+-                 }
+-@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             }
+-         }
+- 
+--        n_end = nb_significant_coeff_flag;
+-+        if (nb_significant_coeff_flag != 0) {
+-+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+-+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+-+                prev_subset_coded;
+-+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+-+                (gt1_idx_delta << 2);
+-+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+-+                gt1_idx_delta;
+++        rpi_inter_pred_alloc(&jb->chroma_ip,
+++                             QPU_N_MAX, QPU_N_GRP,
+++                             QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
+++                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
+++        rpi_inter_pred_alloc(&jb->luma_ip,
+++                             QPU_N_MAX,  QPU_N_GRP,
+++                             QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
+++                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
+ +
+-+            const unsigned int x_cg = scan_x_cg[i];
+-+            const unsigned int y_cg = scan_y_cg[i];
+-+            int16_t * const blk_coeffs = coeffs +
+-+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+-+            // This calculation is 'wrong' for log2_traffo_size == 2
+-+            // but that doesn't mattor as in this case x_cg & y_cg
+-+            // are always 0 so result is correct (0) anyway
+-+            const uint8_t * const blk_scale = scale_matrix +
+-+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+++        jb->deblk.n = 0;
+++        jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
+++    }
+++    s->pass0_job = 0;
+++    s->pass1_job = 0;
+++    s->jb0 = s->jobs + 0;
+++    s->jb1 = s->jobs + 0;
+ +
+-+            // * The following code block doesn't deal with these flags:
+-+            //   (nor did the one it replaces)
+-+            //
+-+            // cabac_bypass_alignment_enabled_flag
+-+            //    This should be easy but I can't find a test case
+-+            // extended_precision_processing_flag
+-+            //    This can extend the required precision past 16bits
+-+            //    so is probably tricky - also no example found yet
+++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+++    if (err) {
+++        printf("Failed to create worker thread\n");
+++        exit(-1);
+++    }
+++}
+ +
+-+#if USE_N_END_1
+-+            if (nb_significant_coeff_flag == 1) {
+-+                // There is a small gain to be had from special casing the single
+-+                // transform coefficient case.  The reduction in complexity
+-+                // makes up for the code duplicatioon.
+++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
+++{
+++    av_freep(&ipe->q);
+++    gpu_free(&ipe->gptr);
+++}
+ +
+-+                int trans_coeff_level = 1;
+-+                int coeff_sign_flag;
+-+                int coded_val = 0;
+++static av_cold void hevc_exit_worker(HEVCContext *s)
+++{
+++    void *res;
+++    unsigned int i;
+ +
+-+                // initialize first elem of coeff_bas_level_greater1_flag
+-+                prev_subset_coded = 0;
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++        s->jobs[i].terminate = 1;
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++        sem_post(&s->jobs[i].sem_in);
+++    pthread_join(s->worker_thread, &res);
+ +
+-+                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+-+                    trans_coeff_level = 2;
+-+                    prev_subset_coded = 1;
+-+                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+-+                }
+- 
+-+                // Probably not worth the overhead of starting by22 for just one value
+-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+- 
+--        if (n_end) {
+--            int first_nz_pos_in_cg;
+--            int last_nz_pos_in_cg;
+--            int c_rice_param = 0;
+--            int first_greater1_coeff_idx = -1;
+--            uint8_t coeff_abs_level_greater1_flag[8];
+--            uint16_t coeff_sign_flag;
+--            int sum_abs = 0;
+--            int sign_hidden;
+--            int sb_type;
+-+                if (coded_val)
+-+                {
+-+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+-+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+-+                    } else {
+-+                        uint8_t * const stat_coeff =
+-+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+-+                        const unsigned int c_rice_param = *stat_coeff >> 2;
+-+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+- 
+-+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+-+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+-+                    }
+-+                }
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++    {
+++        HEVCRpiJob * const jb = s->jobs + i;
+++
+++        sem_destroy(&jb->sem_in);
+++        sem_destroy(&jb->sem_out);
+++        ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
+++        av_freep(&jb->intra.cmds);
+++        av_freep(&jb->deblk.blks);
+++        rpi_free_inter_pred(&jb->chroma_ip);
+++        rpi_free_inter_pred(&jb->luma_ip);
+++    }
+++}
+++
+++#endif
+++
++ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++ {
++     HEVCContext       *s = avctx->priv_data;
++@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+  
+--            // initialize first elem of coeff_bas_level_greater1_flag
+--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+-+                {
+-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
++     av_freep(&s->cabac_state);
+  
+--            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+--                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+--                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+--                else
+--                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+--                c_rice_param = lc->stat_coeff[sb_type] / 4;
+--            }
+--
+--            if (!(i == num_last_subset) && greater1_ctx == 0)
+--                ctx_set++;
+--            greater1_ctx = 1;
+--            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+--
+--            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+--                int inc = (ctx_set << 2) + greater1_ctx;
+--                coeff_abs_level_greater1_flag[m] =
+--                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+--                if (coeff_abs_level_greater1_flag[m]) {
+--                    greater1_ctx = 0;
+--                    if (first_greater1_coeff_idx == -1)
+--                        first_greater1_coeff_idx = m;
+--                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+--                    greater1_ctx++;
+-+                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+-+                        (trans_coeff_level ^ k) - k,  // Apply sign
+-+                        scale,
+-+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+-+                        shift);
+-                 }
+-             }
+--            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+--
+--            if (lc->cu.cu_transquant_bypass_flag ||
+--                (lc->cu.pred_mode ==  MODE_INTRA  &&
+--                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
+--                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
+--                 explicit_rdpcm_flag)
+--                sign_hidden = 0;
+-             else
+--                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
++-    for (i = 0; i < 3; i++) {
++-        av_freep(&s->sao_pixel_buffer_h[i]);
++-        av_freep(&s->sao_pixel_buffer_v[i]);
+++#ifdef RPI
+++
+++    hevc_exit_worker(s);
+++    vpu_qpu_term();
+++    for (i = 0; i != 2; ++i) {
+++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++     }
+++
+++    av_rpi_zc_uninit(avctx);
+ +#endif
+-+            {
+-+                int sign_hidden = may_hide_sign;
+-+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+-+                uint32_t coeff_sign_flags;
+-+                uint32_t coded_vals = 0;
+-+                // Sum(abs(level[]))
+-+                // In fact we only need the bottom bit and in some future
+-+                // version that may be all we calculate
+-+                unsigned int sum_abs;
+ +
+-+                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+-+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
+++    av_freep(&s->sao_pixel_buffer_v[0]);
++     av_frame_free(&s->output_frame);
++ 
++     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++     return 0;
++ }
++ 
+ +
+-+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+-+                    sign_hidden = 0;
++ static av_cold int hevc_init_context(AVCodecContext *avctx)
++ {
++     HEVCContext *s = avctx->priv_data;
++@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++     s->HEVClcList[0] = s->HEVClc;
++     s->sList[0] = s;
++ 
+++#ifdef RPI
+++    // Whilst FFmpegs init fn is only called once the close fn is called as
+++    // many times as we have threads (init_thread_copy is called for the
+++    // threads).  So to match init & term put the init here where it will be
+++    // called by both init & copy
+++    av_rpi_zc_init(avctx);
+ +
+-+                // -- Start bypass block
+++    if (vpu_qpu_init() != 0)
+++        goto fail;
+ +
+-+                bypass_start(s);
+++#if RPI_INTER
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++    {
+++        static const uint32_t dframe[1] = {0x80808080};
+++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
+++    }
+++#endif
+++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+++    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
+++#endif
+++#endif
+++    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+-+                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+++    s->enable_rpi = 0;
+ +
+-+                if (coded_vals != 0)
+-+                {
+-+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+-+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+-+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+-+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+-+                    int * level = levels - 1;
+++    for (i = 0; i != 2; ++i) {
+++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
+++    }
+++    hevc_init_worker(s);
+++#endif
+ +
+-+                    do {
+-+                        {
+-+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+-+                            level += z;
+-+                            coded_vals <<= z;
+-+                        }
++     s->cabac_state = av_malloc(HEVC_CONTEXTS);
++     if (!s->cabac_state)
++         goto fail;
++@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++         if (!s->DPB[i].frame)
++             goto fail;
++         s->DPB[i].tf.f = s->DPB[i].frame;
+++        s->DPB[i].dpb_no = i;
++     }
+  
+--            if (first_greater1_coeff_idx != -1) {
+--                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+--            }
+--            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+--            } else {
+--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+--            }
+-+                        {
+-+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+-+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+-+
+-+                            sum_abs += last_coeff_abs_level_remaining + 1;
+-+                            *level = trans_coeff_level;
+- 
+--            for (m = 0; m < n_end; m++) {
+--                n = significant_coeff_flag_idx[m];
+--                GET_COORD(offset, n);
+--                if (m < 8) {
+--                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+--                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+--                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+--
+--                        trans_coeff_level += last_coeff_abs_level_remaining;
+--                        if (trans_coeff_level > (3 << c_rice_param))
+--                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+--                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+--                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+--                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+--                                lc->stat_coeff[sb_type]++;
+--                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+--                                if (lc->stat_coeff[sb_type] > 0)
+--                                    lc->stat_coeff[sb_type]--;
+--                            rice_init = 1;
+-+                            if (stat_coeff != NULL)
+-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+-+                            stat_coeff = NULL;
+-+
+-+                            if (trans_coeff_level > (3 << c_rice_param) &&
+-+                                (c_rice_param < 4 || rice_adaptation_enabled))
+-+                                ++c_rice_param;
+-                         }
+--                    }
+--                } else {
+--                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+--
+--                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+--                    if (trans_coeff_level > (3 << c_rice_param))
+--                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+--                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+--                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+--                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+--                            lc->stat_coeff[sb_type]++;
+--                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+--                            if (lc->stat_coeff[sb_type] > 0)
+--                                lc->stat_coeff[sb_type]--;
+--                        rice_init = 1;
+--                    }
+-+                    } while (coded_vals != 0);
+-                 }
+--                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+--                    sum_abs += trans_coeff_level;
+--                    if (n == first_nz_pos_in_cg && (sum_abs&1))
+--                        trans_coeff_level = -trans_coeff_level;
+-+
+-+                // sign_hidden = 0 or 1 so we can combine the tests
+-+                if ((sign_hidden & sum_abs) != 0) {
+-+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
+-                 }
+--                if (coeff_sign_flag >> 15)
+--                    trans_coeff_level = -trans_coeff_level;
+--                coeff_sign_flag <<= 1;
+--                if(!lc->cu.cu_transquant_bypass_flag) {
+--                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+--                        if(y_c || x_c || log2_trafo_size < 4) {
+--                            switch(log2_trafo_size) {
+--                                case 3: pos = (y_c << 3) + x_c; break;
+--                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+--                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+--                                default: pos = (y_c << 2) + x_c; break;
+--                            }
+--                            scale_m = scale_matrix[pos];
+--                        } else {
+--                            scale_m = dc_scale;
+--                        }
+-+
+-+                bypass_finish(s);
+-+
+-+                // -- Finish bypass block
+-+
+-+                // Scale loop
+-+                {
+-+                    int m = nb_significant_coeff_flag - 1;
+-+
+-+                    // Deal with DC component (if any) first
+-+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+-+                    {
+-+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+-+                        blk_coeffs[0] = trans_scale_sat(
+-+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+-+                        --m;
+-                     }
+--                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+--                    if(trans_coeff_level < 0) {
+--                        if((~trans_coeff_level) & 0xFffffffffff8000)
+--                            trans_coeff_level = -32768;
+--                    } else {
+--                        if(trans_coeff_level & 0xffffffffffff8000)
+--                            trans_coeff_level = 32767;
+-+
+-+#if !USE_N_END_1
+-+                    // If N_END_1 set then m was at least 1 initially
+-+                    if (m >= 0)
+-+#endif
+-+                    {
+-+                        do {
+-+                            const xy_off_t * const xy_off = scan_xy_off +
+-+                                significant_coeff_flag_idx[m];
+-+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+-+
+-+                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+-+                                (levels[m] ^ k) - k,
+-+                                scale,
+-+                                blk_scale[xy_off->scale],
+-+                                shift);
+-+                        } while (--m >= 0);
+-                     }
+-                 }
+--                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+-+
+-             }
+-         }
+--    }
+-+    } while ((i = next_subset(s, i, c_idx_nz,
+-+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
++     s->max_ra = INT_MAX;
++@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
++     }
+  
+-     if (lc->cu.cu_transquant_bypass_flag) {
+-         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+-         }
+-     } else {
+--        if (transform_skip_flag) {
+-+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
+-             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+-                       log2_trafo_size == 2 &&
+-                       lc->cu.pred_mode == MODE_INTRA;
+-@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 for (i = 0; i < 8; i++)
+-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+-             }
+--
+-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
++     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++-            s->threads_type = FF_THREAD_FRAME;
++-        else
++-            s->threads_type = FF_THREAD_SLICE;
+++        s->threads_type = FF_THREAD_FRAME;
+++    else
+++        s->threads_type = FF_THREAD_SLICE;
+  
+-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+-             }
+-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+--            s->hevcdsp.idct_4x4_luma(coeffs);
+-+           s->hevcdsp.idct_4x4_luma(coeffs);
+-         } else {
+-+#ifdef RPI
+-+            if (!use_vpu) {
+-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-+              if (max_xy == 0) {
+-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-+              } else {
+-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+-+                  if (max_xy < 4)
+-+                      col_limit = FFMIN(4, col_limit);
+-+                  else if (max_xy < 8)
+-+                      col_limit = FFMIN(8, col_limit);
+-+                  else if (max_xy < 12)
+-+                      col_limit = FFMIN(24, col_limit);
+-+
+-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-+              }
+-+            }
+-+#else
+-             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-             if (max_xy == 0)
+-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                     col_limit = FFMIN(24, col_limit);
+-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-             }
+-+#endif
+-         }
+-     }
+-     if (lc->tu.cross_pf) {
+-@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-         }
+-     }
+-+#ifdef RPI
+-+    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+-+#else
+-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+-+#endif
++     return 0;
+  }
++@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = {
++     .update_thread_context = hevc_update_thread_context,
++     .init_thread_copy      = hevc_init_thread_copy,
++     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+++//                             0,
+++//                             AV_CODEC_CAP_FRAME_THREADS,
++                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
++     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++ };
++diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
++index 162ca0e582..d647232638 100644
++--- a/libavcodec/hevc.h
+++++ b/libavcodec/hevc.h
++@@ -23,6 +23,7 @@
++ #ifndef AVCODEC_HEVC_H
++ #define AVCODEC_HEVC_H
+  
+- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f33b0c..3143b4f 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -22,6 +22,12 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+++#include "rpi_opts.h"
++ #include "libavutil/buffer.h"
++ #include "libavutil/md5.h"
+  
+-+//#define DISABLE_SAO
+-+//#define DISABLE_DEBLOCK
+-+//#define DISABLE_STRENGTHS
+-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+-+//#define DISABLE_DEBLOCK_NONREF
++@@ -37,6 +38,10 @@
++ #include "thread.h"
++ #include "videodsp.h"
++ 
+++#ifdef RPI
+++#include "rpi_qpu.h"
+++#endif
+ +
+- #include "libavutil/common.h"
+- #include "libavutil/internal.h"
++ #define MAX_DPB_SIZE 16 // A.4.1
++ #define MAX_REFS 16
+  
+-@@ -31,6 +37,11 @@
++@@ -463,6 +468,7 @@ typedef struct HEVCSPS {
++     int implicit_rdpcm_enabled_flag;
++     int explicit_rdpcm_enabled_flag;
++     int intra_smoothing_disabled_flag;
+++    int high_precision_offsets_enabled_flag;
++     int persistent_rice_adaptation_enabled_flag;
+  
+- #include "bit_depth_template.c"
++     ///< coded frame dimension in various units
++@@ -660,6 +666,7 @@ typedef struct CodingUnit {
++     uint8_t cu_transquant_bypass_flag;
++ } CodingUnit;
+  
+-+#ifdef RPI
+-+#include "rpi_qpu.h"
+-+#include "rpi_zc.h"
+++#if 0
++ typedef struct Mv {
++     int16_t x;  ///< horizontal component of motion vector
++     int16_t y;  ///< vertical component of motion vector
++@@ -670,6 +677,7 @@ typedef struct MvField {
++     int8_t ref_idx[2];
++     int8_t pred_flag;
++ } MvField;
+ +#endif
++ 
++ typedef struct NeighbourAvailable {
++     int cand_bottom_left;
++@@ -745,9 +753,23 @@ typedef struct HEVCFrame {
++      * A combination of HEVC_FRAME_FLAG_*
++      */
++     uint8_t flags;
+ +
+- #define LUMA 0
+- #define CB 1
+- #define CR 2
+-@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+-     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
+- }
+++    // Entry no in DPB - can be used as a small unique
+++    // frame identifier (within the current thread)
+++    uint8_t dpb_no;
++ } HEVCFrame;
+  
+-+static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+-+{
+ +#ifdef RPI
+-+    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
+-+#else
+-+    return s->ps.sps->pixel_shift;
+++typedef struct HEVCLocalContextIntra {
+++    TransformUnit tu;
+++    NeighbourAvailable na;
+++} HEVCLocalContextIntra;
+ +#endif
+-+}
+ +
+- static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+-                      intptr_t stride_dst, intptr_t stride_src)
+- {
+-@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+-                            int stride_src, int x, int y, int width, int height,
+-                            int c_idx, int x_ctb, int y_ctb)
+- {
+--    int sh = s->ps.sps->pixel_shift;
+-+    const unsigned int sh = pixel_shift(s, c_idx);
+-     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+-     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++ typedef struct HEVCLocalContext {
+++    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
+++    NeighbourAvailable na;
+++
++     uint8_t cabac_state[HEVC_CONTEXTS];
+  
+-@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+-         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+-         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+-         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+--        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+-+        const unsigned int sh = pixel_shift(s, c_idx);
+-+        int len          = (min_pu_size >> hshift) << sh;
+-         for (y = y_min; y < y_max; y++) {
+-             for (x = x_min; x < x_max; x++) {
+-                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+-                     int n;
+--                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+--                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+-+                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+-+                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+-                     for (n = 0; n < (min_pu_size >> vshift); n++) {
+-                         memcpy(src, dst, len);
+-                         src += stride_src;
+-@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+- 
+- static void sao_filter_CTB(HEVCContext *s, int x, int y)
+- {
+--    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+-     HEVCLocalContext *lc = s->HEVClc;
+-     int c_idx;
+-     int edges[4];  // 0 left 1 top 2 right 3 bottom
+-@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-     uint8_t right_tile_edge  = 0;
+-     uint8_t up_tile_edge     = 0;
+-     uint8_t bottom_tile_edge = 0;
+-+#ifdef RPI
+-+    const int sliced = rpi_sliced_frame(s->frame);
+-+    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+-+#else
+-+    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+-+#endif
++     uint8_t stat_coeff[4];
++@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext {
+  
+-     edges[0]   = x_ctb == 0;
+-     edges[1]   = y_ctb == 0;
+-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+-     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++     int qPy_pred;
+  
+-+#ifdef DISABLE_SAO
+-+    return;
+-+#endif
+-+
+-     if (restore) {
+-         if (!edges[0]) {
+-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+-@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-         }
+-     }
++-    TransformUnit tu;
++-
++     uint8_t ctb_left_flag;
++     uint8_t ctb_up_flag;
++     uint8_t ctb_up_right_flag;
++@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext {
++     int ct_depth;
++     CodingUnit cu;
++     PredictionUnit pu;
++-    NeighbourAvailable na;
+  
+--    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+-+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
+-         int x0       = x >> s->ps.sps->hshift[c_idx];
+-         int y0       = y >> s->ps.sps->vshift[c_idx];
+-         int stride_src = s->frame->linesize[c_idx];
+-@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+-         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+-         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+--        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+--        int stride_dst;
+-+        ptrdiff_t stride_dst;
+-         uint8_t *dst;
++ #define BOUNDARY_LEFT_SLICE     (1 << 0)
++ #define BOUNDARY_LEFT_TILE      (1 << 1)
++@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext {
++     int boundary_flags;
++ } HEVCLocalContext;
+  
+ +#ifdef RPI
+-+        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
+-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+-+        uint8_t * const src = !sliced ?
+-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
+-+            c_idx == 0 ?
+-+                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
+-+                rpi_sliced_frame_pos_c(s->frame, x0, y0);
+-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+-+            !sliced ? src - (1 << sh) :
+-+            c_idx == 0 ?
+-+                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
+-+                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
+-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+-+            !sliced ? src + (width << sh) :
+-+            c_idx == 0 ?
+-+                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
+-+                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
+ +
+++// The processing is done in chunks
+++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+++// but allocate more memory and increase the latency before data in the next frame can be processed
+++#define RPI_NUM_CHUNKS 4
+++#define RPI_CHUNK_SIZE 12
+++#define RPI_ROUND_TO_LINES 0
+ +
+-+        if (sliced && c_idx > 1) {
+-+            break;
+-+        }
+-+#else
+-+        const unsigned int sh = s->ps.sps->pixel_shift;
+-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+-+        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+-+#endif
+++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+++
+++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+++// Each block can have an intra prediction and a transform_add command
+++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+++// Worst case is 16x16 CTUs
+++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+++
+++#define RPI_CMD_LUMA_UNI 0
+++#define RPI_CMD_CHROMA_UNI 1
+++#define RPI_CMD_LUMA_BI 2
+++#define RPI_CMD_CHROMA_BI 3
+++#define RPI_CMD_V_BI 4
+++
+++// Command for inter prediction
+++typedef struct HEVCMvCmd {
+++    uint8_t cmd;
+++    uint8_t block_w;
+++    uint8_t block_h;
+++    int8_t ref_idx[2];
+++    uint16_t dststride;
+++    uint16_t srcstride;
+++    uint16_t srcstride1;
+++    int16_t weight;
+++    int16_t offset;
+++    int16_t x_off;
+++    int16_t y_off;
+++    uint8_t *src;
+++    uint8_t *src1;
+++    uint8_t *dst;
+++    Mv mv;
+++    Mv mv1;
+++} HEVCMvCmd;
+++
+++
+++// Command for intra prediction and transform_add of predictions to coefficients
+++enum rpi_pred_cmd_e
+++{
+++    RPI_PRED_ADD_RESIDUAL,
+++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
+++    RPI_PRED_ADD_DC,
+++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
+++    RPI_PRED_ADD_DC_V,
+++    RPI_PRED_INTRA,
+++    RPI_PRED_I_PCM,
+++    RPI_PRED_CMD_MAX
+++};
+++
+++typedef struct HEVCPredCmd {
+++    uint8_t type;
+++    uint8_t size;  // log2 "size" used by all variants
+++    uint8_t na;    // i_pred - but left here as they pack well
+++    uint8_t c_idx; // i_pred
+++    union {
+++        struct {  // TRANSFORM_ADD
+++            uint8_t * dst;
+++            const int16_t * buf;
+++            uint16_t stride;  // Should be good enough for all pic fmts we use
+++            int16_t dc;
+++        } ta;
+++        struct {
+++            uint8_t * dst;
+++            uint32_t stride;
+++            int dc;
+++        } dc;
+++        struct {  // INTRA
+++            uint16_t x;
+++            uint16_t y;
+++            enum IntraPredMode mode;
+++        } i_pred;
+++        struct {  // I_PCM
+++            uint16_t x;
+++            uint16_t y;
+++            const void * src;
+++            uint32_t src_len;
+++        } i_pcm;
+++    };
+++} HEVCPredCmd;
+ +
+-         switch (sao->type_idx[c_idx]) {
+-         case SAO_BAND:
+-             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+-                            x_ctb, y_ctb);
+-             if (s->ps.pps->transquant_bypass_enable_flag ||
+-                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+--            dst = lc->edge_emu_buffer;
+--            stride_dst = 2*MAX_PB_SIZE;
+--            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+--            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+--                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+--                                            width, height);
+--            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+--                               x, y, width, height, c_idx);
+-+                dst = lc->edge_emu_buffer;
+-+                stride_dst = 2*MAX_PB_SIZE;
+-+                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+-+#ifdef RPI
+-+                if (sliced && c_idx != 0)
+-+                {
+-+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+-+                                                    sao->offset_val[1], sao->band_position[1],
+-+                                                    sao->offset_val[2], sao->band_position[2],
+-+                                                    width, height);
+-+                }
+-+                else
+ +#endif
+-+                {
+-+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+-+                                                    width, height);
+-+                }
+-+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-+                                   x, y, width, height, c_idx);
+-             } else {
+--            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+--                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+--                                            width, height);
+++
+ +#ifdef RPI
+-+                if (sliced && c_idx != 0)
+-+                {
+-+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+-+                                                    sao->offset_val[1], sao->band_position[1],
+-+                                                    sao->offset_val[2], sao->band_position[2],
+-+                                                    width, height);
+-+                }
+-+                else
+-+#endif
+-+                {
+-+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+-+                                                    width, height);
+-+                }
+-             }
+-             sao->type_idx[c_idx] = SAO_APPLIED;
+-             break;
+-@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-         {
+-             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+-             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+--            int left_edge = edges[0];
+-             int top_edge = edges[1];
+--            int right_edge = edges[2];
+-             int bottom_edge = edges[3];
+--            int sh = s->ps.sps->pixel_shift;
+--            int left_pixels, right_pixels;
+- 
+-             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+-             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+- 
+-             if (!top_edge) {
+--                int left = 1 - left_edge;
+--                int right = 1 - right_edge;
+--                const uint8_t *src1[2];
+-                 uint8_t *dst1;
+--                int src_idx, pos;
+-+                int src_idx;
+-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+- 
+--                dst1 = dst - stride_dst - (left << sh);
+--                src1[0] = src - stride_src - (left << sh);
+--                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+--                pos = 0;
+--                if (left) {
+-+                dst1 = dst - stride_dst;
+++#include <semaphore.h>
+ +
+-+                if (src_l != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1, src1[src_idx], sh);
+--                    pos += (1 << sh);
+-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
+-                 }
+++union qpu_mc_pred_cmd_s;
+++struct qpu_mc_pred_y_p_s;
+++struct qpu_mc_src_s;
+ +
+-                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+-                            SAO_APPLIED);
+--                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+--                if (right) {
+--                    pos += width << sh;
+-+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+++typedef struct HEVCRpiInterPredQ
+++{
+++    union qpu_mc_pred_cmd_u *qpu_mc_base;
+++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
+++    struct qpu_mc_src_s *last_l0;
+++    struct qpu_mc_src_s *last_l1;
+++    unsigned int load;
+++    uint32_t code_setup;
+++    uint32_t code_sync;
+++    uint32_t code_exit;
+++} HEVCRpiInterPredQ;
+ +
+-+                if (src_r != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
+-                 }
+-             }
+-             if (!bottom_edge) {
+--                int left = 1 - left_edge;
+--                int right = 1 - right_edge;
+--                const uint8_t *src1[2];
+--                uint8_t *dst1;
+--                int src_idx, pos;
+-+                uint8_t * const dst1 = dst + height * stride_dst;
+-+                int src_idx;
+-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+-+                const unsigned int hoff = height * stride_src;
+- 
+--                dst1 = dst + height * stride_dst - (left << sh);
+--                src1[0] = src + height * stride_src - (left << sh);
+--                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+--                pos = 0;
+--                if (left) {
+-+                if (src_l != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1, src1[src_idx], sh);
+--                    pos += (1 << sh);
+-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
+-                 }
+++typedef struct HEVCRpiInterPredEnv
+++{
+++    HEVCRpiInterPredQ * q;
+++    unsigned int n;        // Number of Qs
+++    unsigned int n_grp;    // Number of Q in a group
+++    unsigned int curr;     // Current Q number (0..n-1)
+++    int used;              // 0 if nothing in any Q, 1 otherwise
+++    int used_grp;          // 0 if nothing in any Q in the current group
+++    unsigned int max_fill;
+++    unsigned int min_gap;
+++    GPU_MEM_PTR_T gptr;
+++} HEVCRpiInterPredEnv;
+++
+++typedef struct HEVCRpiIntraPredEnv {
+++    unsigned int n;        // Number of commands
+++    HEVCPredCmd * cmds;
+++} HEVCRpiIntraPredEnv;
+++
+++typedef struct HEVCRpiCeoffEnv {
+++    unsigned int n;
+++    uint16_t * buf;
+++} HEVCRpiCoeffEnv;
+ +
+-                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+-                            SAO_APPLIED);
+--                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+--                if (right) {
+--                    pos += width << sh;
+-+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+++typedef struct HEVCRpiCeoffsEnv {
+++    HEVCRpiCoeffEnv s[4];
+++    GPU_MEM_PTR_T gptr;
+++    void * mptr;
+++} HEVCRpiCoeffsEnv;
+ +
+-+                if (src_r != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
+-                 }
+-             }
+--            left_pixels = 0;
+--            if (!left_edge) {
+-+            if (src_l != NULL) {
+-                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+-                     copy_vert(dst - (1 << sh),
+-                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+-                               sh, height, stride_dst, 1 << sh);
+-                 } else {
+--                    left_pixels = 1;
+-+                    copy_vert(dst - (1 << sh),
+-+                              src_l,
+-+                              sh, height, stride_dst, stride_src);
+-                 }
+-             }
+--            right_pixels = 0;
+--            if (!right_edge) {
+-+            if (src_r != NULL) {
+-                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+-                     copy_vert(dst + (width << sh),
+-                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+-                               sh, height, stride_dst, 1 << sh);
+-                 } else {
+--                    right_pixels = 1;
+-+                    copy_vert(dst + (width << sh),
+-+                              src_r,
+-+                              sh, height, stride_dst, stride_src);
+-                 }
+-             }
+- 
+--            copy_CTB(dst - (left_pixels << sh),
+--                     src - (left_pixels << sh),
+--                     (width + left_pixels + right_pixels) << sh,
+-+            copy_CTB(dst,
+-+                     src,
+-+                     width << sh,
+-                      height, stride_dst, stride_src);
+- 
+-             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+-                            x_ctb, y_ctb);
+--            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+--                                            sao->eo_class[c_idx], width, height);
+--            s->hevcdsp.sao_edge_restore[restore](src, dst,
+--                                                stride_src, stride_dst,
+--                                                sao,
+--                                                edges, width,
+--                                                height, c_idx,
+--                                                vert_edge,
+--                                                horiz_edge,
+--                                                diag_edge);
+-+#ifdef RPI
+-+            if (sliced && c_idx != 0)
+-+            {
+-+                // Class always the same for both U & V (which is just as well :-))
+-+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+-+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+-+                                                width, height);
+-+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+-+                                                    stride_src, stride_dst,
+-+                                                    sao,
+-+                                                    edges, width,
+-+                                                    height, c_idx,
+-+                                                    vert_edge,
+-+                                                    horiz_edge,
+-+                                                    diag_edge);
+-+            }
+-+            else
+-+#endif
+-+            {
+-+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+-+                                                sao->eo_class[c_idx], width, height);
+-+                s->hevcdsp.sao_edge_restore[restore](src, dst,
+-+                                                    stride_src, stride_dst,
+-+                                                    sao,
+-+                                                    edges, width,
+-+                                                    height, c_idx,
+-+                                                    vert_edge,
+-+                                                    horiz_edge,
+-+                                                    diag_edge);
+-+            }
+-             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-                                x, y, width, height, c_idx);
+-             sao->type_idx[c_idx] = SAO_APPLIED;
+-@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-     }
+- }
+- 
+-+// Returns 2 or 0.
+- static int get_pcm(HEVCContext *s, int x, int y)
+- {
+-     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
+-@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     uint8_t *src;
+-     int x, y;
+-     int chroma, beta;
+--    int32_t c_tc[2], tc[2];
+-+    int32_t c_tc[4], tc[2];
+-     uint8_t no_p[2] = { 0 };
+-     uint8_t no_q[2] = { 0 };
+- 
+-@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+-+#ifdef DISABLE_DEBLOCK_NONREF
+-+    if (!s->used_for_ref)
+-+      return; // Don't deblock non-reference frames
+++typedef struct HEVCRpiDeblkBlk {
+++    uint16_t x_ctb;
+++    uint16_t y_ctb;
+++} HEVCRpiDeblkBlk;
+++
+++typedef struct HEVCRpiDeblkEnv {
+++    unsigned int n;
+++    HEVCRpiDeblkBlk * blks;
+++} HEVCRpiDeblkEnv;
+++
+++typedef struct HEVCRPiFrameProgressWait {
+++    int req;
+++    struct HEVCRPiFrameProgressWait * next;
+++    sem_t sem;
+++} HEVCRPiFrameProgressWait;
+++
+++typedef struct HEVCRPiFrameProgressState {
+++    struct HEVCRPiFrameProgressWait * first;
+++    struct HEVCRPiFrameProgressWait * last;
+++    pthread_mutex_t lock;
+++} HEVCRPiFrameProgressState;
+++
+++typedef struct HEVCRpiJob {
+++    volatile int terminate;
+++    int pending;
+++    sem_t sem_in;       // set by main
+++    sem_t sem_out;      // set by worker
+++    HEVCRpiInterPredEnv chroma_ip;
+++    HEVCRpiInterPredEnv luma_ip;
+++    int16_t progress[32];  // index by dpb_no
+++    HEVCRpiIntraPredEnv intra;
+++    HEVCRpiCoeffsEnv coeffs;
+++    HEVCRpiDeblkEnv deblk;
+++    HEVCRPiFrameProgressWait progress_wait;
+++} HEVCRpiJob;
+++
+++#if RPI_TSTATS
+++typedef struct HEVCRpiStats {
+++    int y_pred1_y8_merge;
+++    int y_pred1_xy;
+++    int y_pred1_x0;
+++    int y_pred1_y0;
+++    int y_pred1_x0y0;
+++    int y_pred1_wle8;
+++    int y_pred1_wgt8;
+++    int y_pred1_hle16;
+++    int y_pred1_hgt16;
+++    int y_pred2_xy;
+++    int y_pred2_x0;
+++    int y_pred2_y0;
+++    int y_pred2_x0y0;
+++    int y_pred2_hle16;
+++    int y_pred2_hgt16;
+++} HEVCRpiStats;
+ +#endif
+-+#ifdef DISABLE_DEBLOCK
+-+    return;
+++
+ +#endif
+-+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+-+        return;
+-     if (x0) {
+-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+-         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+-@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+++
++ typedef struct HEVCContext {
++     const AVClass *c;  // needed by private avoptions
++     AVCodecContext *avctx;
++@@ -805,6 +1025,69 @@ typedef struct HEVCContext {
++     int                 width;
++     int                 height;
+  
+-                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+-                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+--                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-                 if (pcmf) {
+-                     no_p[0] = get_pcm(s, x - 1, y);
+-                     no_p[1] = get_pcm(s, x - 1, y + 4);
+-                     no_q[0] = get_pcm(s, x, y);
+-                     no_q[1] = get_pcm(s, x, y + 4);
+--                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+--                                                         s->frame->linesize[LUMA],
+--                                                         beta, tc, no_p, no_q);
+--                } else
+--                    s->hevcdsp.hevc_v_loop_filter_luma(src,
+--                                                       s->frame->linesize[LUMA],
+--                                                       beta, tc, no_p, no_q);
+-+                }
+++    int used_for_ref;  // rpi
+ +#ifdef RPI
+-+                if (rpi_sliced_frame(s->frame)) {
+++    int enable_rpi;
+++    unsigned int pass0_job; // Pass0 does coefficient decode
+++    unsigned int pass1_job; // Pass1 does pixel processing
+++    int ctu_count; // Number of CTUs done in pass0 so far
+++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+ +
+-+                    // This copes properly with no_p/no_q
+-+                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
+-+                                                     s->frame->linesize[LUMA],
+-+                                                     beta, tc, no_p, no_q,
+-+                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
+-+                }
+-+                else
+++    HEVCRpiJob * jb0;
+++    HEVCRpiJob * jb1;
+++    HEVCRpiJob jobs[RPI_MAX_JOBS];
+++#if RPI_TSTATS
+++    HEVCRpiStats tstats;
+ +#endif
+-+                {
+-+                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-+                    if (pcmf) {
+-+                        // Standard DSP code is broken if no_p / no_q is set
+-+                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+-+                                                           s->frame->linesize[LUMA],
+-+                                                           beta, tc, no_p, no_q);
+-+                    }
+-+                    else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                    if (s->enable_rpi_deblock) {
+-+                        uint8_t (*setup)[2][2][4];
+-+                        int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                        int a = ((y>>3) & 1) << 1;
+-+                        int b = (x>>3) & 1;
+-+                        setup = s->dvq->y_setup_arm[num16];
+-+                        setup[0][b][0][a] = beta;
+-+                        setup[0][b][0][a + 1] = beta;
+-+                        setup[0][b][1][a] = tc[0];
+-+                        setup[0][b][1][a + 1] = tc[1];
+-+                    } else
+++#if RPI_INTER
+++    struct qpu_mc_pred_y_p_s * last_y8_p;
+++    struct qpu_mc_src_s * last_y8_l1;
+++
+++    // Function pointers
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++    const uint8_t * qpu_dummy_frame_emu;
+ +#endif
+-+                    {
+-+                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+-+                                                           s->frame->linesize[LUMA],
+-+                                                           beta, tc, no_p, no_q);
+-+                    }
+-+                }
+-             }
+-         }
+- 
+-@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+-                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+-                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+--                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-+                src =
+-+#ifdef RPI
+-+                    rpi_sliced_frame(s->frame) ?
+-+                        rpi_sliced_frame_pos_y(s->frame, x, y) :
+++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
+ +#endif
+-+                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-                 if (pcmf) {
+-                     no_p[0] = get_pcm(s, x, y - 1);
+-                     no_p[1] = get_pcm(s, x + 4, y - 1);
+-@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
+++    HEVCRpiQpu qpu;
+++#endif
+++
+++    pthread_t worker_thread;
+++
+ +#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((x>>3) & 1) << 1;
+-+                    int b = (y>>3) & 1;
+-+                    setup = s->dvq->y_setup_arm[num16];
+-+                    setup[1][b][0][a] = beta;
+-+                    setup[1][b][0][a + 1] = beta;
+-+                    setup[1][b][1][a] = tc[0];
+-+                    setup[1][b][1][a + 1] = tc[1];
+-+                } else
+++#define RPI_DEBLOCK_VPU_Q_COUNT 2
+++    int enable_rpi_deblock;
+++
+++    int uv_setup_width;
+++    int uv_setup_height;
+++    int setup_width; // Number of 16x16 blocks across the image
+++    int setup_height; // Number of 16x16 blocks down the image
+++
+++    struct dblk_vpu_q_s
+++    {
+++        GPU_MEM_PTR_T deblock_vpu_gmem;
+++
+++        uint8_t (*y_setup_arm)[2][2][2][4];
+++        uint8_t (*y_setup_vc)[2][2][2][4];
+++
+++        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+++        uint8_t (*uv_setup_vc)[2][2][2][4];
+++
+++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+++        int vpu_cmds_vc;
+++
+++        vpu_qpu_wait_h cmd_id;
+++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+++
+++    struct dblk_vpu_q_s * dvq;
+++    unsigned int dvq_n;
+++
+ +#endif
+-                     s->hevcdsp.hevc_h_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     }
+++    HEVCLocalContextIntra HEVClcIntra;
+++    HEVCRPiFrameProgressState progress_states[2];
+++#endif
+++
++     uint8_t *cabac_state;
+  
+-     if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI
+-+        if (rpi_sliced_frame(s->frame)) {
+-+            const int v = 2;
+-+            const int h = 2;
++     /** 1 if the independent slice segment header was successfully parsed */
++@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
++ 
++ int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
++                            uint8_t *buf, int buf_size);
+++#if RPI_INTER
+++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+++#endif
+ +
+-+            // vertical filtering chroma
+-+            for (y = y0; y < y_end; y += 8 * v) {
+-+                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+-+                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+-+                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
++ 
++ /**
++  * Reset SEI values that are stored on the Context.
++@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
++ extern const uint8_t ff_hevc_diag_scan8x8_x[64];
++ extern const uint8_t ff_hevc_diag_scan8x8_y[64];
++ 
+++#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+ +
+-+                    if ((bs0 == 2) || (bs1 == 2)) {
+-+                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+-+                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+-+                        unsigned int no_f = 0;
+++// arm/hevc_misc_neon.S
+++// Neon coeff zap fn
+++#if HAVE_NEON
+++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+++#endif
+ +
+-+                        // tc_offset here should be set to cur_tc_offset I think
+-+                        const uint32_t tc4 =
+-+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+-+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int val, const int field);
+ +
+-+                        if (tc4 == 0)
+-+                            continue;
+++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
+ +
+-+                        if (pcmf) {
+-+                            no_f =
+-+                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+-+                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+-+                                (get_pcm(s, x, y) ? 4 : 0) |
+-+                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+-+                            if (no_f == 0xf)
+-+                                continue;
+-+                        }
+++// All of these expect that s->threads_type == FF_THREAD_FRAME
+ +
+-+                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+-+                                                       s->frame->linesize[1],
+-+                                                       tc4,
+-+                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+-+                                                       no_f);
+-+                    }
+-+                }
+++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int y)
+++{
+++    if (s->enable_rpi)
+++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
+++    else
+++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+++}
+ +
+-+                if (y == 0)
+-+                    continue;
+++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
+++{
+++    if (s->enable_rpi && s->used_for_ref)
+++        ff_hevc_rpi_progress_signal_field(s, y, 1);
+++}
+ +
+-+                // horizontal filtering chroma
+-+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+-+                x_end2 = x_end;
+-+                if (x_end != s->ps.sps->width)
+-+                    x_end2 = x_end - 8 * h;
+++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int y)
+++{
+++    if (s->enable_rpi)
+++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
+++    else
+++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+++}
+ +
+-+                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+-+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+-+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+-+                    if ((bs0 == 2) || (bs1 == 2)) {
+-+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+-+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+-+                        const uint32_t tc4 =
+-+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+-+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+-+                        unsigned int no_f = 0;
+++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
+++{
+++    if (s->used_for_ref)
+++    {
+++        if (s->enable_rpi)
+++            ff_hevc_rpi_progress_signal_field(s, y, 0);
+++        else
+++            ff_thread_report_progress(&s->ref->tf, y, 0);
+++    }
+++}
+ +
+-+                        if (tc4 == 0)
+-+                            continue;
+++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
+++{
+++    if (s->enable_rpi)
+++    {
+++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
+++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
+++    }
+++    else
+++        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+++}
+ +
+-+                        if (pcmf) {
+-+                            no_f =
+-+                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+-+                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+-+                                (get_pcm(s, x,         y)     ? 4 : 0) |
+-+                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+++#else
+ +
+-+                            if (no_f == 0xf)
+-+                                continue;
+-+                        }
+++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
+++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+++#define ff_hevc_progress_signal_mv(s, y)
+++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
+++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
+ +
+-+                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+-+                                                             s->frame->linesize[1],
+-+                                                             tc4, no_f);
+-+                    }
+-+                }
+-+            }
+-+        }
+-+        else
+ +#endif
+-         for (chroma = 1; chroma <= 2; chroma++) {
+-             int h = 1 << s->ps.sps->hshift[chroma];
+-             int v = 1 << s->ps.sps->vshift[chroma];
+-@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+++
+++// Set all done - signal nothing (used in missing refs)
+++// Works for both rpi & non-rpi
+++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
+++{
+++    if (ref->tf.progress != NULL)
+++    {
+++        int * const p = (int *)&ref->tf.progress->data;
+++        p[0] = INT_MAX;
+++        p[1] = INT_MAX;
+++    }
+++}
+++
++ #endif /* AVCODEC_HEVC_H */
++diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
++index 05b2821840..c84886817d 100644
++--- a/libavcodec/hevc_cabac.c
+++++ b/libavcodec/hevc_cabac.c
++@@ -21,14 +21,76 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
+  
+-                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+-                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+--                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+-+                        src =
+-+#ifdef RPI
+-+                            rpi_sliced_frame(s->frame) ?
+-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+-+#endif
+-+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+-                         if (pcmf) {
+-                             no_p[0] = get_pcm(s, x - 1, y);
+-                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+-@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((yc>>3) & 1) << 1;
+-+                            int b = (xc>>3) & 1;
+-+                            setup = s->dvq->uv_setup_arm[num16];
+-+                            setup[0][b][0][a] = c_tc[0];
+-+                            setup[0][b][0][a + 1] = c_tc[1];
+-+                        } else
+-+#endif
+-                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+++#define UNCHECKED_BITSTREAM_READER 1
+ +
+-                     }
+-                 }
++ #include "libavutil/attributes.h"
++ #include "libavutil/common.h"
+  
+-@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ #include "cabac_functions.h"
++ #include "hevc.h"
+  
+-                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+-                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+--                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-+                        src =
+ +#ifdef RPI
+-+                            rpi_sliced_frame(s->frame) ?
+-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#include "libavutil/rpi_sand_fns.h"
+ +#endif
+-+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-                         if (pcmf) {
+-                             no_p[0] = get_pcm(s, x,           y - 1);
+-                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+-@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((xc>>3) & 1) << 1;
+-+                            int b = (yc>>3) & 1;
+-+                            setup = s->dvq->uv_setup_arm[num16];
+-+                            setup[1][b][0][a] = c_tc[0];
+-+                            setup[1][b][0][a + 1] = c_tc[1];
+-+                        } else
+++
+++// BY22 is probably faster than simple bypass if the processor has
+++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+++// x86 has fast int divide
+++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+++// Use native divide if we have a fast one - otherwise use mpy 1/x
+++// x86 has a fast integer divide - arm doesn't - unsure about other
+++// architectures
+++#define USE_BY22_DIV  ARCH_X86
+++
+++// Special case blocks with a single significant ceoff
+++// Decreases the complexity of the code for a common case but increases the
+++// code size.
+++#define USE_N_END_1 1
+++
+++#if ARCH_ARM
+++#include "arm/hevc_cabac.h"
+ +#endif
+-                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+-@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     }
+- }
+++
++ #define CABAC_MAX_BIN 31
+  
+--static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+--                             RefPicList *neigh_refPicList)
+--{
+--    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+--        // same L0 and L1
+--        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+--            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+--            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+--            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+--                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else {
+--            return 1;
+--        }
+--    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+--        Mv A, B;
+--        int ref_A, ref_B;
+--
+--        if (curr->pred_flag & 1) {
+--            A     = curr->mv[0];
+--            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+--        } else {
+--            A     = curr->mv[1];
+--            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+--        }
+--
+--        if (neigh->pred_flag & 1) {
+--            B     = neigh->mv[0];
+--            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+--        } else {
+--            B     = neigh->mv[1];
+--            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+--        }
+--
+--        if (ref_A == ref_B) {
+--            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else
+--            return 1;
+--    }
+--
+--    return 1;
+--}
+- 
+- void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-                                            int log2_trafo_size)
+-@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+-     int min_pu_width     = s->ps.sps->min_pu_width;
+-     int min_tu_width     = s->ps.sps->min_tb_width;
+--    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+--                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+-     int boundary_upper, boundary_left;
+--    int i, j, bs;
+-+    int i, j;
+-+    RefPicList *rpl      = s->ref->refPicList;
+-+    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+-+    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+-+    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+-+    int y_pu             = y0 >> log2_min_pu_size;
+-+    int x_pu             = x0 >> log2_min_pu_size;
+-+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    int is_intra         = curr->pred_flag == PF_INTRA;
+-+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+-+    uint8_t *bs;
+ +
+-+#ifdef DISABLE_STRENGTHS
+-+    return;
+-+#endif
+- 
+-     boundary_upper = y0 > 0 && !(y0 & 7);
+-     if (boundary_upper &&
+-@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_upper = 0;
+++#if USE_BY22 && !USE_BY22_DIV
+++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+++
+++static const uint32_t cabac_by22_inv_range[256] = {
+++                                                    0,      I(257), I(258), I(259),
+++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+++    I(510), I(511)
+++};
+++#undef I
+++#endif  // USE_BY22
+++
++ /**
++  * number of bin by SyntaxElement.
++  */
++@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
++     { 28, 36, 43, 49, 54, 58, 61, 63, },
++ };
+  
+-+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
+-     if (boundary_upper) {
+-         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
+-                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
+--                              s->ref->refPicList;
+--        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+--        int yq_pu =  y0      >> log2_min_pu_size;
+--        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+--        int yq_tu =  y0      >> log2_min_tu_size;
+-+                              rpl;
+-+        MvField *top = curr - min_pu_width;
+++typedef struct
+++{
+++    uint16_t coeff;
+++    uint16_t scale;
+++} xy_off_t;
+ +
+-+        if (is_intra) {
+-+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+-+                bs[i >> 2] = 2;
+++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+ +
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+-+                    curr, top, bs);
+- 
+-             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                int x_tu = (x0 + i) >> log2_min_tu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || top_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, top, rpl_top);
+--                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+-+                int i_pu = i >> log2_min_pu_size;
+-+                int i_tu = i >> log2_min_tu_size;
+++#define OFF_DIAG(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+                if (top[i_pu].pred_flag == PF_INTRA)
+-+                    bs[i >> 2] = 2;
+-+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+-+                    bs[i >> 2] = 1;
+-             }
+-+        }
+-+    }
+++#define OFF_HORIZ(t) {\
+++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+++}
+ +
+-+    if (!is_intra) {
+-+        for (j = inc; j < trafo_in_min_pus; j += inc) {
+-+            MvField *top;
+++#define OFF_VERT(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+            curr += min_pu_width * inc;
+-+            top = curr - min_pu_width;
+-+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+++static const xy_off_t off_xys[3][4][16] =
+++{
+++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+++};
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, top, bs);
+-+        }
+-     }
+- 
+--    // bs for vertical TU boundaries
+-     boundary_left = x0 > 0 && !(x0 & 7);
+-     if (boundary_left &&
+-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+-@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_left = 0;
+- 
+-+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
+-     if (boundary_left) {
+-         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
+-                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
+--                               s->ref->refPicList;
+--        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+--        int xq_pu =  x0      >> log2_min_pu_size;
+--        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+--        int xq_tu =  x0      >> log2_min_tu_size;
+--
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int y_pu      = (y0 + i) >> log2_min_pu_size;
+--                int y_tu      = (y0 + i) >> log2_min_tu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+--                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || left_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, left, rpl_left);
+--                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+--            }
+--    }
+--
+--    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+--        RefPicList *rpl = s->ref->refPicList;
+-+                               rpl;
+-+        MvField *left = curr - 1;
+- 
+--        // bs for TU internal horizontal PU boundaries
+--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+-+        if (is_intra) {
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+-+                bs[j * s->bs_width >> 2] = 2;
+- 
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--
+--                bs = boundary_strength(s, curr, top, rpl);
+--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+-+
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+-+                    curr, left, bs);
+-+
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+-+                int j_pu = j >> log2_min_pu_size;
+-+                int j_tu = j >> log2_min_tu_size;
+-+
+-+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+-+                    bs[j * s->bs_width >> 2] = 2;
+-+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+-+                    bs[j * s->bs_width >> 2] = 1;
+-             }
+-         }
+-+    }
+- 
+--        // bs for TU internal vertical PU boundaries
+--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+--            int y_pu = (y0 + j) >> log2_min_pu_size;
+-+    if (!is_intra) {
+-+        for (i = inc; i < trafo_in_min_pus; i += inc) {
+-+            MvField *left;
+- 
+--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+-+            curr += inc;
+-+            left = curr - 1;
+-+            bs += inc << log2_min_pu_size >> 2;
+- 
+--                bs = boundary_strength(s, curr, left, rpl);
+--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+--            }
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, left, bs);
+-         }
+-     }
+- }
+-@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+- #undef CB
+- #undef CR
+- 
+-+#ifdef RPI_DEBLOCK_VPU
+-+// ff_hevc_flush_buffer_lines
+-+// flushes and invalidates all pixel rows in [start,end-1]
+-+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+++// Helper fns
+++#ifndef hevc_mem_bits32
+++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+ +{
+-+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+-+    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+-+    rpi_cache_flush_finish(rfe);
+++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+ +}
+ +#endif
+ +
+-+#if RPI_INTER
+-+
+-+// Flush some lines of a reference frames
+-+void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+++#define hevc_clz32 hevc_clz32_builtin
+++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+ +{
+-+    if (s->enable_rpi && s->used_for_ref) {
+-+        const int d0 = ((int *)f->progress->data)[0];
+-+        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+++}
+++#endif
+ +
+-+        if (curr_y < (unsigned int)f->f->height) {
+-+            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+-+            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
+-+            rpi_cache_flush_finish(rfe);
+-+        }
+++// It is unlikely that we will ever need this but include for completeness
+++#ifndef hevc_clz32
+++static inline unsigned int hevc_clz32(unsigned int x)
+++{
+++    unsigned int n = 1;
+++    if ((x & 0xffff0000) == 0) {
+++        n += 16;
+++        x <<= 16;
+++    }
+++    if ((x & 0xff000000) == 0) {
+++        n += 8;
+++        x <<= 8;
+++    }
+++    if ((x & 0xf0000000) == 0) {
+++        n += 4;
+++        x <<= 4;
+++    }
+++    if ((x & 0xc0000000) == 0) {
+++        n += 2;
+++        x <<= 2;
+ +    }
+++    return n - ((x >> 31) & 1);
+ +}
+ +#endif
+ +
+-+#ifdef RPI_DEBLOCK_VPU
+-+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+-+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+-+{
+-+  // Flush image, 4 lines above to bottom of ctb stripe
+-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+-+  // TODO flush buffer of beta/tc setup when it becomes cached
+ +
+-+  // Prepare three commands at once to avoid calling overhead
+-+  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+-+  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+-+  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+-+  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+-+  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+-+  s->dvq->vpu_cmds_arm[0][5] = 2;
+++#if !USE_BY22
+++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+++// will no longer be called but the setup calls will still exist and we want
+++// to null them out
+++#define bypass_start(s)
+++#define bypass_finish(s)
+++#else
+++// Use BY22 for residual bypass block
+ +
+-+  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-+  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+-+  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+-+  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->dvq->vpu_cmds_arm[1][5] = 3;
+++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+ +
+-+  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-+  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+-+  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+-+  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->dvq->vpu_cmds_arm[2][5] = 4;
+++// BY22 notes that bypass is simply a divide into the bitstream and so we
+++// can peek out large quantities of bits at once and treat the result as if
+++// it was VLC.  In many cases this will lead to O(1) processing rather than
+++// O(n) though the setup and teardown is sufficiently expensive that it is
+++// only worth using if we expect to be dealing with more than a few bits
+++// The definition of "a few bits" will vary from platform to platform but
+++// tests on ARM show that it probably isn't worth it for a single coded
+++// residual, but is for >1 - it also seems likely that if there are
+++// more residuals then they are likely to be bigger and this will make the
+++// O(1) nature of the code more worthwhile.
+ +
+-+  // Call VPU
+-+  {
+-+      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+-+      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+-+      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+-+      vpu_qpu_job_finish(vqj);
+-+  }
+ +
+-+  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+-+  s->dvq = s->dvq_ents + s->dvq_n;
+++#if !USE_BY22_DIV
+++// * 1/x @ 32 bits gets us 22 bits of accuracy
+++#define CABAC_BY22_PEEK_BITS  22
+++#else
+++// A real 32-bit divide gets us another bit
+++// If we have a 64 bit int & a unit time divider then we should get a lot
+++// of bits (55)  but that is untested and it is unclear if it would give
+++// us a large advantage
+++#define CABAC_BY22_PEEK_BITS  23
+++#endif
+ +
+-+  vpu_qpu_wait(&s->dvq->cmd_id);
+++// Bypass block start
+++// Must be called before _by22_peek is used as it sets the CABAC environment
+++// into the correct state.  _by22_finish must be called to return to 'normal'
+++// (i.e. non-bypass) cabac decoding
+++static inline void get_cabac_by22_start(CABACContext * const c)
+++{
+++    const unsigned int bits = __builtin_ctz(c->low);
+++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+++#if !USE_BY22_DIV
+++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+++#endif
+++
+++    c->bytestream -= (CABAC_BITS / 8);
+++    c->by22.bits = bits;
+++#if !USE_BY22_DIV
+++    c->by22.range = c->range;
+++    c->range = inv;
+++#endif
+++    c->low = x;
+ +}
+ +
+++// Bypass block finish
+++// Must be called at the end of the bypass block to return to normal operation
+++static inline void get_cabac_by22_finish(CABACContext * const c)
+++{
+++    unsigned int used = c->by22.bits;
+++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+++
+++    c->bytestream += bytes_used + (CABAC_BITS / 8);
+++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+++#if !USE_BY22_DIV
+++    c->range = c->by22.range;
+ +#endif
+++}
+ +
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int done_deblock = 0;
+++// Peek bypass bits
+++// _by22_start must be called before _by22_peek is called and _by22_flush
+++// must be called afterwards to flush any used bits
+++// The actual number of valid bits returned is
+++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+++// will be at least 22 which should be long enough for any prefix or suffix
+++// though probably not long enough for the worst case combination
+++#ifndef get_cabac_by22_peek
+++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+++{
+++#if USE_BY22_DIV
+++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+++#else
+++    uint32_t x = c->low & ~1U;
+++    const uint32_t inv = c->range;
+++
+++    if (inv != 0)
+++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+++
+++    return x << 1;
+ +#endif
+-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+-         deblocking_filter_CTB(s, x, y);
+-+#ifdef RPI_DEBLOCK_VPU
+-+    if (s->enable_rpi_deblock && x_end)
+-+    {
+-+      int y_at_end = y >= s->ps.sps->height - ctb_size;
+-+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+-+      int y_start = y&~63;
+-+      if (y_at_end) height = s->ps.sps->height - y_start;
+-+      if ((((y+ctb_size)&63)==0) || y_at_end) {
+-+        done_deblock = 1;
+-+        rpi_deblock(s, y_start, height);
+-+      }
+-+    }
+-+#endif
+-     if (s->ps.sps->sao_enabled) {
+-         int y_end = y >= s->ps.sps->height - ctb_size;
+-         if (y && x)
+-@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x - ctb_size, y);
+-         if (y && x_end) {
+-             sao_filter_CTB(s, x, y - ctb_size);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type == FF_THREAD_FRAME ) {
+-+#if RPI_INTER
+-+                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-+            }
+-         }
+-         if (x_end && y_end) {
+-             sao_filter_CTB(s, x , y);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type == FF_THREAD_FRAME ) {
+-+#if RPI_INTER
+-+                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-+            }
+-+        }
+-+    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+-+        //int newh = y + ctb_size - 4;
+-+        //int currh = s->ref->tf.progress->data[0];
+-+        //if (((y + ctb_size)&63)==0)
+-+#ifdef RPI_DEBLOCK_VPU
+-+        if (s->enable_rpi_deblock) {
+-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+-+          if (done_deblock) {
+-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+          }
+-+        } else {
+-+#if RPI_INTER
+-+          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+++}
+ +#endif
+-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-         }
+--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+++
+++// Flush bypass bits peeked by _by22_peek
+++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+++// val is an unmodified copy of whatever _by22_peek returned
+++#ifndef get_cabac_by22_flush
+++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+++{
+++    // Subtract the bits used & reshift up to the top of the word
+++#if USE_BY22_DIV
+++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+ +#else
+-+#if RPI_INTER
+-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+ +#endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++
+++    // and refill lower bits
+++    // We will probably OR over some existing bits but that doesn't matter
+++    c->by22.bits += n;
+++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+++}
+ +#endif
+-+    }
+++
+++#endif  // USE_BY22
+++
+++
++ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
++ {
++     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
++     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+  }
+  
+- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+-diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
+-index 83f2ec2..bcf53dc 100644
+---- a/libavcodec/hevc_ps.c
+-+++ b/libavcodec/hevc_ps.c
+-@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+-     switch (sps->bit_depth) {
+-     case 8:
+-         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+-+#if RPI_HEVC_SAND
+-+        // *** Horrid kludge s.t. we start out with sand format
+-+        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+-+#else
+-         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+-+#endif
+-         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+-         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+-        break;
+-@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+-     sps->amp_enabled_flag = get_bits1(gb);
+-     sps->sao_enabled      = get_bits1(gb);
++-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++ }
+  
+-+    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
+-+
+-     sps->pcm_enabled_flag = get_bits1(gb);
+-     if (sps->pcm_enabled_flag) {
+-         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+-index 9d773d9..c4d7250 100644
+---- a/libavcodec/hevcdsp.c
+-+++ b/libavcodec/hevcdsp.c
+-@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+- #include "hevcdsp_template.c"
+- #undef BIT_DEPTH
++-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++ }
+  
+-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs)
+-+{
+-+    for (; pus > 0; pus--) {
+-+        int strength, out;
+-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++ }
++ 
++ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
++@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
++     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
++ }
++ 
++-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
++                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++ {
++     int i = 0;
++     int max = (log2_size << 1) - 1;
++     int ctx_offset, ctx_shift;
++ 
++-    if (!c_idx) {
+++    if (!c_idx_nz) {
++         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++         ctx_shift = (log2_size + 1) >> 2;
++     } else {
++@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
++     return value;
++ }
++ 
++-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
++ {
++     int inc;
++ 
++-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
++ 
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++ }
++-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
++-                                           int offset, const uint8_t *ctx_idx_map)
++-{
++-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
++-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
++-}
++ 
++-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
++ {
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++ }
++@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
++     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
++ }
++ 
++-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+ +
+-+#if 1 // This more directly matches the original implementation
+-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+-+            // same L0 and L1
+-+            if (curr_refL0 == neigh_refL0 &&
+-+                curr_refL0 == curr_refL1 &&
+-+                neigh_refL0 == neigh_refL1) {
+-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL0 == curr_refL0 &&
+-+                       neigh_refL1 == curr_refL1) {
+-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL1 == curr_refL0 &&
+-+                       neigh_refL0 == curr_refL1) {
+-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else {
+-+                strength = 1;
+-+            }
+-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+-+            Mv curr_mv0, neigh_mv0;
+++#if !USE_BY22
+++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+++#endif
+ +
+-+            if (curr->pred_flag & 1) {
+-+                curr_mv0   = curr->mv[0];
+-+            } else {
+-+                curr_mv0   = curr->mv[1];
+-+                curr_refL0 = curr_refL1;
+-+            }
+ +
+-+            if (neigh->pred_flag & 1) {
+-+                neigh_mv0   = neigh->mv[0];
+-+            } else {
+-+                neigh_mv0   = neigh->mv[1];
+-+                neigh_refL0 = neigh_refL1;
+-+            }
+++#ifndef coeff_abs_level_remaining_decode_bypass
+++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
++ {
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    unsigned int prefix;
+++    unsigned int last_coeff_abs_level_remaining;
+++    unsigned int n;
+ +
+-+            if (curr_refL0 == neigh_refL0) {
+-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else
+-+                strength = 1;
+-+        } else
+-+            strength = 1;
+-+#else // This has exactly the same effect, but is more suitable for vectorisation
+-+        Mv curr_mv[2];
+-+        Mv neigh_mv[2];
+-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++    y = get_cabac_by22_peek(c);
+++    prefix = hevc_clz32(~y);
+++    // y << prefix will always have top bit 0
+ +
+-+        if (!(curr->pred_flag & 2)) {
+-+            curr_mv[1] = curr_mv[0];
+-+            curr_refL1 = curr_refL0;
+-+        }
+-+        if (!(neigh->pred_flag & 2)) {
+-+            neigh_mv[1] = neigh_mv[0];
+-+            neigh_refL1 = neigh_refL0;
+-+        }
+-+        if (!(curr->pred_flag & 1)) {
+-+            curr_mv[0] = curr_mv[1];
+-+            curr_refL0 = curr_refL1;
+-+        }
+-+        if (!(neigh->pred_flag & 1)) {
+-+            neigh_mv[0] = neigh_mv[1];
+-+            neigh_refL0 = neigh_refL1;
+-+        }
+-+
+-+        strength = 1;
+++    if (prefix < 3) {
+++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+++        n = prefix + 1 + rice_param;
+++    }
+++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+++    {
+++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+ +
+-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix * 2 + rice_param - 2;
+++    }
+++    else {
+++        unsigned int suffix;
+ +
+-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++        get_cabac_by22_flush(c, prefix, y);
+++        y = get_cabac_by22_peek(c);
+ +
+-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+-+#endif
+++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix + rice_param - 2;
+++    }
+ +
+-+        curr += in_inc / sizeof (MvField);
+-+        neigh += in_inc / sizeof (MvField);
+++    get_cabac_by22_flush(c, n, y);
+ +
+-+        for (out = dup; out > 0; out--)
+-+        {
+-+            *bs = strength;
+-+            bs += out_inc;
+-+        }
+-+    }
+++    return last_coeff_abs_level_remaining;
+ +}
+-+
+- void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+- {
+- #undef FUNC
+-@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+-     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+- 
+-+#ifndef RPI
+-+#define SLICED_LOOP_FILTERS(depth)
+-+#else
+-+#define SLICED_LOOP_FILTERS(depth)\
+-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+ +#endif
+ +
+-+
+- #define HEVC_DSP(depth)                                                     \
+-     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+-     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+-@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+-     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
+-     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
+-+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+-+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+-+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+-+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+-+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+-+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+-+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+-+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+-     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+-     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+-     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+-@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+-     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+-                                                                                \
+-+    hevcdsp->sao_band_filter_c[0] =                                            \
+-+    hevcdsp->sao_band_filter_c[1] =                                            \
+-+    hevcdsp->sao_band_filter_c[2] =                                            \
+-+    hevcdsp->sao_band_filter_c[3] =                                            \
+-+    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
+-+    hevcdsp->sao_edge_filter_c[0] =                                            \
+-+    hevcdsp->sao_edge_filter_c[1] =                                            \
+-+    hevcdsp->sao_edge_filter_c[2] =                                            \
+-+    hevcdsp->sao_edge_filter_c[3] =                                            \
+-+    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
+-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
+-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
+-+                                                                               \
+-     QPEL_FUNCS(depth);                                                         \
+-     QPEL_UNI_FUNCS(depth);                                                     \
+-     QPEL_BI_FUNCS(depth);                                                      \
+-@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     EPEL_UNI_FUNCS(depth);                                                     \
+-     EPEL_BI_FUNCS(depth);                                                      \
+-                                                                                \
+-+    SLICED_LOOP_FILTERS(depth);                                                \
+-     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
+-     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
+-     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
+-@@ -257,6 +404,8 @@ int i = 0;
+-         break;
+-     }
+- 
+-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+-+
+-     if (ARCH_X86)
+-         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+-     if (ARCH_ARM)
+-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+-index 9f1f6dd..639ecf1 100644
+---- a/libavcodec/hevcdsp.h
+-+++ b/libavcodec/hevcdsp.h
+-@@ -42,11 +42,26 @@ typedef struct SAOParams {
+-     uint8_t type_idx[3];    ///< sao_type_idx
+- } SAOParams;
+++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
++     int prefix = 0;
++     int suffix = 0;
++     int last_coeff_abs_level_remaining;
++     int i;
+  
+-+typedef struct Mv {
+-+    int16_t x;  ///< horizontal component of motion vector
+-+    int16_t y;  ///< vertical component of motion vector
+-+} Mv;
++-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++         prefix++;
++     if (prefix == CABAC_MAX_BIN) {
++         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++         return 0;
++     }
+ +
+-+typedef struct MvField {
+-+    DECLARE_ALIGNED(4, Mv, mv)[2];
+-+    int8_t ref_idx[2];
+-+    int8_t pred_flag;
+-+} MvField;
++     if (prefix < 3) {
++         for (i = 0; i < rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++     } else {
++         int prefix_minus3 = prefix - 3;
++         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                               << rc_rice_param) + suffix;
++     }
+ +
+- typedef struct HEVCDSPContext {
+-     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+-                     struct GetBitContext *gb, int pcm_bit_depth);
+-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+-+                    struct GetBitContext *gb, int pcm_bit_depth);
+- 
+--    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+-+    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+-+    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+-+    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+- 
+-     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+- 
+-@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
+- 
+-     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+-+    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                               int width, int height);
+- 
+-     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+-     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+-+    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++     return last_coeff_abs_level_remaining;
++ }
+  
+-     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+-                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+-+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+++#if !USE_BY22
+++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
++ {
++-    int i;
++-    int ret = 0;
+++    CABACContext * const c = &s->HEVClc->cc;
+++    unsigned int i;
+++    uint32_t ret = 0;
+  
+-     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+-                                     int height, intptr_t mx, intptr_t my, int width);
+-@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
+-     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+-                                         int32_t *tc, uint8_t *no_p,
+-                                         uint8_t *no_q);
+-+#ifdef RPI
+-+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+-+                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+-+                                 const uint8_t no_p[2], const uint8_t no_q[2],
+-+                                 uint8_t * _pix_l);
+-+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+-+                                 unsigned int no_f);
+-+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+-+                                 uint8_t * src_l,
+-+                                 unsigned int no_f);
++     for (i = 0; i < nb; i++)
++-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
++-    return ret;
+++        ret = (ret << 1) | get_cabac_bypass(c);
+ +
+++    return ret << (32 - nb);
+++}
+ +#endif
+ +
+-+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs);
+- } HEVCDSPContext;
+- 
+- void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+-diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
+-index b840d17..32b9e47 100644
+---- a/libavcodec/hevcdsp_template.c
+-+++ b/libavcodec/hevcdsp_template.c
+-@@ -26,6 +26,9 @@
+- #include "bit_depth_template.c"
+- #include "hevcdsp.h"
+- 
+-+#ifdef RPI
+-+#include "rpi_zc.h"
+-+#endif
+- 
+- static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+-                           GetBitContext *gb, int pcm_bit_depth)
+-@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+-     }
+- }
+- 
+-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+-+                          GetBitContext *gb, int pcm_bit_depth)
+++#ifndef coeff_sign_flag_decode_bypass
+++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+ +{
+-+    int x, y;
+-+    pixel *dst = (pixel *)_dst;
+-+
+-+    stride /= sizeof(pixel);
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    y = get_cabac_by22_peek(c);
+++    get_cabac_by22_flush(c, nb, y);
+++    return y & ~(0xffffffffU >> nb);
+++}
+++#endif
+ +
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x++)
+-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+-+        dst += stride;
+-+    }
+ +
+-+    dst = (pixel *)_dst + 1;
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x++)
+-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+-+        dst += stride;
+++#ifndef get_cabac_greater1_bits
+++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+++    uint8_t * const state0)
+++{
+++    unsigned int i;
+++    unsigned int rv = 0;
+++    for (i = 0; i != n; ++i) {
+++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+++        const unsigned int b = get_cabac(c, state0 + idx);
+++        rv = (rv << 1) | b;
+ +    }
+-+}
+++    return rv;
++ }
+++#endif
+ +
+ +
+- static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+-                                                      ptrdiff_t stride, int size)
+- {
+-@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
+-     }
+- }
+- 
+-+static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
+-+                                                ptrdiff_t stride, int size)
+-+{
+-+    int x, y;
+-+    pixel *dst = (pixel *)_dst;
+++// N.B. levels returned are the values assuming coeff_abs_level_remaining
+++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+++// this version of events.
+++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+++    int * const pprev_subset_coded, int * const psum,
+++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+++    unsigned int rv;
+++    unsigned int i;
+++    const unsigned int n = FFMIN(n_end, 8);
+ +
+-+    stride /= sizeof(pixel);
+++    // Really this is i != n but the simple unconditional loop is cheaper
+++    // and faster
+++    for (i = 0; i != 8; ++i)
+++        levels[i] = 1;
+ +
+-+    for (y = 0; y < size; y++) {
+-+        for (x = 0; x < size * 2; x += 2) {
+-+            dst[x] = av_clip_pixel(dst[x] + *res);
+-+            res++;
+++    rv = get_cabac_greater1_bits(c, n, state0);
+++
+++    *pprev_subset_coded = 0;
+++    *psum = n;
+++
+++    rv <<= (32 - n);
+++    if (rv != 0)
+++    {
+++        *pprev_subset_coded = 1;
+++        *psum = n + 1;
+++        i = hevc_clz32(rv);
+++        levels[i] = 2;
+++        if (get_cabac(c, state_gt2) == 0)
+++        {
+++            // Unset first coded bit
+++            rv &= ~(0x80000000U >> i);
+ +        }
+-+        dst += stride;
+ +    }
+-+}
+ +
+- static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
+-                                        ptrdiff_t stride)
+- {
+-@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+-     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+- }
+- 
+-+// -- U -- (plaited)
+++    if (n_end > 8) {
+++        const unsigned int g8 = n_end - 8;
+++        rv |= ((1 << g8) - 1) << (24 - g8);
+++        for (i = 0; i != g8; ++i) {
+++            levels[i + 8] = 0;
+++        }
+++    }
+ +
+-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+-+{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 4);
+++    return rv;
+ +}
+ +
+-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+++// extended_precision_processing_flag must be false given we are
+++// putting the result into a 16-bit array
+++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+++// scale_m is uint8_t
+++//
+++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+++//   or it can be 2 (if we have transquant_bypass)
+++// shift is set to one less than we really want but would normally be
+++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+++// to achieve it
+++
+++#ifndef trans_scale_sat
+++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+ +{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 8);
+++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+ +}
+++#endif
+ +
+-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++
+++#ifndef update_rice
+++static inline void update_rice(uint8_t * const stat_coeff,
+++    const unsigned int last_coeff_abs_level_remaining,
+++    const unsigned int c_rice_param)
+ +{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 16);
+++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+++    if (x >= 6)
+++        (*stat_coeff)++;
+++    else if (x == 0 && *stat_coeff > 0)
+++        (*stat_coeff)--;
+ +}
+++#endif
+ +
+-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++
+++// n must be > 0 on entry
+++#ifndef get_cabac_sig_coeff_flag_idxs
+++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * p)
+ +{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 32);
+++    do {
+++        if (get_cabac(c, state0 + ctx_map[n]))
+++            *p++ = n;
+++    } while (--n != 0);
+++    return p;
+ +}
+++#endif
+ +
+-+// -- V -- (plaited)
+ +
+-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * const flag_idx)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
+++    int rv;
+++
+++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+++
+++    return rv;
+ +}
+ +
+-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x1,  x2,  x3,\
+++     x4,  x5,  x6,  x7,\
+++     x8,  x9, x10, x11,\
+++    x12, x13, x14, x15}
+++
+++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x8, x12,\
+++     x1,  x5,  x9, x13,\
+++     x2,  x6, x10, x14,\
+++     x3,  x7, x11, x15}
+++
+++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x1,  x8,\
+++     x5,  x2, x12,  x9,\
+++     x6,  x3, x13, x10,\
+++     x7, x14, x11, x15}
+++
+++
+++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+++    uint8_t * const significant_coeff_group_flag,
+++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+++    int * const pPrev_sig)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
+++    while (--i >= 0) {
+++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
+++        const unsigned int x_cg = scan_x_cg[i];
+++
+++        // For the flag decode we only care about Z/NZ but
+++        // we use the full Right * 2 + Down when calculating
+++        // significant coeff flags so we obtain it here.
+++        //
+++        // The group flag array is one longer than it needs to
+++        // be so we don't need to check for y_cg limits
+++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
+++
+++        if (i == 0 ||
+++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+++        {
+++            gf_y[0] |= (1 << x_cg);
+++            *pPrev_sig = prev_sig;
+++            break;
+++        }
+++    }
+++
+++    return i;
+ +}
+ +
+-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++#ifdef RPI
+++static void rpi_add_residual(HEVCContext * const s,
+++    const unsigned int log2_trafo_size, const unsigned int c_idx,
+++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
+++    const AVFrame * const frame = s->frame;
+++    unsigned int stride = frame->linesize[c_idx];
+++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+++    const int is_sliced = av_rpi_is_sand_frame(frame);
+++    uint8_t * dst = !is_sliced ?
+++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+++        c_idx == 0 ?
+++            av_rpi_sand_frame_pos_y(frame, x, y) :
+++            av_rpi_sand_frame_pos_c(frame, x, y);
+++
+++    if (s->enable_rpi) {
+++        const unsigned int i = s->jb0->intra.n;
+++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+++
+++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+++            pc->ta.dst == dst)
+++        {
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->ta.stride == stride);
+++
+++            pc->type = RPI_PRED_ADD_RESIDUAL_C;
+++        }
+++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+++            pc->dc.dst == dst)
+++        {
+++            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->dc.stride == stride);
+++
+++            // Rewrite as add residual - must rewrite all fields as different union member
+++            pc->type = RPI_PRED_ADD_RESIDUAL_V;
+++            pc->c_idx = c_idx;
+++            pc->ta.buf = coeffs;
+++            pc->ta.dst = dst;
+++            pc->ta.stride = stride;
+++            pc->ta.dc = dc;
+++        }
+++        else
+++        {
+++            HEVCPredCmd * const cmd = pc + 1;
+++            s->jb0->intra.n = i + 1;
+++
+++            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+++            cmd->size = log2_trafo_size;
+++            cmd->c_idx = c_idx;
+++            cmd->ta.buf = coeffs;
+++            cmd->ta.dst = dst;
+++            cmd->ta.stride = stride;
+++            cmd->ta.dc = 0;
+++        }
+++    }
+++    else if (!is_sliced || c_idx == 0) {
+++        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
+++#if RPI_HEVC_SAND
+++    // * These should probably never happen
+++    else if (c_idx == 1) {
+++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+++    }
+++    else {
+++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+++    }
+++#endif
+ +}
+ +
+-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++
+++static void rpi_add_dc(HEVCContext * const s,
+++    const unsigned int log2_trafo_size, const unsigned int c_idx,
+++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
+++    const AVFrame * const frame = s->frame;
+++    const unsigned int stride = frame->linesize[c_idx];
+++    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+++    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+++    const int is_sliced = av_rpi_is_sand_frame(frame);
+++    uint8_t * const dst = !is_sliced ?
+++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+++        c_idx == 0 ?
+++            av_rpi_sand_frame_pos_y(frame, x, y) :
+++            av_rpi_sand_frame_pos_c(frame, x, y);
+++
+++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
+++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
+++
+++    if (s->enable_rpi) {
+++        const unsigned int i = s->jb0->intra.n;
+++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+++
+++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+++            pc->ta.dst == dst)
+++        {
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->ta.stride == stride);
+++
+++            pc->ta.dc = (int16_t)coeff;
+++        }
+++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+++            pc->dc.dst == dst)
+++        {
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->dc.stride == stride &&
+++                       (pc->dc.dc & ~0xffff) == 0);
+++
+++            pc->dc.dc |= (coeff << 16);
+++        }
+++        else
+++        {
+++            HEVCPredCmd * const cmd = pc + 1;
+++            s->jb0->intra.n = i + 1;
+++
+++            cmd->type = RPI_PRED_ADD_DC + c_idx;
+++            cmd->size = log2_trafo_size;
+++            cmd->c_idx = c_idx;
+++            cmd->dc.dst = dst;
+++            cmd->dc.stride = stride;
+++            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
+++        }
+++    }
+ +}
+ +
+++
+++#endif
+  
+- static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                                 int log2_trafo_size, enum ScanType scan_idx,
++                                 int c_idx)
+  {
+-@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+-     int x, y;
+-     pixel *dst = (pixel *)_dst;
+-     pixel *src = (pixel *)_src;
+--    int16_t *sao_offset_val = sao->offset_val[c_idx];
+-     int sao_eo_class    = sao->eo_class[c_idx];
+-     int init_x = 0, width = _width, height = _height;
++-#define GET_COORD(offset, n)                                    \
++-    do {                                                        \
++-        x_c = (x_cg << 2) + scan_x_off[n];                      \
++-        y_c = (y_cg << 2) + scan_y_off[n];                      \
++-    } while (0)
++-    HEVCLocalContext *lc = s->HEVClc;
++-    int transform_skip_flag = 0;
+++    HEVCLocalContext * const lc = s->HEVClc;
+++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
+  
+-@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++     int last_significant_coeff_x, last_significant_coeff_y;
++-    int last_scan_pos;
++-    int n_end;
++     int num_coeff = 0;
++-    int greater1_ctx = 1;
+++    int prev_subset_coded = 0;
+  
+-     if (sao_eo_class != SAO_EO_VERT) {
+-         if (borders[0]) {
+--            int offset_val = sao_offset_val[0];
+-             for (y = 0; y < height; y++) {
+--                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+-+                dst[y * stride_dst] = src[y * stride_src];
+-             }
+-             init_x = 1;
+-         }
+-         if (borders[2]) {
+--            int offset_val = sao_offset_val[0];
+-             int offset     = width - 1;
+-             for (x = 0; x < height; x++) {
+--                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+-             }
+-             width--;
+-         }
+-     }
+-     if (sao_eo_class != SAO_EO_HORIZ) {
+-         if (borders[1]) {
+--            int offset_val = sao_offset_val[0];
+-             for (x = init_x; x < width; x++)
+--                dst[x] = av_clip_pixel(src[x] + offset_val);
+-+                dst[x] = src[x];
+-         }
+-         if (borders[3]) {
+--            int offset_val   = sao_offset_val[0];
+--            int y_stride_dst = stride_dst * (height - 1);
+--            int y_stride_src = stride_src * (height - 1);
+-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
+-             for (x = init_x; x < width; x++)
+--                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+-+                dst[x + y_stride_dst] = src[x + y_stride_src];
+-             height--;
+-         }
+-     }
+-@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+-     int x, y;
+-     pixel *dst = (pixel *)_dst;
+-     pixel *src = (pixel *)_src;
+--    int16_t *sao_offset_val = sao->offset_val[c_idx];
+-     int sao_eo_class    = sao->eo_class[c_idx];
+-     int init_x = 0, init_y = 0, width = _width, height = _height;
++     int num_last_subset;
++     int x_cg_last_sig, y_cg_last_sig;
+  
+-@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+++    const uint8_t *scan_x_cg, *scan_y_cg;
+++    const xy_off_t * scan_xy_off;
+  
+-     if (sao_eo_class != SAO_EO_VERT) {
+-         if (borders[0]) {
+--            int offset_val = sao_offset_val[0];
+-             for (y = 0; y < height; y++) {
+--                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+-+                dst[y * stride_dst] = src[y * stride_src];
+-             }
+-             init_x = 1;
+-         }
+-         if (borders[2]) {
+--            int offset_val = sao_offset_val[0];
+-             int offset     = width - 1;
+-             for (x = 0; x < height; x++) {
+--                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+-             }
+-             width--;
+-         }
+-     }
+-     if (sao_eo_class != SAO_EO_HORIZ) {
+-         if (borders[1]) {
+--            int offset_val = sao_offset_val[0];
+-             for (x = init_x; x < width; x++)
+--                dst[x] = av_clip_pixel(src[x] + offset_val);
+-+                dst[x] = src[x];
+-             init_y = 1;
+-         }
+-         if (borders[3]) {
+--            int offset_val   = sao_offset_val[0];
+--            int y_stride_dst = stride_dst * (height - 1);
+--            int y_stride_src = stride_src * (height - 1);
+-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
+-             for (x = init_x; x < width; x++)
+--                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+-+                dst[x + y_stride_dst] = src[x + y_stride_src];
+-             height--;
+-         }
+-     }
+-@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+-     }
+- }
+++#ifndef RPI
++     ptrdiff_t stride = s->frame->linesize[c_idx];
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++#endif
+++#ifdef RPI
+++    int use_vpu;
+++    int use_dc = 0;
+++#endif
+++    int16_t *coeffs;
+++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
++     int explicit_rdpcm_flag = 0;
++     int explicit_rdpcm_dir_flag;
+  
++     int trafo_size = 1 << log2_trafo_size;
++     int i;
++-    int qp,shift,add,scale,scale_m;
+++    int qp,shift,scale;
++     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
++     const uint8_t *scale_matrix = NULL;
++     uint8_t dc_scale;
++     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
++                                          lc->tu.intra_pred_mode_c;
++ 
++-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+++    int prev_sig = 0;
+++    const int c_idx_nz = (c_idx != 0);
+ +
+-+// --- Plaited chroma versions
+-+
+-+#if BIT_DEPTH != 8
+-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                                  int width, int height)
+-+{
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+#else
+-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                                  int width, int height)
+-+{
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int offset_table_u[32] = { 0 };
+-+    int offset_table_v[32] = { 0 };
+-+    int k, y, x;
+-+    int shift  = BIT_DEPTH - 5;
+-+
+-+    stride_dst /= sizeof(pixel);
+-+    stride_src /= sizeof(pixel);
+-+    width *= 2;
+++    int may_hide_sign;
++ 
++     // Derive QP for dequant
++     if (!lc->cu.cu_transquant_bypass_flag) {
++-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
++         static const uint8_t rem6[51 + 4 * 6 + 1] = {
++             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
++             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
++@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         };
++         int qp_y = lc->qp_y;
++ 
+++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+ +
+-+    for (k = 0; k < 4; k++)
+-+    {
+-+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+-+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+-+    }
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x += 2)
++         if (s->ps.pps->transform_skip_enabled_flag &&
++             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+++            if (transform_skip_flag) {
+++                trans_skip_or_bypass = 1;
+++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+++                    may_hide_sign = 0;
+++                }
+++            }
++         }
++ 
++         if (c_idx == 0) {
++@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             qp += s->ps.sps->qp_bd_offset;
++         }
++ 
++-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
++-        add      = 1 << (shift-1);
++-        scale    = level_scale[rem6[qp]] << (div6[qp]);
++-        scale_m  = 16; // default when no custom scaling lists.
++-        dc_scale = 16;
+++        // Shift is set to one less than will actually occur as the scale
+++        // and saturate step adds 1 and then shifts right again
+++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+++        scale = level_scale[rem6[qp]];
+++        if (div6[qp] >= shift) {
+++            scale <<= (div6[qp] - shift);
+++            shift = 0;
+++        } else {
+++            shift -= div6[qp];
+++        }
++ 
++-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
++-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
++ 
++             matrix_id = 3 * matrix_id + c_idx;
++ 
++             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+++            dc_scale = scale_matrix[0];
++             if (log2_trafo_size >= 4)
++                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++         }
+++        else
+ +        {
+-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+++            static const uint8_t sixteen_scale[64] = {
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16
+++            };
+++            scale_matrix = sixteen_scale;
+++            dc_scale = 16;
+ +        }
+-+        dst += stride_dst;
+-+        src += stride_src;
+-+    }
+-+}
+-+#endif
++     } else {
+++        static const uint8_t unit_scale[64] = {
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++        };
+++        scale_matrix = unit_scale;
++         shift        = 0;
++-        add          = 0;
++-        scale        = 0;
++-        dc_scale     = 0;
+++        scale        = 2;  // We will shift right to kill this
+++        dc_scale     = 1;
+ +
+-+#if BIT_DEPTH != 8
+-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+-+                                  int eo, int width, int height) {
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+#else
+++        may_hide_sign = 0;
++     }
++ 
+ +
+-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+-+                                  int eo, int width, int height) {
+ +
+-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+-+    static const int8_t pos[4][2][2] = {
+-+        { { -1,  0 }, {  1, 0 } }, // horizontal
+-+        { {  0, -1 }, {  0, 1 } }, // vertical
+-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+-+    };
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int a_stride, b_stride;
+-+    int x, y;
+-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+-+    stride_dst /= sizeof(pixel);
+-+    width *= 2;
+ +
+-+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+-+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x += 2) {
+-+            int diff0u = CMP(src[x], src[x + a_stride]);
+-+            int diff1u = CMP(src[x], src[x + b_stride]);
+-+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+-+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+-+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+-+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+-+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+-+        }
+-+        src += stride_src;
+-+        dst += stride_dst;
+-+    }
+-+}
+-+#endif
++     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+++        trans_skip_or_bypass) {
+++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
++         if (explicit_rdpcm_flag) {
++-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+++            may_hide_sign = 0;
+++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
++         }
++     }
++ 
++-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
++                                            &last_significant_coeff_x, &last_significant_coeff_y);
++ 
++     if (last_significant_coeff_x > 3) {
++@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         int last_x_c = last_significant_coeff_x & 3;
++         int last_y_c = last_significant_coeff_y & 3;
++ 
++-        scan_x_off = ff_hevc_diag_scan4x4_x;
++-        scan_y_off = ff_hevc_diag_scan4x4_y;
++         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++-        if (trafo_size == 4) {
+ +
+-+#if BIT_DEPTH != 8
+-+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
+-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+-+                                    int *borders, int _width, int _height,
+-+                                    int c_idx, uint8_t *vert_edge,
+-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+-+{
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
+-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+-+                                    int *borders, int _width, int _height,
+-+                                    int c_idx, uint8_t *vert_edge,
+-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+-+{
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+++        switch (log2_trafo_size) {
+++        case 2:
++             scan_x_cg = scan_1x1;
++             scan_y_cg = scan_1x1;
++-        } else if (trafo_size == 8) {
+++            break;
+++        case 3:
++             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = diag_scan2x2_x;
++             scan_y_cg = diag_scan2x2_y;
++-        } else if (trafo_size == 16) {
+++            break;
+++        case 4:
++             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan4x4_x;
++             scan_y_cg = ff_hevc_diag_scan4x4_y;
++-        } else { // trafo_size == 32
+++            break;
+++        case 5:
+++        default:
++             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan8x8_x;
++             scan_y_cg = ff_hevc_diag_scan8x8_y;
+++            break;
++         }
++         break;
++     }
++     case SCAN_HORIZ:
++         scan_x_cg = horiz_scan2x2_x;
++         scan_y_cg = horiz_scan2x2_y;
++-        scan_x_off = horiz_scan4x4_x;
++-        scan_y_off = horiz_scan4x4_y;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++         break;
++     default: //SCAN_VERT
++         scan_x_cg = horiz_scan2x2_y;
++         scan_y_cg = horiz_scan2x2_x;
++-        scan_x_off = horiz_scan4x4_y;
++-        scan_y_off = horiz_scan4x4_x;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++         break;
++     }
++     num_coeff++;
++     num_last_subset = (num_coeff - 1) >> 4;
++ 
++-    for (i = num_last_subset; i >= 0; i--) {
++-        int n, m;
++-        int x_cg, y_cg, x_c, y_c, pos;
++-        int implicit_non_zero_coeff = 0;
++-        int64_t trans_coeff_level;
++-        int prev_sig = 0;
++-        int offset = i << 4;
++-        int rice_init = 0;
++-
++-        uint8_t significant_coeff_flag_idx[16];
++-        uint8_t nb_significant_coeff_flag = 0;
+++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++ 
++-        x_cg = scan_x_cg[i];
++-        y_cg = scan_y_cg[i];
+++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++ 
++-        if ((i < num_last_subset) && (i > 0)) {
++-            int ctx_cg = 0;
++-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
++-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+++    {
+++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+++#ifdef RPI
+++        use_vpu = 0;
+++        if (s->enable_rpi) {
+++            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
+++            use_dc = (num_coeff == 1) && !special &&
+++                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++ 
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
++-            implicit_non_zero_coeff = 1;
++-        } else {
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
++-             (x_cg == 0 && y_cg == 0));
+++            if (use_dc) {
+++                // Just need a little empty space
+++                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+++                // No need to clear
+++            }
+++            else
+++            {
+++                use_vpu = !special && log2_trafo_size >= 4;
+++                coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+++#if HAVE_NEON
+++                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+ +#else
+-+// Any old 2 byte 'normal' restore will work for these
+-+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
+-+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
+++                memset(coeffs, 0, ccount * sizeof(int16_t));
+ +#endif
+++            }
++         }
+++        else
+++#endif
+++        {
+++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+++            memset(coeffs, 0, ccount * sizeof(int16_t));
+++        }
+++    }
++ 
++-        last_scan_pos = num_coeff - offset - 1;
+++    i = num_last_subset;
+++    do {
+++        int implicit_non_zero_coeff = 0;
+++        int n_end;
+ +
+-+
+- #undef CMP
+++        uint8_t significant_coeff_flag_idx[16];
+++        unsigned int nb_significant_coeff_flag = 0;
+  
+- ////////////////////////////////////////////////////////////////////////////////
+-@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+- #undef TQ1
+- #undef TQ2
+- #undef TQ3
++         if (i == num_last_subset) {
+++            // First time through
+++            int last_scan_pos = num_coeff - (i << 4) - 1;
++             n_end = last_scan_pos - 1;
++             significant_coeff_flag_idx[0] = last_scan_pos;
++             nb_significant_coeff_flag = 1;
++         } else {
++             n_end = 15;
+++            implicit_non_zero_coeff = (i != 0);
++         }
++ 
++-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
++-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
++-
++-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
++-            static const uint8_t ctx_idx_map[] = {
++-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
++-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
++-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
++-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
++-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+++        if (n_end >= 0) {
+++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+++            };
+++            // N.B. prev_sig = Right * 2 + Down
+++            static const uint8_t ctx_idx_maps[3][4][16] = {
+++                {
+++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                }
++             };
++             const uint8_t *ctx_idx_map_p;
++             int scf_offset = 0;
++-            if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
++-                if (c_idx == 0) {
++-                    scf_offset = 40;
++-                } else {
++-                    scf_offset = 14 + 27;
++-                }
+ +
+-+#ifdef RPI
+++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                ctx_idx_map_p = ctx_idx_maps[0][3];
+++                scf_offset = 40 + c_idx_nz;
++             } else {
++-                if (c_idx != 0)
+++                if (c_idx_nz != 0)
++                     scf_offset = 27;
+ +
+-+// line zero
+-+#define P3 pix_l[0 * xstride]
+-+#define P2 pix_l[1 * xstride]
+-+#define P1 pix_l[2 * xstride]
+-+#define P0 pix_l[3 * xstride]
+-+#define Q0 pix_r[0 * xstride]
+-+#define Q1 pix_r[1 * xstride]
+-+#define Q2 pix_r[2 * xstride]
+-+#define Q3 pix_r[3 * xstride]
++                 if (log2_trafo_size == 2) {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++                 } else {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
++-                    if (c_idx == 0) {
++-                        if ((x_cg > 0 || y_cg > 0))
+++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+++                    if (!c_idx_nz) {
+++                        if (i != 0)
++                             scf_offset += 3;
+ +
+-+// line three. used only for deblocking decision
+-+#define TP3 pix_l[0 * xstride + 3 * ystride]
+-+#define TP2 pix_l[1 * xstride + 3 * ystride]
+-+#define TP1 pix_l[2 * xstride + 3 * ystride]
+-+#define TP0 pix_l[3 * xstride + 3 * ystride]
+-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
+-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
+-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
+-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
++                         if (log2_trafo_size == 3) {
++                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                         } else {
++@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                     }
++                 }
++             }
++-            for (n = n_end; n > 0; n--) {
++-                x_c = scan_x_off[n];
++-                y_c = scan_y_off[n];
++-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
++-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
++-                    nb_significant_coeff_flag++;
+ +
+-+// This is identical to hevc_loop_filter_luma except that the P/Q
+-+// components are on separate pointers
+-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+-+                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+-+                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+-+                                 uint8_t * _pix_l)
+-+{
+-+    int d, j;
+-+    pixel *pix_l        = (pixel *)_pix_l;
+-+    pixel *pix_r        = (pixel *)_pix_r;
+-+    const ptrdiff_t xstride = 1;
+-+    const ptrdiff_t ystride = _stride / sizeof(pixel);
+-+
+-+    beta <<= BIT_DEPTH - 8;
+-+
+-+    for (j = 0; j < 2; j++) {
+-+        const int dp0  = abs(P2  - 2 * P1  + P0);
+-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+-+        const int d0   = dp0 + dq0;
+-+        const int d3   = dp3 + dq3;
+-+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+-+        const int no_p = _no_p[j];
+-+        const int no_q = _no_q[j];
+-+
+-+        if (d0 + d3 >= beta) {
+-+            pix_l += 4 * ystride;
+-+            pix_r += 4 * ystride;
+-+            continue;
+-+        } else {
+-+            const int beta_3 = beta >> 3;
+-+            const int beta_2 = beta >> 2;
+-+            const int tc25   = ((tc * 5 + 1) >> 1);
+-+
+-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+-+                // strong filtering
+-+                const int tc2 = tc << 1;
+-+                for (d = 0; d < 4; d++) {
+-+                    const int p3 = P3;
+-+                    const int p2 = P2;
+-+                    const int p1 = P1;
+-+                    const int p0 = P0;
+-+                    const int q0 = Q0;
+-+                    const int q1 = Q1;
+-+                    const int q2 = Q2;
+-+                    const int q3 = Q3;
+-+                    if (!no_p) {
+-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+-+                    }
+-+                    if (!no_q) {
+-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+-+                    }
+-+                    pix_l += ystride;
+-+                    pix_r += ystride;
+-+                }
+-+            } else { // normal filtering
+-+                int nd_p = 1;
+-+                int nd_q = 1;
+-+                const int tc_2 = tc >> 1;
+-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+-+                    nd_p = 2;
+-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+-+                    nd_q = 2;
+-+
+-+                for (d = 0; d < 4; d++) {
+-+                    const int p2 = P2;
+-+                    const int p1 = P1;
+-+                    const int p0 = P0;
+-+                    const int q0 = Q0;
+-+                    const int q1 = Q1;
+-+                    const int q2 = Q2;
+-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+-+                    if (abs(delta0) < 10 * tc) {
+-+                        delta0 = av_clip(delta0, -tc, tc);
+-+                        if (!no_p)
+-+                            P0 = av_clip_pixel(p0 + delta0);
+-+                        if (!no_q)
+-+                            Q0 = av_clip_pixel(q0 - delta0);
+-+                        if (!no_p && nd_p > 1) {
+-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+-+                            P1 = av_clip_pixel(p1 + deltap1);
+-+                        }
+-+                        if (!no_q && nd_q > 1) {
+-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+-+                            Q1 = av_clip_pixel(q1 + deltaq1);
+-+                        }
+-+                    }
+-+                    pix_l += ystride;
+-+                    pix_r += ystride;
+-+                }
+-+            }
+-+        }
+-+    }
+-+}
+-+
+-+#undef TP3
+-+#undef TP2
+-+#undef TP1
+-+#undef TP0
+-+#undef TQ0
+-+#undef TQ1
+-+#undef TQ2
+-+#undef TQ3
+-+
+-+#undef P3
+-+#undef P2
+-+#undef P1
+-+#undef P0
+-+#undef Q0
+-+#undef Q1
+-+#undef Q2
+-+#undef Q3
+-+
+-+#define P1 pix_l[0 * xstride]
+-+#define P0 pix_l[1 * xstride]
+-+#define Q0 pix_r[0 * xstride]
+-+#define Q1 pix_r[1 * xstride]
+-+
+-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+-+                                          ptrdiff_t _ystride, const int32_t *_tc,
+-+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+-+{
+-+    int d, j, no_p, no_q;
+-+    pixel *pix_l        = (pixel *)_pix_l;
+-+    pixel *pix_r        = (pixel *)_pix_r;
+-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+++            if (n_end > 0) {
+++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+++                    n_end, ctx_idx_map_p,
+++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+ +
+-+    for (j = 0; j < 2; j++) {
+-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
+-+        if (tc <= 0) {
+-+            pix_l += 4 * ystride;
+-+            pix_r += 4 * ystride;
+-+            continue;
+-+        }
+-+        no_p = _no_p[j];
+-+        no_q = _no_q[j];
+++                nb_significant_coeff_flag += cnt;
+++                if (cnt != 0) {
++                     implicit_non_zero_coeff = 0;
++                 }
++             }
+ +
+-+        for (d = 0; d < 4; d++) {
+-+            int delta0;
+-+            const int p1 = P1;
+-+            const int p0 = P0;
+-+            const int q0 = Q0;
+-+            const int q1 = Q1;
+-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+-+            if (!no_p)
+-+                P0 = av_clip_pixel(p0 + delta0);
+-+            if (!no_q)
+-+                Q0 = av_clip_pixel(q0 - delta0);
+-+            pix_l += ystride;
+-+            pix_r += ystride;
+-+        }
+-+    }
+-+}
++             if (implicit_non_zero_coeff == 0) {
++-                if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                    if (c_idx == 0) {
++-                        scf_offset = 42;
++-                    } else {
++-                        scf_offset = 16 + 27;
++-                    }
+++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                    scf_offset = 42 + c_idx_nz;
++                 } else {
++                     if (i == 0) {
++-                        if (c_idx == 0)
++-                            scf_offset = 0;
++-                        else
++-                            scf_offset = 27;
+++                        scf_offset = c_idx_nz ? 27 : 0;
++                     } else {
++                         scf_offset = 2 + scf_offset;
++                     }
++                 }
++-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
++                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                     nb_significant_coeff_flag++;
++                 }
++@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             }
++         }
++ 
++-        n_end = nb_significant_coeff_flag;
++-
+++        if (nb_significant_coeff_flag != 0) {
+++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+++                prev_subset_coded;
+++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+++                (gt1_idx_delta << 2);
+++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+++                gt1_idx_delta;
+ +
+-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+-+                                 unsigned int no_f)
+-+{
+-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+-+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+-+}
+++            const unsigned int x_cg = scan_x_cg[i];
+++            const unsigned int y_cg = scan_y_cg[i];
+++            int16_t * const blk_coeffs = coeffs +
+++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+++            // This calculation is 'wrong' for log2_traffo_size == 2
+++            // but that doesn't mattor as in this case x_cg & y_cg
+++            // are always 0 so result is correct (0) anyway
+++            const uint8_t * const blk_scale = scale_matrix +
+++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+ +
+-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+-+                                 uint8_t * src_l,
+-+                                 unsigned int no_f)
+-+{
+-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+-+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+-+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+-+}
+++            // * The following code block doesn't deal with these flags:
+++            //   (nor did the one it replaces)
+++            //
+++            // cabac_bypass_alignment_enabled_flag
+++            //    This should be easy but I can't find a test case
+++            // extended_precision_processing_flag
+++            //    This can extend the required precision past 16bits
+++            //    so is probably tricky - also no example found yet
+ +
+-+#undef P1
+-+#undef P0
+-+#undef Q0
+-+#undef Q1
+++#if USE_N_END_1
+++            if (nb_significant_coeff_flag == 1) {
+++                // There is a small gain to be had from special casing the single
+++                // transform coefficient case.  The reduction in complexity
+++                // makes up for the code duplicatioon.
+ +
+++                int trans_coeff_level = 1;
+++                int coeff_sign_flag;
+++                int coded_val = 0;
+ +
+-+#endif
+++                // initialize first elem of coeff_bas_level_greater1_flag
+++                prev_subset_coded = 0;
+ +
+-diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
+-index 02c1766..cea16ea 100644
+---- a/libavcodec/hevcpred.c
+-+++ b/libavcodec/hevcpred.c
+-@@ -24,6 +24,7 @@
+- 
+- #include "hevcpred.h"
+- 
+-+#define PRED_C 0
+- #define BIT_DEPTH 8
+- #include "hevcpred_template.c"
+- #undef BIT_DEPTH
+-@@ -39,13 +40,37 @@
+- #define BIT_DEPTH 12
+- #include "hevcpred_template.c"
+- #undef BIT_DEPTH
+-+#undef PRED_C
+-+
+-+#ifdef RPI
+-+#define PRED_C 1
+-+#define BIT_DEPTH 8
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+
+-+#define BIT_DEPTH 9
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+
+-+#define BIT_DEPTH 10
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+
+-+#define BIT_DEPTH 12
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+#undef PRED_C
+-+#endif
+- 
+- void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+- {
+- #undef FUNC
+- #define FUNC(a, depth) a ## _ ## depth
+- 
+--#define HEVC_PRED(depth)                                \
+-+#undef FUNCC
+-+#define FUNCC(a, depth) a ## _ ## depth ## _c
+-+
+-+#define HEVC_PRED_Y(depth)                                \
+-     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
+-     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
+-     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
+-@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+-     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+-     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+++                    trans_coeff_level = 2;
+++                    prev_subset_coded = 1;
+++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+++                }
+  
+-+#define HEVC_PRED_C(depth)                                \
+-+    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+-+    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+-+    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+-+    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+-+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+-+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+-+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+-+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+-+    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+-+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+-+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+-+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+-+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+-+
+-+#ifdef RPI
+-+#define HEVC_PRED(depth) \
+-+    HEVC_PRED_Y(depth); \
+-+    HEVC_PRED_C(depth);
+-+#else
+-+#define HEVC_PRED(depth) \
+-+    HEVC_PRED_Y(depth);
+-+#endif
+-+
+-     switch (bit_depth) {
+-     case 9:
+-         HEVC_PRED(9);
+-diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
+-index eb17663..00ba3f9 100644
+---- a/libavcodec/hevcpred.h
+-+++ b/libavcodec/hevcpred.h
+-@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
+-     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+-                             const uint8_t *left, ptrdiff_t stride,
+-                             int c_idx, int mode);
+-+#ifdef RPI
+-+    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+-+
+-+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+-+                           const uint8_t *left, ptrdiff_t stride);
+-+    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+-+                    ptrdiff_t stride, int log2_size, int c_idx);
+-+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+-+                            const uint8_t *left, ptrdiff_t stride,
+-+                            int c_idx, int mode);
+-+#endif
+- } HEVCPredContext;
++-        if (n_end) {
++-            int first_nz_pos_in_cg;
++-            int last_nz_pos_in_cg;
++-            int c_rice_param = 0;
++-            int first_greater1_coeff_idx = -1;
++-            uint8_t coeff_abs_level_greater1_flag[8];
++-            uint16_t coeff_sign_flag;
++-            int sum_abs = 0;
++-            int sign_hidden;
++-            int sb_type;
+++                // Probably not worth the overhead of starting by22 for just one value
+++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+  
+- void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 6ae87cc..c14dddd 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -20,13 +20,55 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+++                if (coded_val)
+++                {
+++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+++                    } else {
+++                        uint8_t * const stat_coeff =
+++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                        const unsigned int c_rice_param = *stat_coeff >> 2;
+++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+  
+-+//#define DISABLE_INTRA
+-+
+- #include "libavutil/pixdesc.h"
++-            // initialize first elem of coeff_bas_level_greater1_flag
++-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                    }
+++                }
+  
+- #include "bit_depth_template.c"
+- #include "hevcpred.h"
++-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
++-                else
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
++-                c_rice_param = lc->stat_coeff[sb_type] / 4;
++-            }
+++                {
+++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+  
+-+#ifdef RPI
+-+#include "rpi_zc.h"
++-            if (!(i == num_last_subset) && greater1_ctx == 0)
++-                ctx_set++;
++-            greater1_ctx = 1;
++-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
++-
++-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
++-                int inc = (ctx_set << 2) + greater1_ctx;
++-                coeff_abs_level_greater1_flag[m] =
++-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
++-                if (coeff_abs_level_greater1_flag[m]) {
++-                    greater1_ctx = 0;
++-                    if (first_greater1_coeff_idx == -1)
++-                        first_greater1_coeff_idx = m;
++-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
++-                    greater1_ctx++;
+++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                        (trans_coeff_level ^ k) - k,  // Apply sign
+++                        scale,
+++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+++                        shift);
++                 }
++             }
++-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
++-
++-            if (lc->cu.cu_transquant_bypass_flag ||
++-                (lc->cu.pred_mode ==  MODE_INTRA  &&
++-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
++-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
++-                 explicit_rdpcm_flag)
++-                sign_hidden = 0;
++             else
++-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+ +#endif
+++            {
+++                int sign_hidden = may_hide_sign;
+++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+++                uint32_t coeff_sign_flags;
+++                uint32_t coded_vals = 0;
+++                // Sum(abs(level[]))
+++                // In fact we only need the bottom bit and in some future
+++                // version that may be all we calculate
+++                unsigned int sum_abs;
+ +
+-+#define DUMP_PRED 0
+++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+ +
+- #define POS(x, y) src[(x) + stride * (y)]
+- 
+-+#if PRED_C
+++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+++                    sign_hidden = 0;
+ +
+-+typedef uint8_t (* c8_dst_ptr_t)[2];
+-+typedef const uint8_t (* c8_src_ptr_t)[2];
+++                // -- Start bypass block
+ +
+-+#if BIT_DEPTH == 8
+-+#undef BIT_DEPTH
+-+#define BIT_DEPTH 16
+-+#include "bit_depth_template.c"
+-+#undef FUNC
+-+#define FUNC(a) FUNC3(a, 8, _c)
+-+#else
+-+#undef FUNC
+-+#define FUNC FUNCC
+-+#endif
+++                bypass_start(s);
+ +
+-+#endif
+++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+ +
+-+#if DUMP_PRED
+-+#ifndef DEBUG_ONCE
+-+#define DEBUG_ONCE
+-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+-+{
+-+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+-+        for (unsigned int x = 0; x != size; x++) {
+-+            printf("%4d", data[x * 2]);
+-+        }
+-+        printf("\n");
+-+    }
+-+    printf("\n");
+-+}
+-+#endif
+-+#endif
+++                if (coded_vals != 0)
+++                {
+++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+++                    int * level = levels - 1;
+ +
+- static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
+-                                               int log2_size, int c_idx)
+- {
+-@@ -69,8 +111,11 @@ do {                                  \
+-                 AV_WN4P(&ptr[i], a);                                           \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+-+#else
+-     HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+-     int i;
+-     int hshift = s->ps.sps->hshift[c_idx];
+-     int vshift = s->ps.sps->vshift[c_idx];
+-@@ -79,15 +124,23 @@ do {                                  \
+-     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+-     int size_in_luma_v = size << vshift;
+-     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+--    int x = x0 >> hshift;
+--    int y = y0 >> vshift;
+-+    const int x = x0 >> hshift;
+-+    const int y = y0 >> vshift;
+-     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+-     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+- 
+-     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+- 
+--    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+-+    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+-+#if defined(RPI)
+-+    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
+-+            (pixel*)s->frame->data[c_idx] + x + y * stride :
+-+        c_idx == 0 ?
+-+            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
+-+            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
+-+#else
+-     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+-+#endif
+- 
+-     int min_pu_width = s->ps.sps->min_pu_width;
+- 
+-@@ -95,14 +148,20 @@ do {                                  \
+-                               lc->tu.intra_pred_mode;
+-     pixel4 a;
+-     pixel  left_array[2 * MAX_TB_SIZE + 1];
+-+#if !PRED_C
+-     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+-+#endif
+-     pixel  top_array[2 * MAX_TB_SIZE + 1];
+-+#if !PRED_C
+-     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+-+#endif
+++                    do {
+++                        {
+++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+++                            level += z;
+++                            coded_vals <<= z;
+++                        }
+  
+-     pixel  *left          = left_array + 1;
+-     pixel  *top           = top_array  + 1;
+-+#if !PRED_C
+-     pixel  *filtered_left = filtered_left_array + 1;
+-     pixel  *filtered_top  = filtered_top_array  + 1;
+-+#endif
+-     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
+-     int cand_left        = lc->na.cand_left;
+-     int cand_up_left     = lc->na.cand_up_left;
+-@@ -114,6 +173,26 @@ do {                                  \
+-     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+-                            (x0 + size_in_luma_h)) >> hshift;
++-            if (first_greater1_coeff_idx != -1) {
++-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
++-            }
++-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
++-            } else {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
++-            }
+++                        {
+++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+  
+-+    pixel * src_l = src - 1;
+-+    pixel * src_u = src - stride;
+-+    pixel * src_ur = src_u + size;
++-            for (m = 0; m < n_end; m++) {
++-                n = significant_coeff_flag_idx[m];
++-                GET_COORD(offset, n);
++-                if (m < 8) {
++-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
++-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
++-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                        trans_coeff_level += last_coeff_abs_level_remaining;
++-                        if (trans_coeff_level > (3 << c_rice_param))
++-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                                lc->stat_coeff[sb_type]++;
++-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                                if (lc->stat_coeff[sb_type] > 0)
++-                                    lc->stat_coeff[sb_type]--;
++-                            rice_init = 1;
+++                            sum_abs += last_coeff_abs_level_remaining + 1;
+++                            *level = trans_coeff_level;
+ +
+-+#ifdef DISABLE_INTRA
+-+    return;
+-+#endif
+++                            if (stat_coeff != NULL)
+++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                            stat_coeff = NULL;
+ +
+-+#if defined(RPI)
+-+    if (s->frame->format == AV_PIX_FMT_SAND128) {
+-+        const AVFrame * const frame = s->frame;
+-+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+-+        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
+-+        if ((x & mask) == 0)
+-+            src_l -= stripe_adj;
+-+        if (((x + size) & mask) == 0)
+-+            src_ur += stripe_adj;
+-+    }
+++                            if (trans_coeff_level > (3 << c_rice_param) &&
+++                                (c_rice_param < 4 || rice_adaptation_enabled))
+++                                ++c_rice_param;
++                         }
++-                    }
++-                } else {
++-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
++-                    if (trans_coeff_level > (3 << c_rice_param))
++-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                            lc->stat_coeff[sb_type]++;
++-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                            if (lc->stat_coeff[sb_type] > 0)
++-                                lc->stat_coeff[sb_type]--;
++-                        rice_init = 1;
++-                    }
+++                    } while (coded_vals != 0);
++                 }
++-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
++-                    sum_abs += trans_coeff_level;
++-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
++-                        trans_coeff_level = -trans_coeff_level;
+++
+++                // sign_hidden = 0 or 1 so we can combine the tests
+++                if ((sign_hidden & sum_abs) != 0) {
+++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++                 }
++-                if (coeff_sign_flag >> 15)
++-                    trans_coeff_level = -trans_coeff_level;
++-                coeff_sign_flag <<= 1;
++-                if(!lc->cu.cu_transquant_bypass_flag) {
++-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
++-                        if(y_c || x_c || log2_trafo_size < 4) {
++-                            switch(log2_trafo_size) {
++-                                case 3: pos = (y_c << 3) + x_c; break;
++-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
++-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
++-                                default: pos = (y_c << 2) + x_c; break;
++-                            }
++-                            scale_m = scale_matrix[pos];
++-                        } else {
++-                            scale_m = dc_scale;
++-                        }
+++
+++                bypass_finish(s);
+++
+++                // -- Finish bypass block
+++
+++                // Scale loop
+++                {
+++                    int m = nb_significant_coeff_flag - 1;
+++
+++                    // Deal with DC component (if any) first
+++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+++                    {
+++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+++                        blk_coeffs[0] = trans_scale_sat(
+++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+++                        --m;
++                     }
++-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
++-                    if(trans_coeff_level < 0) {
++-                        if((~trans_coeff_level) & 0xFffffffffff8000)
++-                            trans_coeff_level = -32768;
++-                    } else {
++-                        if(trans_coeff_level & 0xffffffffffff8000)
++-                            trans_coeff_level = 32767;
+++
+++#if !USE_N_END_1
+++                    // If N_END_1 set then m was at least 1 initially
+++                    if (m >= 0)
+ +#endif
+++                    {
+++                        do {
+++                            const xy_off_t * const xy_off = scan_xy_off +
+++                                significant_coeff_flag_idx[m];
+++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+ +
+-     if (s->ps.pps->constrained_intra_pred_flag == 1) {
+-         int size_in_luma_pu_v = PU(size_in_luma_v);
+-         int size_in_luma_pu_h = PU(size_in_luma_h);
+-@@ -163,23 +242,24 @@ do {                                  \
+-         top[-1] = 128;
+-     }
+-     if (cand_up_left) {
+--        left[-1] = POS(-1, -1);
+-+        left[-1] = src_l[-stride];
+-         top[-1]  = left[-1];
+-     }
+-     if (cand_up)
+--        memcpy(top, src - stride, size * sizeof(pixel));
+-+        // Always good - even with sand
+-+        memcpy(top, src_u, size * sizeof(pixel));
+-     if (cand_up_right) {
+--        memcpy(top + size, src - stride + size, size * sizeof(pixel));
+--        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+-+        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+-+        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
+-                size - top_right_size);
+-     }
+-     if (cand_left)
+-         for (i = 0; i < size; i++)
+--            left[i] = POS(-1, i);
+-+            left[i] = src_l[stride * i];
+-     if (cand_bottom_left) {
+-         for (i = size; i < size + bottom_left_size; i++)
+--            left[i] = POS(-1, i);
+--        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+-+            left[i] = src_l[stride * i];
+-+        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
+-                size - bottom_left_size);
+-     }
+++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                                (levels[m] ^ k) - k,
+++                                scale,
+++                                blk_scale[xy_off->scale],
+++                                shift);
+++                        } while (--m >= 0);
++                     }
++                 }
++-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+++
++             }
++         }
++-    }
+++    } while ((i = next_subset(s, i, c_idx_nz,
+++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
+  
+-@@ -268,7 +348,11 @@ do {                                  \
+-             cand_up_left = 1;
+-             cand_left    = 1;
+-         } else { // No samples available
+-+#if PRED_C && BIT_DEPTH == 16
+-+            left[-1] = 0x8080;
++     if (lc->cu.cu_transquant_bypass_flag) {
++         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++         }
++     } else {
++-        if (transform_skip_flag) {
+++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                       log2_trafo_size == 2 &&
++                       lc->cu.pred_mode == MODE_INTRA;
++@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             }
++         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++             s->hevcdsp.idct_4x4_luma(coeffs);
++-        } else {
+++        }
+++#ifdef RPI
+++        else if (!use_vpu)
+ +#else
+-             left[-1] = (1 << (BIT_DEPTH - 1));
+++        else
+ +#endif
+-             EXTEND(top,  left[-1], 2 * size);
+-             EXTEND(left, left[-1], 2 * size);
+++        {
++             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++             if (max_xy == 0)
++-                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+++            {
+++#ifdef RPI
+++                if (use_dc)
+++                    rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+++                else
+++#endif
+++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+++            }
++             else {
++                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++                 if (max_xy < 4)
++@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+          }
+-@@ -287,6 +371,9 @@ do {                                  \
+-     top[-1] = left[-1];
+- 
+-     // Filtering process
+-+    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
+-+    // worry about chroma smoothing for that case
+-+#if !PRED_C
+-     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+-         if (mode != INTRA_DC && size != 4){
+-             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+-@@ -342,13 +429,46 @@ do {                                  \
+-                                            mode);
+-         break;
+      }
+-+#else
+-+    switch (mode) {
+-+    case INTRA_PLANAR:
+-+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+-+                                          (uint8_t *)left, stride);
+-+        break;
+-+    case INTRA_DC:
+-+        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+-+                       (uint8_t *)left, stride, log2_size, c_idx);
+-+        break;
+-+    default:
+-+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+-+                                           (uint8_t *)left, stride, c_idx,
+-+                                           mode);
+-+        break;
+++#ifdef RPI
+++    if (!use_dc)
+++    {
+++        rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+ +    }
+-+
+-+#if DUMP_PRED
+-+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+-+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+-+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+-+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+-+#endif
+++#else
++     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+ +#endif
+  }
+  
+-+#if !PRED_C || BIT_DEPTH == 16
+- #define INTRA_PRED(size)                                                            \
+- static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+- {                                                                                   \
+-     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
+- }
+-+#else
+-+#define INTRA_PRED(size)                                                            \
+-+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+-+{                                                                                   \
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+#endif
++ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
++diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
++index 9fbcd1d8b8..df129e2e46 100644
++--- a/libavcodec/hevc_filter.c
+++++ b/libavcodec/hevc_filter.c
++@@ -22,6 +22,12 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
+  
+- INTRA_PRED(2)
+- INTRA_PRED(3)
+-@@ -357,6 +477,7 @@ INTRA_PRED(5)
+++//#define DISABLE_SAO
+++//#define DISABLE_DEBLOCK
+++//#define DISABLE_STRENGTHS
+++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+++//#define DISABLE_DEBLOCK_NONREF
+++
++ #include "libavutil/common.h"
++ #include "libavutil/internal.h"
+  
+- #undef INTRA_PRED
++@@ -31,6 +37,16 @@
+  
+-+#if !PRED_C
+- static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+-                                   const uint8_t *_left, ptrdiff_t stride,
+-                                   int trafo_size)
+-@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+-             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
+-                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
+- }
++ #include "bit_depth_template.c"
++ 
+++#ifdef RPI
+++#include "rpi_qpu.h"
+++#endif
+++#if RPI_HEVC_SAND
+++#include "rpi_zc.h"
+++#include "libavutil/rpi_sand_fns.h"
+ +#else
+-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+-+                                  const uint8_t * _left, ptrdiff_t stride,
+-+                                  int trafo_size)
+++#define RPI_ZC_SAND_8_IN_10_BUF 0
+++#endif
+++
++ #define LUMA 0
++ #define CB 1
++ #define CR 2
++@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
++     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
++ }
++ 
+++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+ +{
+-+    int x, y;
+-+    int size = 1 << trafo_size;
+-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++#if RPI_HEVC_SAND
+++    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
+++#else
+++    return s->ps.sps->pixel_shift;
+++#endif
+++}
+ +
+-+    for (y = 0; y < size; y++, src += stride)
++ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
++                      intptr_t stride_dst, intptr_t stride_src)
++ {
++@@ -161,12 +186,21 @@ int i, j;
++     }
++ }
++ 
+++// "DSP" these?
++ static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++ {
++-    if (pixel_shift)
++-        *(uint16_t *)dst = *(uint16_t *)src;
++-    else
++-        *dst = *src;
+++    switch (pixel_shift)
+ +    {
+-+        for (x = 0; x < size; x++)
+-+        {
+-+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+-+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+-+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+-+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+-+        }
+++        case 2:
+++            *(uint32_t *)dst = *(uint32_t *)src;
+++            break;
+++        case 1:
+++            *(uint16_t *)dst = *(uint16_t *)src;
+++            break;
+++        default:
+++            *dst = *src;
+++            break;
+ +    }
+-+}
+-+#endif
++ }
+  
+-+#if !PRED_C || BIT_DEPTH == 16
+- #define PRED_PLANAR(size)\
+- static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+-                                        const uint8_t *left, ptrdiff_t stride)   \
+- {                                                                               \
+-     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++ static void copy_vert(uint8_t *dst, const uint8_t *src,
++@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
++                       int stride_dst, int stride_src)
++ {
++     int i;
++-    if (pixel_shift == 0) {
++-        for (i = 0; i < height; i++) {
++-            *dst = *src;
++-            dst += stride_dst;
++-            src += stride_src;
++-        }
++-    } else {
++-        for (i = 0; i < height; i++) {
++-            *(uint16_t *)dst = *(uint16_t *)src;
++-            dst += stride_dst;
++-            src += stride_src;
++-        }
+++    switch (pixel_shift)
+++    {
+++        case 2:
+++            for (i = 0; i < height; i++) {
+++                *(uint32_t *)dst = *(uint32_t *)src;
+++                dst += stride_dst;
+++                src += stride_src;
+++            }
+++            break;
+++        case 1:
+++            for (i = 0; i < height; i++) {
+++                *(uint16_t *)dst = *(uint16_t *)src;
+++                dst += stride_dst;
+++                src += stride_src;
+++            }
+++            break;
+++        default:
+++            for (i = 0; i < height; i++) {
+++                *dst = *src;
+++                dst += stride_dst;
+++                src += stride_src;
+++            }
+++            break;
++     }
+  }
+-+#else
+-+#define PRED_PLANAR(size)\
+-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+-+                                       const uint8_t *left, ptrdiff_t stride)   \
+-+{                                                                               \
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
+-+    abort();                                                                    \
+-+}
+-+#endif
+  
+- PRED_PLANAR(0)
+- PRED_PLANAR(1)
+-@@ -386,6 +540,7 @@ PRED_PLANAR(3)
++@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
++                            int stride_src, int x, int y, int width, int height,
++                            int c_idx, int x_ctb, int y_ctb)
++ {
++-    int sh = s->ps.sps->pixel_shift;
+++    const unsigned int sh = pixel_shift(s, c_idx);
++     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+  
+- #undef PRED_PLANAR
++@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s,
++         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
++         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
++         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
++-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+++        const unsigned int sh = pixel_shift(s, c_idx);
+++        int len          = (min_pu_size >> hshift) << sh;
++         for (y = y_min; y < y_max; y++) {
++             for (x = x_min; x < x_max; x++) {
++                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
++                     int n;
++-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                     for (n = 0; n < (min_pu_size >> vshift); n++) {
++                         memcpy(src, dst, len);
++                         src += stride_src;
++@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s,
+  
+-+#if !PRED_C
+- static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+-                           const uint8_t *_left,
+-                           ptrdiff_t stride, int log2_size, int c_idx)
+-@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+-             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+-     }
+- }
++ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++ {
++-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+++#if SAO_FILTER_N == 5
+++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+++#elif SAO_FILTER_N == 6
+++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+ +#else
+-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+-+                          const uint8_t *_left,
+-+                          ptrdiff_t stride, int log2_size, int c_idx)
+-+{
+-+    unsigned int i, j;
+-+    const unsigned int size = (1 << log2_size);
+-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+-+    unsigned int dc0 = size;
+-+    unsigned int dc1 = size;
+++#error Confused by size of sao fn array
+++#endif
++     HEVCLocalContext *lc = s->HEVClc;
++     int c_idx;
++     int edges[4];  // 0 left 1 top 2 right 3 bottom
++@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     uint8_t right_tile_edge  = 0;
++     uint8_t up_tile_edge     = 0;
++     uint8_t bottom_tile_edge = 0;
+++#if RPI_HEVC_SAND
+++    const int sliced = av_rpi_is_sand_frame(s->frame);
+++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#else
+++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#endif
++ 
++     edges[0]   = x_ctb == 0;
++     edges[1]   = y_ctb == 0;
++     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++ 
+++#ifdef DISABLE_SAO
+++    return;
+++#endif
+ +
+-+    for (i = 0; i < size; i++)
+-+    {
+-+        dc0 += left[i][0] + top[i][0];
+-+        dc1 += left[i][1] + top[i][1];
+-+    }
++     if (restore) {
++         if (!edges[0]) {
++             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         }
++     }
++ 
++-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
++         int x0       = x >> s->ps.sps->hshift[c_idx];
++         int y0       = y >> s->ps.sps->vshift[c_idx];
++         int stride_src = s->frame->linesize[c_idx];
++@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
++         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
++         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++-        int stride_dst;
+++        ptrdiff_t stride_dst;
++         uint8_t *dst;
++ 
+++#if RPI_HEVC_SAND
+++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = !sliced ?
+++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
+++            c_idx == 0 ?
+++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
+++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+++            !sliced ? src - (1 << sh) :
+++            c_idx == 0 ?
+++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
+++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+++            !sliced ? src + (width << sh) :
+++            c_idx == 0 ?
+++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
+++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
+ +
+-+    dc0 >>= log2_size + 1;
+-+    dc1 >>= log2_size + 1;
+ +
+-+    for (i = 0; i < size; i++, src += stride)
+-+    {
+-+        for (j = 0; j < size; ++j)
+-+        {
+-+            src[j][0] = dc0;
+-+            src[j][1] = dc1;
+- 
+++        if (sliced && c_idx > 1) {
+++            break;
+ +        }
+-+    }
+-+}
+-+#endif
+-+
+-+#ifndef ANGLE_CONSTS
+-+#define ANGLE_CONSTS
+-+static const int intra_pred_angle[] = {
+-+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+-+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+-+};
+-+static const int inv_angle[] = {
+-+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+-+    -630, -910, -1638, -4096
+-+};
+++#else
+++        const unsigned int sh = s->ps.sps->pixel_shift;
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+ +#endif
+ +
+-+#if !PRED_C
+- static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-                                                 const uint8_t *_top,
+-                                                 const uint8_t *_left,
+-@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-     const pixel *top  = (const pixel *)_top;
+-     const pixel *left = (const pixel *)_left;
+- 
+--    static const int intra_pred_angle[] = {
+--         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+--        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+--    };
+--    static const int inv_angle[] = {
+--        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+--        -630, -910, -1638, -4096
+--    };
+--
+-     int angle = intra_pred_angle[mode - 2];
+-     pixel ref_array[3 * MAX_TB_SIZE + 4];
+-     pixel *ref_tmp = ref_array + size;
+-@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-         }
+-     }
+- }
+-+#else
+-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-+                                                const uint8_t *_top,
+-+                                                const uint8_t *_left,
+-+                                                ptrdiff_t stride, int c_idx,
+-+                                                int mode, int size)
+-+{
+-+    int x, y;
+-+    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
+-+    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
+-+    c8_src_ptr_t left = (c8_src_ptr_t)_left;
+-+
+-+    const int angle = intra_pred_angle[mode - 2];
+-+    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
+-+    c8_dst_ptr_t ref_tmp = ref_array + size;
+-+    c8_src_ptr_t ref;
+-+    const int last = (size * angle) >> 5;
+-+
+-+    if (mode >= 18) {
+-+        ref = top - 1;
+-+        if (angle < 0 && last < -1) {
+-+            memcpy(ref_tmp, top - 1, (size + 1) * 2);
+-+            for (x = last; x <= -1; x++)
+-+            {
+-+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+-+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+-+            }
+-+            ref = (c8_src_ptr_t)ref_tmp;
+-+        }
+-+
+-+        for (y = 0; y < size; y++, src += stride) {
+-+            const int idx  = ((y + 1) * angle) >> 5;
+-+            const int fact = ((y + 1) * angle) & 31;
+-+            if (fact) {
+-+                for (x = 0; x < size; ++x) {
+-+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+-+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+-+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+-+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++         switch (sao->type_idx[c_idx]) {
++         case SAO_BAND:
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++             if (s->ps.pps->transquant_bypass_enable_flag ||
++                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
++-            dst = lc->edge_emu_buffer;
++-            stride_dst = 2*MAX_PB_SIZE;
++-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
++-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
++-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++-                               x, y, width, height, c_idx);
+++                dst = lc->edge_emu_buffer;
+++                stride_dst = 2*MAX_PB_SIZE;
+++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+++#if RPI_HEVC_SAND
+++                if (sliced && c_idx != 0)
+++                {
+++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+ +                }
+-+            } else {
+-+                memcpy(src, ref + idx + 1, size * 2);
+-+            }
+-+        }
+-+    } else {
+-+        ref = left - 1;
+-+        if (angle < 0 && last < -1) {
+-+            memcpy(ref_tmp, left - 1, (size + 1) * 2);
+-+            for (x = last; x <= -1; x++)
+-+            {
+-+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+-+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+-+            }
+-+            ref = (c8_src_ptr_t)ref_tmp;
+-+        }
+-+
+-+        for (x = 0; x < size; x++, src++) {
+-+            const int idx  = ((x + 1) * angle) >> 5;
+-+            const int fact = ((x + 1) * angle) & 31;
+-+            if (fact) {
+-+                for (y = 0; y < size; y++) {
+-+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+-+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+-+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+-+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+++                else
+++#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+ +                }
+-+            } else {
+-+                for (y = 0; y < size; y++)
+++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+++                                   x, y, width, height, c_idx);
++             } else {
++-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
+++#if RPI_HEVC_SAND
+++                if (sliced && c_idx != 0)
+ +                {
+-+                    src[y * stride][0] = ref[y + idx + 1][0];
+-+                    src[y * stride][1] = ref[y + idx + 1][1];
+++//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
+++
+++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+ +                }
+-+            }
+-+        }
+-+    }
+-+}
+++                else
+ +#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+++                }
++             }
++             sao->type_idx[c_idx] = SAO_APPLIED;
++             break;
++@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         {
++             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++-            int left_edge = edges[0];
++             int top_edge = edges[1];
++-            int right_edge = edges[2];
++             int bottom_edge = edges[3];
++-            int sh = s->ps.sps->pixel_shift;
++-            int left_pixels, right_pixels;
+  
+- static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+-                                  const uint8_t *left,
+-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+-index 099a8c5..bdff2d2 100644
+---- a/libavcodec/mmaldec.c
+-+++ b/libavcodec/mmaldec.c
+-@@ -24,6 +24,9 @@
+-  * MMAL Video Decoder
+-  */
+- 
+-+#pragma GCC diagnostic push
+-+// Many many redundant decls in the header files
+-+#pragma GCC diagnostic ignored "-Wredundant-decls"
+- #include <bcm_host.h>
+- #include <interface/mmal/mmal.h>
+- #include <interface/mmal/mmal_parameters_video.h>
+-@@ -31,6 +34,7 @@
+- #include <interface/mmal/util/mmal_util_params.h>
+- #include <interface/mmal/util/mmal_default_components.h>
+- #include <interface/mmal/vc/mmal_vc_api.h>
+-+#pragma GCC diagnostic pop
+- 
+- #include "avcodec.h"
+- #include "internal.h"
+-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
+-index 3adf28d..2f9195f 100644
+---- a/libavcodec/mpeg4videodec.c
+-+++ b/libavcodec/mpeg4videodec.c
+-@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+- 
+-         if (ctx->divx_version >= 0)
+-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+-+
+-+        if (ctx->num_sprite_warping_points > 1)
+-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
+-     }
+- 
+-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+-@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+- 
+-+    avctx->workaround_bugs = s->workaround_bugs;
+-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+-         s->codec_id == AV_CODEC_ID_MPEG4 &&
+-         avctx->idct_algo == FF_IDCT_AUTO) {
+-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+-index bfa2537..1bca89e 100644
+---- a/libavcodec/raw.c
+-+++ b/libavcodec/raw.c
+-@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+-     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+-     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
++             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
++             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+  
+-+    /* RPI */
+-+#ifdef RPI
+-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+-+#endif
+-+
+-     /* special */
+-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+-index d837056..81256b5 100644
+---- a/libavcodec/rawenc.c
+-+++ b/libavcodec/rawenc.c
+-@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
+-     return 0;
+- }
++             if (!top_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++                 uint8_t *dst1;
++-                int src_idx, pos;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+  
+-+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
+-+{
+-+    for (int y = 0; y != frame->height / 2; ++y) {
+-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+-+            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
+-+            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
+-+            for (int i = 0; i < w; ++i)
+-+                *dst++ = p[i * 2];
+-+        }
+-+    }
+-+    return dst;
+-+}
+-+
+-+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+-+                      const AVFrame *frame)
+-+{
+-+    int size = frame->width * frame->height * 3 / 2;
+-+    uint8_t * dst;
+-+    int ret;
++-                dst1 = dst - stride_dst - (left << sh);
++-                src1[0] = src - stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                dst1 = dst - stride_dst;
+ +
+-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+-+        return ret;
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++                 }
+ +
+-+    dst = pkt->data;
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+ +
+-+    // Luma is "easy"
+-+    for (int y = 0; y != frame->height; ++y) {
+-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+-+            const int w = FFMIN(frame->linesize[0], frame->width - x);
+-+            memcpy(dst,
+-+                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
+-+            dst += w;
+-+        }
+-+    }
+-+    // Chroma is dull
+-+    dst = cpy_sand_c(dst, frame, 0);
+-+    dst = cpy_sand_c(dst, frame, 1);
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++                 }
++             }
++             if (!bottom_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++-                uint8_t *dst1;
++-                int src_idx, pos;
+++                uint8_t * const dst1 = dst + height * stride_dst;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+++                const unsigned int hoff = height * stride_src;
++ 
++-                dst1 = dst + height * stride_dst - (left << sh);
++-                src1[0] = src + height * stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++                 }
+ +
+-+    return 0;
+-+}
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+ +
+- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+-                       const AVFrame *frame, int *got_packet)
+- {
+-@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+-     if (ret < 0)
+-         return ret;
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++                 }
++             }
++-            left_pixels = 0;
++-            if (!left_edge) {
+++            if (src_l != NULL) {
++                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst - (1 << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    left_pixels = 1;
+++                    copy_vert(dst - (1 << sh),
+++                              src_l,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
++-            right_pixels = 0;
++-            if (!right_edge) {
+++            if (src_r != NULL) {
++                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst + (width << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    right_pixels = 1;
+++                    copy_vert(dst + (width << sh),
+++                              src_r,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
+  
+-+    if (frame->format == AV_PIX_FMT_SAND128) {
+-+        ret = raw_sand_as_yuv420(avctx, pkt, frame);
+-+        *got_packet = (ret == 0);
+-+        return ret;
++-            copy_CTB(dst - (left_pixels << sh),
++-                     src - (left_pixels << sh),
++-                     (width + left_pixels + right_pixels) << sh,
+++            copy_CTB(dst,
+++                     src,
+++                     width << sh,
++                      height, stride_dst, stride_src);
++ 
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++-                                            sao->eo_class[c_idx], width, height);
++-            s->hevcdsp.sao_edge_restore[restore](src, dst,
++-                                                stride_src, stride_dst,
++-                                                sao,
++-                                                edges, width,
++-                                                height, c_idx,
++-                                                vert_edge,
++-                                                horiz_edge,
++-                                                diag_edge);
+++#if RPI_HEVC_SAND
+++            if (sliced && c_idx != 0)
+++            {
+++                // Class always the same for both U & V (which is just as well :-))
+++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+++                                                width, height);
+++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
+++            else
+++#endif
+++            {
+++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+++                                                sao->eo_class[c_idx], width, height);
+++                s->hevcdsp.sao_edge_restore[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
+++            // ??? Does this actually work for chroma ???
++             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                x, y, width, height, c_idx);
++             sao->type_idx[c_idx] = SAO_APPLIED;
++@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         }
++         }
++     }
+++
+++#if RPI_ZC_SAND_8_IN_10_BUF
+++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
+++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
+++    {
+++        const unsigned int stride1 = s->frame->linesize[0];
+++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
+++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
+++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
+++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
+++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
+++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
+++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
+++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
+++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
+++
+++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
+++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
+++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
+ +    }
+++#endif
++ }
++ 
+++// Returns 2 or 0.
++ static int get_pcm(HEVCContext *s, int x, int y)
++ {
++     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     uint8_t *src;
++     int x, y;
++     int chroma, beta;
++-    int32_t c_tc[2], tc[2];
+++    int32_t c_tc[4], tc[2];
++     uint8_t no_p[2] = { 0 };
++     uint8_t no_q[2] = { 0 };
++ 
++@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 s->ps.sps->pcm.loop_filter_disable_flag) ||
++                s->ps.pps->transquant_bypass_enable_flag;
++ 
+++#ifdef DISABLE_DEBLOCK_NONREF
+++    if (!s->used_for_ref)
+++      return; // Don't deblock non-reference frames
+++#endif
+++#ifdef DISABLE_DEBLOCK
+++    return;
+++#endif
+++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+++        return;
++     if (x0) {
++         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
++         left_beta_offset = s->deblock[ctb - 1].beta_offset;
++@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x - 1, y);
++                     no_p[1] = get_pcm(s, x - 1, y + 4);
++                     no_q[0] = get_pcm(s, x, y);
++                     no_q[1] = get_pcm(s, x, y + 4);
++-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++-                                                         s->frame->linesize[LUMA],
++-                                                         beta, tc, no_p, no_q);
++-                } else
++-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
++-                                                       s->frame->linesize[LUMA],
++-                                                       beta, tc, no_p, no_q);
+++                }
+++#if RPI_HEVC_SAND
+++                if (av_rpi_is_sand_frame(s->frame)) {
+ +
+-     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+-         return ret;
+-     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-new file mode 100644
+-index 0000000..4309f1c
+---- /dev/null
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -0,0 +1,3070 @@
+-+unsigned char rpi_hevc_transform [] = {
+-+21,
+-+106,
+-+0,
+-+144,
+-+47,
+-+1,
+-+37,
+-+106,
+-+0,
+-+144,
+-+66,
+-+1,
+-+53,
+-+106,
+-+0,
+-+144,
+-+192,
+-+4,
+-+69,
+-+106,
+-+0,
+-+144,
+-+192,
+-+4,
+-+85,
+-+106,
+-+0,
+-+144,
+-+220,
+-+5,
+-+169,
+-+3,
+-+62,
+-+64,
+-+79,
+-+64,
+-+3,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+136,
+-+0,
+-+0,
+-+192,
+-+248,
+-+0,
+-+0,
+-+64,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+168,
+-+0,
+-+0,
+-+192,
+-+248,
+-+0,
+-+0,
+-+0,
+-+96,
+-+3,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+7,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+8,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+4,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+5,
+-+232,
+-+0,
+-+8,
+-+0,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+70,
+-+128,
+-+144,
+-+40,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+16,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+9,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+4,
+-+254,
+-+0,
+-+144,
+-+128,
+-+2,
+-+0,
+-+8,
+-+2,
+-+0,
+-+128,
+-+144,
+-+23,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+20,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+4,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+140,
+-+248,
+-+44,
+-+0,
+-+0,
+-+0,
+-+32,
+-+48,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+242,
+-+140,
+-+211,
+-+192,
+-+34,
+-+31,
+-+41,
+-+3,
+-+70,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+204,
+-+96,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+91,
+-+0,
+-+103,
+-+90,
+-+0,
+-+70,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+204,
+-+224,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+91,
+-+0,
+-+103,
+-+90,
+-+0,
+-+225,
+-+64,
+-+242,
+-+64,
+-+3,
+-+232,
+-+128,
+-+0,
+-+0,
+-+0,
+-+7,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+57,
+-+239,
+-+224,
+-+247,
+-+255,
+-+255,
+-+72,
+-+192,
+-+95,
+-+207,
+-+88,
+-+122,
+-+88,
+-+124,
+-+137,
+-+64,
+-+26,
+-+64,
+-+4,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+149,
+-+96,
+-+161,
+-+64,
+-+152,
+-+64,
+-+128,
+-+144,
+-+35,
+-+0,
+-+72,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+65,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+128,
+-+144,
+-+27,
+-+0,
+-+4,
+-+232,
+-+0,
+-+8,
+-+0,
+-+0,
+-+69,
+-+96,
+-+145,
+-+64,
+-+168,
+-+64,
+-+128,
+-+144,
+-+19,
+-+0,
+-+72,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+65,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+128,
+-+144,
+-+11,
+-+0,
+-+74,
+-+232,
+-+0,
+-+8,
+-+0,
+-+0,
+-+242,
+-+140,
+-+221,
+-+192,
+-+57,
+-+239,
+-+32,
+-+8,
+-+0,
+-+0,
+-+41,
+-+3,
+-+239,
+-+3,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+248,
+-+4,
+-+0,
+-+12,
+-+248,
+-+0,
+-+132,
+-+64,
+-+0,
+-+192,
+-+248,
+-+4,
+-+0,
+-+0,
+-+96,
+-+255,
+-+159,
+-+154,
+-+255,
+-+0,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+255,
+-+159,
+-+165,
+-+255,
+-+4,
+-+255,
+-+48,
+-+204,
+-+16,
+-+3,
+-+224,
+-+251,
+-+62,
+-+0,
+-+4,
+-+255,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+16,
+-+0,
+-+76,
+-+254,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+20,
+-+0,
+-+128,
+-+64,
+-+6,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+140,
+-+248,
+-+47,
+-+0,
+-+0,
+-+0,
+-+224,
+-+99,
+-+0,
+-+0,
+-+32,
+-+247,
+-+240,
+-+207,
+-+16,
+-+3,
+-+32,
+-+247,
+-+176,
+-+207,
+-+17,
+-+19,
+-+32,
+-+247,
+-+112,
+-+207,
+-+18,
+-+35,
+-+32,
+-+247,
+-+48,
+-+207,
+-+19,
+-+51,
+-+32,
+-+247,
+-+240,
+-+206,
+-+20,
+-+67,
+-+32,
+-+247,
+-+176,
+-+206,
+-+21,
+-+83,
+-+32,
+-+247,
+-+112,
+-+206,
+-+22,
+-+99,
+-+32,
+-+247,
+-+48,
+-+206,
+-+23,
+-+115,
+-+32,
+-+247,
+-+240,
+-+205,
+-+24,
+-+131,
+-+32,
+-+247,
+-+176,
+-+205,
+-+25,
+-+147,
+-+32,
+-+247,
+-+112,
+-+205,
+-+26,
+-+163,
+-+32,
+-+247,
+-+48,
+-+205,
+-+27,
+-+179,
+-+32,
+-+247,
+-+240,
+-+204,
+-+28,
+-+195,
+-+32,
+-+247,
+-+176,
+-+204,
+-+29,
+-+211,
+-+32,
+-+247,
+-+112,
+-+204,
+-+30,
+-+227,
+-+32,
+-+247,
+-+48,
+-+204,
+-+31,
+-+243,
+-+4,
+-+255,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+16,
+-+0,
+-+76,
+-+254,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+20,
+-+0,
+-+0,
+-+237,
+-+32,
+-+0,
+-+0,
+-+0,
+-+140,
+-+248,
+-+47,
+-+0,
+-+0,
+-+0,
+-+224,
+-+99,
+-+0,
+-+0,
+-+111,
+-+3,
+-+4,
+-+254,
+-+0,
+-+128,
+-+0,
+-+4,
+-+0,
+-+248,
+-+0,
+-+0,
+-+2,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+140,
+-+248,
+-+32,
+-+0,
+-+0,
+-+0,
+-+224,
+-+35,
+-+0,
+-+0,
+-+64,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+193,
+-+232,
+-+0,
+-+1,
+-+0,
+-+0,
+-+1,
+-+106,
+-+116,
+-+30,
+-+90,
+-+0,
+-+169,
+-+3,
+-+73,
+-+64,
+-+52,
+-+64,
+-+45,
+-+64,
+-+2,
+-+64,
+-+10,
+-+64,
+-+64,
+-+198,
+-+1,
+-+7,
+-+8,
+-+232,
+-+63,
+-+0,
+-+0,
+-+0,
+-+6,
+-+232,
+-+253,
+-+255,
+-+255,
+-+255,
+-+0,
+-+246,
+-+0,
+-+0,
+-+0,
+-+4,
+-+215,
+-+64,
+-+3,
+-+96,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+137,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+129,
+-+0,
+-+131,
+-+102,
+-+0,
+-+158,
+-+67,
+-+0,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+108,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+100,
+-+0,
+-+131,
+-+102,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+161,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+150,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+3,
+-+99,
+-+131,
+-+71,
+-+68,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+0,
+-+99,
+-+2,
+-+99,
+-+23,
+-+102,
+-+7,
+-+106,
+-+127,
+-+156,
+-+182,
+-+255,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+112,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+101,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+25,
+-+102,
+-+9,
+-+106,
+-+2,
+-+30,
+-+41,
+-+3,
+-+26,
+-+87,
+-+162,
+-+64,
+-+64,
+-+198,
+-+1,
+-+23,
+-+127,
+-+158,
+-+103,
+-+255,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+92,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+93,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+94,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+95,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+208,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+209,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+210,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+211,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+107,
+-+0,
+-+8,
+-+255,
+-+99,
+-+23,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+23,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+23,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+52,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+52,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+52,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+12,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+13,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+14,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+15,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+16,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+17,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+18,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+19,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+33,
+-+0,
+-+8,
+-+255,
+-+99,
+-+3,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+3,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+3,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+4,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+4,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+4,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+32,
+-+246,
+-+192,
+-+11,
+-+1,
+-+16,
+-+32,
+-+246,
+-+2,
+-+137,
+-+47,
+-+240,
+-+40,
+-+246,
+-+2,
+-+140,
+-+47,
+-+240,
+-+128,
+-+245,
+-+99,
+-+140,
+-+5,
+-+4,
+-+0,
+-+247,
+-+99,
+-+140,
+-+1,
+-+20,
+-+88,
+-+246,
+-+99,
+-+140,
+-+1,
+-+20,
+-+0,
+-+247,
+-+35,
+-+136,
+-+62,
+-+226,
+-+32,
+-+247,
+-+35,
+-+136,
+-+32,
+-+210,
+-+0,
+-+247,
+-+34,
+-+136,
+-+63,
+-+2,
+-+208,
+-+246,
+-+34,
+-+136,
+-+0,
+-+4,
+-+0,
+-+247,
+-+99,
+-+136,
+-+58,
+-+162,
+-+32,
+-+247,
+-+99,
+-+136,
+-+33,
+-+146,
+-+0,
+-+247,
+-+98,
+-+136,
+-+59,
+-+18,
+-+208,
+-+246,
+-+98,
+-+136,
+-+0,
+-+20,
+-+0,
+-+247,
+-+162,
+-+136,
+-+33,
+-+2,
+-+88,
+-+246,
+-+98,
+-+137,
+-+2,
+-+68,
+-+88,
+-+246,
+-+162,
+-+137,
+-+3,
+-+68,
+-+208,
+-+254,
+-+227,
+-+136,
+-+60,
+-+242,
+-+192,
+-+243,
+-+188,
+-+11,
+-+208,
+-+254,
+-+227,
+-+136,
+-+56,
+-+178,
+-+192,
+-+243,
+-+188,
+-+10,
+-+32,
+-+255,
+-+226,
+-+136,
+-+38,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+208,
+-+254,
+-+227,
+-+136,
+-+59,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+49,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+226,
+-+136,
+-+34,
+-+34,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+37,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+194,
+-+8,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+255,
+-+202,
+-+40,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+35,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+226,
+-+140,
+-+34,
+-+34,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+255,
+-+227,
+-+140,
+-+36,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+254,
+-+226,
+-+136,
+-+35,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+11,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+115,
+-+5,
+-+106,
+-+0,
+-+144,
+-+173,
+-+1,
+-+27,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+227,
+-+0,
+-+64,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+0,
+-+246,
+-+192,
+-+175,
+-+63,
+-+2,
+-+0,
+-+246,
+-+192,
+-+174,
+-+59,
+-+2,
+-+0,
+-+246,
+-+128,
+-+175,
+-+62,
+-+2,
+-+0,
+-+246,
+-+128,
+-+174,
+-+58,
+-+2,
+-+0,
+-+246,
+-+64,
+-+175,
+-+61,
+-+2,
+-+0,
+-+246,
+-+64,
+-+174,
+-+57,
+-+2,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+191,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+143,
+-+52,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+190,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+143,
+-+52,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+180,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+212,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+196,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+189,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+143,
+-+52,
+-+210,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+187,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+142,
+-+52,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+186,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+142,
+-+52,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+244,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+148,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+132,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+185,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+142,
+-+52,
+-+146,
+-+192,
+-+243,
+-+60,
+-+128,
+-+64,
+-+255,
+-+98,
+-+141,
+-+0,
+-+52,
+-+192,
+-+243,
+-+0,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+53,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+177,
+-+0,
+-+88,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+128,
+-+245,
+-+99,
+-+141,
+-+10,
+-+4,
+-+88,
+-+246,
+-+162,
+-+138,
+-+1,
+-+68,
+-+0,
+-+247,
+-+162,
+-+138,
+-+36,
+-+162,
+-+88,
+-+254,
+-+162,
+-+138,
+-+3,
+-+164,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+226,
+-+137,
+-+32,
+-+2,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+226,
+-+137,
+-+42,
+-+114,
+-+0,
+-+255,
+-+34,
+-+138,
+-+33,
+-+18,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+34,
+-+138,
+-+42,
+-+130,
+-+16,
+-+246,
+-+98,
+-+138,
+-+40,
+-+114,
+-+16,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+226,
+-+137,
+-+41,
+-+146,
+-+40,
+-+246,
+-+34,
+-+138,
+-+41,
+-+146,
+-+32,
+-+247,
+-+163,
+-+141,
+-+63,
+-+178,
+-+32,
+-+247,
+-+227,
+-+141,
+-+62,
+-+162,
+-+0,
+-+254,
+-+0,
+-+240,
+-+8,
+-+4,
+-+0,
+-+240,
+-+128,
+-+11,
+-+128,
+-+253,
+-+35,
+-+240,
+-+9,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+128,
+-+253,
+-+163,
+-+141,
+-+128,
+-+115,
+-+192,
+-+243,
+-+152,
+-+10,
+-+88,
+-+246,
+-+163,
+-+141,
+-+4,
+-+100,
+-+208,
+-+246,
+-+35,
+-+139,
+-+0,
+-+100,
+-+32,
+-+255,
+-+34,
+-+139,
+-+53,
+-+202,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+139,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+240,
+-+246,
+-+163,
+-+141,
+-+48,
+-+98,
+-+0,
+-+247,
+-+99,
+-+139,
+-+63,
+-+210,
+-+0,
+-+247,
+-+98,
+-+139,
+-+1,
+-+212,
+-+88,
+-+254,
+-+98,
+-+139,
+-+1,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+32,
+-+255,
+-+99,
+-+139,
+-+62,
+-+98,
+-+192,
+-+243,
+-+188,
+-+10,
+-+88,
+-+246,
+-+98,
+-+139,
+-+1,
+-+212,
+-+240,
+-+246,
+-+98,
+-+139,
+-+50,
+-+210,
+-+0,
+-+247,
+-+163,
+-+128,
+-+59,
+-+146,
+-+0,
+-+247,
+-+160,
+-+128,
+-+1,
+-+36,
+-+88,
+-+254,
+-+160,
+-+128,
+-+1,
+-+36,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+247,
+-+163,
+-+128,
+-+58,
+-+98,
+-+64,
+-+255,
+-+35,
+-+240,
+-+0,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+255,
+-+163,
+-+128,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+160,
+-+128,
+-+1,
+-+36,
+-+240,
+-+246,
+-+160,
+-+128,
+-+50,
+-+34,
+-+8,
+-+255,
+-+227,
+-+143,
+-+54,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+40,
+-+255,
+-+227,
+-+142,
+-+54,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+39,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+143,
+-+45,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+44,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+40,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+142,
+-+2,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+90,
+-+0,
+-+169,
+-+3,
+-+14,
+-+96,
+-+4,
+-+31,
+-+169,
+-+3,
+-+30,
+-+96,
+-+1,
+-+31,
+-+73,
+-+64,
+-+52,
+-+64,
+-+45,
+-+64,
+-+2,
+-+64,
+-+10,
+-+64,
+-+64,
+-+198,
+-+1,
+-+7,
+-+8,
+-+232,
+-+63,
+-+0,
+-+0,
+-+0,
+-+6,
+-+232,
+-+253,
+-+255,
+-+255,
+-+255,
+-+0,
+-+246,
+-+0,
+-+0,
+-+0,
+-+4,
+-+215,
+-+64,
+-+3,
+-+96,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+30,
+-+106,
+-+132,
+-+24,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+143,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+135,
+-+0,
+-+131,
+-+102,
+-+0,
+-+158,
+-+71,
+-+0,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+30,
+-+106,
+-+132,
+-+24,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+112,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+104,
+-+0,
+-+131,
+-+102,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+30,
+-+106,
+-+134,
+-+24,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+123,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+112,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+3,
+-+99,
+-+131,
+-+71,
+-+68,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+0,
+-+99,
+-+2,
+-+99,
+-+23,
+-+102,
+-+7,
+-+106,
+-+127,
+-+156,
+-+178,
+-+255,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+30,
+-+106,
+-+134,
+-+24,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+72,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+61,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+25,
+-+102,
+-+9,
+-+106,
+-+2,
+-+30,
+-+41,
+-+3,
+-+26,
+-+87,
+-+162,
+-+64,
+-+64,
+-+198,
+-+1,
+-+23,
+-+127,
+-+158,
+-+95,
+-+255,
+-+239,
+-+3,
+-+0,
+-+254,
+-+128,
+-+143,
+-+94,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+95,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+208,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+209,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+47,
+-+0,
+-+8,
+-+255,
+-+227,
+-+23,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+52,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+239,
+-+3,
+-+0,
+-+254,
+-+128,
+-+143,
+-+14,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+15,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+16,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+17,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+13,
+-+0,
+-+8,
+-+255,
+-+227,
+-+3,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+4,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+32,
+-+246,
+-+192,
+-+11,
+-+1,
+-+16,
+-+32,
+-+246,
+-+2,
+-+140,
+-+47,
+-+240,
+-+32,
+-+247,
+-+35,
+-+141,
+-+63,
+-+178,
+-+64,
+-+254,
+-+35,
+-+141,
+-+2,
+-+68,
+-+192,
+-+243,
+-+128,
+-+11,
+-+32,
+-+255,
+-+35,
+-+240,
+-+58,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+0,
+-+254,
+-+0,
+-+141,
+-+4,
+-+4,
+-+0,
+-+240,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+240,
+-+246,
+-+35,
+-+141,
+-+48,
+-+66,
+-+0,
+-+247,
+-+227,
+-+143,
+-+52,
+-+242,
+-+32,
+-+247,
+-+227,
+-+142,
+-+52,
+-+178,
+-+90,
+-+0,
+-+161,
+-+3,
+-+6,
+-+64,
+-+23,
+-+64,
+-+96,
+-+8,
+-+70,
+-+98,
+-+97,
+-+8,
+-+70,
+-+98,
+-+98,
+-+8,
+-+70,
+-+98,
+-+99,
+-+8,
+-+70,
+-+98,
+-+100,
+-+8,
+-+70,
+-+98,
+-+101,
+-+8,
+-+70,
+-+98,
+-+255,
+-+159,
+-+8,
+-+250,
+-+23,
+-+102,
+-+7,
+-+106,
+-+112,
+-+30,
+-+33,
+-+3,
+++                    // This copes properly with no_p/no_q
+++                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+++                                                     s->frame->linesize[LUMA],
+++                                                     beta, tc, no_p, no_q,
+++                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+++                }
+++                else
+++#endif
+++                {
+++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                    if (pcmf) {
+++                        // Standard DSP code is broken if no_p / no_q is set
+++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                    else
+++#ifdef RPI_DEBLOCK_VPU
+++                    if (s->enable_rpi_deblock) {
+++                        uint8_t (*setup)[2][2][4];
+++                        int num16 = (y>>4)*s->setup_width + (x>>4);
+++                        int a = ((y>>3) & 1) << 1;
+++                        int b = (x>>3) & 1;
+++                        setup = s->dvq->y_setup_arm[num16];
+++                        setup[0][b][0][a] = beta;
+++                        setup[0][b][0][a + 1] = beta;
+++                        setup[0][b][1][a] = tc[0];
+++                        setup[0][b][1][a + 1] = tc[1];
+++                    } else
+++#endif
+++                    {
+++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                }
++             }
++         }
++ 
++@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                src =
+++#if RPI_HEVC_SAND
+++                    av_rpi_is_sand_frame(s->frame) ?
+++                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
+++#endif
+++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x, y - 1);
++                     no_p[1] = get_pcm(s, x + 4, y - 1);
++@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                          s->frame->linesize[LUMA],
++                                                          beta, tc, no_p, no_q);
++                 } else
+++#ifdef RPI_DEBLOCK_VPU
+++                if (s->enable_rpi_deblock) {
+++                    uint8_t (*setup)[2][2][4];
+++                    int num16 = (y>>4)*s->setup_width + (x>>4);
+++                    int a = ((x>>3) & 1) << 1;
+++                    int b = (y>>3) & 1;
+++                    setup = s->dvq->y_setup_arm[num16];
+++                    setup[1][b][0][a] = beta;
+++                    setup[1][b][0][a + 1] = beta;
+++                    setup[1][b][1][a] = tc[0];
+++                    setup[1][b][1][a + 1] = tc[1];
+++                } else
+++#endif
++                     s->hevcdsp.hevc_h_loop_filter_luma(src,
++                                                        s->frame->linesize[LUMA],
++                                                        beta, tc, no_p, no_q);
++@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ 
++     if (s->ps.sps->chroma_format_idc) {
+++#if RPI_HEVC_SAND
+++        if (av_rpi_is_sand_frame(s->frame)) {
+++            const int v = 2;
+++            const int h = 2;
+++
+++            // vertical filtering chroma
+++            for (y = y0; y < y_end; y += 8 * v) {
+++//                const int demi_y = y + 4 * v >= s->ps.sps->height;
+++                const int demi_y = 0;
+++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
+++
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+++                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
+++
+++                        // tc_offset here should be set to cur_tc_offset I think
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+++                                (get_pcm(s, x, y) ? 4 : 0) |
+++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                       s->frame->linesize[1],
+++                                                       tc4,
+++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+++                                                       no_f);
+++                    }
+++                }
+++
+++                if (y == 0)
+++                    continue;
+++
+++                // horizontal filtering chroma
+++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+++                x_end2 = x_end;
+++                if (x_end != s->ps.sps->width)
+++                    x_end2 = x_end - 8 * h;
+++
+++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+++//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
+++                    const int demi_x = 0;
+++
+++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+++                                (get_pcm(s, x,         y)     ? 4 : 0) |
+++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+++
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                             s->frame->linesize[1],
+++                                                             tc4, no_f);
+++                    }
+++                }
+++            }
+++        }
+++        else
+++#endif
++         for (chroma = 1; chroma <= 2; chroma++) {
++             int h = 1 << s->ps.sps->hshift[chroma];
++             int v = 1 << s->ps.sps->vshift[chroma];
++@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
++                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#if RPI_HEVC_SAND
+++                            av_rpi_is_sand_frame(s->frame) ?
+++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x - 1, y);
++                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
++@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((yc>>3) & 1) << 1;
+++                            int b = (xc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[0][b][0][a] = c_tc[0];
+++                            setup[0][b][0][a + 1] = c_tc[1];
+++                        } else
+++#endif
++                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
+++
++                     }
++                 }
++ 
++@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
++                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#if RPI_HEVC_SAND
+++                            av_rpi_is_sand_frame(s->frame) ?
+++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x,           y - 1);
++                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
++@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((xc>>3) & 1) << 1;
+++                            int b = (yc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[1][b][0][a] = c_tc[0];
+++                            setup[1][b][0][a + 1] = c_tc[1];
+++                        } else
+++#endif
++                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
++@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ }
++ 
++-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
++-                             RefPicList *neigh_refPicList)
++-{
++-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++-        // same L0 and L1
++-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
++-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
++-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
++-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else {
++-            return 1;
++-        }
++-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++-        Mv A, B;
++-        int ref_A, ref_B;
++-
++-        if (curr->pred_flag & 1) {
++-            A     = curr->mv[0];
++-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
++-        } else {
++-            A     = curr->mv[1];
++-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
++-        }
++-
++-        if (neigh->pred_flag & 1) {
++-            B     = neigh->mv[0];
++-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
++-        } else {
++-            B     = neigh->mv[1];
++-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
++-        }
++-
++-        if (ref_A == ref_B) {
++-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else
++-            return 1;
++-    }
++-
++-    return 1;
++-}
++ 
++ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++                                            int log2_trafo_size)
++@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++     int min_pu_width     = s->ps.sps->min_pu_width;
++     int min_tu_width     = s->ps.sps->min_tb_width;
++-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
++-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
++     int boundary_upper, boundary_left;
++-    int i, j, bs;
+++    int i, j;
+++    RefPicList *rpl      = s->ref->refPicList;
+++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+++    int y_pu             = y0 >> log2_min_pu_size;
+++    int x_pu             = x0 >> log2_min_pu_size;
+++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    int is_intra         = curr->pred_flag == PF_INTRA;
+++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+++    uint8_t *bs;
+++
+++#ifdef DISABLE_STRENGTHS
+++    return;
+++#endif
++ 
++     boundary_upper = y0 > 0 && !(y0 & 7);
++     if (boundary_upper &&
++@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_upper = 0;
++ 
+++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+++
++     if (boundary_upper) {
++         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
++                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
++-                              s->ref->refPicList;
++-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
++-        int yq_pu =  y0      >> log2_min_pu_size;
++-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
++-        int yq_tu =  y0      >> log2_min_tu_size;
+++                              rpl;
+++        MvField *top = curr - min_pu_width;
+++
+++        if (is_intra) {
+++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+++                bs[i >> 2] = 2;
+++
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+++
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+++                    curr, top, bs);
++ 
++             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                int x_tu = (x0 + i) >> log2_min_tu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || top_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, top, rpl_top);
++-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+++                int i_pu = i >> log2_min_pu_size;
+++                int i_tu = i >> log2_min_tu_size;
+++
+++                if (top[i_pu].pred_flag == PF_INTRA)
+++                    bs[i >> 2] = 2;
+++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+++                    bs[i >> 2] = 1;
++             }
+++        }
+++    }
+++
+++    if (!is_intra) {
+++        for (j = inc; j < trafo_in_min_pus; j += inc) {
+++            MvField *top;
+++
+++            curr += min_pu_width * inc;
+++            top = curr - min_pu_width;
+++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+++
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, top, bs);
+++        }
++     }
++ 
++-    // bs for vertical TU boundaries
++     boundary_left = x0 > 0 && !(x0 & 7);
++     if (boundary_left &&
++         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_left = 0;
++ 
+++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+++
++     if (boundary_left) {
++         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
++                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
++-                               s->ref->refPicList;
++-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
++-        int xq_pu =  x0      >> log2_min_pu_size;
++-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
++-        int xq_tu =  x0      >> log2_min_tu_size;
++-
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int y_pu      = (y0 + i) >> log2_min_pu_size;
++-                int y_tu      = (y0 + i) >> log2_min_tu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
++-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || left_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, left, rpl_left);
++-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
++-            }
++-    }
++-
++-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
++-        RefPicList *rpl = s->ref->refPicList;
+++                               rpl;
+++        MvField *left = curr - 1;
++ 
++-        // bs for TU internal horizontal PU boundaries
++-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
++-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
++-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+++        if (is_intra) {
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+++                bs[j * s->bs_width >> 2] = 2;
++ 
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-
++-                bs = boundary_strength(s, curr, top, rpl);
++-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+++
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+++                    curr, left, bs);
+++
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+++                int j_pu = j >> log2_min_pu_size;
+++                int j_tu = j >> log2_min_tu_size;
+++
+++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+++                    bs[j * s->bs_width >> 2] = 2;
+++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+++                    bs[j * s->bs_width >> 2] = 1;
++             }
++         }
+++    }
++ 
++-        // bs for TU internal vertical PU boundaries
++-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
++-            int y_pu = (y0 + j) >> log2_min_pu_size;
+++    if (!is_intra) {
+++        for (i = inc; i < trafo_in_min_pus; i += inc) {
+++            MvField *left;
++ 
++-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
++-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
++-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+++            curr += inc;
+++            left = curr - 1;
+++            bs += inc << log2_min_pu_size >> 2;
++ 
++-                bs = boundary_strength(s, curr, left, rpl);
++-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
++-            }
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, left, bs);
++         }
++     }
++ }
++@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++ #undef CB
++ #undef CR
++ 
+++#ifdef RPI_DEBLOCK_VPU
+++// ff_hevc_flush_buffer_lines
+++// flushes and invalidates all pixel rows in [start,end-1]
+++static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+++{
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+++    rpi_cache_flush_finish(rfe);
+++}
+++#endif
+++
+++#if RPI_INTER
+++
+++// Flush some lines of a reference frames
+++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+++{
+++    if (s->enable_rpi && s->used_for_ref) {
+++        const int d0 = ((int *)f->progress->data)[0];
+++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+++
+++        if (curr_y < (unsigned int)s->ps.sps->height) {
+++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++              0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
+++              s->ps.sps->vshift[1], 1, 1);
+++            rpi_cache_flush_finish(rfe);
+++        }
+++    }
+++}
+++#endif
+++
+++#ifdef RPI_DEBLOCK_VPU
+++/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+++static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+++{
+++  // Flush image, 4 lines above to bottom of ctb stripe
+++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+++  // TODO flush buffer of beta/tc setup when it becomes cached
+++
+++  // Prepare three commands at once to avoid calling overhead
+++  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+++  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+++  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+++  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+++  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+++  s->dvq->vpu_cmds_arm[0][5] = 2;
+++
+++  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+++  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+++  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[1][5] = 3;
+++
+++  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+++  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+++  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[2][5] = 4;
+++
+++  // Call VPU
+++  {
+++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+++      vpu_qpu_job_finish(vqj);
+++  }
+++
+++  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+++  s->dvq = s->dvq_ents + s->dvq_n;
+++
+++  vpu_qpu_wait(&s->dvq->cmd_id);
+++}
+++
+++#endif
+++
++ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++ {
++     int x_end = x >= s->ps.sps->width  - ctb_size;
+++#ifdef RPI_DEBLOCK_VPU
+++    int done_deblock = 0;
+++#endif
++     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
++         deblocking_filter_CTB(s, x, y);
+++#ifdef RPI_DEBLOCK_VPU
+++    if (s->enable_rpi_deblock && x_end)
+++    {
+++      int y_at_end = y >= s->ps.sps->height - ctb_size;
+++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+++      int y_start = y&~63;
+++      if (y_at_end) height = s->ps.sps->height - y_start;
+++      if ((((y+ctb_size)&63)==0) || y_at_end) {
+++        done_deblock = 1;
+++        rpi_deblock(s, y_start, height);
+++      }
+++    }
+++#endif
++     if (s->ps.sps->sao_enabled) {
++         int y_end = y >= s->ps.sps->height - ctb_size;
++         if (y && x)
++@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++             sao_filter_CTB(s, x - ctb_size, y);
++         if (y && x_end) {
++             sao_filter_CTB(s, x, y - ctb_size);
++-            if (s->threads_type & FF_THREAD_FRAME )
++-                ff_thread_report_progress(&s->ref->tf, y, 0);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+++#endif
+++                ff_hevc_progress_signal_recon(s, y);
+++            }
++         }
++         if (x_end && y_end) {
++             sao_filter_CTB(s, x , y);
++-            if (s->threads_type & FF_THREAD_FRAME )
++-                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+++#endif
+++                ff_hevc_progress_signal_recon(s, y + ctb_size);
+++            }
++         }
++-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++-        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+++        //int newh = y + ctb_size - 4;
+++        //int currh = s->ref->tf.progress->data[0];
+++        //if (((y + ctb_size)&63)==0)
+++#ifdef RPI_DEBLOCK_VPU
+++        if (s->enable_rpi_deblock) {
+++            // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++            if (done_deblock) {
+++                ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+++            }
+++        } else {
+++#if RPI_INTER
+++            rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+++#endif
+++            ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+++        }
+++#else
+++#if RPI_INTER
+++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+++#endif
+++        ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+++#endif
+++    }
++ }
++ 
++ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
++diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
++index 4a6dde0f67..8ee37ebfbc 100644
++--- a/libavcodec/hevc_mvs.c
+++++ b/libavcodec/hevc_mvs.c
++@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
++     return 0;
++ }
++ 
++-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
++ {
++     int tx, scale_factor;
++ 
++@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
++                            (scale_factor * src->y < 0)) >> 8);
++ }
++ 
++-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
++-                       int colPic, int poc,
++-                       RefPicList *refPicList, int X, int refIdxLx,
++-                       RefPicList *refPicList_col, int listCol, int refidxCol)
+++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
+++                       const int colPic, const int poc,
+++                       const RefPicList * const refPicList, const int X, const int refIdxLx,
+++                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
++ {
++     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
++     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
++@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
++                 refPicList_col, L ## l, temp_col.ref_idx[l])
++ 
++ // derive the motion vectors section 8.5.3.1.8
++-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
++-                                         int refIdxLx, Mv *mvLXCol, int X,
++-                                         int colPic, RefPicList *refPicList_col)
+++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
+++                                         const int refIdxLx, Mv * const mvLXCol, const int X,
+++                                         const int colPic, const RefPicList * const refPicList_col)
++ {
++-    RefPicList *refPicList = s->ref->refPicList;
+++    const RefPicList * const refPicList = s->ref->refPicList;
++ 
++     if (temp_col.pred_flag == PF_INTRA)
++         return 0;
++@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
++ /*
++  * 8.5.3.1.7  temporal luma motion vector prediction
++  */
++-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
++-                                       int nPbW, int nPbH, int refIdxLx,
++-                                       Mv *mvLXCol, int X)
+++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
+++                                       const int nPbW, const int nPbH, const int refIdxLx,
+++                                       Mv * const mvLXCol, const int X)
++ {
++     MvField *tab_mvf;
++     MvField temp_col;
++     int x, y, x_pu, y_pu;
++-    int min_pu_width = s->ps.sps->min_pu_width;
+++    const int min_pu_width = s->ps.sps->min_pu_width;
++     int availableFlagLXCol = 0;
++     int colPic;
++ 
++-    HEVCFrame *ref = s->ref->collocated_ref;
+++    HEVCFrame * const ref = s->ref->collocated_ref;
++ 
++-    if (!ref) {
+++    if (ref == NULL || ref->tab_mvf == NULL) {
++         memset(mvLXCol, 0, sizeof(*mvLXCol));
++         return 0;
++     }
++@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
++     x = x0 + nPbW;
++     y = y0 + nPbH;
++ 
++-    if (tab_mvf &&
++-        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++         y < s->ps.sps->height &&
++         x < s->ps.sps->width) {
++         x                 &= ~15;
++         y                 &= ~15;
++         if (s->threads_type == FF_THREAD_FRAME)
++-            ff_thread_await_progress(&ref->tf, y, 0);
+++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++         x_pu               = x >> s->ps.sps->log2_min_pu_size;
++         y_pu               = y >> s->ps.sps->log2_min_pu_size;
++         temp_col           = TAB_MVF(x_pu, y_pu);
++@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
++     }
++ 
++     // derive center collocated motion vector
++-    if (tab_mvf && !availableFlagLXCol) {
+++    if (!availableFlagLXCol) {
++         x                  = x0 + (nPbW >> 1);
++         y                  = y0 + (nPbH >> 1);
++         x                 &= ~15;
++         y                 &= ~15;
++         if (s->threads_type == FF_THREAD_FRAME)
++-            ff_thread_await_progress(&ref->tf, y, 0);
+++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++         x_pu               = x >> s->ps.sps->log2_min_pu_size;
++         y_pu               = y >> s->ps.sps->log2_min_pu_size;
++         temp_col           = TAB_MVF(x_pu, y_pu);
++diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
++index c1b69a0199..455cdaea1c 100644
++--- a/libavcodec/hevc_ps.c
+++++ b/libavcodec/hevc_ps.c
++@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
++     switch (sps->bit_depth) {
++     case 8:
++         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+++#if RPI_HEVC_SAND
+++        // *** Horrid kludge s.t. we start out with sand format
+++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+++#else
++         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+++#endif
++         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
++         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
++        break;
++@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
++         break;
++     case 10:
++         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+++#if RPI_HEVC_SAND
+++        // *** Horrid kludge s.t. we start out with sand format
+++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
+++#else
++         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+++#endif
++         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
++         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
++         break;
++@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++         if (sps_extension_flag[0]) {
++             int extended_precision_processing_flag;
++-            int high_precision_offsets_enabled_flag;
++             int cabac_bypass_alignment_enabled_flag;
++ 
++             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++                    "extended_precision_processing_flag not yet implemented\n");
++ 
++             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
++-            high_precision_offsets_enabled_flag  = get_bits1(gb);
++-            if (high_precision_offsets_enabled_flag)
+++            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
+++            if (sps->high_precision_offsets_enabled_flag)
++                 av_log(avctx, AV_LOG_WARNING,
++-                   "high_precision_offsets_enabled_flag not yet implemented\n");
+++                   "high_precision_offsets_enabled_flag not fully implemented\n");
++ 
++             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
++ 
++diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
++index df52e401ad..992e994b1a 100644
++--- a/libavcodec/hevc_refs.c
+++++ b/libavcodec/hevc_refs.c
++@@ -23,7 +23,7 @@
++ 
++ #include "libavutil/avassert.h"
++ #include "libavutil/pixdesc.h"
++-
+++#include "libavutil/rpi_sand_fns.h"
++ #include "internal.h"
++ #include "thread.h"
++ #include "hevc.h"
++@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
++             HEVCFrame *frame = &s->DPB[min_idx];
++             AVFrame *dst = out;
++             AVFrame *src = frame->frame;
++-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+++            const int fmt = src->format;
+++            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
++             int pixel_shift = !!(desc->comp[0].depth > 8);
++ 
++             ret = av_frame_ref(out, src);
++@@ -215,13 +216,31 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
++                 ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++             if (ret < 0)
++                 return ret;
++-
++-            for (i = 0; i < 3; i++) {
++-                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
++-                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
++-                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
++-                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
++-                dst->data[i] += off;
+++#ifdef RPI
+++            if (av_rpi_is_sand_format(fmt))
+++            {
+++                // Sand cannot be windowed by offset so add side data if we have an offset
+++                const HEVCWindow * const window = &frame->window;
+++                if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
+++                {
+++                    AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
+++                    AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+++                    si->left_offset = window->left_offset;
+++                    si->top_offset = window->top_offset;
+++                    si->pic_width = s->ps.sps->width;
+++                    si->pic_height = s->ps.sps->height;
+++                }
+++            }
+++            else
+++#endif
+++            {
+++                for (i = 0; i < 3; i++) {
+++                    int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+++                    int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+++                    int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+++                              (frame->window.top_offset   >> vshift) * dst->linesize[i];
+++                    dst->data[i] += off;
+++                }
++             }
++             av_log(s->avctx, AV_LOG_DEBUG,
++                    "Output frame with POC %d.\n", frame->poc);
++@@ -426,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
++     frame->sequence = s->seq_decode;
++     frame->flags    = 0;
++ 
++-    if (s->threads_type == FF_THREAD_FRAME)
++-        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+++    ff_hevc_progress_set_all_done(frame);
++ 
++     return frame;
++ }
++diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
++index 9d773d960e..c9661c3ab1 100644
++--- a/libavcodec/hevcdsp.c
+++++ b/libavcodec/hevcdsp.c
++@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
++ #include "hevcdsp_template.c"
++ #undef BIT_DEPTH
++ 
+++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs)
+++{
+++    for (; pus > 0; pus--) {
+++        int strength, out;
+++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+++
+++#if 1 // This more directly matches the original implementation
+++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+++            // same L0 and L1
+++            if (curr_refL0 == neigh_refL0 &&
+++                curr_refL0 == curr_refL1 &&
+++                neigh_refL0 == neigh_refL1) {
+++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL0 == curr_refL0 &&
+++                       neigh_refL1 == curr_refL1) {
+++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL1 == curr_refL0 &&
+++                       neigh_refL0 == curr_refL1) {
+++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else {
+++                strength = 1;
+++            }
+++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+++            Mv curr_mv0, neigh_mv0;
+++
+++            if (curr->pred_flag & 1) {
+++                curr_mv0   = curr->mv[0];
+++            } else {
+++                curr_mv0   = curr->mv[1];
+++                curr_refL0 = curr_refL1;
+++            }
+++
+++            if (neigh->pred_flag & 1) {
+++                neigh_mv0   = neigh->mv[0];
+++            } else {
+++                neigh_mv0   = neigh->mv[1];
+++                neigh_refL0 = neigh_refL1;
+++            }
+++
+++            if (curr_refL0 == neigh_refL0) {
+++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else
+++                strength = 1;
+++        } else
+++            strength = 1;
+++#else // This has exactly the same effect, but is more suitable for vectorisation
+++        Mv curr_mv[2];
+++        Mv neigh_mv[2];
+++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++
+++        if (!(curr->pred_flag & 2)) {
+++            curr_mv[1] = curr_mv[0];
+++            curr_refL1 = curr_refL0;
+++        }
+++        if (!(neigh->pred_flag & 2)) {
+++            neigh_mv[1] = neigh_mv[0];
+++            neigh_refL1 = neigh_refL0;
+++        }
+++        if (!(curr->pred_flag & 1)) {
+++            curr_mv[0] = curr_mv[1];
+++            curr_refL0 = curr_refL1;
+++        }
+++        if (!(neigh->pred_flag & 1)) {
+++            neigh_mv[0] = neigh_mv[1];
+++            neigh_refL0 = neigh_refL1;
+++        }
+++
+++        strength = 1;
+++
+++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++
+++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++
+++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+++#endif
+++
+++        curr += in_inc / sizeof (MvField);
+++        neigh += in_inc / sizeof (MvField);
+++
+++        for (out = dup; out > 0; out--)
+++        {
+++            *bs = strength;
+++            bs += out_inc;
+++        }
+++    }
+++}
+++
++ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++ {
++ #undef FUNC
++@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++ 
+++#if !RPI_HEVC_SAND
+++#define SLICED_LOOP_FILTERS(depth)
+++#define SLICED_ADD_RESIDUAL(depth)
+++#define SLICED_SAO(depth)
+++#else
+++#define SLICED_ADD_RESIDUAL(depth)\
+++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
+++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
+++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
+++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
+++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
+++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
+++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
+++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
+++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
+++#define SLICED_LOOP_FILTERS(depth)\
+++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+++#define SLICED_SAO(depth)\
+++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
+++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
+++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
+++    }                                                                         \
+++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
+++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
+++
+++#endif
+++
++ #define HEVC_DSP(depth)                                                     \
++     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++-    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
++-    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
++-    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
++-    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
++-    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+++    hevcdsp->transform_add[0]       = FUNC(add_residual4x4, depth);         \
+++    hevcdsp->transform_add[1]       = FUNC(add_residual8x8, depth);         \
+++    hevcdsp->transform_add[2]       = FUNC(add_residual16x16, depth);       \
+++    hevcdsp->transform_add[3]       = FUNC(add_residual32x32, depth);       \
+++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);      \
+++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);      \
+++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);    \
+++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);    \
+++    SLICED_ADD_RESIDUAL(depth);                                             \
++     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++-    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+++    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+++    hevcdsp->idct_4x4_luma          = FUNC(idct_4x4_luma, depth);           \
++     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
++     hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
++@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
++     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
++                                                                             \
++-    hevcdsp->sao_band_filter[0] =                                              \
++-    hevcdsp->sao_band_filter[1] =                                              \
++-    hevcdsp->sao_band_filter[2] =                                              \
++-    hevcdsp->sao_band_filter[3] =                                              \
++-    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
++-    hevcdsp->sao_edge_filter[0] =                                              \
++-    hevcdsp->sao_edge_filter[1] =                                              \
++-    hevcdsp->sao_edge_filter[2] =                                              \
++-    hevcdsp->sao_edge_filter[3] =                                              \
++-    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
+++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
+++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
+++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
+++    }                                                                       \
++     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+++    SLICED_SAO(depth);                                                         \
++                                                                                \
++     QPEL_FUNCS(depth);                                                         \
++     QPEL_UNI_FUNCS(depth);                                                     \
++@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     EPEL_UNI_FUNCS(depth);                                                     \
++     EPEL_BI_FUNCS(depth);                                                      \
++                                                                                \
+++    SLICED_LOOP_FILTERS(depth);                                                \
++     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++@@ -257,6 +409,8 @@ int i = 0;
++         break;
++     }
++ 
+++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+++
++     if (ARCH_X86)
++         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
++     if (ARCH_ARM)
++diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
++index 9f1f6dd59f..c4a1b0f09d 100644
++--- a/libavcodec/hevcdsp.h
+++++ b/libavcodec/hevcdsp.h
++@@ -25,6 +25,7 @@
++ #ifndef AVCODEC_HEVCDSP_H
++ #define AVCODEC_HEVCDSP_H
++ 
+++#include "rpi_opts.h"
++ #include "get_bits.h"
++ 
++ #define MAX_PB_SIZE 64
++@@ -42,11 +43,40 @@ typedef struct SAOParams {
++     uint8_t type_idx[3];    ///< sao_type_idx
++ } SAOParams;
++ 
+++typedef struct Mv {
+++    int16_t x;  ///< horizontal component of motion vector
+++    int16_t y;  ///< vertical component of motion vector
+++} Mv;
+++
+++typedef struct MvField {
+++    DECLARE_ALIGNED(4, Mv, mv)[2];
+++    int8_t ref_idx[2];
+++    int8_t pred_flag;
+++} MvField;
+++
+++#ifdef RPI
+++#define SAO_FILTER_N 6
+++#else
+++#define SAO_FILTER_N 5
+++#endif
+++
+++
++ typedef struct HEVCDSPContext {
++     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                     struct GetBitContext *gb, int pcm_bit_depth);
++ 
++-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+++    // add_residual was transform_add - import 3.3 names
+++    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
+++#if RPI_HEVC_SAND
+++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
+++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
+++
+++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
+++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+++                    struct GetBitContext *gb, int pcm_bit_depth);
+++#endif
++ 
++     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
++ 
++@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext {
++ 
++     void (*idct_dc[4])(int16_t *coeffs);
++ 
++-    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++#if RPI_HEVC_SAND
+++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                               int width, int height);
+++#endif
++ 
++     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++-    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+++#if RPI_HEVC_SAND
+++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+++#endif
++ 
++     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+++#if RPI_HEVC_SAND
+++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+++#endif
++ 
++     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width);
++@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext {
++     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                         int32_t *tc, uint8_t *no_p,
++                                         uint8_t *no_q);
+++#ifdef RPI
+++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                                 const uint8_t no_p[2], const uint8_t no_q[2],
+++                                 uint8_t * _pix_l);
+++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f);
+++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f);
+++
+++#endif
+++
+++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs);
++ } HEVCDSPContext;
++ 
++ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
++index 5bca02342d..122fbe8154 100644
++--- a/libavcodec/hevcdsp_template.c
+++++ b/libavcodec/hevcdsp_template.c
++@@ -26,6 +26,7 @@
++ #include "bit_depth_template.c"
++ #include "hevcdsp.h"
++ 
+++#include "rpi_shader_template.h"
++ 
++ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                           GetBitContext *gb, int pcm_bit_depth)
++@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
++     }
++ }
++ 
++-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
++-                                                     ptrdiff_t stride, int size)
+++#if RPI_HEVC_SAND
+++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+++                          GetBitContext *gb, int pcm_bit_depth)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++
+++    dst = (pixel *)_dst + 1;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++}
+++#endif
+++
+++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs,
+++                                                ptrdiff_t stride, int size)
++ {
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
++     }
++ }
++ 
++-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
++-                                       ptrdiff_t stride)
+++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size; x++) {
+++            dst[x] = av_clip_pixel(dst[x] + dc);
+++        }
+++        dst += stride;
+++    }
+++}
+++
+++
+++#if RPI_HEVC_SAND
+++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
+++                                                ptrdiff_t stride, const int dc_v, int size)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + *res);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+++            res++;
+++        }
+++        dst += stride;
+++    }
++ }
++ 
++-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
+++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
+++                                                ptrdiff_t stride, const int dc_u, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + dc_u);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
+++            res++;
+++        }
+++        dst += stride;
+++    }
+++}
+++
+++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
+++                                                ptrdiff_t stride, unsigned int size)
+++{
+++    unsigned int x, y;
+++    pixel *dst = (pixel *)_dst;
+++    const int16_t * ru = res;
+++    const int16_t * rv = res + size * size;
+++
+++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
+++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
+++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
+++        }
+++        dst += stride;
+++    }
+++
+++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
+++}
+++
+++
+++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++    const int dc_v = dc >> 16;
+++    const int dc_u = (dc << 16) >> 16;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + dc_u);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+++        }
+++        dst += stride;
+++    }
+++}
+++
+++
+++#endif
+++
+++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual)(_dst, coeffs, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs,
++                                        ptrdiff_t stride)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+++    FUNC(add_residual)(_dst, coeffs, stride, 8);
++ }
++ 
++-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
+++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs,
++                                          ptrdiff_t stride)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+++    FUNC(add_residual)(_dst, coeffs, stride, 16);
++ }
++ 
++-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs,
++                                          ptrdiff_t stride)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+++    FUNC(add_residual)(_dst, coeffs, stride, 32);
++ }
++ 
+++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
+++}
+++
+++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
+++}
+++
+++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
+++}
+++
+++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
+++}
+++
+++#if RPI_HEVC_SAND
+++// -- U -- (plaited)
+++
+++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_u)
+++{
+++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
+++}
+++
+++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_u)
+++{
+++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
+++}
+++
+++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_u)
+++{
+++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
+++}
+++
+++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_u)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++// -- V -- (plaited)
+++
+++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_v)
+++{
+++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
+++}
+++
+++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_v)
+++{
+++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
+++}
+++
+++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_v)
+++{
+++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
+++}
+++
+++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_v)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++// -- C -- (plaited - both U & V)
+++
+++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_c)(_dst, res, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_c)(_dst, res, stride, 8);
+++}
+++
+++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_c)(_dst, res, stride, 16);
+++}
+++
+++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
+++}
+++
+++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
+++}
+++
+++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
+++}
+++
+++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++#endif
+++
++ 
++ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++ {
++@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
++         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
++     } while (0)
++ 
++-static void FUNC(transform_4x4_luma)(int16_t *coeffs)
+++static void FUNC(idct_4x4_luma)(int16_t *coeffs)
++ {
++     int i;
++     int shift    = 7;
++@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
++     }
++ }
++ 
+++
+++#if BIT_DEPTH == 10
+++#if RPI_HEVC_SAND
+++// We need a 32 bit variation for the _c restores so hijack bit depth 10
+++#undef pixel
+++#undef BIT_DEPTH
+++#define pixel uint32_t
+++#define BIT_DEPTH 32
+++#endif
+++// All 16 bit variations are the same
+++#define sao_edge_restore_0_10 sao_edge_restore_0_9
+++#define sao_edge_restore_1_10 sao_edge_restore_1_9
+++#define sao_edge_restore_0_11 sao_edge_restore_0_9
+++#define sao_edge_restore_1_11 sao_edge_restore_1_9
+++#define sao_edge_restore_0_12 sao_edge_restore_0_9
+++#define sao_edge_restore_1_12 sao_edge_restore_1_9
+++#define sao_edge_restore_0_13 sao_edge_restore_0_9
+++#define sao_edge_restore_1_13 sao_edge_restore_1_9
+++#define sao_edge_restore_0_14 sao_edge_restore_0_9
+++#define sao_edge_restore_1_14 sao_edge_restore_1_9
+++#define sao_edge_restore_0_15 sao_edge_restore_0_9
+++#define sao_edge_restore_1_15 sao_edge_restore_1_9
+++#define sao_edge_restore_0_16 sao_edge_restore_0_9
+++#define sao_edge_restore_1_16 sao_edge_restore_1_9
+++#endif
+++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                     int *borders, int _width, int _height,
++@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, width = _width, height = _height;
++ 
++@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, init_y = 0, width = _width, height = _height;
++ 
++@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++             init_y = 1;
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ 
++     }
++ }
+++#endif
+++#if BIT_DEPTH == 32
+++#undef BIT_DEPTH
+++#undef pixel
+++#define BIT_DEPTH 10
+++#define pixel uint16_t
+++#endif
+++
+++// --- Plaited chroma versions
+++
+++#if RPI_HEVC_SAND
+++
+++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int offset_table_u[32] = { 0 };
+++    int offset_table_v[32] = { 0 };
+++    int k, y, x;
+++    int shift  = BIT_DEPTH - 5;
+++
+++    stride_dst /= sizeof(pixel);
+++    stride_src /= sizeof(pixel);
+++    width *= 2;
+++
+++    for (k = 0; k < 4; k++)
+++    {
+++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+++    }
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2)
+++        {
+++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
+++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
+++            // *** & 31 shouldn't be wanted but just now we generate broken input that
+++            // crashes us in 10-bit world
+++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
+++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
+++        }
+++        dst += stride_dst;
+++        src += stride_src;
+++    }
+++}
+++
+++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+++                                  int eo, int width, int height) {
+++
+++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++    static const int8_t pos[4][2][2] = {
+++        { { -1,  0 }, {  1, 0 } }, // horizontal
+++        { {  0, -1 }, {  0, 1 } }, // vertical
+++        { { -1, -1 }, {  1, 1 } }, // 45 degree
+++        { {  1, -1 }, { -1, 1 } }, // 135 degree
+++    };
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int a_stride, b_stride;
+++    int x, y;
+++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+++
+++    stride_dst /= sizeof(pixel);
+++    width *= 2;
+++
+++    av_assert0(width <= 64);
+++
+++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2) {
+++            int diff0u = CMP(src[x], src[x + a_stride]);
+++            int diff1u = CMP(src[x], src[x + b_stride]);
+++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+++        }
+++        src += stride_src;
+++        dst += stride_dst;
+++    }
+++}
+++
+++// Do once
+++#if BIT_DEPTH == 8
+++// Any old 2 byte 'normal' restore will work for these
+++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
+++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
+++// We need 32 bit for 9 bit+
+++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
+++#endif
+++
+++#endif  // RPI_HEVC_SAND
+++
++ 
++ #undef CMP
++ 
++@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ #undef TQ1
++ #undef TQ2
++ #undef TQ3
+++
+++#if RPI_HEVC_SAND
+++
+++// line zero
+++#define P3 pix_l[0 * xstride]
+++#define P2 pix_l[1 * xstride]
+++#define P1 pix_l[2 * xstride]
+++#define P0 pix_l[3 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++#define Q2 pix_r[2 * xstride]
+++#define Q3 pix_r[3 * xstride]
+++
+++// line three. used only for deblocking decision
+++#define TP3 pix_l[0 * xstride + 3 * ystride]
+++#define TP2 pix_l[1 * xstride + 3 * ystride]
+++#define TP1 pix_l[2 * xstride + 3 * ystride]
+++#define TP0 pix_l[3 * xstride + 3 * ystride]
+++#define TQ0 pix_r[0 * xstride + 3 * ystride]
+++#define TQ1 pix_r[1 * xstride + 3 * ystride]
+++#define TQ2 pix_r[2 * xstride + 3 * ystride]
+++#define TQ3 pix_r[3 * xstride + 3 * ystride]
+++
+++// This is identical to hevc_loop_filter_luma except that the P/Q
+++// components are on separate pointers
+++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+++                                 uint8_t * _pix_l)
+++{
+++    int d, j;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    const ptrdiff_t xstride = 1;
+++    const ptrdiff_t ystride = _stride / sizeof(pixel);
+++
+++    beta <<= BIT_DEPTH - 8;
+++
+++    for (j = 0; j < 2; j++) {
+++        const int dp0  = abs(P2  - 2 * P1  + P0);
+++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+++        const int d0   = dp0 + dq0;
+++        const int d3   = dp3 + dq3;
+++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+++        const int no_p = _no_p[j];
+++        const int no_q = _no_q[j];
+++
+++        if (d0 + d3 >= beta) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        } else {
+++            const int beta_3 = beta >> 3;
+++            const int beta_2 = beta >> 2;
+++            const int tc25   = ((tc * 5 + 1) >> 1);
+++
+++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+++                // strong filtering
+++                const int tc2 = tc << 1;
+++                for (d = 0; d < 4; d++) {
+++                    const int p3 = P3;
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    const int q3 = Q3;
+++                    if (!no_p) {
+++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+++                    }
+++                    if (!no_q) {
+++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            } else { // normal filtering
+++                int nd_p = 1;
+++                int nd_q = 1;
+++                const int tc_2 = tc >> 1;
+++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_p = 2;
+++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_q = 2;
+++
+++                for (d = 0; d < 4; d++) {
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+++                    if (abs(delta0) < 10 * tc) {
+++                        delta0 = av_clip(delta0, -tc, tc);
+++                        if (!no_p)
+++                            P0 = av_clip_pixel(p0 + delta0);
+++                        if (!no_q)
+++                            Q0 = av_clip_pixel(q0 - delta0);
+++                        if (!no_p && nd_p > 1) {
+++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+++                            P1 = av_clip_pixel(p1 + deltap1);
+++                        }
+++                        if (!no_q && nd_q > 1) {
+++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+++                            Q1 = av_clip_pixel(q1 + deltaq1);
+++                        }
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            }
+++        }
+++    }
+++}
+++
+++#undef TP3
+++#undef TP2
+++#undef TP1
+++#undef TP0
+++#undef TQ0
+++#undef TQ1
+++#undef TQ2
+++#undef TQ3
+++
+++#undef P3
+++#undef P2
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++#undef Q2
+++#undef Q3
+++
+++#define P1 pix_l[0 * xstride]
+++#define P0 pix_l[1 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++
+++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+++                                          ptrdiff_t _ystride, const int32_t *_tc,
+++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+++{
+++    int d, j, no_p, no_q;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    ptrdiff_t xstride = _xstride / sizeof(pixel);
+++    ptrdiff_t ystride = _ystride / sizeof(pixel);
+++
+++    for (j = 0; j < 2; j++) {
+++        const int tc = _tc[j] << (BIT_DEPTH - 8);
+++        if (tc <= 0) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        }
+++        no_p = _no_p[j];
+++        no_q = _no_q[j];
+++
+++        for (d = 0; d < 4; d++) {
+++            int delta0;
+++            const int p1 = P1;
+++            const int p0 = P0;
+++            const int q0 = Q0;
+++            const int q1 = Q1;
+++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+++            if (!no_p)
+++                P0 = av_clip_pixel(p0 + delta0);
+++            if (!no_q)
+++                Q0 = av_clip_pixel(q0 - delta0);
+++            pix_l += ystride;
+++            pix_r += ystride;
+++        }
+++    }
+++}
+++
+++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+++}
+++
+++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+++}
+++
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++
+++
+++#endif
+++
++diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
++index 02c1766059..cea16eade4 100644
++--- a/libavcodec/hevcpred.c
+++++ b/libavcodec/hevcpred.c
++@@ -24,6 +24,7 @@
++ 
++ #include "hevcpred.h"
++ 
+++#define PRED_C 0
++ #define BIT_DEPTH 8
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
++@@ -39,13 +40,37 @@
++ #define BIT_DEPTH 12
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
+++#undef PRED_C
+++
+++#ifdef RPI
+++#define PRED_C 1
+++#define BIT_DEPTH 8
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 9
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 10
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 12
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++#undef PRED_C
+++#endif
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++ {
++ #undef FUNC
++ #define FUNC(a, depth) a ## _ ## depth
++ 
++-#define HEVC_PRED(depth)                                \
+++#undef FUNCC
+++#define FUNCC(a, depth) a ## _ ## depth ## _c
+++
+++#define HEVC_PRED_Y(depth)                                \
++     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
++     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
++     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
++@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
++ 
+++#define HEVC_PRED_C(depth)                                \
+++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+++
+++#ifdef RPI
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth); \
+++    HEVC_PRED_C(depth);
+++#else
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth);
+++#endif
+++
++     switch (bit_depth) {
++     case 9:
++         HEVC_PRED(9);
++diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
++index eb17663683..00ba3f94c0 100644
++--- a/libavcodec/hevcpred.h
+++++ b/libavcodec/hevcpred.h
++@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
++     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                             const uint8_t *left, ptrdiff_t stride,
++                             int c_idx, int mode);
+++#ifdef RPI
+++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+++
+++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+++                           const uint8_t *left, ptrdiff_t stride);
+++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+++                    ptrdiff_t stride, int log2_size, int c_idx);
+++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+++                            const uint8_t *left, ptrdiff_t stride,
+++                            int c_idx, int mode);
+++#endif
++ } HEVCPredContext;
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
++diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
++index 6fe33546b1..2f9f5f2798 100644
++--- a/libavcodec/hevcpred_template.c
+++++ b/libavcodec/hevcpred_template.c
++@@ -20,13 +20,110 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
++ 
+++//#define DISABLE_INTRA
+++
++ #include "libavutil/pixdesc.h"
++ 
++ #include "bit_depth_template.c"
++ #include "hevcpred.h"
++ 
+++#ifdef RPI
+++#include "libavutil/rpi_sand_fns.h"
+++#endif
+++
+++#define DUMP_PRED 0
+++
++ #define POS(x, y) src[(x) + stride * (y)]
++ 
+++// REPEAT_INCLUDE defined at EOF
+++#if defined(RPI) && !defined(INCLUDED_ONCE)
+++typedef uint8_t (* c8_dst_ptr_t)[2];
+++typedef const uint8_t (* c8_src_ptr_t)[2];
+++typedef uint16_t (* c16_dst_ptr_t)[2];
+++typedef const uint16_t (* c16_src_ptr_t)[2];
+++
+++// *** On ARM make these NEON registers
+++typedef struct pixel4_16 {
+++    uint16_t x[4];
+++} pixel4_16;
+++typedef struct pixel4_32 {
+++    uint32_t x[4];
+++} pixel4_32;
+++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
+++{
+++    pixel4_16 t = {{x, x, x, x}};
+++    return t;
+++}
+++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
+++{
+++    pixel4_32 t = {{x, x, x, x}};
+++    return t;
+++}
+++#endif
+++
+++#if PRED_C
+++// For chroma we double pixel size so we copy pairs
+++#undef pixel
+++#undef pixel2
+++#undef pixel4
+++#undef dctcoef
+++#undef INIT_CLIP
+++#undef no_rnd_avg_pixel4
+++#undef rnd_avg_pixel4
+++#undef AV_RN2P
+++#undef AV_RN4P
+++#undef AV_RN4PA
+++#undef AV_WN2P
+++#undef AV_WN4P
+++#undef AV_WN4PA
+++#undef CLIP
+++#undef FUNC
+++#undef FUNCC
+++#undef av_clip_pixel
+++#undef PIXEL_SPLAT_X4
+++
+++#if BIT_DEPTH == 8
+++#define pixel uint16_t
+++#define pixel4 pixel4_16
+++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
+++#define cpel uint8_t
+++#define c_src_ptr_t  c8_src_ptr_t
+++#define c_dst_ptr_t  c8_dst_ptr_t
+++#else
+++#define pixel uint32_t
+++#define pixel4 pixel4_32
+++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
+++#define cpel uint16_t
+++#define c_src_ptr_t c16_dst_ptr_t
+++#define c_dst_ptr_t c16_dst_ptr_t
+++#endif
+++#define AV_RN4P(p) (*(pixel4*)(p))
+++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
+++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
+++#endif
+++
+++
+++// Get PW prior to horrid PRED_C trickery
+++#if BIT_DEPTH == 8
+++#define PW 1
+++#else
+++#define PW 2
+++#endif
+++
+++
+++#if DUMP_PRED && !defined(INCLUDE_ONCE)
+++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+++{
+++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+++        for (unsigned int x = 0; x != size; x++) {
+++            printf("%4d", data[x * 2]);
+++        }
+++        printf("\n");
+++    }
+++    printf("\n");
+++}
+++#endif
+++
++ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
++                                               int log2_size, int c_idx)
++ {
++@@ -69,8 +166,11 @@ do {                                  \
++                 AV_WN4P(&ptr[i], a);                                           \
++             else                                                               \
++                 a = PIXEL_SPLAT_X4(ptr[i + 3])
++-
+++#ifdef RPI
+++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+++#else
++     HEVCLocalContext *lc = s->HEVClc;
+++#endif
++     int i;
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++@@ -79,15 +179,23 @@ do {                                  \
++     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
++     int size_in_luma_v = size << vshift;
++     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
++-    int x = x0 >> hshift;
++-    int y = y0 >> vshift;
+++    const int x = x0 >> hshift;
+++    const int y = y0 >> vshift;
++     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++ 
++     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
++ 
++-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++#if defined(RPI)
+++    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
+++            (pixel*)s->frame->data[c_idx] + x + y * stride :
+++        c_idx == 0 ?
+++            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
+++            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+++#else
++     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+++#endif
++ 
++     int min_pu_width = s->ps.sps->min_pu_width;
++ 
++@@ -95,14 +203,20 @@ do {                                  \
++                               lc->tu.intra_pred_mode;
++     pixel4 a;
++     pixel  left_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+++#endif
++     pixel  top_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+++#endif
++ 
++     pixel  *left          = left_array + 1;
++     pixel  *top           = top_array  + 1;
+++#if !PRED_C
++     pixel  *filtered_left = filtered_left_array + 1;
++     pixel  *filtered_top  = filtered_top_array  + 1;
+++#endif
++     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
++     int cand_left        = lc->na.cand_left;
++     int cand_up_left     = lc->na.cand_up_left;
++@@ -114,6 +228,27 @@ do {                                  \
++     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
++                            (x0 + size_in_luma_h)) >> hshift;
++ 
+++    pixel * src_l = src - 1;
+++    pixel * src_u = src - stride;
+++    pixel * src_ur = src_u + size;
+++
+++#ifdef DISABLE_INTRA
+++    return;
+++#endif
+++
+++#if defined(RPI)
+++    if (av_rpi_is_sand_frame(s->frame)) {
+++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
+++        const AVFrame * const frame = s->frame;
+++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
+++        if ((x & mask) == 0)
+++            src_l -= stripe_adj;
+++        if (((x + size) & mask) == 0)
+++            src_ur += stripe_adj;
+++    }
+++#endif
+++
++     if (s->ps.pps->constrained_intra_pred_flag == 1) {
++         int size_in_luma_pu_v = PU(size_in_luma_v);
++         int size_in_luma_pu_h = PU(size_in_luma_h);
++@@ -163,23 +298,24 @@ do {                                  \
++         top[-1] = 128;
++     }
++     if (cand_up_left) {
++-        left[-1] = POS(-1, -1);
+++        left[-1] = src_l[-stride];
++         top[-1]  = left[-1];
++     }
++     if (cand_up)
++-        memcpy(top, src - stride, size * sizeof(pixel));
+++        // Always good - even with sand
+++        memcpy(top, src_u, size * sizeof(pixel));
++     if (cand_up_right) {
++-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
++-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
++                size - top_right_size);
++     }
++     if (cand_left)
++         for (i = 0; i < size; i++)
++-            left[i] = POS(-1, i);
+++            left[i] = src_l[stride * i];
++     if (cand_bottom_left) {
++         for (i = size; i < size + bottom_left_size; i++)
++-            left[i] = POS(-1, i);
++-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+++            left[i] = src_l[stride * i];
+++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
++                size - bottom_left_size);
++     }
++ 
++@@ -268,7 +404,11 @@ do {                                  \
++             cand_up_left = 1;
++             cand_left    = 1;
++         } else { // No samples available
+++#if PRED_C
+++            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
+++#else
++             left[-1] = (1 << (BIT_DEPTH - 1));
+++#endif
++             EXTEND(top,  left[-1], 2 * size);
++             EXTEND(left, left[-1], 2 * size);
++         }
++@@ -287,6 +427,9 @@ do {                                  \
++     top[-1] = left[-1];
++ 
++     // Filtering process
+++    // Sand can only apply to chroma_format_idc == 1 so we don't need to
+++    // worry about chroma smoothing for that case
+++#if !PRED_C
++     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
++         if (mode != INTRA_DC && size != 4){
++             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
++@@ -342,6 +485,30 @@ do {                                  \
++                                            mode);
++         break;
++     }
+++#else
+++    switch (mode) {
+++    case INTRA_PLANAR:
+++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                          (uint8_t *)left, stride);
+++        break;
+++    case INTRA_DC:
+++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+++                       (uint8_t *)left, stride, log2_size, c_idx);
+++        break;
+++    default:
+++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                           (uint8_t *)left, stride, c_idx,
+++                                           mode);
+++        break;
+++    }
+++
+++#if DUMP_PRED
+++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+++#endif
+++#endif
++ }
++ 
++ #define INTRA_PRED(size)                                                            \
++@@ -357,6 +524,7 @@ INTRA_PRED(5)
++ 
++ #undef INTRA_PRED
++ 
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                   const uint8_t *_left, ptrdiff_t stride,
++                                   int trafo_size)
++@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
++             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++ }
+++#else
+++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+++                                  const uint8_t * _left, ptrdiff_t stride,
+++                                  int trafo_size)
+++{
+++    int x, y;
+++    int size = 1 << trafo_size;
+++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+++    const c_src_ptr_t top = (c_src_ptr_t)_top;
+++    const c_src_ptr_t left = (c_src_ptr_t)_left;
+++
+++    for (y = 0; y < size; y++, src += stride)
+++    {
+++        for (x = 0; x < size; x++)
+++        {
+++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+++        }
+++    }
+++}
+++#endif
++ 
++ #define PRED_PLANAR(size)\
++ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++@@ -386,6 +577,7 @@ PRED_PLANAR(3)
++ 
++ #undef PRED_PLANAR
++ 
+++#if !PRED_C
++ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                           const uint8_t *_left,
++                           ptrdiff_t stride, int log2_size, int c_idx)
++@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++     }
++ }
+++#else
+++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+++                          const uint8_t *_left,
+++                          ptrdiff_t stride, int log2_size, int c_idx)
+++{
+++    unsigned int i, j;
+++    const unsigned int size = (1 << log2_size);
+++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+++    const c_src_ptr_t top = (c_src_ptr_t)_top;
+++    const c_src_ptr_t left = (c_src_ptr_t)_left;
+++    unsigned int dc0 = size;
+++    unsigned int dc1 = size;
+++
+++    for (i = 0; i < size; i++)
+++    {
+++        dc0 += left[i][0] + top[i][0];
+++        dc1 += left[i][1] + top[i][1];
+++    }
+++
+++    dc0 >>= log2_size + 1;
+++    dc1 >>= log2_size + 1;
+++
+++    for (i = 0; i < size; i++, src += stride)
+++    {
+++        for (j = 0; j < size; ++j)
+++        {
+++            src[j][0] = dc0;
+++            src[j][1] = dc1;
++ 
+++        }
+++    }
+++}
+++#endif
+++
+++#ifndef ANGLE_CONSTS
+++#define ANGLE_CONSTS
+++static const int intra_pred_angle[] = {
+++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+++};
+++static const int inv_angle[] = {
+++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+++    -630, -910, -1638, -4096
+ +};
+++#endif
+++
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                 const uint8_t *_top,
++                                                 const uint8_t *_left,
++@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++     const pixel *top  = (const pixel *)_top;
++     const pixel *left = (const pixel *)_left;
++ 
++-    static const int intra_pred_angle[] = {
++-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++-    };
++-    static const int inv_angle[] = {
++-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++-        -630, -910, -1638, -4096
++-    };
++-
++     int angle = intra_pred_angle[mode - 2];
++     pixel ref_array[3 * MAX_TB_SIZE + 4];
++     pixel *ref_tmp = ref_array + size;
++@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++         }
++     }
++ }
+++#else
+++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+++                                                const uint8_t *_top,
+++                                                const uint8_t *_left,
+++                                                ptrdiff_t stride, int c_idx,
+++                                                int mode, int size)
+++{
+++    int x, y;
+++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
+++    c_src_ptr_t top  = (c_src_ptr_t)_top;
+++    c_src_ptr_t left = (c_src_ptr_t)_left;
+++
+++    const int angle = intra_pred_angle[mode - 2];
+++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
+++    c_dst_ptr_t ref_tmp = ref_array + size;
+++    c_src_ptr_t ref;
+++    const int last = (size * angle) >> 5;
+++
+++    if (mode >= 18) {
+++        ref = top - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (y = 0; y < size; y++, src += stride) {
+++            const int idx  = ((y + 1) * angle) >> 5;
+++            const int fact = ((y + 1) * angle) & 31;
+++            if (fact) {
+++                for (x = 0; x < size; ++x) {
+++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                memcpy(src, ref + idx + 1, size * 2 * PW);
+++            }
+++        }
+++    } else {
+++        ref = left - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (x = 0; x < size; x++, src++) {
+++            const int idx  = ((x + 1) * angle) >> 5;
+++            const int fact = ((x + 1) * angle) & 31;
+++            if (fact) {
+++                for (y = 0; y < size; y++) {
+++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                for (y = 0; y < size; y++)
+++                {
+++                    src[y * stride][0] = ref[y + idx + 1][0];
+++                    src[y * stride][1] = ref[y + idx + 1][1];
+++                }
+++            }
+++        }
+++    }
+++}
+++#endif
++ 
++ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                  const uint8_t *left,
++@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
++ }
++ 
+++#undef cpel
+++#undef c_src_ptr_t
+++#undef c_dst_ptr_t
+++
++ #undef EXTEND_LEFT_CIP
++ #undef EXTEND_RIGHT_CIP
++ #undef EXTEND_UP_CIP
++@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++ #undef EXTEND
++ #undef MIN_TB_ADDR_ZS
++ #undef POS
+++#undef PW
+++
+++#ifndef INCLUDED_ONCE
+++#define INCLUDED_ONCE
+++#endif
+++
++diff --git a/libavcodec/raw.c b/libavcodec/raw.c
++index d36b68bfae..b526dc393d 100644
++--- a/libavcodec/raw.c
+++++ b/libavcodec/raw.c
++@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
++     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
++     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
++ 
+++    /* RPI */
+++#ifdef RPI
+++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
+++#endif
+++
++     /* special */
++     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
++     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
++diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
++index d83705645c..4c746786ff 100644
++--- a/libavcodec/rawenc.c
+++++ b/libavcodec/rawenc.c
++@@ -31,6 +31,8 @@
++ #include "libavutil/intreadwrite.h"
++ #include "libavutil/imgutils.h"
++ #include "libavutil/internal.h"
+++#include "libavutil/avassert.h"
+++#include "libavutil/rpi_sand_fns.h"
++ 
++ static av_cold int raw_encode_init(AVCodecContext *avctx)
++ {
++@@ -47,6 +49,73 @@ FF_ENABLE_DEPRECATION_WARNINGS
++     return 0;
++ }
++ 
+++#ifdef RPI
+++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+++                      const AVFrame *frame)
+++{
+++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+++    int size;
+++    int width = frame->width;
+++    int height = frame->height;
+++    int x0 = 0;
+++    int y0 = 0;
+++    uint8_t * dst;
+++    int ret;
+++
+++    if (sd != NULL) {
+++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+++
+++        x0 = si->left_offset;
+++        y0 = si->top_offset;
+++    }
+++
+++    size = width * height * 3 / 2;
+++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+++        return ret;
+++
+++    dst = pkt->data;
+++
+++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+++    dst += width * height;
+++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+++    return 0;
+++}
+++
+++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+++                      const AVFrame *frame)
+++{
+++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+++    int size;
+++    int width = frame->width;
+++    int height = frame->height;
+++    int x0 = 0;
+++    int y0 = 0;
+++    uint8_t * dst;
+++    int ret;
+++
+++    if (sd != NULL) {
+++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+++
+++        x0 = si->left_offset;
+++        y0 = si->top_offset;
+++    }
+++
+++    size = width * height * 3;
+++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+++        return ret;
+++
+++    dst = pkt->data;
+++
+++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+++    dst += width * height * 2;
+++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+++    return 0;
+++}
+++#endif
+++
+++
++ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++                       const AVFrame *frame, int *got_packet)
++ {
++@@ -56,6 +125,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++     if (ret < 0)
++         return ret;
++ 
+++#ifdef RPI
+++    if (av_rpi_is_sand_frame(frame)) {
+++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
+++        *got_packet = (ret == 0);
+++        return ret;
+++    }
+++#endif
+++
++     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
++         return ret;
++     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+ diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+ new file mode 100644
+-index 0000000..5543093
++index 0000000000..391f761df9
+ --- /dev/null
+ +++ b/libavcodec/rpi_hevc_transform.s
+-@@ -0,0 +1,917 @@
++@@ -0,0 +1,923 @@
+ +# ******************************************************************************
+ +# Argon Design Ltd.
+ +# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+@@ -13502,7 +14994,7 @@ index 0000000..5543093
+ +# ******************************************************************************
+ +
+ +# HEVC VPU Transform
+-+#
+++#             fe
+ +# Transform matrix can be thought of as
+ +#   output row vector = input row vector * transMatrix2
+ +#
+@@ -13575,6 +15067,12 @@ index 0000000..5543093
+ +# num32: number of 32x32 transforms
+ +# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+ +#
+++
+++.equ TRANS_SHIFT, 20 - BIT_DEPTH
+++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
+++.equ TRANS_ASL2, 16 - TRANS_SHIFT
+++
+++
+ +hevc_trans_16x16:
+ +  cmp r5,1
+ +  beq memclear16
+@@ -13604,7 +15102,7 @@ index 0000000..5543093
+ +  mov r8,64*16 # Value used to swap from current to next VRF location
+ +  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+ +  mov r4,64 # Constant used for rounding first pass
+-+  mov r5,1<<11 # Constant used for rounding second pass
+++  mov r5,TRANS_RND2 # Constant used for rounding second pass
+ +
+ +  # At start of block r0,r1 point to the current block (that has already been loaded)
+ +block_loop:
+@@ -13625,7 +15123,7 @@ index 0000000..5543093
+ +  bl col_trans_16
+ +  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+ +  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+ +
+ +  # Save results - note there has been a transposition during the processing so we save columns
+ +  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+@@ -13708,8 +15206,8 @@ index 0000000..5543093
+ +  bl trans32
+ +
+ +  # ROW TRANSFORM
+-+  mov r4, 1<<11 # Constant used for rounding second pass
+-+  mov r5, 4 # left shift used for rounding second pass
+++  mov r4, TRANS_RND2 # Constant used for rounding second pass
+++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
+ +
+ +  mov r1,r9  # Input temporary storage
+ +  mov r8,r10   # Output Coefficient buffer
+@@ -14410,9 +15908,6161 @@ index 0000000..5543093
+ +  bgt loop_cmds
+ +
+ +  pop r6-r7, pc
++diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
++new file mode 100644
++index 0000000000..b0e9902d82
++--- /dev/null
+++++ b/libavcodec/rpi_hevc_transform10.h
++@@ -0,0 +1,3070 @@
+++static const unsigned char rpi_hevc_transform10 [] = {
+++21,
+++106,
+++0,
+++144,
+++47,
+++1,
+++37,
+++106,
+++0,
+++144,
+++66,
+++1,
+++53,
+++106,
+++0,
+++144,
+++192,
+++4,
+++69,
+++106,
+++0,
+++144,
+++192,
+++4,
+++85,
+++106,
+++0,
+++144,
+++220,
+++5,
+++169,
+++3,
+++62,
+++64,
+++79,
+++64,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++12,
+++248,
+++0,
+++136,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++248,
+++0,
+++168,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++0,
+++96,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++8,
+++232,
+++0,
+++4,
+++0,
+++0,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++5,
+++232,
+++0,
+++2,
+++0,
+++0,
+++128,
+++69,
+++113,
+++66,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++128,
+++69,
+++113,
+++70,
+++128,
+++144,
+++40,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++16,
+++0,
+++76,
+++254,
+++48,
+++192,
+++9,
+++4,
+++32,
+++8,
+++0,
+++0,
+++4,
+++254,
+++0,
+++144,
+++128,
+++2,
+++0,
+++8,
+++2,
+++0,
+++128,
+++144,
+++23,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++20,
+++0,
+++76,
+++254,
+++48,
+++192,
+++6,
+++4,
+++32,
+++8,
+++0,
+++0,
+++140,
+++248,
+++44,
+++0,
+++0,
+++0,
+++32,
+++48,
+++4,
+++0,
+++128,
+++69,
+++113,
+++66,
+++242,
+++140,
+++211,
+++192,
+++34,
+++31,
+++41,
+++3,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++96,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++224,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++225,
+++64,
+++242,
+++64,
+++3,
+++232,
+++128,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++57,
+++239,
+++224,
+++247,
+++255,
+++255,
+++72,
+++192,
+++95,
+++207,
+++88,
+++122,
+++88,
+++124,
+++137,
+++64,
+++26,
+++64,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++149,
+++96,
+++161,
+++64,
+++152,
+++64,
+++128,
+++144,
+++35,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++27,
+++0,
+++4,
+++232,
+++0,
+++2,
+++0,
+++0,
+++101,
+++96,
+++145,
+++64,
+++168,
+++64,
+++128,
+++144,
+++19,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++11,
+++0,
+++74,
+++232,
+++0,
+++8,
+++0,
+++0,
+++242,
+++140,
+++221,
+++192,
+++57,
+++239,
+++32,
+++8,
+++0,
+++0,
+++41,
+++3,
+++239,
+++3,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++248,
+++4,
+++0,
+++12,
+++248,
+++0,
+++132,
+++64,
+++0,
+++192,
+++248,
+++4,
+++0,
+++0,
+++96,
+++255,
+++159,
+++154,
+++255,
+++0,
+++232,
+++0,
+++4,
+++0,
+++0,
+++255,
+++159,
+++165,
+++255,
+++4,
+++255,
+++48,
+++204,
+++16,
+++3,
+++224,
+++251,
+++62,
+++0,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++128,
+++64,
+++6,
+++232,
+++64,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++32,
+++247,
+++240,
+++207,
+++16,
+++3,
+++32,
+++247,
+++176,
+++207,
+++17,
+++19,
+++32,
+++247,
+++112,
+++207,
+++18,
+++35,
+++32,
+++247,
+++48,
+++207,
+++19,
+++51,
+++32,
+++247,
+++240,
+++206,
+++20,
+++67,
+++32,
+++247,
+++176,
+++206,
+++21,
+++83,
+++32,
+++247,
+++112,
+++206,
+++22,
+++99,
+++32,
+++247,
+++48,
+++206,
+++23,
+++115,
+++32,
+++247,
+++240,
+++205,
+++24,
+++131,
+++32,
+++247,
+++176,
+++205,
+++25,
+++147,
+++32,
+++247,
+++112,
+++205,
+++26,
+++163,
+++32,
+++247,
+++48,
+++205,
+++27,
+++179,
+++32,
+++247,
+++240,
+++204,
+++28,
+++195,
+++32,
+++247,
+++176,
+++204,
+++29,
+++211,
+++32,
+++247,
+++112,
+++204,
+++30,
+++227,
+++32,
+++247,
+++48,
+++204,
+++31,
+++243,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++0,
+++237,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++111,
+++3,
+++4,
+++254,
+++0,
+++128,
+++0,
+++4,
+++0,
+++248,
+++0,
+++0,
+++2,
+++232,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++32,
+++0,
+++0,
+++0,
+++224,
+++35,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++193,
+++232,
+++0,
+++1,
+++0,
+++0,
+++1,
+++106,
+++116,
+++30,
+++90,
+++0,
+++169,
+++3,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++137,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++129,
+++0,
+++131,
+++102,
+++0,
+++158,
+++67,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++108,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++100,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++161,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++150,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++182,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++112,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++101,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++103,
+++255,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++92,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++93,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++210,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++211,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++107,
+++0,
+++8,
+++255,
+++99,
+++23,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++23,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++52,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++52,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++12,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++13,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++18,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++19,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++33,
+++0,
+++8,
+++255,
+++99,
+++3,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++3,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++4,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++4,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++137,
+++47,
+++240,
+++40,
+++246,
+++2,
+++140,
+++47,
+++240,
+++128,
+++245,
+++99,
+++140,
+++5,
+++4,
+++0,
+++247,
+++99,
+++140,
+++1,
+++20,
+++88,
+++246,
+++99,
+++140,
+++1,
+++20,
+++0,
+++247,
+++35,
+++136,
+++62,
+++226,
+++32,
+++247,
+++35,
+++136,
+++32,
+++210,
+++0,
+++247,
+++34,
+++136,
+++63,
+++2,
+++208,
+++246,
+++34,
+++136,
+++0,
+++4,
+++0,
+++247,
+++99,
+++136,
+++58,
+++162,
+++32,
+++247,
+++99,
+++136,
+++33,
+++146,
+++0,
+++247,
+++98,
+++136,
+++59,
+++18,
+++208,
+++246,
+++98,
+++136,
+++0,
+++20,
+++0,
+++247,
+++162,
+++136,
+++33,
+++2,
+++88,
+++246,
+++98,
+++137,
+++2,
+++68,
+++88,
+++246,
+++162,
+++137,
+++3,
+++68,
+++208,
+++254,
+++227,
+++136,
+++60,
+++242,
+++192,
+++243,
+++188,
+++11,
+++208,
+++254,
+++227,
+++136,
+++56,
+++178,
+++192,
+++243,
+++188,
+++10,
+++32,
+++255,
+++226,
+++136,
+++38,
+++58,
+++192,
+++243,
+++60,
+++0,
+++208,
+++254,
+++227,
+++136,
+++59,
+++242,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++49,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++226,
+++136,
+++34,
+++34,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++37,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++194,
+++8,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++255,
+++202,
+++40,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++254,
+++0,
+++240,
+++35,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++226,
+++140,
+++34,
+++34,
+++195,
+++243,
+++60,
+++0,
+++32,
+++255,
+++227,
+++140,
+++36,
+++58,
+++192,
+++243,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++254,
+++226,
+++136,
+++35,
+++58,
+++192,
+++243,
+++60,
+++0,
+++11,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++115,
+++5,
+++106,
+++0,
+++144,
+++173,
+++1,
+++27,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++227,
+++0,
+++64,
+++246,
+++163,
+++140,
+++1,
+++4,
+++0,
+++246,
+++192,
+++175,
+++63,
+++2,
+++0,
+++246,
+++192,
+++174,
+++59,
+++2,
+++0,
+++246,
+++128,
+++175,
+++62,
+++2,
+++0,
+++246,
+++128,
+++174,
+++58,
+++2,
+++0,
+++246,
+++64,
+++175,
+++61,
+++2,
+++0,
+++246,
+++64,
+++174,
+++57,
+++2,
+++0,
+++255,
+++43,
+++240,
+++4,
+++212,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++228,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++191,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++143,
+++52,
+++242,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++212,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++180,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++190,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++143,
+++52,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++180,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++212,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++196,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++189,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++143,
+++52,
+++210,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++148,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++164,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++228,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++187,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++142,
+++52,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++148,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++244,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++186,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++142,
+++52,
+++162,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++244,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++148,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++132,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++185,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++142,
+++52,
+++146,
+++192,
+++243,
+++60,
+++128,
+++64,
+++255,
+++98,
+++141,
+++0,
+++52,
+++192,
+++243,
+++0,
+++0,
+++0,
+++254,
+++0,
+++240,
+++53,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++177,
+++0,
+++88,
+++246,
+++163,
+++140,
+++1,
+++4,
+++128,
+++245,
+++99,
+++141,
+++10,
+++4,
+++88,
+++246,
+++162,
+++138,
+++1,
+++68,
+++0,
+++247,
+++162,
+++138,
+++36,
+++162,
+++88,
+++254,
+++162,
+++138,
+++3,
+++164,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++226,
+++137,
+++32,
+++2,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++226,
+++137,
+++42,
+++114,
+++0,
+++255,
+++34,
+++138,
+++33,
+++18,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++34,
+++138,
+++42,
+++130,
+++16,
+++246,
+++98,
+++138,
+++40,
+++114,
+++16,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++226,
+++137,
+++41,
+++146,
+++40,
+++246,
+++34,
+++138,
+++41,
+++146,
+++32,
+++247,
+++163,
+++141,
+++63,
+++178,
+++32,
+++247,
+++227,
+++141,
+++62,
+++162,
+++0,
+++254,
+++0,
+++240,
+++8,
+++4,
+++0,
+++240,
+++128,
+++11,
+++128,
+++253,
+++35,
+++240,
+++9,
+++100,
+++192,
+++243,
+++128,
+++10,
+++128,
+++253,
+++163,
+++141,
+++128,
+++115,
+++192,
+++243,
+++152,
+++10,
+++88,
+++246,
+++163,
+++141,
+++4,
+++100,
+++208,
+++246,
+++35,
+++139,
+++0,
+++100,
+++32,
+++255,
+++34,
+++139,
+++53,
+++202,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++139,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++240,
+++246,
+++163,
+++141,
+++48,
+++98,
+++0,
+++247,
+++99,
+++139,
+++63,
+++210,
+++0,
+++247,
+++98,
+++139,
+++1,
+++212,
+++88,
+++254,
+++98,
+++139,
+++1,
+++212,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++99,
+++139,
+++62,
+++98,
+++192,
+++243,
+++188,
+++10,
+++88,
+++246,
+++98,
+++139,
+++1,
+++212,
+++240,
+++246,
+++98,
+++139,
+++50,
+++210,
+++0,
+++247,
+++163,
+++128,
+++59,
+++146,
+++0,
+++247,
+++160,
+++128,
+++1,
+++36,
+++88,
+++254,
+++160,
+++128,
+++1,
+++36,
+++192,
+++243,
+++128,
+++11,
+++0,
+++247,
+++163,
+++128,
+++58,
+++98,
+++64,
+++255,
+++35,
+++240,
+++0,
+++100,
+++192,
+++243,
+++128,
+++10,
+++64,
+++255,
+++163,
+++128,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++160,
+++128,
+++1,
+++36,
+++240,
+++246,
+++160,
+++128,
+++50,
+++34,
+++8,
+++255,
+++227,
+++143,
+++54,
+++242,
+++192,
+++243,
+++60,
+++128,
+++40,
+++255,
+++227,
+++142,
+++54,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++39,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++143,
+++45,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++44,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++40,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++142,
+++2,
+++162,
+++192,
+++243,
+++60,
+++128,
+++90,
+++0,
+++169,
+++3,
+++14,
+++96,
+++4,
+++31,
+++169,
+++3,
+++30,
+++96,
+++1,
+++31,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++143,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++135,
+++0,
+++131,
+++102,
+++0,
+++158,
+++71,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++112,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++104,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++123,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++112,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++178,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++72,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++61,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++95,
+++255,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++47,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++13,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++140,
+++47,
+++240,
+++32,
+++247,
+++35,
+++141,
+++63,
+++178,
+++64,
+++254,
+++35,
+++141,
+++2,
+++68,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++35,
+++240,
+++58,
+++226,
+++192,
+++243,
+++188,
+++10,
+++0,
+++254,
+++0,
+++141,
+++4,
+++4,
+++0,
+++240,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++240,
+++246,
+++35,
+++141,
+++48,
+++66,
+++0,
+++247,
+++227,
+++143,
+++52,
+++242,
+++32,
+++247,
+++227,
+++142,
+++52,
+++178,
+++90,
+++0,
+++161,
+++3,
+++6,
+++64,
+++23,
+++64,
+++96,
+++8,
+++70,
+++98,
+++97,
+++8,
+++70,
+++98,
+++98,
+++8,
+++70,
+++98,
+++99,
+++8,
+++70,
+++98,
+++100,
+++8,
+++70,
+++98,
+++101,
+++8,
+++70,
+++98,
+++255,
+++159,
+++8,
+++250,
+++23,
+++102,
+++7,
+++106,
+++112,
+++30,
+++33,
+++3,
+++};
++diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
++new file mode 100644
++index 0000000000..2901b6568d
++--- /dev/null
+++++ b/libavcodec/rpi_hevc_transform8.h
++@@ -0,0 +1,3070 @@
+++static const unsigned char rpi_hevc_transform8 [] = {
+++21,
+++106,
+++0,
+++144,
+++47,
+++1,
+++37,
+++106,
+++0,
+++144,
+++66,
+++1,
+++53,
+++106,
+++0,
+++144,
+++192,
+++4,
+++69,
+++106,
+++0,
+++144,
+++192,
+++4,
+++85,
+++106,
+++0,
+++144,
+++220,
+++5,
+++169,
+++3,
+++62,
+++64,
+++79,
+++64,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++12,
+++248,
+++0,
+++136,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++248,
+++0,
+++168,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++0,
+++96,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++8,
+++232,
+++0,
+++4,
+++0,
+++0,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++5,
+++232,
+++0,
+++8,
+++0,
+++0,
+++128,
+++69,
+++113,
+++66,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++128,
+++69,
+++113,
+++70,
+++128,
+++144,
+++40,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++16,
+++0,
+++76,
+++254,
+++48,
+++192,
+++9,
+++4,
+++32,
+++8,
+++0,
+++0,
+++4,
+++254,
+++0,
+++144,
+++128,
+++2,
+++0,
+++8,
+++2,
+++0,
+++128,
+++144,
+++23,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++20,
+++0,
+++76,
+++254,
+++48,
+++192,
+++4,
+++4,
+++32,
+++8,
+++0,
+++0,
+++140,
+++248,
+++44,
+++0,
+++0,
+++0,
+++32,
+++48,
+++4,
+++0,
+++128,
+++69,
+++113,
+++66,
+++242,
+++140,
+++211,
+++192,
+++34,
+++31,
+++41,
+++3,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++96,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++224,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++225,
+++64,
+++242,
+++64,
+++3,
+++232,
+++128,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++57,
+++239,
+++224,
+++247,
+++255,
+++255,
+++72,
+++192,
+++95,
+++207,
+++88,
+++122,
+++88,
+++124,
+++137,
+++64,
+++26,
+++64,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++149,
+++96,
+++161,
+++64,
+++152,
+++64,
+++128,
+++144,
+++35,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++27,
+++0,
+++4,
+++232,
+++0,
+++8,
+++0,
+++0,
+++69,
+++96,
+++145,
+++64,
+++168,
+++64,
+++128,
+++144,
+++19,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++11,
+++0,
+++74,
+++232,
+++0,
+++8,
+++0,
+++0,
+++242,
+++140,
+++221,
+++192,
+++57,
+++239,
+++32,
+++8,
+++0,
+++0,
+++41,
+++3,
+++239,
+++3,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++248,
+++4,
+++0,
+++12,
+++248,
+++0,
+++132,
+++64,
+++0,
+++192,
+++248,
+++4,
+++0,
+++0,
+++96,
+++255,
+++159,
+++154,
+++255,
+++0,
+++232,
+++0,
+++4,
+++0,
+++0,
+++255,
+++159,
+++165,
+++255,
+++4,
+++255,
+++48,
+++204,
+++16,
+++3,
+++224,
+++251,
+++62,
+++0,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++128,
+++64,
+++6,
+++232,
+++64,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++32,
+++247,
+++240,
+++207,
+++16,
+++3,
+++32,
+++247,
+++176,
+++207,
+++17,
+++19,
+++32,
+++247,
+++112,
+++207,
+++18,
+++35,
+++32,
+++247,
+++48,
+++207,
+++19,
+++51,
+++32,
+++247,
+++240,
+++206,
+++20,
+++67,
+++32,
+++247,
+++176,
+++206,
+++21,
+++83,
+++32,
+++247,
+++112,
+++206,
+++22,
+++99,
+++32,
+++247,
+++48,
+++206,
+++23,
+++115,
+++32,
+++247,
+++240,
+++205,
+++24,
+++131,
+++32,
+++247,
+++176,
+++205,
+++25,
+++147,
+++32,
+++247,
+++112,
+++205,
+++26,
+++163,
+++32,
+++247,
+++48,
+++205,
+++27,
+++179,
+++32,
+++247,
+++240,
+++204,
+++28,
+++195,
+++32,
+++247,
+++176,
+++204,
+++29,
+++211,
+++32,
+++247,
+++112,
+++204,
+++30,
+++227,
+++32,
+++247,
+++48,
+++204,
+++31,
+++243,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++0,
+++237,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++111,
+++3,
+++4,
+++254,
+++0,
+++128,
+++0,
+++4,
+++0,
+++248,
+++0,
+++0,
+++2,
+++232,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++32,
+++0,
+++0,
+++0,
+++224,
+++35,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++193,
+++232,
+++0,
+++1,
+++0,
+++0,
+++1,
+++106,
+++116,
+++30,
+++90,
+++0,
+++169,
+++3,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++137,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++129,
+++0,
+++131,
+++102,
+++0,
+++158,
+++67,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++108,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++100,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++161,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++150,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++182,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++112,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++101,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++103,
+++255,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++92,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++93,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++210,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++211,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++107,
+++0,
+++8,
+++255,
+++99,
+++23,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++23,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++52,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++52,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++12,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++13,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++18,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++19,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++33,
+++0,
+++8,
+++255,
+++99,
+++3,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++3,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++4,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++4,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++137,
+++47,
+++240,
+++40,
+++246,
+++2,
+++140,
+++47,
+++240,
+++128,
+++245,
+++99,
+++140,
+++5,
+++4,
+++0,
+++247,
+++99,
+++140,
+++1,
+++20,
+++88,
+++246,
+++99,
+++140,
+++1,
+++20,
+++0,
+++247,
+++35,
+++136,
+++62,
+++226,
+++32,
+++247,
+++35,
+++136,
+++32,
+++210,
+++0,
+++247,
+++34,
+++136,
+++63,
+++2,
+++208,
+++246,
+++34,
+++136,
+++0,
+++4,
+++0,
+++247,
+++99,
+++136,
+++58,
+++162,
+++32,
+++247,
+++99,
+++136,
+++33,
+++146,
+++0,
+++247,
+++98,
+++136,
+++59,
+++18,
+++208,
+++246,
+++98,
+++136,
+++0,
+++20,
+++0,
+++247,
+++162,
+++136,
+++33,
+++2,
+++88,
+++246,
+++98,
+++137,
+++2,
+++68,
+++88,
+++246,
+++162,
+++137,
+++3,
+++68,
+++208,
+++254,
+++227,
+++136,
+++60,
+++242,
+++192,
+++243,
+++188,
+++11,
+++208,
+++254,
+++227,
+++136,
+++56,
+++178,
+++192,
+++243,
+++188,
+++10,
+++32,
+++255,
+++226,
+++136,
+++38,
+++58,
+++192,
+++243,
+++60,
+++0,
+++208,
+++254,
+++227,
+++136,
+++59,
+++242,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++49,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++226,
+++136,
+++34,
+++34,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++37,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++194,
+++8,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++255,
+++202,
+++40,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++254,
+++0,
+++240,
+++35,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++226,
+++140,
+++34,
+++34,
+++195,
+++243,
+++60,
+++0,
+++32,
+++255,
+++227,
+++140,
+++36,
+++58,
+++192,
+++243,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++254,
+++226,
+++136,
+++35,
+++58,
+++192,
+++243,
+++60,
+++0,
+++11,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++115,
+++5,
+++106,
+++0,
+++144,
+++173,
+++1,
+++27,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++227,
+++0,
+++64,
+++246,
+++163,
+++140,
+++1,
+++4,
+++0,
+++246,
+++192,
+++175,
+++63,
+++2,
+++0,
+++246,
+++192,
+++174,
+++59,
+++2,
+++0,
+++246,
+++128,
+++175,
+++62,
+++2,
+++0,
+++246,
+++128,
+++174,
+++58,
+++2,
+++0,
+++246,
+++64,
+++175,
+++61,
+++2,
+++0,
+++246,
+++64,
+++174,
+++57,
+++2,
+++0,
+++255,
+++43,
+++240,
+++4,
+++212,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++228,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++191,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++143,
+++52,
+++242,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++212,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++180,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++190,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++143,
+++52,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++180,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++212,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++196,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++189,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++143,
+++52,
+++210,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++148,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++164,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++228,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++187,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++142,
+++52,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++148,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++244,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++186,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++142,
+++52,
+++162,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++244,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++148,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++132,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++185,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++142,
+++52,
+++146,
+++192,
+++243,
+++60,
+++128,
+++64,
+++255,
+++98,
+++141,
+++0,
+++52,
+++192,
+++243,
+++0,
+++0,
+++0,
+++254,
+++0,
+++240,
+++53,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++177,
+++0,
+++88,
+++246,
+++163,
+++140,
+++1,
+++4,
+++128,
+++245,
+++99,
+++141,
+++10,
+++4,
+++88,
+++246,
+++162,
+++138,
+++1,
+++68,
+++0,
+++247,
+++162,
+++138,
+++36,
+++162,
+++88,
+++254,
+++162,
+++138,
+++3,
+++164,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++226,
+++137,
+++32,
+++2,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++226,
+++137,
+++42,
+++114,
+++0,
+++255,
+++34,
+++138,
+++33,
+++18,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++34,
+++138,
+++42,
+++130,
+++16,
+++246,
+++98,
+++138,
+++40,
+++114,
+++16,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++226,
+++137,
+++41,
+++146,
+++40,
+++246,
+++34,
+++138,
+++41,
+++146,
+++32,
+++247,
+++163,
+++141,
+++63,
+++178,
+++32,
+++247,
+++227,
+++141,
+++62,
+++162,
+++0,
+++254,
+++0,
+++240,
+++8,
+++4,
+++0,
+++240,
+++128,
+++11,
+++128,
+++253,
+++35,
+++240,
+++9,
+++100,
+++192,
+++243,
+++128,
+++10,
+++128,
+++253,
+++163,
+++141,
+++128,
+++115,
+++192,
+++243,
+++152,
+++10,
+++88,
+++246,
+++163,
+++141,
+++4,
+++100,
+++208,
+++246,
+++35,
+++139,
+++0,
+++100,
+++32,
+++255,
+++34,
+++139,
+++53,
+++202,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++139,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++240,
+++246,
+++163,
+++141,
+++48,
+++98,
+++0,
+++247,
+++99,
+++139,
+++63,
+++210,
+++0,
+++247,
+++98,
+++139,
+++1,
+++212,
+++88,
+++254,
+++98,
+++139,
+++1,
+++212,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++99,
+++139,
+++62,
+++98,
+++192,
+++243,
+++188,
+++10,
+++88,
+++246,
+++98,
+++139,
+++1,
+++212,
+++240,
+++246,
+++98,
+++139,
+++50,
+++210,
+++0,
+++247,
+++163,
+++128,
+++59,
+++146,
+++0,
+++247,
+++160,
+++128,
+++1,
+++36,
+++88,
+++254,
+++160,
+++128,
+++1,
+++36,
+++192,
+++243,
+++128,
+++11,
+++0,
+++247,
+++163,
+++128,
+++58,
+++98,
+++64,
+++255,
+++35,
+++240,
+++0,
+++100,
+++192,
+++243,
+++128,
+++10,
+++64,
+++255,
+++163,
+++128,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++160,
+++128,
+++1,
+++36,
+++240,
+++246,
+++160,
+++128,
+++50,
+++34,
+++8,
+++255,
+++227,
+++143,
+++54,
+++242,
+++192,
+++243,
+++60,
+++128,
+++40,
+++255,
+++227,
+++142,
+++54,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++39,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++143,
+++45,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++44,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++40,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++142,
+++2,
+++162,
+++192,
+++243,
+++60,
+++128,
+++90,
+++0,
+++169,
+++3,
+++14,
+++96,
+++4,
+++31,
+++169,
+++3,
+++30,
+++96,
+++1,
+++31,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++143,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++135,
+++0,
+++131,
+++102,
+++0,
+++158,
+++71,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++112,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++104,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++123,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++112,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++178,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++72,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++61,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++95,
+++255,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++47,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++13,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++140,
+++47,
+++240,
+++32,
+++247,
+++35,
+++141,
+++63,
+++178,
+++64,
+++254,
+++35,
+++141,
+++2,
+++68,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++35,
+++240,
+++58,
+++226,
+++192,
+++243,
+++188,
+++10,
+++0,
+++254,
+++0,
+++141,
+++4,
+++4,
+++0,
+++240,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++240,
+++246,
+++35,
+++141,
+++48,
+++66,
+++0,
+++247,
+++227,
+++143,
+++52,
+++242,
+++32,
+++247,
+++227,
+++142,
+++52,
+++178,
+++90,
+++0,
+++161,
+++3,
+++6,
+++64,
+++23,
+++64,
+++96,
+++8,
+++70,
+++98,
+++97,
+++8,
+++70,
+++98,
+++98,
+++8,
+++70,
+++98,
+++99,
+++8,
+++70,
+++98,
+++100,
+++8,
+++70,
+++98,
+++101,
+++8,
+++70,
+++98,
+++255,
+++159,
+++8,
+++250,
+++23,
+++102,
+++7,
+++106,
+++112,
+++30,
+++33,
+++3,
+++};
+ diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+ new file mode 100644
+-index 0000000..0255f5d
++index 0000000000..0255f5dd44
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.c
+ @@ -0,0 +1,149 @@
+@@ -14567,7 +22217,7 @@ index 0000000..0255f5d
+ +
+ diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+ new file mode 100644
+-index 0000000..b316878
++index 0000000000..b3168788d2
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.h
+ @@ -0,0 +1,58 @@
+@@ -14617,24 +22267,76 @@ index 0000000..b316878
+ +  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
+ +} VC_IMAGE_T;
+ +
+-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+++
+++
+++extern int mbox_open(void);
+++extern void mbox_close(int file_desc);
+++
+++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+++
+++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+++
+++#endif
++diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h
++new file mode 100644
++index 0000000000..e6127749ea
++--- /dev/null
+++++ b/libavcodec/rpi_opts.h
++@@ -0,0 +1,46 @@
+++#ifndef AVCODEC_RPI_OPTS_H
+++#define AVCODEC_RPI_OPTS_H
+++
+++// define RPI to split the CABAC/prediction/transform into separate stages
+++#ifndef RPI
+++
+++  #define RPI_INTER          0
+++  #define RPI_TSTATS         0
+++  #define RPI_HEVC_SAND      0
+++
+++#else
+++  #include "config.h"
+++
+++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
+++
+++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+++  // This has no effect unless RPI_WORKER is defined
+++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+++  // free for the foreground to fill in.
+++  #define RPI_MAX_JOBS 2
+++
+++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+++//  #define RPI_DEBLOCK_VPU
+++
+++  #define RPI_VPU_DEBLOCK_CACHED 1
+ +
+++  #if HAVE_NEON
+++  #define RPI_HEVC_SAND      1
+++  #else
+++  // Sand bust on Pi1 currently - reasons unknown
+++  #define RPI_HEVC_SAND      0
+++  #endif
+ +
+-+extern int mbox_open(void);
+-+extern void mbox_close(int file_desc);
+ +
+-+extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+-+extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+++  #define RPI_QPU_EMU_Y      0
+++  #define RPI_QPU_EMU_C      0
+ +
+-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+++  #define RPI_TSTATS 0
+++#endif
+ +
+ +#endif
+++
+ diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+ new file mode 100644
+-index 0000000..7c0eedd
++index 0000000000..e872b855b7
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.c
+-@@ -0,0 +1,902 @@
++@@ -0,0 +1,935 @@
+ +#ifdef RPI
+ +#include <stdio.h>
+ +#include <stdlib.h>
+@@ -14653,8 +22355,9 @@ index 0000000..7c0eedd
+ +#include "rpi_mailbox.h"
+ +#include "rpi_qpu.h"
+ +#include "rpi_shader.h"
+-+#include "rpi_hevc_transform.h"
+-+#include "rpi_zc.h"
+++#include "rpi_hevc_transform8.h"
+++#include "rpi_hevc_transform10.h"
+++#include "libavutil/rpi_sand_fns.h"
+ +
+ +#pragma GCC diagnostic push
+ +// Many many redundant decls in the header files
+@@ -14678,26 +22381,13 @@ index 0000000..7c0eedd
+ +#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
+ +#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
+ +
+-+// On Pi2 there is no way to access the VPU L2 cache
+-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+-+#define GPU_MEM_FLG 0x4
+-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+-+#define GPU_MEM_MAP 0x0
+-+
+ +#define vcos_verify_ge0(x) ((x)>=0)
+ +
+-+/*static const unsigned code[] =
+-+{
+-+  #include "rpi_shader.hex"
+-+};*/
+-+
+ +// Size in 32bit words
+-+#define QPU_CODE_SIZE 2048
+++#define QPU_CODE_SIZE 4098
+ +#define VPU_CODE_SIZE 2048
+ +
+-+const short rpi_transMatrix2even[32][16] = { // Even rows first
+++static const short rpi_transMatrix2even[32][16] = { // Even rows first
+ +{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+ +{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+ +{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+@@ -14737,7 +22427,8 @@ index 0000000..7c0eedd
+ +struct GPU
+ +{
+ +  unsigned int qpu_code[QPU_CODE_SIZE];
+-+  unsigned int vpu_code[VPU_CODE_SIZE];
+++  unsigned int vpu_code8[VPU_CODE_SIZE];
+++  unsigned int vpu_code10[VPU_CODE_SIZE];
+ +  short transMatrix2even[16*16*2];
+ +};
+ +
+@@ -14749,8 +22440,9 @@ index 0000000..7c0eedd
+ +#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+ +
+ +struct rpi_cache_flush_env_s {
+-+    unsigned int n;
+-+    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+++//    unsigned int n;
+++//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+++  struct vcsm_user_clean_invalid2_s v;
+ +};
+ +
+ +#define WAIT_COUNT_MAX 16
+@@ -14774,7 +22466,6 @@ index 0000000..7c0eedd
+ +typedef struct vq_wait_s
+ +{
+ +  sem_t sem;
+-+  unsigned int cost;
+ +  struct vq_wait_s * next;
+ +} vq_wait_t;
+ +
+@@ -14793,7 +22484,7 @@ index 0000000..7c0eedd
+ +  int open_count;
+ +  int init_count;
+ +  int mb;
+-+  unsigned int current_load;
+++  int vpu_i_cache_flushed;
+ +  GPU_MEM_PTR_T code_gm_ptr;
+ +  vq_wait_pool_t wait_pool;
+ +#if RPI_TRACE_TIME_VPU_QPU_WAIT
+@@ -14866,8 +22557,8 @@ index 0000000..7c0eedd
+ +
+ +// GPU_MEM_PTR_T alloc fns
+ +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+-+  p->numbytes = numbytes;
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+++  p->numbytes = (numbytes + 255) & ~255;  // Round up
+++  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
+ +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+ +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+@@ -14878,12 +22569,14 @@ index 0000000..7c0eedd
+ +  av_assert0(p->arm);
+ +  p->vc = mbox_mem_lock(mb, p->vc_handle);
+ +  av_assert0(p->vc);
+++//  printf("***** %s, %d\n", __func__, numbytes);
+++
+ +  return 0;
+ +}
+ +
+ +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+ +  p->numbytes = numbytes;
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
+ +  av_assert0(p->vcsm_handle);
+ +  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+ +  av_assert0(p->vc_handle);
+@@ -14891,6 +22584,7 @@ index 0000000..7c0eedd
+ +  av_assert0(p->arm);
+ +  p->vc = mbox_mem_lock(mb, p->vc_handle);
+ +  av_assert0(p->vc);
+++//  printf("***** %s, %d\n", __func__, numbytes);
+ +  return 0;
+ +}
+ +
+@@ -14899,6 +22593,7 @@ index 0000000..7c0eedd
+ +  vcsm_unlock_ptr(p->arm);
+ +  vcsm_free(p->vcsm_handle);
+ +  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+++//  printf("***** %s\n", __func__);
+ +}
+ +
+ +
+@@ -14955,9 +22650,14 @@ index 0000000..7c0eedd
+ +  }
+ +  // And the VPU code
+ +  {
+-+    int num_bytes = sizeof(rpi_hevc_transform);
+++    int num_bytes = sizeof(rpi_hevc_transform8);
+++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
+++  }
+++  {
+++    int num_bytes = sizeof(rpi_hevc_transform10);
+ +    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+++    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
+ +  }
+ +  // And the transform coefficients
+ +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+@@ -15048,10 +22748,18 @@ index 0000000..7c0eedd
+ +  gpu_unlock_unref(ge);
+ +}
+ +
+-+unsigned int vpu_get_fn(void) {
+++unsigned int vpu_get_fn(const unsigned int bit_depth) {
+ +  // Make sure that the gpu is initialized
+ +  av_assert0(gpu != NULL);
+-+  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
+++  switch (bit_depth){
+++    case 8:
+++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
+++    case 10:
+++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
+++    default:
+++      av_assert0(0);
+++  }
+++  return 0;
+ +}
+ +
+ +unsigned int vpu_get_constants(void) {
+@@ -15081,95 +22789,75 @@ index 0000000..7c0eedd
+ +//
+ +// Cache flush functions
+ +
+++#define CACHE_EL_MAX 16
+ +
+ +rpi_cache_flush_env_t * rpi_cache_flush_init()
+ +{
+-+    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
+-+    if (rfe == NULL)
+-+        return NULL;
+++  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
+++            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
+++  if (rfe == NULL)
+++    return NULL;
+ +
+-+    rfe->n = 0;
+-+    return rfe;
+++  rfe->v.op_count = 0;
+++  return rfe;
+ +}
+ +
+ +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+ +{
+-+    if (rfe != NULL)
+-+        free(rfe);
+++  if (rfe != NULL)
+++    free(rfe);
+ +}
+ +
+ +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+ +{
+-+    int rc = 0;
+-+    unsigned int na;
+-+    unsigned int nr;
+-+
+-+    // Clear any reamaining ents in the final block
+-+    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
+-+        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
+++  int rc = 0;
+ +
+-+    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
+-+    {
+-+        if (vcsm_clean_invalid(rfe->a + na) != 0)
+-+            rc = -1;
+-+    }
+++  if (vcsm_clean_invalid2(&rfe->v) != 0)
+++    rc = -1;
+ +
+-+    free(rfe);
+++  free(rfe);
+ +
+-+    if (rc == 0)
+-+        return 0;
+++  if (rc == 0)
+++    return 0;
+ +
+-+    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+-+    return rc;
+++  av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+++  return rc;
+ +}
+ +
+-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
+ +{
+-+    // Deal with empty pointer trivially
+-+    if (gm == NULL || gm->numbytes == 0)
+-+        return;
+-+
+-+    {
+-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+ +
+-+        av_assert0(rfe->n < CFE_ENT_COUNT);
+++  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
+ +
+-+        a->s[n].cmd = mode;
+-+        a->s[n].handle = gm->vcsm_handle;
+-+        a->s[n].addr = (unsigned int)gm->arm;
+-+        a->s[n].size = gm->numbytes;
+-+        ++rfe->n;
+-+    }
+++  b->invalidate_mode = mode;
+++  b->block_count = blocks;
+++  b->start_address = gm->arm + offset0;
+++  b->block_size = block_size;
+++  b->inter_block_stride = block_stride;
+ +}
+ +
+ +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+ +  const unsigned int offset, const unsigned int size)
+ +{
+-+    // Deal with empty pointer trivially
+-+    if (gm == NULL || size == 0)
+-+        return;
+-+
+-+//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
+-+
+-+    av_assert0(offset <= gm->numbytes);
+-+    av_assert0(size <= gm->numbytes);
+-+    av_assert0(offset + size <= gm->numbytes);
+++  // Deal with empty pointer trivially
+++  if (gm == NULL || size == 0)
+++    return;
+ +
+-+    {
+-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+++  av_assert0(offset <= gm->numbytes);
+++  av_assert0(size <= gm->numbytes);
+++  av_assert0(offset + size <= gm->numbytes);
+ +
+-+        av_assert0(rfe->n < CFE_ENT_COUNT);
+++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
+++}
+ +
+-+        a->s[n].cmd = mode;
+-+        a->s[n].handle = gm->vcsm_handle;
+-+        a->s[n].addr = (unsigned int)gm->arm + offset;
+-+        a->s[n].size = size;
+-+        ++rfe->n;
+-+    }
+++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+++{
+++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
+ +}
+ +
+++
+ +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+ +{
+ +#if !RPI_ONE_BUF
+@@ -15186,21 +22874,27 @@ index 0000000..7c0eedd
+ +  }
+ +}
+ +
+-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
+++// Flush an area of a frame
+++// Width, height, x0, y0 in luma pels
+++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
+ +{
+-+  const unsigned int y_offset = frame->linesize[0] * start_line;
+-+  const unsigned int y_size = frame->linesize[0] * n;
+++  const unsigned int y_offset = frame->linesize[0] * y0;
+++  const unsigned int y_size = frame->linesize[0] * height;
+ +  // Round UV up/down to get everything
+ +  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+-+  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
+-+  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
+++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
+++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
+ +
+++#if 0
+++  // *** frame->height is cropped height so not good
+ +  // As all unsigned they will also reject -ve
+ +  // Test individually as well as added to reject overflow
+-+  av_assert0(start_line <= (unsigned int)frame->height);
+++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
+ +  av_assert0(n <= (unsigned int)frame->height);
+ +  av_assert0(start_line + n <= (unsigned int)frame->height);
+++#endif
+ +
+ +  if (!gpu_is_buf1(frame))
+ +  {
+@@ -15212,7 +22906,7 @@ index 0000000..7c0eedd
+ +      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+ +    }
+ +  }
+-+  else if (!rpi_sliced_frame(frame))
+++  else if (!av_rpi_is_sand_frame(frame))
+ +  {
+ +    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+ +    if (do_luma) {
+@@ -15225,16 +22919,30 @@ index 0000000..7c0eedd
+ +  }
+ +  else
+ +  {
+-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+-+//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
+-+    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+-+      if (do_luma) {
+-+        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
+-+      }
+-+      if (do_chroma) {
+-+        rpi_cache_flush_add_gm_range(rfe, gm, mode,
+-+                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
+-+      }
+++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
+++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
+++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
+++    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
+++
+++    if (do_chroma)
+++    {
+++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+++      b->invalidate_mode = mode;
+++      b->block_count = block_count;
+++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
+++      b->block_size = uv_size;
+++      b->inter_block_stride = stride1 * stride2;
+++    }
+++    if (do_luma)
+++    {
+++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+++      b->invalidate_mode = mode;
+++      b->block_count = block_count;
+++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
+++      b->block_size = y_size;
+++      b->inter_block_stride = stride1 * stride2;
+ +    }
+ +  }
+ +}
+@@ -15275,13 +22983,11 @@ index 0000000..7c0eedd
+ +
+ +
+ +// If sem_init actually takes time then maybe we want a pool...
+-+static vq_wait_t * vq_wait_new(const unsigned int cost)
+++static vq_wait_t * vq_wait_new(void)
+ +{
+ +  gpu_env_t * const ge = gpu_lock_ref();
+ +  vq_wait_t * const wait = ge->wait_pool.head;
+ +  ge->wait_pool.head = wait->next;
+-+  ge->current_load += cost;
+-+  wait->cost = cost;
+ +  wait->next = NULL;
+ +
+ +#if RPI_TRACE_TIME_VPU_QPU_WAIT
+@@ -15337,17 +23043,13 @@ index 0000000..7c0eedd
+ +
+ +static void vq_wait_post(vq_wait_t * const wait)
+ +{
+-+#if !RPI_TRACE_TIME_VPU_QPU_WAIT
+-+  if (wait->cost != 0)
+-+#endif
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ +  {
+ +    gpu_env_t *const ge = gpu_lock();
+-+    ge->current_load -= wait->cost;
+-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ +    tto_end(&ge->ttw.active, ns_time());
+-+#endif
+ +    gpu_unlock();
+ +  }
+++#endif
+ +
+ +  sem_post(&wait->sem);
+ +}
+@@ -15363,7 +23065,6 @@ index 0000000..7c0eedd
+ +{
+ +  unsigned int n;
+ +  unsigned int mask;
+-+  unsigned int cost;
+ +  struct gpu_job_s j[VPU_QPU_JOB_MAX];
+ +};
+ +
+@@ -15396,23 +23097,26 @@ index 0000000..7c0eedd
+ +    vqj->mask |= VPU_QPU_MASK_VPU;
+ +
+ +    j->command = EXECUTE_VPU;
+-+    j->u.v.q[0] = vpu_code;
+++    // The bottom two bits of the execute address contain no-flush flags
+++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
+++    // as we never reload code
+++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
+ +    j->u.v.q[1] = r0;
+ +    j->u.v.q[2] = r1;
+ +    j->u.v.q[3] = r2;
+ +    j->u.v.q[4] = r3;
+ +    j->u.v.q[5] = r4;
+ +    j->u.v.q[6] = r5;
+++    gpu->vpu_i_cache_flushed = 1;
+ +  }
+ +}
+ +
+ +// flags are QPU_FLAGS_xxx
+-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
+++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
+ +{
+ +  if (n != 0) {
+ +    struct gpu_job_s *const j = new_job(vqj);
+ +    vqj->mask |= VPU_QPU_MASK_QPU;
+-+    vqj->cost += cost;
+ +
+ +    j->command = EXECUTE_QPU;
+ +    j->u.q.jobs = n;
+@@ -15442,7 +23146,7 @@ index 0000000..7c0eedd
+ +  }
+ +
+ +  // We are going to want a sync object
+-+  wait = vq_wait_new(vqj->cost);
+++  wait = vq_wait_new();
+ +
+ +  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+ +  // If we only posted one thing or only QPU jobs
+@@ -15464,7 +23168,6 @@ index 0000000..7c0eedd
+ +    j->callback.cookie = wait;
+ +  }
+ +
+-+  vqj->cost = 0;
+ +  vqj->mask = 0;
+ +  *wait_h = wait;
+ +}
+@@ -15483,11 +23186,6 @@ index 0000000..7c0eedd
+ +  return rv;
+ +}
+ +
+-+unsigned int vpu_qpu_current_load(void)
+-+{
+-+  return gpu_ptr()->current_load;
+-+}
+-+
+ +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+ +{
+ +  if (wait_h != NULL)
+@@ -15536,13 +23234,50 @@ index 0000000..7c0eedd
+ +  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+ +}
+ +
+++
+++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
+++{
+++  // Dummy values we can catch with emulation
+++  qf->y_pxx = ~1U;
+++  qf->y_bxx = ~2U;
+++  qf->y_p00 = ~3U;
+++  qf->y_b00 = ~4U;
+++  qf->c_pxx = ~5U;
+++  qf->c_bxx = ~6U;
+++
+++  switch (bit_depth) {
+++    case 8:
+++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
+++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
+++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
+++      qf->c_pxx = qpu_fn(mc_filter_c_p);
+++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
+++      qf->c_bxx = qpu_fn(mc_filter_c_b);
+++      break;
+++    case 10:
+++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
+++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
+++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
+++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
+++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
+++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
+++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
+++      break;
+++    default:
+++      return -1;
+++  }
+++  return 0;
+++}
+++
+ +#endif // RPI
+ diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+ new file mode 100644
+-index 0000000..a95f7d9
++index 0000000000..485a08f8ba
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.h
+-@@ -0,0 +1,200 @@
++@@ -0,0 +1,206 @@
+ +#ifndef RPI_QPU_H
+ +#define RPI_QPU_H
+ +
+@@ -15687,21 +23422,35 @@ index 0000000..a95f7d9
+ +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+ +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+ +  const unsigned int offset, const unsigned int size);
+++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
+ +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
+++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
+ +
+ +// init, add, finish for one gm ptr
+ +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+ +
+ +
+ +// QPU specific functions
+++
+++typedef struct HEVCRpiQpu {
+++    uint32_t c_pxx;
+++    uint32_t c_pxx_l1;
+++    uint32_t c_bxx;
+++    uint32_t y_pxx;
+++    uint32_t y_bxx;
+++    uint32_t y_p00;
+++    uint32_t y_b00;
+++} HEVCRpiQpu;
+++
+++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
+++
+ +uint32_t qpu_fn(const int * const mc_fn);
+ +
+-+#define QPU_N_GRP_UV 4
+-+#define QPU_N_UV     8
+-+#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
+-+#define QPU_N_Y      12
+++#define QPU_N_GRP    4
+++#define QPU_N_MAX    12
+ +
+ +#define QPU_MAIL_EL_VALS  2
+ +
+@@ -15717,27 +23466,19 @@ index 0000000..a95f7d9
+ +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+ +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+ +  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
+++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
+ +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+ +int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+ +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+ +
+-+
+-+extern unsigned int vpu_get_fn(void);
+++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
+ +extern unsigned int vpu_get_constants(void);
+ +
+ +// Waits for previous post_codee to complete and Will null out *wait_h after use
+ +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+-+unsigned int vpu_qpu_current_load(void);
+ +int vpu_qpu_init(void);
+ +void vpu_qpu_term(void);
+ +
+-+// Simple test of shader code
+-+extern int rpi_test_shader(void);
+-+
+-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+-+
+ +extern int gpu_get_mailbox(void);
+ +void gpu_ref(void);
+ +void gpu_unref(void);
+@@ -15745,10 +23486,10 @@ index 0000000..a95f7d9
+ +#endif
+ diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+ new file mode 100644
+-index 0000000..0898ecd
++index 0000000000..2c6541a8fb
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.c
+-@@ -0,0 +1,670 @@
++@@ -0,0 +1,1570 @@
+ +#include "rpi_shader.h"
+ +
+ +#ifdef _MSC_VER
+@@ -15772,648 +23513,1548 @@ index 0000000..0898ecd
+ +__attribute__((aligned(8)))
+ +#endif
+ +unsigned int rpi_shader[] = {
+-+// ::mc_setup_c
+-+/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
+-+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
+-+/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+-+/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
+-+/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+-+/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+-+/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+-+/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+-+/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+-+/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+-+/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
+-+/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
+-+/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
+-+/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
+-+/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
+-+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+-+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
+-+/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
+-+/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+-+/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+-+/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+++// ::mc_setup_c_q0
+++// ::mc_start
+++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_c_qn
+++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
+++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
+++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+ +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+ +/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
+++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+ +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+-+/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+-+/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+-+/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+-+/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+-+/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+-+/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+-+/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
+-+/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+-+/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
+-+/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+-+/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+-+/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+-+/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+-+/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+-+/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+-+/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+-+/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
+-+/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+-+/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
+-+/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
+-+/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
+-+/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+-+/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
+-+/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
+-+/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+-+/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+-+/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+-+/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+-+/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
+-+/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+-+/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+-+/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+-+/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
+-+/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
+-+/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
+-+/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
+-+// ::mc_filter_uv
+-+/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
+-+/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+-+/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+-+/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+-+/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+-+/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+-+/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+-+/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+-+/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+-+/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+-+/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+-+/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+-+/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
+-+/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
+-+/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
+-+/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
+-+/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
+-+/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
+-+/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+-+/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+-+/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+-+// :uvloop
+-+/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+-+/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+-+/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+-+/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+-+/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+-+/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
+-+/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
+-+/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+-+/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+-+/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
+-+/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
+-+/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+-+/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
+-+/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
+-+/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_uv_b0
+-+/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
+-+/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+-+/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+-+/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+-+/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+-+/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+-+/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+-+/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+-+/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+-+/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+-+/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+-+/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+-+/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
+-+/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
+-+/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
+-+/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
+-+/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+-+/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
+-+/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
+-+// :uvloop_b0
+-+/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+-+/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+-+/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+-+/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+-+/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+-+/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+-+/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
+-+/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+-+/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+-+/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
+-+/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
+-+/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
+-+/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
+-+/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
+-+/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
+-+/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
+-+/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
+-+/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
+-+/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
+-+/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
+-+/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
+-+/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+-+/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+-+/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+-+/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+-+/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
+-+/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+-+/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+-+/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
+-+/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+-+/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+-+/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+-+// :uv_b0_post12
+-+/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
+-+/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+-+/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
+-+/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
+-+/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+-+/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
+-+// :uv_b0_post_fin
+-+/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+-+/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+-+/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
+-+/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+-+/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
+-+/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+-+/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+-+/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
+-+/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
+-+/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+-+/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
+-+/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
+-+/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
+-+/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
+-+/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+-+// :uvloop_b
+-+/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
+-+/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+-+/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+-+/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+-+/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+-+/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
+-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+-+/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+-+/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+-+/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+-+/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+-+/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+-+/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+-+/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+-+/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
+-+/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+-+/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+-+/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+-+/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
+-+/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
+-+/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+-+/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
+-+/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
+-+/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
+-+/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
+-+/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
+-+/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+-+/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
+-+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
+-+/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
+-+/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_interrupt_exit8c
+-+/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
+-+/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit
+-+// ::mc_exit_c
+-+/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit12
+-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+-+/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_setup
+-+/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
+-+/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
+-+/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+-+/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+-+/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+-+/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+-+/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+-+/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+-+/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+-+/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
+-+/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+-+/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
+-+/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+-+/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+-+/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+-+/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
+-+/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
+-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
+-+/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
+-+/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+-+/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+-+/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+-+/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
+-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
+-+/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+-+/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+-+/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+-+/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+-+/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+-+/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+-+/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+-+/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+-+/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+-+/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+-+/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+-+/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
+-+/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
+-+/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
+-+/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
+-+/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+-+/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+-+/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+-+/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
+-+// :per_block_setup
+-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+-+/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
+-+/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
+-+/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+-+/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+-+/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+-+/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
+-+/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
+-+/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
+-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+-+/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+-+/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+-+/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
+-+/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
+-+/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
+-+/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
+-+/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
+-+/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
+-+/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+-+/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
+-+/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
+-+/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+-+/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+-+/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
+-+/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
+-+/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
+-+/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+-+/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+-+/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+-+/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
+-+/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
+-+/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+-+/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+-+/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+-+/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+-+/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+-+/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+-+/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+-+/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+-+/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+-+/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+-+/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+-+/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+-+/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+-+/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+-+/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+-+/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+-+/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
+-+/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
+-+/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
+-+/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
+-+/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
+-+/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
+-+/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
+-+/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
+-+/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
+-+/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
+-+// ::mc_filter
+-+/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
+-+// :yloop
+-+/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+-+/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+-+/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
+-+/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+-+/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+-+/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+-+/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
+-+/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+-+/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+-+/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+-+/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+-+/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+-+/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+-+/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+-+/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+-+/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
+-+/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
+-+// ::mc_filter_b
+-+// :yloopb
+-+/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+-+/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+-+/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+-+/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+-+/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+-+/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+-+/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
+-+/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+-+/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+-+/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+-+/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
+-+/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+-+/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+-+/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+-+/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+-+/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+-+/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+-+/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+-+/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+-+/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
+-+/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+++/* [0x00000118] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
+++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
+++// :1
+++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+++// ::mc_filter_c_p
+++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
+++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c_p_l1
+++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
+++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c_b
+++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
+++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
+++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
+++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
+++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
+++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
+++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
+++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
+++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
+++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
+++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
+++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
+++// :1
+++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
+++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
+++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
+++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
+++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
+++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
+++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
+++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
+++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
+++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
+++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_sync_q0
+++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q1
+++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q2
+++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q3
+++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync_q4
+++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q5
+++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q6
+++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q7
+++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync_q8
+++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q9
+++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q10
+++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q11
+++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c_qn
+++// ::mc_exit_y_qn
+++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c_q0
+++// ::mc_exit_y_q0
+++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_setup_y_q0
+++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_y_qn
+++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
+++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
+++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
+++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
+++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
+++// :1
+++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+++// :per_block_setup_8
+++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
+++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
+++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
+++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
+++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
+++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
+++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
+++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
+++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
+++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
+++// ::mc_filter_y_pxx
+++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+++// :1
+++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y_bxx
+++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++// :1
+++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
+++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
+++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
+++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y_p00
+++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
+++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
+++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
+++// :1
+++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y_b00
+++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
+++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+++// :1
+++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_setup_c10_q0
+++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_c10_qn
+++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
+++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
+++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
+++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
+++// :1
+++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+++// ::mc_filter_c10_p
+++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
+++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c10_p_l1
+++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
+++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c10_b
+++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
+++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
+++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
+++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
+++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
+++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
+++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
+++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
+++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
+++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
+++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
+++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
+++// :1
+++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
+++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
+++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
+++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
+++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
+++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
+++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
+++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
+++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
+++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
+++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
+++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_sync10_q0
+++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q1
+++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q2
+++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q3
+++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync10_q4
+++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q5
+++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q6
+++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q7
+++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync10_q8
+++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q9
+++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q10
+++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q11
+++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c10_q0
+++// ::mc_exit_y10_q0
+++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c10_qn
+++// ::mc_exit_y10_qn
+++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_setup_y10_q0
+++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_y10_qn
+++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
+++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
+++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
+++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
+++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
+++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
+++// :1
+++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+++// :per_block_setup_10
+++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
+++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
+++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
+++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
+++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
+++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
+++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
+++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
+++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
+++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
+++// ::mc_filter_y10_pxx
+++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+++// :1
+++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y10_p00
+++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
+++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
+++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
+++// :1
+++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y10_bxx
+++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++// :1
+++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
+++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
+++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
+++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y10_b00
+++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
+++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+++// :1
+++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+ +// ::mc_end
+ +};
+ +#ifdef __HIGHC__
+@@ -16421,35 +25062,79 @@ index 0000000..0898ecd
+ +#endif
+ diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+ new file mode 100644
+-index 0000000..d17b9fd
++index 0000000000..82bf380eb4
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.h
+-@@ -0,0 +1,19 @@
++@@ -0,0 +1,63 @@
+ +#ifndef rpi_shader_H
+ +#define rpi_shader_H
+ +
+ +extern unsigned int rpi_shader[];
+ +
+-+#define mc_setup_c (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 152)
+-+#define mc_filter_uv_b0 (rpi_shader + 280)
+-+#define mc_interrupt_exit8c (rpi_shader + 554)
+-+#define mc_exit (rpi_shader + 582)
+-+#define mc_exit_c (rpi_shader + 582)
+-+#define mc_interrupt_exit12 (rpi_shader + 598)
+-+#define mc_exit1 (rpi_shader + 634)
+-+#define mc_setup (rpi_shader + 650)
+-+#define mc_filter (rpi_shader + 942)
+-+#define mc_filter_b (rpi_shader + 1094)
+-+#define mc_end (rpi_shader + 1246)
+++#define mc_setup_c_q0 (rpi_shader + 0)
+++#define mc_start (rpi_shader + 0)
+++#define mc_setup_c_qn (rpi_shader + 2)
+++#define mc_filter_c_p (rpi_shader + 142)
+++#define mc_filter_c_p_l1 (rpi_shader + 272)
+++#define mc_filter_c_b (rpi_shader + 402)
+++#define mc_sync_q0 (rpi_shader + 590)
+++#define mc_sync_q1 (rpi_shader + 608)
+++#define mc_sync_q2 (rpi_shader + 620)
+++#define mc_sync_q3 (rpi_shader + 632)
+++#define mc_sync_q4 (rpi_shader + 644)
+++#define mc_sync_q5 (rpi_shader + 662)
+++#define mc_sync_q6 (rpi_shader + 674)
+++#define mc_sync_q7 (rpi_shader + 686)
+++#define mc_sync_q8 (rpi_shader + 698)
+++#define mc_sync_q9 (rpi_shader + 716)
+++#define mc_sync_q10 (rpi_shader + 728)
+++#define mc_sync_q11 (rpi_shader + 740)
+++#define mc_exit_c_qn (rpi_shader + 752)
+++#define mc_exit_y_qn (rpi_shader + 752)
+++#define mc_exit_c_q0 (rpi_shader + 770)
+++#define mc_exit_y_q0 (rpi_shader + 770)
+++#define mc_setup_y_q0 (rpi_shader + 790)
+++#define mc_setup_y_qn (rpi_shader + 792)
+++#define mc_filter_y_pxx (rpi_shader + 1032)
+++#define mc_filter_y_bxx (rpi_shader + 1162)
+++#define mc_filter_y_p00 (rpi_shader + 1292)
+++#define mc_filter_y_b00 (rpi_shader + 1382)
+++#define mc_setup_c10_q0 (rpi_shader + 1462)
+++#define mc_setup_c10_qn (rpi_shader + 1464)
+++#define mc_filter_c10_p (rpi_shader + 1600)
+++#define mc_filter_c10_p_l1 (rpi_shader + 1728)
+++#define mc_filter_c10_b (rpi_shader + 1856)
+++#define mc_sync10_q0 (rpi_shader + 2042)
+++#define mc_sync10_q1 (rpi_shader + 2060)
+++#define mc_sync10_q2 (rpi_shader + 2072)
+++#define mc_sync10_q3 (rpi_shader + 2084)
+++#define mc_sync10_q4 (rpi_shader + 2096)
+++#define mc_sync10_q5 (rpi_shader + 2114)
+++#define mc_sync10_q6 (rpi_shader + 2126)
+++#define mc_sync10_q7 (rpi_shader + 2138)
+++#define mc_sync10_q8 (rpi_shader + 2150)
+++#define mc_sync10_q9 (rpi_shader + 2168)
+++#define mc_sync10_q10 (rpi_shader + 2180)
+++#define mc_sync10_q11 (rpi_shader + 2192)
+++#define mc_exit_c10_q0 (rpi_shader + 2204)
+++#define mc_exit_y10_q0 (rpi_shader + 2204)
+++#define mc_exit_c10_qn (rpi_shader + 2224)
+++#define mc_exit_y10_qn (rpi_shader + 2224)
+++#define mc_setup_y10_q0 (rpi_shader + 2242)
+++#define mc_setup_y10_qn (rpi_shader + 2244)
+++#define mc_filter_y10_pxx (rpi_shader + 2494)
+++#define mc_filter_y10_p00 (rpi_shader + 2624)
+++#define mc_filter_y10_bxx (rpi_shader + 2716)
+++#define mc_filter_y10_b00 (rpi_shader + 2846)
+++#define mc_end (rpi_shader + 2926)
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+ new file mode 100644
+-index 0000000..aa3fe47
++index 0000000000..ba6cc13a95
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.qasm
+-@@ -0,0 +1,1259 @@
++@@ -0,0 +1,1741 @@
+ +
+ +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+ +# the warning that we are using rotation & ra/rb registers. r0..3 can be
+@@ -16457,102 +25142,197 @@ index 0000000..aa3fe47
+ +# local 4.  As it happens this is what is wanted here as we do not want the
+ +# constants from the other half of the calc.
+ +
+-+# register allocation
+++# PREREAD is the number of requests that we have sitting in the TMU request
+++# queue.
+ +#
+-+# ra0...ra7                                     eight horizontal filter coefficients
+++# There are 8 slots availible in the TMU request Q for tm0s requests, but
+++# only 4 output FIFO entries and overflow is bad (corruption or crash)
+++# (If threaded then only 2 out FIFO entries, but we aren't.)
+++# In s/w we are effectively limited to the min vertical read which is >= 4
+++# so output FIFO is the limit.
+ +#
+-+# rb0 rx_shift2
+-+# rb1 rb_y2_next
+-+#
+-+# rb4...rb7
+-+#
+-+# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
+-+#
+-+#                                               (ra15 isn't clamped to zero - this happens during the
+-+#                                                copy to ra14, and during its use in the vertical filter)
+-+#
+-+# rb8...rb11                                    eight vertical filter coefficients
+++# However in the current world there seems to be no benefit (and a small
+++# overhead) in setting this bigger than 2.
+ +
+-+# ra4                                           y: Fiter, UV: part -of b0 -> b stash
+++.set PREREAD,                      4
+ +
+-+# rb12                                          offset to add before shift (round + weighting offsets)
+-+# rb13                                          shift: denom + 6 + 9
+-+# rb14                                          L0 weight (U on left, V on right)
+-+# rb15                                          -- free --
+-+#
+-+# ra16                                          width:height
+-+# ra17                                          ra_y:ra_xshift
+-+# ra18                                          L1 weight (Y)
+-+# ra19                                          ra_y_next:ra_xshift_next
+-+#
+-+# rb16                                          pitch
+-+# rb17                                          height + 1
+-+# rb18                                          max(height,16) + 3
+-+# rb19                                          frame_base2_next
+-+#
+-+# ra20                                          1
+-+# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
+-+# ra22 ra_k256                                  256
+-+# ra23                                          0
+-+#
+-+# rb20                                          -- free --
+-+# rb21                                          -- free --
+-+# rb22 rb_k255                                  255
+-+# rb23                                          dest (Y)
+-+#
+-+# rb24                                          vdw_setup_1(dst_pitch)
+-+# rb25                                          frame width-1
+-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
+-+# rb27                                          vdw_setup_0 (depends on QPU number)
+-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+-+# rb29                                          vdw_setup_1(dst_pitch-width)
+-+# rb30                                          frame height-1
+-+# rb31                                          used as temp to count loop iterations
+-+#
+-+# ra24                                          src frame base
+-+# ra25                                          src frame base 2
+-+# ra26                                          next ra24
+-+# ra27                                          next ra25
+-+# ra28                                          -- free --
+-+# ra29                                          -- free --
+++# Block heights - 8 & 16 are the only numbers we currently support
+++
+++.set C_BLK_HEIGHT_8,               16
+++.set C_BLK_HEIGHT_16,              8
+++.set Y_BLK_HEIGHT_8,               16
+++.set Y_BLK_HEIGHT_16,              8
+++
+++# QPU counts - depend on block size
+++# If we have a 2-byte format & block_size > 8 then can only afford
+++# 8 QPUs
+++# These numbers must match the numbers in rpi_shader_cmd.h
+++
+++.set N_QPU_8,                      12
+++.set N_QPU_16,                     12
+++
+++# register allocation
+ +#
+-+# Use an even numbered register as a link register to avoid corrupting flags
+-+# ra30                                          next kernel address
+-+# ra31                                          chroma-B height+3; free otherwise
+ +
+-+.set rb_max_x,                     rb25
+-+.set rb_max_y,                     rb30
+-+.set rb_pitch,                     rb16
+++# ra0-3
+++# Used as temp and may be loop filter coeffs (split into .8s)
+++# or temp in loop. Check usage on an individual basis.
+++
+++# ra4-7
+++# C:   L0 H filter out FIFO
+++# otherwise -- free --
+++
+++# ra8-11
+++# temp in some places - check usage
+++# Y:   (with rb8-11) horiz out FIFO
+++
+++# ra12-15
+++# -- free --
+++
+++# uniform: width:height
+ +.set ra_width_height,              ra16
+ +.set ra_width,                     ra16.16b
+ +.set ra_height,                    ra16.16a
+-+.set ra_y2,                        ra21.16a
+-+.set ra_y2_next,                   ra21.16b
+ +
+-+.set rb_base2_next,                rb19
+++# y:y2 same layout as y_y2_next so we can update both together
+++.set ra_y_y2,                      ra17
+++.set ra_y2,                        ra17.16a
+++.set ra_y,                         ra17.16b
+ +
+-+.set rb_dest,                      rb23
+++# uniform: L1 weight (U on left, V on right)
+++# Only used in Y B
+++.set ra_wt_off_mul_l1,             ra18
+++.set ra_wt_off_l1,                 ra18.16b
+++.set ra_wt_mul_l1,                 ra18.16a
+++
+++# y_next:y2_next same layout as y_y2 so we can update both together
+++.set ra_y_y2_next,                 ra19
+++.set ra_y_next,                    ra19.16b
+++.set ra_y2_next,                   ra19.16a
+++
+++# Setup: consts - subdivide a single register
+++.set ra_kff100100,                 ra20
+++.set ra_k256,                      ra20.16a
+++.set ra_k0,                        ra20.8a
+++.set ra_k1,                        ra20.8b
+++.set ra_k16,                       ra20.8c
+++.set ra_k255,                      ra20.8d
+++
+++# Loop: xshifts
+++.set ra_xshift,                    ra21.16a
+++.set ra_xshift_next,               ra21.16b
+++
+++# Loop var: L0 weight (U on left, V on right)
+++# _off_ is not used in loop as we want to modify it before use
+++.set ra_wt_off_mul_l0,             ra22
+++.set ra_wt_mul_l0,                 ra22.16a
+++.set ra_wt_off_l0,                 ra22.16b
+++
+++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
+++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
+++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
+++.set ra_blk_height_pmax,           ra23
+++.set ra_pmax,                      ra23.16a
+++.set ra_blk_height,                ra23.8c
+++# -- free --                       ra23.8d
+++
+++# Loop:  src frame base (L0)
+ +.set ra_base,                      ra24
+-+.set ra_base_next,                 ra26
+-+.set ra_xshift,                    ra17.16a
+ +
+++# Loop: src frame base (L1)
+ +.set ra_base2,                     ra25
+ +
+-+# Note ra_xy & ra_xy_next should have same structure!
+-+.set ra_xshift_next,               ra19.16a
+++# Loop: next src frame base (L0)
+++.set ra_base_next,                 ra26
+++
+++# -- free --                       ra27
+++# -- free --                       ra28
+++# -- free --                       ra29
+++
+++# Use an even numbered register as a link register to avoid corrupting flags
+++.set ra_link,                      ra30
+++
+++# -- free --                       ra31
+++
+ +.set rb_xshift2,                   rb0
+ +.set rb_xshift2_next,              rb1
+ +
+-+.set ra_y_next,                    ra19.16b
+-+.set ra_y,                         ra17.16b
+++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
+++.set rb_elem_x,                    rb2
+++
+++# El Flags
+++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
+++.set rb_ef,                        rb3
+++
+++# rb4-7
+++# C-B: L1 H filter out FIFO
+++# Y:   (with ra2.8x) Y vertical filter coeffs
+ +
+-+.set ra_k1,                        ra20
+++# rb8-11
+++# C:   Vertical filter coeffs
+++# Y:   (with ra8-11) horiz out FIFO
+++
+++# Loop var: offset to add before shift (round + weighting offsets)
+++# Exact value varies by loop
+++.set rb_wt_off,                    rb12
+++
+++# Setup: denom + 6 + 9
+++.set rb_wt_den_p15,                rb13
+++
+++# -- free --                       rb14
+++# -- free --                       rb15
+++
+++# Line pitch (128 for sand128)
+++.set rb_pitch,                     rb16
+++
+++# Loop count - 2 (set up TMU for next xfer)
+++.set rb_i_tmu,                     rb17
+++
+++# Loop count for min(height, 16)
+++# Y will reset & loop again if height > 16
+++.set rb_lcount,                    rb18
+++
+++# frame_base2_next
+++.set rb_base2_next,                rb19
+++
+++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
+++# offset to the slice
+ +.set rb_xpitch,                    rb20
+-+.set rb_k255,                      rb22
+-+.set ra_k256,                      ra22
+-+.set ra_k0,                        ra23
+ +
+-+.set ra_link,                      ra30
+++# -- free --                       rb21
+++
+++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
+++.set rb_pmask,                     rb22
+++
+++# Loop: destination address
+++.set rb_dest,                      rb23
+++
+++# vdw_setup_1(dst_pitch)
+++.set rb_dma1_base,                 rb24
+++
+++# Setup: pic width - 1
+++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
+++.set rb_max_x,                     rb25
+++
+++# Loop: height<<23 + width<<16 + vdw_setup_0
+++.set rb_dma0,                      rb26
+++
+++# vdw_setup_0 (depends on QPU number)
+++.set rb_dma0_base,                 rb27
+++
+++# Setup: vw_setup value to reset VPM write pointer
+++.set rb_vpm_init,                  rb28
+++
+++# Loop: vdw_setup_1(dst_pitch-width) = stride
+++.set rb_dma1,                      rb29
+++
+++# Setup: pic_height - 1
+++.set rb_max_y,                     rb30
+++
+++# -- free --                       rb31
+++
+++
+++
+ +
+ +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+ +.set i_shift16,                    -16
+@@ -16564,8 +25344,10 @@ index 0000000..aa3fe47
+ +# Macros that express this - obviously these can't be overlapped
+ +# so are probably unsuitable for loop code
+ +
+-+.macro m_calc_dma_regs, r_vpm, r_dma
+++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
+ +  mov r2, qpu_num
+++.if v_bit_depth <= 8
+++  # 8 bit version
+ +  asr r1, r2, 2
+ +  shl r1, r1, 6
+ +  and r0, r2, 3
+@@ -16576,811 +25358,983 @@ index 0000000..aa3fe47
+ +
+ +  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+ +  shl r0, r0, 5
+-+  add r_dma, r0, r1  # DMA out
+-+.endm
+ +
+-+# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
+-+.macro m_calc_dma_regs_c, r_vpm, r_dma
+-+  mov r2, qpu_num
+++.else
+++  # 16 bit version
+++  # Limited to 8 QPUs if blk height > 8
+ +  asr r1, r2, 1
+++.if v_blk_height <= 8
+++  shl r1, r1, 4
+++.else
+ +  shl r1, r1, 5
+++.endif
+ +  and r0, r2, 1
+ +  or  r0, r0, r1
+ +
+-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+  add r_vpm, r0, r1  # VPM 8bit storage
+++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
+++  add r_vpm, r0, r1
+ +
+ +  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+ +  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
+++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
+ +  shl r0, r0, 6
+++.endif
+ +  add r_dma, r0, r1  # DMA out
+ +.endm
+ +
+ +
+++.macro m_setup_q0
+++  srel -, 12
+++.endm
+++
+++# Code start label
+++::mc_start
+++
+ +################################################################################
+ +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+-+::mc_setup_c
+-+  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
+++
+++.macro m_setup_c, v_bit_depth
+++
+++# Cannot use mul24 on x as x might be -ve, so must use shift
+++.if v_bit_depth <= 8
+++.set v_x_shift,         1
+++.set v_pmask,           0xff
+++.set v_blk_height,      C_BLK_HEIGHT_8
+++.else
+++.set v_x_shift,         2
+++.set v_pmask,           0xffff
+++.set v_blk_height,      C_BLK_HEIGHT_16
+++.endif
+++
+++  mov tmurs, 1                                  # No swap TMUs
+ +
+ +# Load first request location
+-+  mov ra0, unif         # next_x_y
+++  mov ra0, unif                                 # next_x_y
+++
+++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++  shl rb_ef, r0, i_shift30
+ +
+ +  mov ra_base, unif                             # Store frame c base
+ +
+ +# Read image dimensions
+-+  sub rb_max_x, unif, 1     # pic c width
+-+  sub rb_max_y, unif, 1     # pic c height
+++  sub r0, unif, 1                               # pic c width
+++  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
+++  sub rb_max_y, unif, 1                         # pic c height
+ +
+ +# load constants
+-+  mov ra_k1, 1
+-+  mov ra_k256, 256
+-+  mov rb_k255, 255
+-+  mov ra_k0, 0
+-+
+-+# touch registers to keep simulator happy
+++  mov ra_kff100100, 0xff100100
+++  mov rb_pmask, v_pmask
+++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+ +
+-+  # ra/b4..7: B0 -> B stash registers
+-+  mov ra4, 0 ; mov rb4, 0
+-+  mov ra5, 0 ; mov rb5, 0
+-+  mov ra6, 0 ; mov rb6, 0
+-+  mov ra7, 0 ; mov rb7, 0
+-+
+-+  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
+-+
+-+# ; ra12..15: vertical scroll registers
+ +# get source pitch
+-+  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
+-+  mov rb_pitch, unif    ; mov ra13, 0           # stride1
+-+  mov r0, elem_num      ; mov ra14, 0
+-+# get destination vdw setup
+-+  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
+++  mov rb_xpitch, unif                           # stride2
+++  mov rb_pitch, unif                            # stride1
+++  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
+++  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
+++
+++  and r0, 1, elem_num
+++  nop                   ; mul24 r0, r0, 5
+++.if v_bit_depth <= 8
+++  add rb_elem_x, r0, elem_num
+++.else
+++  add r0, r0, elem_num
+++  add rb_elem_x, r0, r0
+++.endif
+ +
+ +# Compute base address for first and second access
+ +# ra_base ends up with t0s base
+ +# ra_base2 ends up with t1s base
+ +
+-+  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
+++  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
+++  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
+ +  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
+ +  min r0, r0, rb_max_x
+ +
+ +# Get shift
+-+  and r1, r0, 1
+-+  shl ra_xshift_next, r1, 4
+-+
+-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+-+
+-+  and r0, r0, -2
+-+  add r0, r0, r0        ; v8subs r1, r1, r1
+-+  sub r1, r1, rb_pitch
+++# Shift will always calculate as 0 for 9+ bit
+++# Ideally we can optimize the shift out of the code in these cases but for now
+++# it is tidier to leave it in
+++.if v_bit_depth <= 8
+++  shl ra_xshift_next, r0, 3
+++.else
+++  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+++.endif
+++
+++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
+++
+++.if v_bit_depth <= 8
+++  and r0, r0, -4
+++.endif
+++  sub r1, ra_k0, rb_pitch
+ +  and r1, r0, r1
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra_y
+++  add r0, r0, r1
+ +  add ra_base, ra_base, r0
+ +
+-+  max r0, r1, 0
+-+  min r0, r0, rb_max_y
+-+
+-+# submit texture requests for first line
+-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+  add t0s, ra_base, r0
+-+
+-+# submit texture requests for 2nd line
+-+
+-+  max r0, r1, 0
+-+  min r0, r0, rb_max_y
+-+
+-+  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+  add t0s, ra_base, r0
+-+
+-+  add rb13, 9, unif     # denominator
+-+  mov -, unif           # Unused
+++  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
+ +
+ +# Compute part of VPM to use for DMA output
+-+  m_calc_dma_regs_c rb28, rb27
+++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
+++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+ +
+-+# -----------------
+ +# And again for L1, but only worrying about frame2 stuff
+ +
+-+  mov ra_link, unif        # Next fn
+-+
+ +# Load first request location
+-+  mov ra0, unif            # next_x_y
+++  mov ra0, unif                                 # next_x_y
+ +
+-+  mov ra_base2, unif # Store frame c base
+++  mov ra_base2, unif                            # [ra0 delay] Store frame c base
+ +
+ +# Compute base address for first and second access
+ +# ra_base ends up with t0s base
+ +# ra_base2 ends up with t1s base
+ +
+-+  mov ra_y2, ra0.16a       # Store y
+-+  mov r0, ra0.16b          # Load x
+-+  add r0, r0, elem_num     # Add QPU slice
+-+  max r0, r0, 0         ; mov -, unif           # Unused 0
+-+  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
+++  shl r0, ra0.16b, v_x_shift
+++  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
+++  max r0, r0, 0
+++  min r0, r0, rb_max_x
+ +
+-+# Get shift
+-+  and r1, r0, 1         ; mov -, unif           # Unused 2
+-+  shl rb_xshift2_next, r1, 4
+++# Get shift (already zero if 9+ bit so ignore)
+++.if v_bit_depth <= 8
+++  shl rb_xshift2_next, r0, 3
+++.endif
+ +
+ +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+ +
+-+  and r0, r0, -2
+-+  add r0, r0, r0        ; v8subs r1, r1, r1
+-+  sub r1, r1, rb_pitch
+++.if v_bit_depth <= 8
+++  and r0, r0, -4
+++.endif
+++  sub r1, ra_k0, rb_pitch
+ +  and r1, r0, r1
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra_y2
+++  add r0, r0, r1        ; mov r2, ra_y2
+ +  add ra_base2, ra_base2, r0
+ +
+-+  max r0, r1, 0
+-+  min r0, r0, rb_max_y
+-+
+-+# submit texture requests for first line
+-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
+++# Do preloads
+++# r0 = ra_y, r2 = ra_y2
+++  mov r3, PREREAD       ; mov r0, ra_y
+ +
+-+# submit texture requests for 2nd line
+++:1
+++  sub.setf r3, r3, 1
+++  max r1, r0, 0
+++  min r1, r1, rb_max_y
+++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t0s, ra_base, r1  ; mov ra_y, r0
+ +
+-+  max r0, r1, 0         ; mov -, unif           # Unused 4
+++  max r1, r2, 0
+++  brr.anynz -, r:1b
+++  min r1, r1, rb_max_y
+++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t1s, ra_base2, r1 ; mov ra_y2, r2
+++# >>> .anynz 1b
+ +
+++  mov ra_link, unif                             # link
+++# touch registers to keep simulator happy
+++  # ra/b4..7: B0 -> B stash registers
+++  mov ra4, 0 ; mov rb4, 0
+ +  bra -, ra_link
+-+
+-+  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
+-+  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+  add t1s, ra_base2, r0
+-+
+++  mov ra5, 0 ; mov rb5, 0
+++  mov ra6, 0 ; mov rb6, 0
+++  mov ra7, 0 ; mov rb7, 0
+ +# >>> ra_link
+-+
+-+
+-+.macro setf_nz_if_v
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +.endm
+ +
+++::mc_setup_c_q0
+++  m_setup_q0
+++::mc_setup_c_qn
+++  m_setup_c 8
+ +
+ +################################################################################
+ +
+-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+ +
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv
+-+  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++.macro m_filter_c_p, v_tmu, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         1
+++.set v_x_mul,           2
+++.set v_v_shift,         8
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         2
+++.set v_x_mul,           4
+++.set v_v_shift,         i_shift16
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++
+++.if v_tmu == 0
+++.set vrx_xshift,        rb_xshift2              # b side more convienient
+++.set vrx_xshift_next,   ra_xshift_next
+++.set vra_y_next,        ra_y_next
+++.set vrx_base_next,     ra_base_next
+++.set vra_y,             ra_y
+++.set vra_base,          ra_base
+++.set vr_txs,            t0s
+++.else
+++.set vrx_xshift,        ra_xshift               # a side more convienient
+++.set vrx_xshift_next,   rb_xshift2_next
+++.set vra_y_next,        ra_y2_next
+++.set vrx_base_next,     rb_base2_next
+++.set vra_y,             ra_y2
+++.set vra_base,          ra_base2
+++.set vr_txs,            t1s
+++.endif
+ +
+++# per-channel shifts were calculated on the *previous* invocation
+ +# get base addresses and per-channel shifts for *next* invocation
+-+  mov ra2, unif         ; mov r0, elem_num
+++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
+ +
+-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
+ +
+-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
+++  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
+++  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
+++  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+ +
+-+  shl ra_xshift_next, r0, 4
+-+
+-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++.if v_bit_depth <= 8
+++  shl vrx_xshift_next, r0, 3
+++  and r0, r0, -4
+++.endif
+++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
+++  add vrx_base_next, r3, r0     ; mov r1, ra_height
+ +
+ +# set up VPM write
+-+
+-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+-+  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
+-+  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
+++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
+++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
+ +
+ +# ; unpack filter coefficients
+ +
+-+  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
+-+  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
+++  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
+++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
+ +
+-+  shl r1, r1, rb13      ; mov rb10, ra3.8c
+-+  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+++  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
+ +
+-+  asr rb12, r1, 1
+-+  shl rb14, ra1.16a, 1  # b14 = weight*2
+++  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+ +
+-+# rb14 - weight L0 * 2
+-+# rb13 = weight denom + 6 + 9
+-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
+++  sub ra3, rb_wt_den_p15, ra_k1
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++# r5           = 0 (loop counter)
+++# ra9          = alias for rb_max_y
+++# ra_wt_mul_l0 = weight L0
+++# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
+++# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
+++
+++# We want (r0r1)
+++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
+++# We fetch (after shift)
+++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
+ +
+-+# r3 = 0
+-+:uvloop
+++:1
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+
+-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+  min r2, r2, rb_max_y
+-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+  setf_nz_if_v
+++.if v_tmu == 0
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
+++  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+++.else
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
+++  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+++.endif
+++
+++  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++  min r3, r3, ra9       ; mov.ifnc r0, r2
+++
+++  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+ +
+ +# apply horizontal filter
+ +# The filter coeffs for the two halves of this are the same (unlike in the
+ +# Y case) so it doesn't matter which ra0 we get them from
+-+
+-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+  sub r0, r2, r3        ; mov r3, rb31
+-+  sub.setf -, r3, 4     ; mov ra12, ra13
+-+  brr.anyn -, r:uvloop
+-+  mov ra13, ra14        ; mul24 r1, ra14, rb9
+-+  mov ra14, ra15
+-+  mov ra15, r0          ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+  sub r1, r1, r0        ; mul24 r0, ra14, rb10
+-+  add r1, r1, r0        ; mul24 r0, ra15, rb11
+++# Also as the two halves are locked together we don't need to separate the 1st
+++# r0 mul or the last r1 mul as they are vaild for all QPUs
+++
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++
+++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
+++# Have to dup block as we need to move the brr - code is more common than it
+++# looks at first glance
+++.if v_bit_depth <= 8
+++  brr.anyn -, r:1b
+++  add r2, r2, r3        ; mov ra5, ra6
+++  mov ra6, ra7          ; mul24 r1, ra7, rb10
+++  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+++.else
+++  add r2, r2, r3        ; mov ra5, ra6
+++  brr.anyn -, r:1b
+++  mov ra6, ra7          ; mul24 r1, ra7, rb10
+++  sub r2, r2, r0        ; mul24 r0, ra4, rb8
+++  asr ra7, r2, v_bit_depth - 8
+++.endif
+++# >>> .anyn 1b
+++
+++  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
+++  add r1, r1, r0        ; mul24 r0, ra7, rb11
+ +  sub r1, r1, r0
+-+  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+ +  asr r1, r1, 14
+-+  nop                   ; mul24 r1, r1, rb14
+-+  shl r1, r1, 8
+++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++  shl r1, r1, 8         ; mov r3, ra_blk_height
+++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++  brr.anyn -, r:1b
+++  asr r1, r1, ra3
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> .anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+ +
+-+  add r1, r1, rb12
+-+  asr ra1.8as, r1, rb13
+-+  nop                   ; mov r1, r1 << 8
+-+  brr.anyn -, r:uvloop
+-+  asr ra1.8bs, r1, rb13
+-+  mov -, vw_wait
+-+  mov vpm, ra1
+++# At 10 bits
+++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
+++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
+++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
+++# (P)
+++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
+++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
+++# ... should be OK
+++#
+++# (B)
+++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
+++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
+++# So signed overflow if we sign extend here :-(
+++#
+++# In practice this doesn't happen (we need a maximal offset and a very unlucky
+++# filter).
+++#
+++# This could be fixed by offsetting the filters s.t. they are unsigned until
+++# weight mul and then removing the offset with the weighting offset (I think
+++# this should work) or splitting the rounding & offsetting
+ +
+-+# >>>
+++::mc_filter_c_p
+++  m_filter_c_p 0, 8
+ +
+-+# DMA out for U & stash for V
+-+  bra -, ra_link
+-+  mov vw_setup, rb26
+-+  mov vw_setup, rb29
+-+  mov vw_addr, unif     # u_dst_addr
+-+# >>>
+++::mc_filter_c_p_l1
+++  m_filter_c_p 1, 8
+ +
+ +################################################################################
+ +
+-+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++# mc_filter_c_b
+ +
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv_b0
+-+  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
+++
+++.macro m_filter_c_b, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         1
+++.set v_v_shift,         8
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         2
+++.set v_v_shift,         i_shift16
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++.set v_x_mul,           (1 << v_x_shift)
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+-+  mov ra2, unif         ; mov r0, elem_num
+++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
+ +
+-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
+ +
+-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
+++  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+++  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
+++  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+++  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
+ +
+-+  shl ra_xshift_next, r0, 4
+++.if v_bit_depth <= 8
+++  shl ra_xshift_next, r0, 3
+++.endif
+ +
+-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
+++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
+++  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+ +
+ +# set up VPM write
+ +
+-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+-+  add rb17, r1, 1
+-+  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
+-+
+-+# ; unpack filter coefficients
+-+
+-+  add r0,   r0, r2      ; mov rb9,  ra3.8b
+-+  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27
+-+
+-+  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
+-+
+-+  mov rb14, unif                                # U weight
+-+  mov.ifnz rb14, unif                           # V weight
+-+
+-+# rb14 unused in b0 but will hang around till the second pass
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# r3 = 0
+-+:uvloop_b0
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+
+-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+  min r2, r2, rb_max_y
+-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
+-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+  sub r0, r2, r3        ; mov r3, rb31
+-+  sub.setf -, r3, 4     ; mov ra12, ra13
+-+  brr.anyn -, r:uvloop_b0
+-+  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
+-+  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
+-+  mov ra15, r0          ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop_b0
+-+
+-+# apply vertical filter and write to B-FIFO
+-+
+-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
+-+  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
+-+  sub r1, r1, r0        ; mov ra7, rb6
+-+
+-+# FIFO goes:
+-+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
+-+# This arrangement optimizes the inner loop FIFOs at the expense of making the
+-+# bulk shift between loops quite a bit nastier
+-+# a8 used as temp
+-+
+-+  sub.setf -, r3, ra31
+-+  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
+-+  brr.anyn -, r:uvloop_b0
+-+  mov ra5, rb4          ; mov rb4, ra4
+-+  mov ra4, rb5          ; mov rb5, ra6
+-+  mov ra6, rb7          ; mov rb7, ra8
+-+# >>>
+-+
+-+# 1st half done all results now in the a/b4..7 fifo
+-+
+-+# Need to bulk rotate FIFO for heights other than 16
+-+# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
+-+# we are allowed 3/4 cb_size w/h :-(
+-+
+-+# Destination uniforms discarded
+-+# At the end drop through to _b - we will always do b after b0
+-+
+-+  sub.setf -, 15, r3    # 12 + 3 of preroll
+-+  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
+-+  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
+-+  mov r0, i_shift16     ; mov ra_link, unif
+-+  mov r1, 0x10000
+-+# >>>
+-+  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
+-+# If h != 16 && h != 12 then h <= 8 so
+-+# shift 8 with discard (.16b = .16a on all regs)
+-+  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+-+  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+-+  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+-+# >>>
+-+  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+-+
+-+  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
+-+# Shift 4
+-+  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+-+  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+-+  # If we shifted by 4 here then the max length remaining is 4
+-+  # so that is it
+-+
+-+  brr -, r:uv_b0_post_fin
+-+# Shift 2
+-+  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+-+  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+-+  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+-+  # 6 / 2 so need 6 outputs
+-+# >>>
+-+
+-+:uv_b0_post12
+-+# this one is annoying as we need to swap halves of things that don't
+-+# really want to be swapped
+-+
+-+# b7a, a6a, b5a, a4a
+-+# b4a, a5a, b6a, a7a
+-+# b7b, a6b, b5b, a4b
+-+# b4b, a5b, b6b, a7b
+-+
+-+  mov r2, ra6           ; mov r3, rb7
+-+  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+-+  mov ra5, r2           ; mov rb4, r3
+-+
+-+  mov r2,  ra4          ; mov r3,  rb5
+-+  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+-+  mov ra7, r2           ; mov rb6, r3
+-+
+-+:uv_b0_post_fin
+-+
+-+##### L1 B processing
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
+++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+  mov ra2, unif         ; mov r0, elem_num
+++  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
+++  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
+++  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
+ +
+-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++# L1 - uniform layout could possibly be optimized
+ +
+-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+-+  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+-+  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
+++  shl r0, ra3.16b, v_x_shift                    # r0=x*2
+++  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
+++  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
+++  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
+++  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+ +
+-+  shl rb_xshift2_next, r0, 4
+++.if v_bit_depth <= 8
+++  shl rb_xshift2_next, r0, 3
+++.endif
+ +
+-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+-+  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+-+  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
+++  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
+++  and r1, r0, r1        ; mov rb10, ra3.8c
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
+++  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
+ +  add rb_base2_next, r3, r0
+ +
+-+  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
+-+  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
+-+
+-+  nop                   ; mov rb11, ra3.8d
+-+  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
+-+  asr rb12, r1, 1
+-+
+-+# ra1.16a used directly in the loop
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# r3 = 0
+-+
+-+:uvloop_b
+++  mov ra9, rb_max_y     ; mov rb11, ra3.8d
+++  shl r1, ra_wt_off_l1, rb_wt_den_p15
+++  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
+++
+++# r5        loop counter
+++# ra0       H coeffs L0
+++# ra1       H coeffs L1
+++# ra2       V coeffs L0
+++# ra3       temp
+++# ra4-7     L0 H FIFO
+++# rb4-7     L1 H FIFO
+++# rb8-rb11  V coeffs L1
+++# ra9       rb_max_y alias
+++
+++:1
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
+++  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+++  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+++  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+++  add ra_y, 1, ra_y     ; mov r3, ra_y
+++
+++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++
+++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+++
+++# L0 H-filter
+++# H FIFO scrolls are spread all over this loop
+++  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
+++
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+++.if v_bit_depth <= 8
+++  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
+++.else
+++  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
+++  asr ra3, r2, (v_bit_depth - 8)
+++.endif
+++
+++  shr r2, r4, rb_xshift2 ; mov ra5, ra6
+++  shr r1, r2, v_v_shift ; mov r3, ra_y2
+++  add ra_y2, r3, ra_k1  ; mov rb6, rb7
+++
+++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++
+++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+++
+++# L1 H-filter
+++
+++  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+++  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+++  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+++# V filters - start in branch delay slots of H
+++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
+++  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+++  brr.anyn -, r:1b
+++  mov ra6, ra7          ; mul24 r3, ra7, rb10
+++  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+++  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+++# >>> .anyn 1b
+++
+++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
+++  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+++  sub r2, r1, r0        ; mul24 r0, ra4, rb8
+++  sub r1, r3, r0        ; mul24 r0, ra5, rb9
+++  add r1, r1, r0        ; mul24 r0, ra7, rb11
+++  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+++
+++  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+++  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+++
+++  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
+++  add r1, r1, r2        ; mov r3, ra_blk_height
+++
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
+++
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> .anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+ +
+-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
+-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+ +
+-+  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+-+  min r2, r2, rb_max_y
+-+  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+-+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+-+  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+-+  sub r0, r2, r3       ; mov r3, rb31
+-+  sub.setf -, r3, 4    ; mov ra12, ra13
+-+  brr.anyn -, r:uvloop_b
+-+  mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+  mov ra14, ra15          ; mul24 r2, ra15, rb10
+-+  mov ra15, r0            ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop_b
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
+-+  add r1, r1, r2        ; mul24 r0, ra15, rb11
+-+  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+-+  mov ra7, rb6          ; mul24 r1, r1, ra_k256
+-+  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
+-+
+-+  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+-+  add r1, r1, r0        ; mov rb4, ra4
+-+
+-+  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
+-+  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+-+
+-+  sub.setf -, r3, ra31  ; mov ra6, rb7
+-+  asr ra3.8as, r1, rb13
+-+  nop                   ; mov r1, r1 << 8
+-+  brr.anyn -, r:uvloop_b
+-+  asr ra3.8bs, r1, rb13
+-+  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
+-+  mov vpm, ra3
+-+# >>>
+++::mc_filter_c_b
+++  m_filter_c_b 8
+ +
+-+# DMA out
+++################################################################################
+++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
+++# conflicts
+++
+++.macro m_exit_drain
+++.if PREREAD == 2
+++# Special case 2 as loop is wasteful
+++  nop                   ; nop           ; ldtmu0
+++  nop                   ; nop           ; ldtmu1
+++  nop                   ; nop           ; ldtmu0
+++  mov -, vw_wait        ; nop           ; ldtmu1
+++.else
+++  mov.setf r3, PREREAD - 1
+++:1
+++  brr.anynz -, r:1b
+++  nop                   ; nop           ; ldtmu0
+++  nop                   ; nop           ; ldtmu1
+++  sub.setf r3, r3, 1
+++ # >>>
+++  mov  -, vw_wait
+++.endif
+++.endm
+ +
+++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
+++# All qpus start at the beginning and after that (group - 1) must have finished
+++# before (group) can start
+++#
+++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
+++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
+++# lockup otherwise)
+++#
+++# There is some, currently ill defined, potential lockup if we have the VDM active
+++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
+++#
+++# The code stalled when I had many waiters on a single sem so we have a
+++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
+++# and we currently have both the memory & sems to support it.
+++.macro m_sync_q, n_qpu, n_quads
+++# Do not generate code for qpu >= quads * 4 -  fns should never be called
+++.if n_qpu < n_quads * 4
+++  mov ra_link, unif     # Can only branch to an a reg (not r0)
+++  mov -, vw_wait        # [ra_link delay]
+++
+++.set n_sem_sync, n_qpu - (n_qpu % 4)
+++.set n_sem_in, n_qpu
+++.set n_sem_out, n_qpu + 1
+++
+++.if n_qpu % 4 == 0
+++
+++.set n_sem_quad_in,  12 + n_qpu / 4
+++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
+++
+++  sacq -, n_sem_sync
+++  sacq -, n_sem_sync
+++  sacq -, n_sem_sync
+ +  bra -, ra_link
+-+  mov vw_setup, rb26
+-+  mov vw_setup, rb29
+-+  mov vw_addr, unif     # c_dst_addr
+++  sacq -, n_sem_quad_in
+++  srel -, n_sem_out
+++  srel -, n_sem_quad_out
+ +
+++.else
+++  bra -, ra_link
+++  srel -, n_sem_sync
+++  sacq -, n_sem_in
+++.if n_sem_out % 4 != 0
+++  srel -, n_sem_out
+++.else
+++  nop
+++.endif
+++.endif
+++.endif
+++.endm
+ +
+-+################################################################################
+++.set v_quads8, N_QPU_8 / 4
+++
+++::mc_sync_q0
+++  m_sync_q 0, v_quads8
+++::mc_sync_q1
+++  m_sync_q 1, v_quads8
+++::mc_sync_q2
+++  m_sync_q 2, v_quads8
+++::mc_sync_q3
+++  m_sync_q 3, v_quads8
+++::mc_sync_q4
+++  m_sync_q 4, v_quads8
+++::mc_sync_q5
+++  m_sync_q 5, v_quads8
+++::mc_sync_q6
+++  m_sync_q 6, v_quads8
+++::mc_sync_q7
+++  m_sync_q 7, v_quads8
+++::mc_sync_q8
+++  m_sync_q 8, v_quads8
+++::mc_sync_q9
+++  m_sync_q 9, v_quads8
+++::mc_sync_q10
+++  m_sync_q 10, v_quads8
+++::mc_sync_q11
+++  m_sync_q 11, v_quads8
+ +
+ +# mc_exit()
+-+
+-+::mc_interrupt_exit8c
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu1
+-+  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+#  mov -,sacq(0) # 8
+-+#  mov -,sacq(0) # 9
+-+#  mov -,sacq(0) # 10
+-+#  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+ +# Chroma & Luma the same now
+-+::mc_exit_c
+-+::mc_exit
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
+ +
+-+  mov -,srel(0)
+++.macro m_exit_qn
+++  m_exit_drain
+++  nop                   ; nop           ; thrend
+++  nop
+++  nop
+++# >>> thrend <<<
+++.endm
+++
+++::mc_exit_c_qn
+++::mc_exit_y_qn
+++  m_exit_qn
+ +
+-+  nop        ; nop ; thrend
+-+  nop        ; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+ +
+ +
+ +# mc_interrupt_exit12()
+-+::mc_interrupt_exit12
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+  mov -,sacq(0) # 8
+-+  mov -,sacq(0) # 9
+-+  mov -,sacq(0) # 10
+-+  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+-+
+-+::mc_exit1
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  ldtmu1
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+++
+++.macro m_exit_q0
+++  m_exit_drain
+++  sacq -, 12
+++  nop                   ; nop           ; thrend
+++  mov interrupt, 1
+++  nop
+++# >>> thrend <<<
+++.endm
+++
+++::mc_exit_c_q0
+++::mc_exit_y_q0
+++  m_exit_q0
+ +
+ +# LUMA CODE
+ +
+ +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+ +# For P frames we make the second x,y coordinates offset by +8
+ +
+++
+ +################################################################################
+-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+-+::mc_setup
+++# mc_setup
+++#
+++# typedef struct qpu_mc_pred_y_s_s {
+++#    qpu_mc_src_t next_src1;
+++#    qpu_mc_src_t next_src2;
+++#    uint16_t pic_h;
+++#    uint16_t pic_w;
+++#    uint32_t stride2;
+++#    uint32_t stride1;
+++#    uint32_t wdenom;
+++#    uint32_t next_fn;
+++# } qpu_mc_pred_y_s_t;
+++
+++.macro m_setup_y, v_bit_depth
+++
+++# Cannot use mul24 on x as x might be -ve, so must use shift
+++.if v_bit_depth <= 8
+++.set v_x_shift,         0
+++.set v_pmask,           0xff
+++.set v_blk_height,      Y_BLK_HEIGHT_8
+++.else
+++.set v_x_shift,         1
+++.set v_pmask,           0xffff
+++.set v_blk_height,      Y_BLK_HEIGHT_16
+++.endif
+++
+++
+ +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+-+  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
+-+  mov ra9, unif         # ref_y_base
+-+  mov ra10, unif        # y2_x2
+-+  mov ra11, unif        # ref_y2_base
+++  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
+++  mov ra9, unif                                 # ref_y_base
+++  mov ra1, unif                                 # x2_y2
+++  mov ra11, unif                                # ref_y2_base
+++
+++# load constants
+++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++  shl rb_ef, r0, i_shift30
+++
+++
+++  mov ra_kff100100, 0xff100100
+++  mov rb_pmask, v_pmask
+++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++
+++# Compute part of VPM to use
+ +
+ +# Read image dimensions
+-+  mov ra3, unif         # width_height
+-+  mov rb_xpitch, unif   # stride2
+++  mov ra3, unif                                 # width_height
+++  mov rb_xpitch, unif                           # stride2
+++.if v_x_shift == 0
+ +  sub rb_max_x, ra3.16b, 1
+++.else
+++  sub r0, ra3.16b, 1
+++  shl rb_max_x, r0, v_x_shift
+++.endif
+ +  sub rb_max_y, ra3.16a, 1
+-+  mov rb_pitch, unif    # stride1
+++  mov rb_pitch, unif                            # stride1
+ +
+ +# get destination pitch
+ +  mov r1, vdw_setup_1(0)
+-+  or  rb24, r1, rb_pitch
+++  or  rb_dma1_base, r1, rb_pitch
+ +
+ +# Compute base address for first and second access
+ +  mov r3, elem_num
+-+  add r0, ra8.16a, r3   # Load x + elem_num
+++  add r0, ra0.16b, r3                           # Load x + elem_num
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+ +  max r0, r0, 0
+ +  min r0, r0, rb_max_x
+ +  shl ra_xshift_next, r0, 3 # Compute shifts
+ +
+-+
+-+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
+++# X is byte offset - we can only load words - mask
+ +
+ +  and r0, r0, -4        ; v8subs r2, r2, r2
+ +  sub r2, r2, rb_pitch
+ +  and r1, r0, r2
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+++  add r0, r0, r1                                # Add stripe offsets
+ +  add ra_base, ra9, r0
+ +
+-+  mov r1, ra8.16b       # Load y
+-+  add ra_y, r1, 1       # Set for next
+-+  max r1, r1, 0
+-+  min r1, r1, rb_max_y
+-+
+-+# submit texture requests for first line
+-+  nop                   ; mul24 r1, r1, rb_pitch
+-+  add t0s, ra_base, r1
+-+
+-+
+ +  # r3 still contains elem_num
+-+  add r0, ra10.16a, r3  # Load x
+++  add r0, ra1.16b, r3                           # Load x
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+ +  max r0, r0, 0
+ +  min r0, r0, rb_max_x
+-+  shl rb_xshift2_next, r0, 3 # Compute shifts
+++  shl rb_xshift2_next, r0, 3                    # Compute shifts
+ +
+ +  # r2 still contains mask
+ +  and r0, r0, -4
+ +  and r1, r0, r2
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+++  add r0, r0, r1                                # Add stripe offsets
+ +  add ra_base2, ra11, r0
+ +
+-+  mov r1, ra10.16b       # Load y
+-+  add ra_y2, r1, 1       # Set for next
+-+  max r1, r1, 0
+++# Do preloads
+++  nop                   ; mov r0, ra0.16a       # ; r0 = y
+++  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
+++
+++:1
+++  sub.setf r3, r3, 1
+++  max r1, r0, 0
+++  min r1, r1, rb_max_y
+++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t0s, ra_base, r1  ; mov ra_y, r0
+++
+++  max r1, r2, 0
+++  brr.anynz -, r:1b
+ +  min r1, r1, rb_max_y
+++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t1s, ra_base2, r1 ; mov ra_y2, r2
+++# >>> .anynz 1b
+ +
+-+# submit texture requests for first line
+-+  nop                   ; mul24 r1, r1, rb_pitch
+-+  add t1s, ra_base2, r1
+++  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
+ +
+-+# load constants
+++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+ +
+-+  mov ra_k1, 1
+-+  mov ra_k256, 256
+-+  mov rb_k255, 255
+-+  mov ra_k0, 0
+++  mov ra_link, unif                             # Next fn
+ +
+ +# touch vertical context to keep simulator happy
+-+
+ +  mov ra8,  0           ; mov rb8,  0
+++  bra -, ra_link
+ +  mov ra9,  0           ; mov rb9,  0
+ +  mov ra10, 0           ; mov rb10, 0
+ +  mov ra11, 0           ; mov rb11, 0
+++# >>> ra_link
+++.endm
+ +
+-+# Compute part of VPM to use
+-+  m_calc_dma_regs rb28, rb27
+-+
+-+# Weighted prediction denom
+-+  add rb13, unif, 9     # unif = weight denom + 6
+-+
+-+# submit texture requests for second line
+-+  max r1, ra_y, 0
+-+  min r1, r1, rb_max_y
+-+  add ra_y, ra_y, 1
+-+  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
+-+  add t0s, r1, ra_base
+-+
+-+  max r1, ra_y2, 0
+-+  min r1, r1, rb_max_y
+-+  add ra_y2, ra_y2, 1
+-+  nop                   ; mul24 r1, r1, rb_pitch
+-+  add t1s, r1, ra_base2
+-+
+-+# FALL THROUGHT TO PER-BLOCK SETUP
+++::mc_setup_y_q0
+++  m_setup_q0
+++::mc_setup_y_qn
+++  m_setup_y 8
+ +
+++################################################################################
+++#
+ +# Start of per-block setup code
+ +# P and B blocks share the same setup code to save on Icache space
+-+:per_block_setup
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov ra_link, unif
+-+#### We do all the setup even if we are about to exit - reading junk from unif....
+-+
+-+  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+-+  mov ra_xshift, ra_xshift_next
+-+  mov rb_xshift2, rb_xshift2_next
+++# luma_setup_delay3 done in delay slots of branch that got us here
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+++# per-channel shifts were calculated on the *previous* invocation
+ +
+-+  add r0, ra1.16a, r3   # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_max_x
+++# 1st 3 instructions of per_block-setup in branch delay
+++#
+++# typedef struct qpu_mc_pred_y_p_s {
+++#    qpu_mc_src_t next_src1;
+++#    qpu_mc_src_t next_src2;
+++#    uint16_t h;
+++#    uint16_t w;
+++#    uint32_t mymx21;
+++#    uint32_t wo1;
+++#    uint32_t wo2;
+++#    uint32_t dst_addr;
+++#    uint32_t next_fn;
+++# } qpu_mc_pred_y_p_t;
+++#
+ +
+-+  shl ra_xshift_next, r0, 3         # Compute shifts
+-+  and r0, r0, -4        ; v8subs r2, r2, r2
+-+  sub r2, r2, rb_pitch
+-+  and r1, r0, r2
+-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+-+  add ra_base_next, unif, r0              # Base1
+-+  mov ra_y_next, ra1.16b                      # Load y
+-+  mov ra1, unif         # x2_y2
+-+  nop                   # ra1 delay
+++.macro m_luma_setup, v_bit_depth
+++# Hack - QASM may well have have label pasting but I have no idea how...
+++.if v_bit_depth == 8
+++  brr ra_link, r:per_block_setup_8
+++.elif v_bit_depth == 10
+++  brr ra_link, r:per_block_setup_10
+++.endif
+++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
+++  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
+++  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++.endm
+ +
+-+  add r0, ra1.16a, r3   # Load x2
+-+  max r0, r0, 0
+++.macro m_per_block_setup, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         0
+++.set v_x_mul,           1
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         1
+++.set v_x_mul,           2
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+++  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+ +  min r0, r0, rb_max_x
+ +
+-+  shl rb_xshift2_next, r0, 3         # Compute shifts
+++  shl ra_xshift_next, r0, 3         # Compute shifts
+ +  and r0, r0, -4
+-+  and r1, r0, r2
+++  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
+++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+-+  add rb_base2_next, unif, r0              # Base1
+-+  mov ra_y2_next, ra1.16b                      # Load y
+-+  mov ra_width_height, unif         # width_height
+-+
+-+# set up VPM write
+-+  mov vw_setup, rb28    # [ra1 delay]
+-+
+-+# get width,height of block (unif load above)
+-+  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
+-+  add rb17, ra_height, 5  ; mov r0, ra_height
+-+  mov r1, 16
+-+  min r0, r0, r1
+-+  add rb18, r0, 7
+-+  shl r0,   r0, 7
+-+  add r0,   r0, ra_width                        # Combine width and height of destination area
+-+  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+++  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
+++  add ra_base_next, ra_base_next, r0            # [ra1 delay]
+++
+++  add r0, ra1.16b, r3                           # Load x2
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+++  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+++  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
+++  shl rb_xshift2_next, r0, 3                    # Compute shifts
+++  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
+++  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
+++  add rb_base2_next, rb_base2_next, r0
+++
+++# get width,height of block (unif load above), r1 = width * pel_size
+++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
+++  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+++  add rb_lcount, r0, 7
+++  shl r0,   r0, v_dma_h_shift
+++  add r0,   r0, r1                              # Combine width and height of destination area
+++  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
+ +
+ +# get filter coefficients and discard unused B frame values
+-+  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
+-+  mov r2, 0x01040400                 # [ra5 delay]
+-+  shl ra8, r0, 3                     ; mov rb14, ra5.16a
+++  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
+++  shl ra8, r0, 3        ; mov r3, ra_k255
+ +
+ +# Pack the 1st 4 filter coefs for H & V tightly
+++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
+ +
+-+  mov r1,0x00010100  # -ve
+++  mov r1,0x00010100  # -ve                      [ra8 delay]
+ +  ror ra2.8a, r1, ra8.8d
+ +  ror ra0.8a, r1, ra8.8c
+ +
+-+  ror ra2.8b, r2, ra8.8d
+-+  ror ra0.8b, r2, ra8.8c
+++  mov r1, 0x01040400
+++  ror ra2.8b, r1, ra8.8d
+++  ror ra0.8b, r1, ra8.8c
+ +
+ +  mov r1,0x050b0a00  # -ve
+ +  ror ra2.8c, r1, ra8.8d
+@@ -17390,49 +26344,44 @@ index 0000000..aa3fe47
+ +  ror ra2.8d, r1, ra8.8d
+ +  ror ra0.8d, r1, ra8.8c
+ +
+-+# In the 2nd vertical half we use b registers due to
+-+# using a-side fifo regs. The easiest way to achieve this to pack it
+-+# and then unpack!
+++# In the 2nd vertical half we use b registers due to using a-side fifo regs
+ +
+ +  mov r1,0x3a281100
+-+  ror ra3.8a, r1, ra8.8d
+-+  ror ra1.8a, r1, ra8.8c
+++  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+ +
+ +  mov r1,0x0a0b0500  # -ve
+-+  ror ra3.8b, r1, ra8.8d
+-+  ror ra1.8b, r1, ra8.8c
+++  ror r0, r1, ra8.8d
+++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+ +
+ +  mov r1,0x04040100
+-+  ror ra3.8c, r1, ra8.8d
+-+  ror ra1.8c, r1, ra8.8c
+++  ror r0, r1, ra8.8d
+++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+++
+++  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
+ +
+ +  mov r1,0x01010000  # -ve
+-+  ror ra3.8d, r1, ra8.8d
+-+  ror ra1.8d, r1, ra8.8c
+++  ror r0, r1, ra8.8d
+ +
+-+# Extract weighted prediction information in parallel
+-+# We are annoyingly A src limited here
+++  bra -, ra_link
+++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+ +
+-+  mov rb4, ra3.8a            ; mov ra18, unif
+-+  mov rb5, ra3.8b
+-+  mov rb6, ra3.8c
+-+  mov.ifnz ra5, ra18
+++  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
+++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+++  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
+++# >>> branch ra_link
+ +
+-+  mov rb_dest, unif     # Destination address
+++# r5 = 0
+++# ra_wt_mul_l1  = weight L1
+++# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
+++# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
+++# rb_wt_den_p15 = weight denom + 6 + 9
+++# rb_wt_mul_l0  = weight L0
+++.endm
+ +
+-+  bra -, ra_link
+++:per_block_setup_8
+++  m_per_block_setup 8
+ +
+-+  shl r0, ra5.16b, rb13      # Offset calc
+-+  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+-+  mov r3, 0                  ; mov rb7, ra3.8d
+-+# >>> branch ra_link
+-+#
+-+# r3 = 0
+-+# ra18.16a = weight L1
+-+# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
+-+# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+-+# rb13     = weight denom + 6 + 9
+-+# rb14     = weight L0
+ +
+ +
+ +################################################################################
+@@ -17440,381 +26389,1225 @@ index 0000000..aa3fe47
+ +# In a P block, y2_x2 should be y_x+8
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +
+-+::mc_filter
+-+# ra5.16a = weight << 16; We want weight * 2 in rb14
+++.macro m_filter_y_pxx, v_bit_depth
+++  m_luma_setup v_bit_depth
+ +
+-+  shl rb14, ra5.16a, 1
+++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+ +
+-+# r3 = 0
+++# r5 = 0 (loop count)
+ +
+-+:yloop
+++:1
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+-+
+ +# N.B. Whilst y == y2 as far as this loop is concerned we will start
+ +# the grab for the next block before we finish with this block and that
+ +# might be B where y != y2 so we must do full processing on both y and y2
+ +
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+ +
+ +  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+-+
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++  max r2, ra_y2, 0
+++  min r2, r2, rb_max_y          ; mov ra7, ra8
+++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+ +
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+ +
+ +# apply horizontal filter
+-+  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+  sub r0, r2, r3       ; mov r3, rb31
+-+
+-+  sub.setf -, r3, 8       ; mov r1,   ra8
+-+  mov ra8,  ra9           ; mov rb8,  rb9
+-+  brr.anyn -, r:yloop
+-+  mov ra9,  ra10          ; mov rb9,  rb10
+-+  mov ra10, ra11          ; mov rb10, rb11
+-+  mov ra11, r0            ; mov rb11, r1
+-+  # >>> .anyn yloop
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++
+++  sub.setf -, r5, 8     ; mov ra9,  ra10
+++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++  brr.anyn -, r:1b
+++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++  mov ra10, ra11        ; mov rb10, rb11
+++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++  # >>> .anyn 1b
+ +
+ +  # apply vertical filter and write to VPM
+-+
+-+  nop                     ; mul24 r0, rb8,  ra2.8a
+-+  nop                     ; mul24 r1, rb9,  ra2.8b
+-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+  sub r1, r1, r0          ; mov -, vw_wait
+++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++  add r1, r1, r0        ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0
+ +# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+ +#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+ +# The top 8 bits have rubbish in them as mul24 is unsigned
+ +# The low 6 bits need discard before weighting
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+ +  asr r1, r1, 14
+-+  nop                     ; mul24 r1, r1, rb14
+-+  add r1, r1, rb12
+-+
+-+  shl r1, r1, 8
+-+  brr.anyn -, r:yloop
+-+  asr r1, r1, rb13
+-+# We have a saturating pack unit - I can't help feeling it should be useful here
+-+  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
+-+  max vpm, r1, 0         # Delay 3
+++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
+++
+++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++
+ +# >>> branch.anyn yloop
+ +
+-+# If looping again the we consumed 16 height last loop
+-+  # rb29 (stride) remains constant
+-+  # rb17 remains const (based on total height)
+-+  # recalc rb26, rb18 based on new segment height
+-+  # N.B. r3 is loop counter still
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_pxx
+++  m_filter_y_pxx 8
+++
+++
+++################################################################################
+++
+++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# In a P block, only the first half of coefficients contain used information.
+++# At this point we have already issued two pairs of texture requests for the current block
+++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+++# Or possibly by taking advantage of symmetry?
+++
+++.macro m_filter_y_bxx, v_bit_depth
+++  m_luma_setup v_bit_depth
+++
+++:1
+++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++
+++  max r2, ra_y2, 0
+++  min r2, r2, rb_max_y          ; mov ra7, ra8
+++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+++
+++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++
+++# apply horizontal filter
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++
+++  sub.setf -, r5, 8     ; mov ra9,  ra10
+++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++  brr.anyn -, r:1b
+++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++  mov ra10, ra11        ; mov rb10, rb11
+++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++  # >>> .anyn 1b
+++
+++  # apply vertical filter and write to VPM
+++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++  add r1, r1, r0        ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0        ; mov r2, rb_wt_off
+++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+++# Top 8 bits are bad - low 6 bits should be discarded
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++
+++  asr r1, r1, 14
+++  nop                   ; mul24 r0, r1, ra_wt_mul_l0
+++  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+++
+++  add r1, r1, r0        ; mov r3, ra_blk_height
+++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> branch.anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_bxx
+++  m_filter_y_bxx 8
+++
+++################################################################################
+++#
+++# typedef struct qpu_mc_pred_y_p00_s {
+++#    qpu_mc_src_t next_src1;
+++#    uint16_t h;
+++#    uint16_t w;
+++#    uint32_t wo1;
+++#    uint32_t dst_addr;
+++#    uint32_t next_fn;
+++# } qpu_mc_pred_y_p00_t;
+++
+++.macro m_filter_y_p00, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         0
+++.set v_x_mul,           1
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         1
+++.set v_x_mul,           2
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++
+++  mov ra0, unif         ; mov r3, elem_num      # y_x
+++  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
+++  add r0, ra0.16b, r3
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+++
+++  max r0, r0, 0
+++  min r0, r0, rb_max_x
+++
+++  shl ra_xshift_next, r0, 3                     # Compute shifts
+++  and r0, r0, -4        ; v8subs r2, r2, r2
+++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
+++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
+++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
+++
+++# get width,height of block (unif load above)
+++# Compute vdw_setup1(dst_pitch-width)
+++  shl r1, ra_width, v_x_shift
+++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+++  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+++  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
+++  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
+++  add rb_dma0, r0, rb_dma0_base
+++
+++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
+++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+++  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
+++
+++:1
+++  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+++  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+++
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+++  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> branch.anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_p00
+++  m_filter_y_p00 8
+++
+++################################################################################
+++
+++.macro m_filter_y_b00, v_bit_depth
+++# luma setup does a fair bit more than we need calculating filter coeffs
+++# that we will never use but it saves I-cache to use it (also simple!)
+++  m_luma_setup v_bit_depth
+++
+++# Fix up vals that were expecting a filter (somewhat icky)
+++  mov r0, 7
+++  sub rb_i_tmu, rb_i_tmu, r0
+++  sub rb_lcount, rb_lcount, r0
+++  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+++  shl rb_wt_off, rb_wt_off, r0
+++  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+++
+++:1
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+++
+++  max r2, ra_y2, 0
+++  min r2, r2, rb_max_y
+++  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+++  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+++
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+++  add r1, r0, r1
+++  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+ +
+-+  mov r1, 16
+-+  sub r0, ra_height, r1
+-+  mov ra_height, r0
+-+  max.setf r0, r0, 0    # Done if Z now
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> branch.anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+ +
+ +# DMA out
+-+  brr.anyz -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, rb_dest # start the VDW   Delay 3
+-+# >>> .anyz per_block_setup
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_b00
+++  m_filter_y_b00 8
+++
+++################################################################################
+++################################################################################
+++# 10 BIT
+++
+++::mc_setup_c10_q0
+++  m_setup_q0
+++::mc_setup_c10_qn
+++  m_setup_c 10
+++
+++::mc_filter_c10_p
+++  m_filter_c_p 0, 10
+++
+++::mc_filter_c10_p_l1
+++  m_filter_c_p 1, 10
+++
+++
+++::mc_filter_c10_b
+++  m_filter_c_b 10
+++
+++# Even if these fns are the same as for other bit depths we want our own copy
+++# to keep the code we are using in a single lump to avoid (direct map) cache
+++# thrashing
+++.set v_quads10, N_QPU_16 / 4
+++
+++::mc_sync10_q0
+++  m_sync_q 0, v_quads10
+++::mc_sync10_q1
+++  m_sync_q 1, v_quads10
+++::mc_sync10_q2
+++  m_sync_q 2, v_quads10
+++::mc_sync10_q3
+++  m_sync_q 3, v_quads10
+++::mc_sync10_q4
+++  m_sync_q 4, v_quads10
+++::mc_sync10_q5
+++  m_sync_q 5, v_quads10
+++::mc_sync10_q6
+++  m_sync_q 6, v_quads10
+++::mc_sync10_q7
+++  m_sync_q 7, v_quads10
+++::mc_sync10_q8
+++  m_sync_q 8, v_quads10
+++::mc_sync10_q9
+++  m_sync_q 9, v_quads10
+++::mc_sync10_q10
+++  m_sync_q 10, v_quads10
+++::mc_sync10_q11
+++  m_sync_q 11, v_quads10
+++
+++::mc_exit_y10_q0
+++::mc_exit_c10_q0
+++  m_exit_q0
+++
+++::mc_exit_y10_qn
+++::mc_exit_c10_qn
+++  m_exit_qn
+++
+++::mc_setup_y10_q0
+++  m_setup_q0
+++::mc_setup_y10_qn
+++  m_setup_y 10
+++
+++:per_block_setup_10
+++  m_per_block_setup 10
+++
+++::mc_filter_y10_pxx
+++  m_filter_y_pxx 10
+++
+++::mc_filter_y10_p00
+++  m_filter_y_p00 10
+++
+++::mc_filter_y10_bxx
+++  m_filter_y_bxx 10
+++
+++::mc_filter_y10_b00
+++  m_filter_y_b00 10
+++
+++
+++
+++::mc_end
+++# Do not add code here because mc_end must appear after all other code.
++diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
++new file mode 100644
++index 0000000000..9f8983da52
++--- /dev/null
+++++ b/libavcodec/rpi_shader_cmd.h
++@@ -0,0 +1,128 @@
+++#ifndef RPI_SHADER_CMD_H
+++#define RPI_SHADER_CMD_H
+++
+++#pragma pack(push, 4)
+++
+++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
+++// If mixed then we are just confused and get a lot of warnings....
+++typedef const uint8_t * qpu_mc_src_addr_t;
+++typedef uint8_t * qpu_mc_dst_addr_t;
+++#else
+++typedef uint32_t qpu_mc_src_addr_t;
+++typedef uint32_t qpu_mc_dst_addr_t;
+++#endif
+++
+++typedef struct qpu_mc_src_s
+++{
+++    int16_t y;
+++    int16_t x;
+++    qpu_mc_src_addr_t base;
+++} qpu_mc_src_t;
+++
+++
+++typedef struct qpu_mc_pred_c_p_s {
+++    qpu_mc_src_t next_src;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t coeffs_x;
+++    uint32_t coeffs_y;
+++    uint32_t wo_u;
+++    uint32_t wo_v;
+++    qpu_mc_dst_addr_t dst_addr_c;
+++    uint32_t next_fn;
+++} qpu_mc_pred_c_p_t;
+++
+++typedef struct qpu_mc_pred_c_b_s {
+++    qpu_mc_src_t next_src1;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t coeffs_x1;
+++    uint32_t coeffs_y1;
+++    uint32_t weight_u1;
+++    uint32_t weight_v1;
+++    qpu_mc_src_t next_src2;
+++    uint32_t coeffs_x2;
+++    uint32_t coeffs_y2;
+++    uint32_t wo_u2;
+++    uint32_t wo_v2;
+++    qpu_mc_dst_addr_t dst_addr_c;
+++    uint32_t next_fn;
+++} qpu_mc_pred_c_b_t;
+++
+++typedef struct qpu_mc_pred_c_s_s {
+++    qpu_mc_src_t next_src1;
+++    uint32_t pic_cw;            // C Width (== Y width / 2)
+++    uint32_t pic_ch;            // C Height (== Y Height / 2)
+++    uint32_t stride2;
+++    uint32_t stride1;
+++    uint32_t wdenom;
+++    qpu_mc_src_t next_src2;
+++    uint32_t next_fn;
+++} qpu_mc_pred_c_s_t;
+++
+++typedef struct qpu_mc_pred_c_s {
+++    union {
+++        qpu_mc_pred_c_p_t p;
+++        qpu_mc_pred_c_b_t b;
+++        qpu_mc_pred_c_s_t s;
+++    };
+++} qpu_mc_pred_c_t;
+++
+++
+++typedef struct qpu_mc_pred_y_p_s {
+++    qpu_mc_src_t next_src1;
+++    qpu_mc_src_t next_src2;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t mymx21;
+++    uint32_t wo1;
+++    uint32_t wo2;
+++    qpu_mc_dst_addr_t dst_addr;
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_p_t;
+++
+++typedef struct qpu_mc_pred_y_p00_s {
+++    qpu_mc_src_t next_src1;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t wo1;
+++    qpu_mc_dst_addr_t dst_addr;
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_p00_t;
+++
+++typedef struct qpu_mc_pred_y_s_s {
+++    qpu_mc_src_t next_src1;
+++    qpu_mc_src_t next_src2;
+++    uint16_t pic_h;
+++    uint16_t pic_w;
+++    uint32_t stride2;
+++    uint32_t stride1;
+++    uint32_t wdenom;
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_s_t;
+ +
+-+  min r0, r0, r1
+-+  add rb18, rb18, r0
+-+  sub r0, r0, r1
+-+  shl r0, r0, i_shift23
+-+  add rb26, rb26, r0
+++// Only a useful structure in that it allows us to return something other than a void *
+++typedef struct qpu_mc_pred_y_s {
+++    union {
+++        qpu_mc_pred_y_p_t p;
+++        qpu_mc_pred_y_p00_t p00;
+++        qpu_mc_pred_y_s_t s;
+++    };
+++} qpu_mc_pred_y_t;
+++
+++typedef union qpu_mc_pred_cmd_u {
+++    qpu_mc_pred_y_t y;
+++    qpu_mc_pred_c_t c;
+++    uint32_t data[1];
+++} qpu_mc_pred_cmd_t;
+++
+++#define QPU_MC_PRED_N_Y8        12
+++#define QPU_MC_PRED_N_C8        12
+++
+++#define QPU_MC_PRED_N_Y10       12
+++#define QPU_MC_PRED_N_C10       12
+++
+++#pragma pack(pop)
+++
+++#endif
+++
++diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
++new file mode 100644
++index 0000000000..1925ab7a79
++--- /dev/null
+++++ b/libavcodec/rpi_shader_template.c
++@@ -0,0 +1,65 @@
+++#ifdef RPI
+++
+++#include "hevc.h"
+++#include "libavutil/rpi_sand_fns.h"
+++#include "rpi_shader_cmd.h"
+++#include "rpi_shader_template.h"
+++
+++typedef struct shader_track_s
+++{
+++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
+++    const struct qpu_mc_src_s *last_l0;
+++    const struct qpu_mc_src_s *last_l1;
+++    uint32_t width;  // pic_width * PW
+++    uint32_t height;
+++    uint32_t stride2;
+++    uint32_t stride1;
+++    uint32_t wdenom;
+++} shader_track_t;
+++
+++static int wtoidx(const unsigned int w)
+++{
+++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+++    return pel_weight[w];
+++}
+++
+++static const int fctom(uint32_t x)
+++{
+++    int rv;
+++    // As it happens we can take the 2nd filter term & divide it by 8
+++    // (dropping fractions) to get the fractional move
+++    rv = 8 - ((x >> 11) & 0xf);
+++    av_assert2(rv >= 0 && rv <= 7);
+++    return rv;
+++}
+++
+++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
+++{
+++    return (x << shl) >> shr;
+++}
+++
+++static inline int woff_p(HEVCContext *const s, int32_t x)
+++{
+++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
+++}
+++
+++static inline int woff_b(HEVCContext *const s, int32_t x)
+++{
+++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
+++}
+++
+++static inline int wweight(int32_t x)
+++{
+++    return ext(x, 16, 16);
+++}
+++
+++
+++#define PW 1
+++#include "rpi_shader_template_fn.h"
+++
+++#undef PW
+++#define PW 2
+++#include "rpi_shader_template_fn.h"
+++
+++#endif
+++
++diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
++new file mode 100644
++index 0000000000..ecf5b8185a
++--- /dev/null
+++++ b/libavcodec/rpi_shader_template.h
++@@ -0,0 +1,24 @@
+++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+++
+++#ifdef RPI
+++struct HEVCContext;
+++struct HEVCRpiInterPredEnv;
+++
+++void rpi_shader_c8(struct HEVCContext *const s,
+++                  const struct HEVCRpiInterPredEnv *const ipe_y,
+++                  const struct HEVCRpiInterPredEnv *const ipe_c);
+ +
+-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+-+  add rb_dest, rb_dest, r0
+++void rpi_shader_c16(struct HEVCContext *const s,
+++                  const struct HEVCRpiInterPredEnv *const ipe_y,
+++                  const struct HEVCRpiInterPredEnv *const ipe_c);
+++
+++void rpi_sand_dump8(const char * const name,
+++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+++
+++void rpi_sand_dump16(const char * const name,
+++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+++
+++#endif
+++#endif
+++
++diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
++new file mode 100644
++index 0000000000..b5ac2ceed6
++--- /dev/null
+++++ b/libavcodec/rpi_shader_template_fn.h
++@@ -0,0 +1,477 @@
+++#define STRCAT(x,y) x##y
+++
+++#if PW == 1
+++#define pixel uint8_t
+++#define FUNC(f) STRCAT(f, 8)
+++#elif PW == 2
+++#define pixel uint16_t
+++#define FUNC(f) STRCAT(f, 16)
+++#else
+++#error Unexpected PW
+++#endif
+++
+++#define PATCH_STRIDE (16 * PW)
+++
+++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+++{
+++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
+++        const pixel s = *(const pixel *)src;
+++        pixel * d = (pixel *)dst;
+++        for (unsigned int j = 0; j < w; j += PW) {
+++            *d++ = s;
+++        }
+++    }
+++}
+ +
+-+  mov vw_setup, rb28    # Reset our VDM write pointer
+++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+++{
+++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
+++        memcpy(dst, src, w);
+++    }
+++}
+ +
+-+  brr -, r:yloop
+-+  nop
+-+  nop
+-+  nop
+-+# >>>
+++static void FUNC(get_patch_y)(const shader_track_t * const st,
+++                         uint8_t * dst, const unsigned int dst_stride,
+++                         const qpu_mc_src_t *src,
+++                         unsigned int _w, unsigned int _h)
+++{
+++    int x = src->x * PW;
+++    int y = src->y;
+++    int w = _w * PW;
+++    int h = _h;
+++    int dl = 0;
+++    int dr = 0;
+++    int dt = 0;
+++    int db = 0;
+++
+++    if (x < 0) {
+++        if (-x >= w)
+++            x = PW - w;
+++        dl = -x;
+++        w += x;
+++        x = 0;
+++    }
+++    if (x + w > st->width) {
+++        if (x >= st->width)
+++            x = st->width - PW;
+++        dr = (x + w) - st->width;
+++        w = st->width - x;
+++    }
+ +
+++    // Y
+++    if (y < 0) {
+++        if (-y >= h)
+++            y = 1 - h;
+++        dt = -y;
+++        h += y;
+++        y = 0;
+++    }
+++    if (y + h > st->height) {
+++        if (y >= st->height)
+++            y = st->height - 1;
+++        db = (y + h) - st->height;
+++        h = st->height - y;
+++    }
+ +
+++    dst += dl + dt * dst_stride;
+++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+++
+++    // Edge dup
+++    if (dl != 0)
+++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
+++    if (dr != 0)
+++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
+++    w += dl + dr;
+++    dst -= dl;
+++
+++    if (dt != 0)
+++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
+++    if (db != 0)
+++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
+++}
+ +
+ +
+ +
+-+################################################################################
+++static void FUNC(get_patch_c)(const shader_track_t * const st,
+++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
+++                         const qpu_mc_src_t *src,
+++                         unsigned int _w, unsigned int _h)
+++{
+++    int x = src->x * PW;
+++    int y = src->y;
+++    int w = _w * PW;
+++    int h = _h;
+++    int dl = 0;
+++    int dr = 0;
+++    int dt = 0;
+++    int db = 0;
+++    const int width = st->width;
+++    const int height = st->height;
+++
+++    if (x < 0) {
+++        if (-x >= w)
+++            x = PW - w;
+++        dl = -x;
+++        w += x;
+++        x = 0;
+++    }
+++    if (x + w > width) {
+++        if (x >= width)
+++            x = width - PW;
+++        dr = (x + w) - width;
+++        w = width - x;
+++    }
+ +
+-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+# In a P block, only the first half of coefficients contain used information.
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+-+# Can fill in the coefficients so only
+-+# Can also assume default weighted prediction for B frames.
+-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+-+# Or possibly by taking advantage of symmetry?
+-+# From 19->7 32bits per command.
+++    // Y
+++    if (y < 0) {
+++        if (-y >= h)
+++            y = 1 - h;
+++        dt = -y;
+++        h += y;
+++        y = 0;
+++    }
+++    if (y + h > height) {
+++        if (y >= height)
+++            y = height - 1;
+++        db = (y + h) - height;
+++        h = height - y;
+++    }
+ +
+-+::mc_filter_b
+-+  # r0 = weightL0 << 16, we want it in rb14
+-+#  asr rb14, r0, i_shift16
+++    dst_u += dl + dt * dst_stride;
+++    dst_v += dl + dt * dst_stride;
+++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+ +
+-+:yloopb
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++    // Edge dup
+++    if (dl != 0)
+++    {
+++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
+++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
+++    }
+++    if (dr != 0)
+++    {
+++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
+++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
+++    }
+++    w += dl + dr;
+++    dst_u -= dl;
+++    dst_v -= dl;
+ +
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+++    if (dt != 0)
+++    {
+++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
+++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
+++    }
+++    if (db != 0)
+++    {
+++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
+++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
+++    }
+++}
+ +
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++// w, y, w, h in pixels
+++// stride1, stride2 in bytes
+++void FUNC(rpi_sand_dump)(const char * const name,
+++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
+++{
+++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
+ +
+-+  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
+ +
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++    if (is_c) {
+++        x *= 2;
+++        w *= 2;
+++    }
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++    for (int i = y; i != y + h; ++i) {
+++        for (int j = x; j != x + w; ++j) {
+++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
+++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
+++#if PW == 1
+++            if (j < 0 || i < 0)
+++                printf("..%c", sep);
+++            else
+++                printf("%02x%c", *(const pixel*)p, sep);
+++#else
+++            if (j < 0 || i < 0)
+++                printf("...%c", sep);
+++            else
+++                printf("%03x%c", *(const pixel*)p, sep);
+++#endif
+++        }
+++        printf("\n");
+++    }
+++}
+ +
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+# apply horizontal filter
+-+  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+  sub r0, r2, r3       ; mov r3, rb31
+-+
+-+  sub.setf -, r3, 8       ; mov r1,   ra8
+-+  mov ra8,  ra9           ; mov rb8,  rb9
+-+  brr.anyn -, r:yloopb
+-+  mov ra9,  ra10          ; mov rb9,  rb10
+-+  mov ra10, ra11          ; mov rb10, rb11
+-+  mov ra11, r0            ; mov rb11, r1
+-+  # >>> .anyn yloopb
+++void FUNC(rpi_shader_c)(HEVCContext *const s,
+++                  const HEVCRpiInterPredEnv *const ipe_y,
+++                  const HEVCRpiInterPredEnv *const ipe_c)
+++{
+++    for (int c_idx = 0; c_idx < 2; ++c_idx)
+++    {
+++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
+++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
+++        unsigned int exit_n = 0;
+ +
+-+  # apply vertical filter and write to VPM
+-+  nop                     ; mul24 r0, rb8,  ra2.8a
+-+  nop                     ; mul24 r1, rb9,  ra2.8b
+-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+  sub r1, r1, r0          ; mov r2, rb12
+-+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+-+# Top 8 bits are bad - low 6 bits should be discarded
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++        if (ipe == NULL || !ipe->used) {
+++            continue;
+++        }
+ +
+-+  asr r1, r1, 14
+-+  nop                     ; mul24 r0, r1, rb14
+-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+++        do {
+++            for (unsigned int i = 0; i != ipe->n; ++i) {
+++                const HEVCRpiInterPredQ * const q = ipe->q + i;
+++                shader_track_t * const st = tracka + i;
+++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+++
+++                for (;;) {
+++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
+++
+++                    if (link == q->code_setup) {
+++                        if (c_idx == 0) {
+++                            // Luma
+++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
+++
+++                            st->height = c->pic_h;
+++                            st->width = c->pic_w * PW;
+++                            st->stride1 = c->stride1;
+++                            st->stride2 = c->stride2;
+++                            st->wdenom = c->wdenom;
+++                            st->last_l0 = &c->next_src1;
+++                            st->last_l1 = &c->next_src2;
+++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                        }
+++                        else {
+++                            // Chroma
+++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
+++
+++                            st->height = c->pic_ch;
+++                            st->width = c->pic_cw * PW;
+++                            st->stride1 = c->stride1;
+++                            st->stride2 = c->stride2;
+++                            st->wdenom = c->wdenom;
+++                            st->last_l0 = &c->next_src1;
+++                            st->last_l1 = &c->next_src2;
+++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                        }
+++                    }
+++                    else if (link == s->qpu.y_pxx) {
+++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+++                        const int w1 = FFMIN(c->w, 8);
+++                        const int w2 = c->w - w1;
+++
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h + 7);
+++                        if (w2 > 0) {
+++                            FUNC(get_patch_y)(st,
+++                                        patch_y2, PATCH_STRIDE,
+++                                        st->last_l1,
+++                                        16, c->h + 7);
+++                        }
+ +
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  shl r1, r1, 8
+++                        // wo[offset] = offset*2+1
+++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
+++                        if (w2 > 0) {
+++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+++                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
+++                        }
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.y_bxx) {
+++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+++
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+++
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h + 7);
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y2, PATCH_STRIDE,
+++                                    st->last_l1,
+++                                    16, c->h + 7);
+++
+++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
+++
+++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
+++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.y_p00) {
+++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
+ +
+-+  brr.anyn -, r:yloopb
+-+  asr r1, r1, rb13         # Delay 1
+-+  min r1, r1, rb_k255       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ +
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h + 7);
+ +
+-+# If looping again the we consumed 16 height last loop
+-+  # rb29 (stride) remains constant
+-+  # rb17 remains const (based on total height)
+-+  # recalc rb26, rb18 based on new segment height
+-+  # N.B. r3 is loop counter still
+++                        // wo[offset] = offset*2+1
+++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
+ +
+-+  mov r1, 16
+-+  sub r0, ra_height, r1
+-+  mov ra_height, r0
+-+  max.setf r0, r0, 0    # Done if Z now
+++                        st->last_l0 = &c->next_src1;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.y_b00) {
+++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+++
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+++
+++                        av_assert0(c->w <= 16 && c->h <= 64);
+++
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h);
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y2, PATCH_STRIDE,
+++                                    st->last_l1,
+++                                    16, c->h);
+++
+++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
+++                           patch_y3, patch_y1, PATCH_STRIDE,
+++                           c->h, 0, 0, c->w);
+++
+++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
+++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+++                            0, woff_b(s, c->wo2), 0, 0, c->w);
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.c_pxx) {
+++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+++                        const int mx = fctom(c->coeffs_x);
+++                        const int my = fctom(c->coeffs_y);
+ +
+-+# DMA out
+-+  brr.anyz -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, rb_dest # start the VDW   Delay 3
+-+# >>> .anyz per_block_setup
+++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_u3[8 * 16 * PW];
+++                        uint8_t patch_v3[8 * 16 * PW];
+ +
+-+  min r0, r0, r1
+-+  add rb18, rb18, r0
+-+  sub r0, r0, r1
+-+  shl r0, r0, i_shift23
+-+  add rb26, rb26, r0
+++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+ +
+-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+-+  add rb_dest, rb_dest, r0
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+ +
+-+  mov vw_setup, rb28    # Reset our VDM write pointer
+++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+ +
+-+  brr -, r:yloopb
+-+  nop
+-+  nop
+-+  nop
+++                        st->last_l0 = &c->next_src;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.c_pxx_l1) {
+++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+++                        const int mx = fctom(c->coeffs_x);
+++                        const int my = fctom(c->coeffs_y);
+ +
+-+################################################################################
+++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_u3[8 * 16 * PW];
+++                        uint8_t patch_v3[8 * 16 * PW];
+ +
+-+::mc_end
+-+# Do not add code here because mc_end must appear after all other code.
+-diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
+-new file mode 100644
+-index 0000000..27cbb59
+---- /dev/null
+-+++ b/libavcodec/rpi_shader_cmd.h
+-@@ -0,0 +1,88 @@
+-+#ifndef RPI_SHADER_CMD_H
+-+#define RPI_SHADER_CMD_H
+++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+ +
+-+#pragma pack(push, 4)
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+ +
+-+typedef struct qpu_mc_pred_c_s {
+-+    uint32_t next_fn;
+-+    int16_t next_src_y;
+-+    int16_t next_src_x;
+-+    uint32_t next_src_base_c;
+-+    union {
+-+        struct {
+-+            uint16_t h;
+-+            uint16_t w;
+-+            uint32_t coeffs_x;
+-+            uint32_t coeffs_y;
+-+            uint32_t wo_u;
+-+            uint32_t wo_v;
+-+            uint32_t dst_addr_c;
+-+        } p;
+-+        struct {
+-+            uint16_t h;
+-+            uint16_t w;
+-+            uint32_t coeffs_x;
+-+            uint32_t coeffs_y;
+-+            uint32_t weight_u;
+-+            uint32_t weight_v;
+-+            uint32_t dummy0;
+-+        } b0;
+-+        struct {
+-+            uint32_t dummy0;
+-+            uint32_t coeffs_x;
+-+            uint32_t coeffs_y;
+-+            uint32_t wo_u;
+-+            uint32_t wo_v;
+-+            uint32_t dst_addr_c;
+-+        } b1;
+-+        struct {
+-+            uint32_t pic_cw;            // C Width (== Y width / 2)
+-+            uint32_t pic_ch;            // C Height (== Y Height / 2)
+-+            uint32_t stride2;
+-+            uint32_t stride1;
+-+            uint32_t wdenom;
+-+            uint32_t dummy0;
+-+        } s0;
+-+        struct {
+-+            uint32_t dummy0;
+-+            uint32_t dummy1;
+-+            uint32_t dummy2;
+-+            uint32_t dummy3;
+-+            uint32_t dummy4;
+-+            uint32_t dummy5;
+-+        } s1;
+-+    };
+-+} qpu_mc_pred_c_t;
+++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+ +
+-+typedef struct qpu_mc_pred_y_s {
+-+    int16_t next_src1_x;
+-+    int16_t next_src1_y;
+-+    uint32_t next_src1_base;
+-+    int16_t next_src2_x;
+-+    int16_t next_src2_y;
+-+    uint32_t next_src2_base;
+-+    union {
+-+        struct {
+-+            uint16_t h;
+-+            uint16_t w;
+-+            uint32_t mymx21;
+-+            uint32_t wo1;
+-+            uint32_t wo2;
+-+            uint32_t dst_addr;
+-+        } p;
+-+        struct {
+-+            uint16_t pic_h;
+-+            uint16_t pic_w;
+-+            uint32_t stride2;
+-+            uint32_t stride1;
+-+            uint32_t wdenom;
+-+            uint32_t dummy0;
+-+        } s;
+-+    };
+-+    uint32_t next_fn;
+-+} qpu_mc_pred_y_t;
+++                        st->last_l1 = &c->next_src;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.c_bxx) {
+++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
+++                        const int mx1 = fctom(c->coeffs_x1);
+++                        const int my1 = fctom(c->coeffs_y1);
+++                        const int mx2 = fctom(c->coeffs_x2);
+++                        const int my2 = fctom(c->coeffs_y2);
+++
+++                        uint8_t patch_u1[PATCH_STRIDE * 72];
+++                        uint8_t patch_v1[PATCH_STRIDE * 72];
+++                        uint8_t patch_u2[PATCH_STRIDE * 72];
+++                        uint8_t patch_v2[PATCH_STRIDE * 72];
+++                        uint8_t patch_u3[8 * 16 * PW];
+++                        uint8_t patch_v3[8 * 16 * PW];
+++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
+++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
+++
+++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+++
+++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                           c->h, mx1, my1, c->w);
+++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                           c->h, mx1, my1, c->w);
+++
+++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
+++                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
+++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
+++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
+++                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
+++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
+++
+++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+++
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == q->code_sync) {
+++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
+++                        break;
+++                    }
+++                    else if (link == q->code_exit) {
+++                        // We expect exit to occur without other sync
+++                        av_assert0(i == exit_n);
+++                        ++exit_n;
+++                        break;
+++                    }
+++                    else {
+++                        av_assert0(0);
+++                    }
+++                }
+ +
+-+#pragma pack(pop)
+++                st->qpu_mc_curr = cmd;
+++            }
+++        } while (exit_n == 0);
+++    }
+++}
+ +
+-+#endif
+++#undef FUNC
+++#undef pixel
+ +
+ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+ new file mode 100644
+-index 0000000..b061fe0
++index 0000000000..b502de0a2c
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.c
+-@@ -0,0 +1,581 @@
++@@ -0,0 +1,745 @@
+ +#include "config.h"
+ +#ifdef RPI
+++#include "libavcodec/avcodec.h"
+ +#include "rpi_qpu.h"
+ +#include "rpi_mailbox.h"
+ +#include "rpi_zc.h"
+ +#include "libavutil/avassert.h"
+++#include "libavutil/rpi_sand_fns.h"
+ +#include <pthread.h>
+ +
+ +#include "libavutil/buffer_internal.h"
+@@ -17841,21 +27634,11 @@ index 0000000..b061fe0
+ +    struct ZcPool * pool;
+ +} ZcPoolEnt;
+ +
+-+#if 1
+-+//#define ALLOC_PAD       0x1000
+-+#define ALLOC_PAD       0
+-+#define ALLOC_ROUND     0x1000
+-+//#define ALLOC_N_OFFSET  0x100
+-+#define ALLOC_N_OFFSET  0
+-+#define STRIDE_ROUND    0x80
+-+#define STRIDE_OR       0x80
+-+#else
+ +#define ALLOC_PAD       0
+ +#define ALLOC_ROUND     0x1000
+ +#define ALLOC_N_OFFSET  0
+-+#define STRIDE_ROUND    32
+++#define STRIDE_ROUND    64
+ +#define STRIDE_OR       0
+-+#endif
+ +
+ +#define DEBUG_ZAP0_BUFFERS 0
+ +
+@@ -18032,13 +27815,22 @@ index 0000000..b061fe0
+ +    {
+ +        case AV_PIX_FMT_YUV420P:
+ +            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+-+        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
+ +            geo.stride_c = geo.stride_y / 2;
+-+        //    geo.height_y = (video_height + 15) & ~15;
+ +            geo.height_y = (video_height + 32 + 31) & ~31;
+ +            geo.height_c = geo.height_y / 2;
+ +            geo.planes_c = 2;
+ +            geo.stripes = 1;
+++            geo.bytes_per_pel = 1;
+++            break;
+++
+++        case AV_PIX_FMT_YUV420P10:
+++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+++            geo.stride_c = geo.stride_y / 2;
+++            geo.height_y = (video_height + 32 + 31) & ~31;
+++            geo.height_c = geo.height_y / 2;
+++            geo.planes_c = 2;
+++            geo.stripes = 1;
+++            geo.bytes_per_pel = 2;
+ +            break;
+ +
+ +        case AV_PIX_FMT_SAND128:
+@@ -18073,6 +27865,7 @@ index 0000000..b061fe0
+ +            geo.height_c = img.pitch / stripe_w - geo.height_y;
+ +            geo.planes_c = 1;
+ +            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+++            geo.bytes_per_pel = 1;
+ +
+ +            pthread_mutex_unlock(&sand_lock);
+ +
+@@ -18081,6 +27874,45 @@ index 0000000..b061fe0
+ +            break;
+ +        }
+ +
+++        case AV_PIX_FMT_SAND64_16:
+++        case AV_PIX_FMT_SAND64_10:
+++        {
+++            const unsigned int stripe_w = 128;  // bytes
+++
+++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+++            static VC_IMAGE_T img = {0};
+++
+++            // Given the overhead of calling the mailbox keep a stashed
+++            // copy as we will almost certainly just want the same numbers again
+++            // but that means we need a lock
+++            pthread_mutex_lock(&sand_lock);
+++
+++            if (img.width != video_width || img.height != video_height)
+++            {
+++                VC_IMAGE_T new_img = {
+++                    .type = VC_IMAGE_YUV_UV_16,
+++                    .width = video_width,
+++                    .height = video_height
+++                };
+++
+++                gpu_ref();
+++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
+++                gpu_unref();
+++                img = new_img;
+++            }
+++
+++            geo.stride_y = stripe_w;
+++            geo.stride_c = stripe_w;
+++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+++            geo.height_c = img.pitch / stripe_w - geo.height_y;
+++            geo.planes_c = 1;
+++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
+++            geo.bytes_per_pel = 2;
+++
+++            pthread_mutex_unlock(&sand_lock);
+++            break;
+++        }
+++
+ +        default:
+ +            memset(&geo, 0, sizeof(geo));
+ +            break;
+@@ -18153,8 +27985,12 @@ index 0000000..b061fe0
+ +    frame->linesize[0] = geo.stride_y;
+ +    frame->linesize[1] = geo.stride_c;
+ +    frame->linesize[2] = geo.stride_c;
+++    // abuse: linesize[3] = "stripe stride"
+++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
+++    // In a general case this makes the calculation an xor and multiply rather
+++    // than a divide and multiply
+ +    if (geo.stripes > 1)
+-+        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
+++        frame->linesize[3] = geo.height_y + geo.height_c;
+ +
+ +    frame->data[0] = buf->data;
+ +    frame->data[1] = frame->data[0] + size_y;
+@@ -18164,6 +28000,11 @@ index 0000000..b061fe0
+ +    frame->extended_data = frame->data;
+ +    // Leave extended buf alone
+ +
+++#if RPI_ZC_SAND_8_IN_10_BUF != 0
+++    // *** If we intend to use this for real we will want a 2nd buffer pool
+++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
+++#endif
+++
+ +    return 0;
+ +}
+ +
+@@ -18182,7 +28023,7 @@ index 0000000..b061fe0
+ +        rv = avcodec_default_get_buffer2(s, frame, flags);
+ +    }
+ +    else if (frame->format == AV_PIX_FMT_YUV420P ||
+-+             frame->format == AV_PIX_FMT_SAND128)
+++             av_rpi_is_sand_frame(frame))
+ +    {
+ +        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
+ +    }
+@@ -18212,6 +28053,7 @@ index 0000000..b061fe0
+ +    unsigned int i;
+ +    uint8_t * psrc, * pdest;
+ +
+++    dest->format = src->format;
+ +    dest->width = src->width;
+ +    dest->height = src->height;
+ +
+@@ -18243,29 +28085,142 @@ index 0000000..b061fe0
+ +}
+ +
+ +
+++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
+++    const AVFrame * const src)
+++{
+++    AVFrame dest_frame;
+++    AVFrame * const dest = &dest_frame;
+++    unsigned int i;
+++    uint8_t * psrc, * psrc2, * pdest;
+++
+++    memset(dest, 0, sizeof(*dest));
+++    dest->format = AV_PIX_FMT_SAND128;
+++    dest->width = src->width;
+++    dest->height = src->height;
+++
+++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+++    {
+++        return NULL;
+++    }
+++
+++    // Y
+++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+++         i != dest->height;
+++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+++    {
+++        uint16_t * s = (uint16_t*)psrc;
+++        uint8_t * d = pdest;
+++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
+++        {
+++            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
+++            for (unsigned int j = 0; j != n; ++j)
+++                *d++ = (uint8_t)(*s++ >> 2);
+++            d += (dest->linesize[3] - 1) * dest->linesize[0];
+++        }
+++    }
+++
+++    // C
+++    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
+++         i != dest->height / 2;
+++         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
+++    {
+++        const uint16_t * su = (uint16_t*)psrc;
+++        const uint16_t * sv = (uint16_t*)psrc2;
+++        uint8_t * d = pdest;
+++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
+++        {
+++            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
+++            for (unsigned int j = 0; j != n; ++j)
+++            {
+++                *d++ = (uint8_t)(*su++ >> 2);
+++                *d++ = (uint8_t)(*sv++ >> 2);
+++            }
+++            d += (dest->linesize[3] - 1) * dest->linesize[1];
+++        }
+++    }
+++
+++    return dest->buf[0];
+++}
+++
+++
+++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
+++    const AVFrame * const src, const unsigned int src_bits)
+++{
+++    AVFrame dest_frame = {
+++        .format = AV_PIX_FMT_SAND128,
+++        .width = src->width,
+++        .height = src->height
+++    };
+++    AVFrame * const dest = &dest_frame;
+++    const unsigned int shr = src_bits - 8;
+++
+++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+++    {
+++        return NULL;
+++    }
+++
+++    // Y
+++    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
+++                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
+++                        src->width, src->height, shr);
+++    // C
+++    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
+++                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
+++                        src->width, src->height / 2, shr);
+++
+++    return dest->buf[0];
+++}
+++
+++
+++
+ +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+-+    const AVFrame * const frame, const int maycopy)
+++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
+ +{
+ +    assert(s != NULL);
+ +
+ +    if (frame->format != AV_PIX_FMT_YUV420P &&
+-+        frame->format != AV_PIX_FMT_SAND128)
+++        frame->format != AV_PIX_FMT_YUV420P10 &&
+++        !av_rpi_is_sand_frame(frame))
+ +    {
+ +        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+ +        return NULL;
+ +    }
+ +
+-+    if (frame->buf[1] != NULL)
+++    if (frame->buf[1] != NULL || frame->format != expected_format)
+ +    {
+-+        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
+++#if RPI_ZC_SAND_8_IN_10_BUF
+++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
+++        {
+++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
+++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
+++        }
+++#endif
+++
+ +        if (maycopy)
+ +        {
+-+            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+-+            return zc_copy(s, frame);
+++            if (frame->buf[1] != NULL)
+++                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+++            else
+++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
+++
+++            switch (frame->format)
+++            {
+++                case AV_PIX_FMT_YUV420P10:
+++                    return zc_420p10_to_sand128(s, frame);
+++
+++                case AV_PIX_FMT_SAND64_10:
+++                    return zc_sand64_16_to_sand128(s, frame, 10);
+++
+++                default:
+++                    return zc_copy(s, frame);
+++            }
+ +        }
+ +        else
+ +        {
+-+            av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
+++            if (frame->buf[1] != NULL)
+++                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
+++            else
+++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
+ +            return NULL;
+ +        }
+ +    }
+@@ -18392,10 +28347,10 @@ index 0000000..b061fe0
+ +
+ diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+ new file mode 100644
+-index 0000000..f4aeb78
++index 0000000000..26fb3be999
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.h
+-@@ -0,0 +1,137 @@
++@@ -0,0 +1,105 @@
+ +#ifndef LIBAVCODEC_RPI_ZC_H
+ +#define LIBAVCODEC_RPI_ZC_H
+ +
+@@ -18406,23 +28361,33 @@ index 0000000..f4aeb78
+ +// bit of memory for the frame when can then be reference counted until
+ +// display has finished with it.
+ +
+-+#include "libavutil/frame.h"
+-+#include "libavcodec/avcodec.h"
+++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
+++// 0 disables
+++// *** This option still in development
+++//     Only works if SAO active
+++//     Allocates buffers that are twice the required size
+++#define RPI_ZC_SAND_8_IN_10_BUF  0
+++
+++struct AVBufferRef;
+++struct AVFrame;
+++struct AVCodecContext;
+++enum AVPixelFormat;
+ +
+ +// "Opaque" pointer to whatever we are using as a buffer reference
+-+typedef AVBufferRef * AVRpiZcRefPtr;
+++typedef struct AVBufferRef * AVRpiZcRefPtr;
+ +
+ +struct AVZcEnv;
+ +typedef struct AVZcEnv * AVZcEnvPtr;
+ +
+ +typedef struct AVRpiZcFrameGeometry
+ +{
+-+    unsigned int stride_y;
+-+    unsigned int height_y;
+-+    unsigned int stride_c;
+-+    unsigned int height_c;
+-+    unsigned int planes_c;
+-+    unsigned int stripes;
+++    unsigned int stride_y;  // Luma stride (bytes)
+++    unsigned int height_y;  // Luma height (lines)
+++    unsigned int stride_c;  // Chroma stride (bytes)
+++    unsigned int height_c;  // Chroma stride (lines)
+++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
+++    unsigned int stripes;   // Number of stripes (sand)
+++    unsigned int bytes_per_pel;
+ +} AVRpiZcFrameGeometry;
+ +
+ +
+@@ -18448,7 +28413,7 @@ index 0000000..f4aeb78
+ +//     the data, then allocate a new buffer and copy the data into it
+ +//   Otherwise return NULL
+ +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+-+    const AVFrame * const frame, const int maycopy);
+++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
+ +
+ +// Get the vc_handle from the frame ref
+ +// Returns -1 if ref doesn't look valid
+@@ -18469,72 +28434,30 @@ index 0000000..f4aeb78
+ +// Allocate an environment for the buffer pool used by the ZC code
+ +// This should be put in avctx->get_buffer_context so it can be found by
+ +// av_rpi_zc_get_buffer2 when it is called from ffmpeg
+-+AVZcEnvPtr av_rpi_zc_env_alloc(void);
+-+
+-+// Allocate the environment used by the ZC code
+-+void av_rpi_zc_env_free(AVZcEnvPtr);
+-+
+-+// Test to see if the context is using zc (checks get_buffer2)
+-+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+-+
+-+// Init ZC into a context
+-+// There is nothing magic in this fn - it just packages setting
+-+// get_buffer2 & get_buffer_context
+-+int av_rpi_zc_init(struct AVCodecContext * const s);
+-+
+-+// Free ZC from a context
+-+// There is nothing magic in this fn - it just packages unsetting
+-+// get_buffer2 & get_buffer_context
+-+void av_rpi_zc_uninit(struct AVCodecContext * const s);
+-+
+-+
+-+
+-+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
+-+{
+-+    return frame->linesize[3];
+-+}
+-+
+-+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+-+{
+-+    const unsigned int stride1 = frame->linesize[0];
+-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+-+    const unsigned int x1 = x & (stride1 - 1);
+-+    const unsigned int x2 = x ^ x1;
+-+
+-+    return x1 + stride1 * y + stride2 * x2;
+-+}
+-+
+-+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+-+{
+-+    const unsigned int stride1 = frame->linesize[0];
+-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+-+    const unsigned int x = x_c * 2;
+-+    const unsigned int x1 = x & (stride1 - 1);
+-+    const unsigned int x2 = x ^ x1;
+++AVZcEnvPtr av_rpi_zc_env_alloc(void);
+ +
+-+    return x1 + stride1 * y_c + stride2 * x2;
+-+}
+++// Allocate the environment used by the ZC code
+++void av_rpi_zc_env_free(AVZcEnvPtr);
+ +
+-+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+-+{
+-+    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
+-+}
+++// Test to see if the context is using zc (checks get_buffer2)
+++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+ +
+-+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+-+{
+-+    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
+-+}
+++// Init ZC into a context
+++// There is nothing magic in this fn - it just packages setting
+++// get_buffer2 & get_buffer_context
+++int av_rpi_zc_init(struct AVCodecContext * const s);
+++
+++// Free ZC from a context
+++// There is nothing magic in this fn - it just packages unsetting
+++// get_buffer2 & get_buffer_context
+++void av_rpi_zc_uninit(struct AVCodecContext * const s);
+ +
+-+static inline int rpi_sliced_frame(const AVFrame * const frame)
+-+{
+-+    return frame->format == AV_PIX_FMT_SAND128;
+-+}
+ +
+ +
+ +#endif
+ +
+ diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+-index f7adb52..3b398a3 100644
++index c4af9cbb17..c1b806e51b 100644
+ --- a/libavcodec/utils.c
+ +++ b/libavcodec/utils.c
+ @@ -26,6 +26,12 @@
+@@ -18550,7 +28473,15 @@ index f7adb52..3b398a3 100644
+  #include "libavutil/atomic.h"
+  #include "libavutil/attributes.h"
+  #include "libavutil/avassert.h"
+-@@ -64,6 +70,10 @@
++@@ -39,6 +45,7 @@
++ #include "libavutil/mathematics.h"
++ #include "libavutil/mem_internal.h"
++ #include "libavutil/pixdesc.h"
+++#include "libavutil/rpi_sand_fns.h"
++ #include "libavutil/imgutils.h"
++ #include "libavutil/samplefmt.h"
++ #include "libavutil/dict.h"
++@@ -64,6 +71,10 @@
+  #include "libavutil/ffversion.h"
+  const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+  
+@@ -18561,7 +28492,7 @@ index f7adb52..3b398a3 100644
+  #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+  static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+  {
+-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
++@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+      return ret;
+  }
+  
+@@ -18609,7 +28540,7 @@ index f7adb52..3b398a3 100644
+  static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+  {
+      FramePool *pool = avctx->internal->pool;
+-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
++@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+              av_buffer_pool_uninit(&pool->pools[i]);
+              pool->linesize[i] = linesize[i];
+              if (size[i]) {
+@@ -18624,20 +28555,20 @@ index f7adb52..3b398a3 100644
+                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                       CONFIG_MEMORY_POISONING ?
+                                                          NULL :
+-@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
++@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+  {
+      int ret;
+  
+ +#ifdef RPI
+ +    // This is going to end badly if we let it continue
+-+    av_assert0(frame->format != AV_PIX_FMT_SAND128);
+++    av_assert0(!av_rpi_is_sand_frame(frame));
+ +#endif
+ +
+      if ((ret = update_frame_pool(avctx, frame)) < 0)
+          return ret;
+  
+ diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
+-index 21f8d9e..71ce7b9 100644
++index 21f8d9e00d..71ce7b9186 100644
+ --- a/libavfilter/avfilter.c
+ +++ b/libavfilter/avfilter.c
+ @@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
+@@ -18649,7 +28580,7 @@ index 21f8d9e..71ce7b9 100644
+  #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
+              if (   !strcmp(filter->filter->name, "format")     ||
+ diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+-index b31d233..2767306 100644
++index 6767b65ec8..f270190d57 100644
+ --- a/libavformat/mpegts.c
+ +++ b/libavformat/mpegts.c
+ @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
+@@ -18662,10 +28593,10 @@ index b31d233..2767306 100644
+      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
+      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
+ diff --git a/libavformat/utils.c b/libavformat/utils.c
+-index 6f343f2..83f26d5 100644
++index 5a35953d24..d36fdc3199 100644
+ --- a/libavformat/utils.c
+ +++ b/libavformat/utils.c
+-@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
++@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+          int default_stream_index = av_find_default_stream_index(s);
+          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
+              for (i = 0; i < s->nb_streams; i++) {
+@@ -18674,8 +28605,84 @@ index 6f343f2..83f26d5 100644
+                      continue;
+                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
+                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
++diff --git a/libavutil/Makefile b/libavutil/Makefile
++index 1e061763a2..cbc9bc145b 100644
++--- a/libavutil/Makefile
+++++ b/libavutil/Makefile
++@@ -59,6 +59,8 @@ HEADERS = adler32.h                                                     \
++           rational.h                                                    \
++           replaygain.h                                                  \
++           ripemd.h                                                      \
+++          rpi_sand_fns.h                                                \
+++          rpi_sand_fn_pw.h                                              \
++           samplefmt.h                                                   \
++           sha.h                                                         \
++           sha512.h                                                      \
++@@ -136,6 +138,7 @@ OBJS = adler32.o                                                        \
++        reverse.o                                                        \
++        rc4.o                                                            \
++        ripemd.o                                                         \
+++       rpi_sand_fns.o                                                   \
++        samplefmt.o                                                      \
++        sha.o                                                            \
++        sha512.o                                                         \
++diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
++index 5da44b0542..b74b7c4e2f 100644
++--- a/libavutil/arm/Makefile
+++++ b/libavutil/arm/Makefile
++@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
++ 
++ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
++              arm/float_dsp_neon.o                                       \
+++             arm/rpi_sand_neon.o                                        \
++diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
++new file mode 100644
++index 0000000000..dbffdaefa4
++--- /dev/null
+++++ b/libavutil/arm/rpi_sand_neon.S
++@@ -0,0 +1,40 @@
+++#include "libavutil/arm/asm.S"
+++
+++@ void rpi_sand128b_stripe_to_8_10(
+++@   uint8_t * dest,             [r0]
+++@   const uint8_t * src1,       [r1]
+++@   const uint8_t * src2,       [r2]
+++@   unsigned int lines);        [r3]
+++
+++.macro  stripe2_to_8, bit_depth
+++        vpush    {q4-q7}
+++1:
+++        vldm     r1!, {q0-q7}
+++        subs     r3, #1
+++        vldm     r2!, {q8-q15}
+++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
+++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
+++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
+++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
+++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
+++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
+++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
+++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
+++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
+++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
+++        vqrshrn.u16 d10, q10, #\bit_depth - 8
+++        vqrshrn.u16 d11, q11, #\bit_depth - 8
+++        vqrshrn.u16 d12, q12, #\bit_depth - 8
+++        vqrshrn.u16 d13, q13, #\bit_depth - 8
+++        vqrshrn.u16 d14, q14, #\bit_depth - 8
+++        vqrshrn.u16 d15, q15, #\bit_depth - 8
+++        vstm     r0!, {q0-q7}
+++        bne      1b
+++        vpop     {q4-q7}
+++        bx       lr
+++.endm
+++
+++function rpi_sand128b_stripe_to_8_10, export=1
+++        stripe2_to_8     10
+++endfunc
+++
+ diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+-index 694e116..203ca7b 100644
++index 694e116a3c..203ca7b3a8 100644
+ --- a/libavutil/buffer.c
+ +++ b/libavutil/buffer.c
+ @@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
+@@ -18689,7 +28696,7 @@ index 694e116..203ca7b 100644
+ +  return buf->opaque;
+ +}
+ diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+-index 0c0ce12..82e0bc3 100644
++index 0c0ce12cf2..82e0bc3058 100644
+ --- a/libavutil/buffer.h
+ +++ b/libavutil/buffer.h
+ @@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+@@ -18702,11 +28709,51 @@ index 0c0ce12..82e0bc3 100644
+  /**
+   * @}
+   */
++diff --git a/libavutil/frame.h b/libavutil/frame.h
++index 2b5c3320c3..990347e484 100644
++--- a/libavutil/frame.h
+++++ b/libavutil/frame.h
++@@ -120,7 +120,20 @@ enum AVFrameSideDataType {
++      * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
++      * This is set on the first frame of a GOP that has a temporal reference of 0.
++      */
++-    AV_FRAME_DATA_GOP_TIMECODE
+++    AV_FRAME_DATA_GOP_TIMECODE,
+++
+++    /**
+++     * The data represents the AVSphericalMapping structure defined in
+++     * libavutil/spherical.h.
+++     */
+++    AV_FRAME_DATA_SPHERICAL,
+++
+++    /**
+++     * Extra data required to deal with a cropped Sand frame
+++     * AVFrame holds the cropped size, but we cannot simply offset the start
+++     * address to get the picture as we can for planar formats
+++     */
+++    AV_FRAME_DATA_SAND_INFO,
++ };
++ 
++ enum AVActiveFormatDescription {
++@@ -133,6 +146,13 @@ enum AVActiveFormatDescription {
++     AV_AFD_SP_4_3       = 15,
++ };
++ 
+++typedef struct AVFrameDataSandInfo
+++{
+++    unsigned int left_offset;
+++    unsigned int top_offset;
+++    unsigned int pic_width;
+++    unsigned int pic_height;
+++} AVFrameDataSandInfo;
++ 
++ /**
++  * Structure to hold side data for an AVFrame.
+ diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+-index 0dffa4d..5644176 100644
++index 0dffa4dbdb..17134b4f38 100644
+ --- a/libavutil/pixdesc.c
+ +++ b/libavutil/pixdesc.c
+-@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
++@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+          .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
+                   AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
+      },
+@@ -18721,35 +28768,486 @@ index 0dffa4d..5644176 100644
+ +            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
+ +        },
+ +        .flags = 0,
+-+    }
+++    },
+++    [AV_PIX_FMT_SAND64_10] = {
+++        .name = "sand64_10",
+++        .nb_components = 3,
+++        .log2_chroma_w = 1,
+++        .log2_chroma_h = 1,
+++        .comp = {
+++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
+++            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
+++            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
+++        },
+++        .flags = 0,
+++    },
+  };
+  #if FF_API_PLUS1_MINUS1
+  FF_ENABLE_DEPRECATION_WARNINGS
+ diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+-index 0ed01c4..4705e80 100644
++index 0ed01c4844..2155b78704 100644
+ --- a/libavutil/pixfmt.h
+ +++ b/libavutil/pixfmt.h
+-@@ -303,7 +303,10 @@ enum AVPixelFormat {
++@@ -303,7 +303,22 @@ enum AVPixelFormat {
+      AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
+      AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
+  
+ -    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+++    AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
+++
+++    AV_PIX_FMT_GRAY12BE,   ///<        Y        , 12bpp, big-endian
+++    AV_PIX_FMT_GRAY12LE,   ///<        Y        , 12bpp, little-endian
+++    AV_PIX_FMT_GRAY10BE,   ///<        Y        , 10bpp, big-endian
+++    AV_PIX_FMT_GRAY10LE,   ///<        Y        , 10bpp, little-endian
+++
+++    AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
+++    AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
+++
+ +// RPI - not on ifdef so can be got at by calling progs
+-+    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+ +
+ +    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+  };
+  
+  #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
++diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
++new file mode 100644
++index 0000000000..52d52a2a83
++--- /dev/null
+++++ b/libavutil/rpi_sand_fn_pw.h
++@@ -0,0 +1,182 @@
+++// * Included twice from rpi_sand_fn with different PW
+++
+++#define STRCAT(x,y) x##y
+++
+++#if PW == 1
+++#define pixel uint8_t
+++#define FUNC(f) STRCAT(f, 8)
+++#elif PW == 2
+++#define pixel uint16_t
+++#define FUNC(f) STRCAT(f, 16)
+++#else
+++#error Unexpected PW
+++#endif
+++
+++// Fetches a single patch - offscreen fixup not done here
+++// w <= stride1
+++// unclipped
+++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h)
+++{
+++    const unsigned int x = _x;
+++    const unsigned int w = _w;
+++    const unsigned int mask = stride1 - 1;
+++
+++    if ((x & ~mask) == ((x + w) & ~mask)) {
+++        // All in one sand stripe
+++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+++            memcpy(dst, p, w);
+++        }
+++    }
+++    else
+++    {
+++        // Two+ stripe
+++        const unsigned int sstride = stride1 * stride2;
+++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        const uint8_t * p2 = p1 + sstride - (x & mask);
+++        const unsigned int w1 = stride1 - (x & mask);
+++        const unsigned int w3 = (x + w) & mask;
+++        const unsigned int w2 = w - (w1 + w3);
+++
+++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+++            unsigned int j;
+++            const uint8_t * p = p2;
+++            uint8_t * d = dst;
+++            memcpy(d, p1, w1);
+++            d += w1;
+++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+++                memcpy(d, p, stride1);
+++            }
+++            memcpy(d, p, w3);
+++        }
+++    }
+++}
+++
+++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+++
+++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+++                             uint8_t * dst_v, const unsigned int dst_stride_v,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h)
+++{
+++    const unsigned int x = _x * 2;
+++    const unsigned int w = _w * 2;
+++    const unsigned int mask = stride1 - 1;
+++
+++    if ((x & ~mask) == ((x + w) & ~mask)) {
+++        // All in one sand stripe
+++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+++            pixel * du = (pixel *)dst_u;
+++            pixel * dv = (pixel *)dst_v;
+++            const pixel * p = (const pixel *)p1;
+++            for (unsigned int k = 0; k < w; k += 2 * PW) {
+++                *du++ = *p++;
+++                *dv++ = *p++;
+++            }
+++        }
+++    }
+++    else
+++    {
+++        // Two+ stripe
+++        const unsigned int sstride = stride1 * stride2;
+++        const unsigned int sstride_p = (sstride - stride1) / PW;
+++
+++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        const uint8_t * p2 = p1 + sstride - (x & mask);
+++        const unsigned int w1 = stride1 - (x & mask);
+++        const unsigned int w3 = (x + w) & mask;
+++        const unsigned int w2 = w - (w1 + w3);
+++
+++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+++            unsigned int j;
+++            const pixel * p = (const pixel *)p1;
+++            pixel * du = (pixel *)dst_u;
+++            pixel * dv = (pixel *)dst_v;
+++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+++                *du++ = *p++;
+++                *dv++ = *p++;
+++            }
+++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+++                    *du++ = *p++;
+++                    *dv++ = *p++;
+++                }
+++            }
+++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+++                *du++ = *p++;
+++                *dv++ = *p++;
+++            }
+++        }
+++    }
+++}
+++
+++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+++                             unsigned int stride1, unsigned int stride2,
+++                             const uint8_t * src_u, const unsigned int src_stride_u,
+++                             const uint8_t * src_v, const unsigned int src_stride_v,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h)
+++{
+++    const unsigned int x = _x * 2;
+++    const unsigned int w = _w * 2;
+++    const unsigned int mask = stride1 - 1;
+++    if ((x & ~mask) == ((x + w) & ~mask)) {
+++        // All in one sand stripe
+++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+++            const pixel * su = (const pixel *)src_u;
+++            const pixel * sv = (const pixel *)src_v;
+++            pixel * p = (pixel *)p1;
+++            for (unsigned int k = 0; k < w; k += 2 * PW) {
+++                *p++ = *su++;
+++                *p++ = *sv++;
+++            }
+++        }
+++    }
+++    else
+++    {
+++        // Two+ stripe
+++        const unsigned int sstride = stride1 * stride2;
+++        const unsigned int sstride_p = (sstride - stride1) / PW;
+++
+++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        const uint8_t * p2 = p1 + sstride - (x & mask);
+++        const unsigned int w1 = stride1 - (x & mask);
+++        const unsigned int w3 = (x + w) & mask;
+++        const unsigned int w2 = w - (w1 + w3);
+++
+++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+++            unsigned int j;
+++            const pixel * su = (const pixel *)src_u;
+++            const pixel * sv = (const pixel *)src_v;
+++            pixel * p = (pixel *)p1;
+++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+++                *p++ = *su++;
+++                *p++ = *sv++;
+++            }
+++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+++                    *p++ = *su++;
+++                    *p++ = *sv++;
+++                }
+++            }
+++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+++                *p++ = *su++;
+++                *p++ = *sv++;
+++            }
+++        }
+++    }
+++}
+++
+++
+++#undef pixel
+++#undef STRCAT
+++#undef FUNC
+++
++diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
++new file mode 100644
++index 0000000000..ec4cfadf8a
++--- /dev/null
+++++ b/libavutil/rpi_sand_fns.c
++@@ -0,0 +1,99 @@
+++#include "config.h"
+++#ifdef RPI
+++#include <stdint.h>
+++#include <string.h>
+++#include "rpi_sand_fns.h"
+++#include "avassert.h"
+++
+++#define PW 1
+++#include "rpi_sand_fn_pw.h"
+++#undef PW
+++
+++#define PW 2
+++#include "rpi_sand_fn_pw.h"
+++#undef PW
+++
+++#if HAVE_NEON
+++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
+++#endif
+++
+++#if 1
+++// Simple round
+++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+++{
+++    const unsigned int rnd = (1 << shr) >> 1;
+++    const uint16_t * src = (const uint16_t *)_src;
+++
+++    for (; n != 0; --n) {
+++        *dst++ = (*src++ + rnd) >> shr;
+++    }
+++}
+++#else
+++// Dithered variation
+++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+++{
+++    unsigned int rnd = (1 << shr) >> 1;
+++    const unsigned int mask = ((1 << shr) - 1);
+++    const uint16_t * src = (const uint16_t *)_src;
+++
+++    for (; n != 0; --n) {
+++        rnd = *src++ + (rnd & mask);
+++        *dst++ = rnd >> shr;
+++    }
+++}
+++#endif
+++
+++// w/h in pixels
+++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+++                         unsigned int w, unsigned int h, const unsigned int shr)
+++{
+++    const unsigned int n = dst_stride1 / 2;
+++    unsigned int j;
+++
+++    // This is true for our current layouts
+++    av_assert0(dst_stride1 == src_stride1);
+++
+++    // As we have the same stride1 for src & dest and src is wider than dest
+++    // then if we loop on src we can always write contiguously to dest
+++    // We make no effort to copy an exact width - round up to nearest src stripe
+++    // as we will always have storage in dest for that
+++
+++#if HAVE_NEON
+++    if (shr == 3 && src_stride1 == 128) {
+++        for (j = 0; j + n < w; j += dst_stride1) {
+++            uint8_t * d = dst + j * dst_stride2;
+++            const uint8_t * s1 = src + j * 2 * src_stride2;
+++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+++
+++            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+++        }
+++    }
+++    else
+++#endif
+++    {
+++        for (j = 0; j + n < w; j += dst_stride1) {
+++            uint8_t * d = dst + j * dst_stride2;
+++            const uint8_t * s1 = src + j * 2 * src_stride2;
+++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+++
+++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+++                cpy16_to_8(d, s1, n, shr);
+++                cpy16_to_8(d + n, s2, n, shr);
+++            }
+++        }
+++    }
+++
+++    // Fix up a trailing dest half stripe
+++    if (j < w) {
+++        uint8_t * d = dst + j * dst_stride2;
+++        const uint8_t * s1 = src + j * 2 * src_stride2;
+++
+++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+++            cpy16_to_8(d, s1, n, shr);
+++        }
+++    }
+++}
+++
+++#endif  // RPI
+++
++diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
++new file mode 100644
++index 0000000000..aa880d0f63
++--- /dev/null
+++++ b/libavutil/rpi_sand_fns.h
++@@ -0,0 +1,129 @@
+++#ifndef AVUTIL_RPI_SAND_FNS
+++#define AVUTIL_RPI_SAND_FNS
+++#ifdef RPI
+++
+++#include "libavutil/frame.h"
+++
+++// For all these fns _x & _w are measured as coord * PW
+++// For the C fns coords are in chroma pels (so luma / 2)
+++// Strides are in bytes
+++
+++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++
+++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+++                             uint8_t * dst_v, const unsigned int dst_stride_v,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+++                             uint8_t * dst_v, const unsigned int dst_stride_v,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++
+++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+++                             unsigned int stride1, unsigned int stride2,
+++                             const uint8_t * src_u, const unsigned int src_stride_u,
+++                             const uint8_t * src_v, const unsigned int src_stride_v,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+++                             unsigned int stride1, unsigned int stride2,
+++                             const uint8_t * src_u, const unsigned int src_stride_u,
+++                             const uint8_t * src_v, const unsigned int src_stride_v,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++
+++// w/h in pixels
+++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+++                         unsigned int w, unsigned int h, const unsigned int shr);
+++
+++
+++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+++{
+++    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
+++    //   to optimize a whole lot better
+++    return frame->linesize[0];
+++}
+++
+++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+++{
+++    return frame->linesize[3];
+++}
+++
+++
+++static inline int av_rpi_is_sand_format(const int format)
+++{
+++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
+++}
+++
+++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+++{
+++    return av_rpi_is_sand_format(frame->format);
+++}
+++
+++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+++{
+++    return (frame->format == AV_PIX_FMT_SAND128);
+++}
+++
+++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+++{
+++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+++}
+++
+++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+++{
+++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+++}
+++
+++// If x is measured in bytes (not pixels) then this works for sand64_16 as
+++// well as sand128 - but in the general case we work that out
+++
+++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+++{
+++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y + stride2 * x2;
+++}
+++
+++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+++{
+++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y_c + stride2 * x2;
+++}
+++
+++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+++}
+++
+++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+++}
+++
+++#endif
+++#endif
+++
+ diff --git a/libswscale/input.c b/libswscale/input.c
+-index 14ab5ab..e61b67a 100644
++index 14ab5abb3a..7a827c71e3 100644
+ --- a/libswscale/input.c
+ +++ b/libswscale/input.c
+-@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
++@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
+      }
+  }
+  
+-+
+ +static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
+ +                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ +                       int width, uint32_t *unused)
+@@ -18760,112 +29258,418 @@ index 14ab5ab..e61b67a 100644
+  #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+  
+  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+-@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
++@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+      case AV_PIX_FMT_P010BE:
+          c->chrToYV12 = p010BEToUV_c;
+          break;
+ +    case AV_PIX_FMT_SAND128:
+-+        c->chrToYV12 = sand128ToUV_c;
+++    case AV_PIX_FMT_SAND64_10:
+++        c->chrToYV12 = sand128ToUV_c;  // NIF
+ +        break;
+      }
+      if (c->chrSrcHSubSample) {
+          switch (srcFormat) {
+ diff --git a/libswscale/utils.c b/libswscale/utils.c
+-index 576d8f0..d7206cc 100644
++index 576d8f0d5a..fd88a5e51e 100644
+ --- a/libswscale/utils.c
+ +++ b/libswscale/utils.c
+-@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
++@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+      [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
+      [AV_PIX_FMT_P010LE]      = { 1, 0 },
+      [AV_PIX_FMT_P010BE]      = { 1, 0 },
+ +#ifdef RPI
+ +    [AV_PIX_FMT_SAND128]     = { 1, 0 },
+++    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
+ +#endif
+  };
+  
+  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+-diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+-new file mode 100755
+-index 0000000..8b596a2
++diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
++new file mode 100644
++index 0000000000..b1e99a6a89
+ --- /dev/null
+-+++ b/pi-util/conf.sh
+-@@ -0,0 +1,33 @@
+-+echo "Configure for Pi2/3"
+++++ b/pi-util/BUILD.txt
++@@ -0,0 +1,25 @@
+++Building Pi FFmpeg
+++==================
+ +
+-+RPI_BUILDROOT=`pwd`/build
+-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+-+#RPI_KEEPS="-save-temps=obj"
+-+RPI_KEEPS=""
+++Configuration:
+++=============
+ +
+-+./configure --enable-cross-compile\
+-+ --arch=armv6t2\
+-+ --cpu=cortex-a7\
+-+ --target-os=linux\
+-+ --disable-stripping\
+-+ --disable-thumb\
+-+ --enable-mmal\
+-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++pi-util/conf_pi2.sh
+ +
+-+# --enable-extra-warnings\
+-+# --arch=armv71\
+-+# --enable-shared\
+++contains suitable options to build the code for Pi2/3.  It expects to find
+++git clones of
+ +
+-+# gcc option for getting asm listing
+-+# -Wa,-ahls
+-diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
+-new file mode 100644
+-index 0000000..160e149
+---- /dev/null
+-+++ b/pi-util/conf1.sh
+-@@ -0,0 +1,34 @@
+-+echo "Configure for Pi1"
+++https://github.com/raspberrypi/tools
+++https://github.com/raspberrypi/firmware
+ +
+-+RPI_BUILDROOT=`pwd`/build
+-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+-+#RPI_KEEPS="-save-temps=obj"
+-+RPI_KEEPS=""
+++in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
+++lot of history you don't want.
+ +
+-+./configure --enable-cross-compile\
+-+ --cpu=arm1176jzf-s\
+-+ --arch=armv\
+-+ --disable-neon\
+-+ --target-os=linux\
+-+ --disable-stripping\
+-+ --enable-mmal\
+-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
+++rebuilt.  Otherwise the prebuilt .c & .h files will be used.
+++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
+ +
+++pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
+++H265 QPU acceleration is broken on Pi1 and so it is disabled.
+ +
+-+# --enable-extra-warnings\
+-+# --arch=armv71\
+-+# --enable-shared\
+ +
+-+# gcc option for getting asm listing
+-+# -Wa,-ahls
++diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
++new file mode 100644
++index 0000000000..f05b7753f7
++--- /dev/null
+++++ b/pi-util/conf_h265.2016.csv
++@@ -0,0 +1,193 @@
+++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
+++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
+++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
+++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
+++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
+++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
+++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
+++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
+++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
+++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
+++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
+++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
+++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
+++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
+++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
+++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
+++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
+++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
+++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
+++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
+++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
+++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
+++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
+++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
+++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
+++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
+++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
+++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
+++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
+++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
+++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
+++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
+++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
+++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
+++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
+++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
+++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
+++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
++diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
++new file mode 100644
++index 0000000000..6082641271
++--- /dev/null
+++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
++@@ -0,0 +1,147 @@
+++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+ diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+ new file mode 100644
+-index 0000000..fc14f2a
++index 0000000000..fc14f2a3c2
+ --- /dev/null
+ +++ b/pi-util/conf_h265.csv
+ @@ -0,0 +1,144 @@
+@@ -19013,14 +29817,88 @@ index 0000000..fc14f2a
+ +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+ +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+ +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
++new file mode 100755
++index 0000000000..ec25b81c31
++--- /dev/null
+++++ b/pi-util/conf_pi1.sh
++@@ -0,0 +1,31 @@
+++echo "Configure for Pi1"
+++
+++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=`pwd`/../firmware/opt/vc
+++
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --cpu=arm1176jzf-s\
+++ --arch=arm\
+++ --disable-neon\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
++diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
++new file mode 100755
++index 0000000000..f8e5e75375
++--- /dev/null
+++++ b/pi-util/conf_pi2.sh
++@@ -0,0 +1,30 @@
+++echo "Configure for Pi2/3"
+++
+++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=`pwd`/../firmware/opt/vc
+++
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --arch=armv6t2\
+++ --cpu=cortex-a7\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --disable-thumb\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
+ diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+-new file mode 100644
+-index 0000000..c896bc6
++new file mode 100755
++index 0000000000..70f7be22bb
+ --- /dev/null
+ +++ b/pi-util/ffconf.py
+-@@ -0,0 +1,154 @@
++@@ -0,0 +1,174 @@
+ +#!/usr/bin/env python
+ +
+++import string
+ +import os
+ +import subprocess
+ +import re
+@@ -19029,12 +29907,20 @@ index 0000000..c896bc6
+ +import csv
+ +from stat import *
+ +
+-+conf_root = "/opt/conform/h265"
+ +ffmpeg_exec = "./ffmpeg"
+ +
+-+def testone(fileroot, name, es_file, md5_file):
+++def testone(fileroot, srcname, es_file, md5_file):
+ +    tmp_root = "/tmp"
+ +
+++    names = srcname.split('/')
+++    while len(names) > 1:
+++        tmp_root = os.path.join(tmp_root, names[0])
+++        del names[0]
+++    name = names[0]
+++
+++    if not os.path.exists(tmp_root):
+++        os.makedirs(tmp_root)
+++
+ +    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+ +    try:
+ +        os.remove(dec_file)
+@@ -19079,10 +29965,10 @@ index 0000000..c896bc6
+ +
+ +def scandir(root):
+ +    aconf = []
+-+    ents = os.listdir(conf_root)
+++    ents = os.listdir(root)
+ +    ents.sort(key=str.lower)
+ +    for name in ents:
+-+        test_path = os.path.join(conf_root, name)
+++        test_path = os.path.join(root, name)
+ +        if S_ISDIR(os.stat(test_path).st_mode):
+ +            files = os.listdir(test_path)
+ +            es_file = "?"
+@@ -19093,7 +29979,7 @@ index 0000000..c896bc6
+ +                    pass
+ +                elif ext == ".bit" or ext == ".bin":
+ +                    es_file = f
+-+                elif ext == ".md5":
+++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
+ +                    if md5_file == "?":
+ +                        md5_file = f
+ +                    elif base[-3:] == "yuv":
+@@ -19105,13 +29991,15 @@ index 0000000..c896bc6
+ +    if not tests:
+ +        return True
+ +    for t in tests:
+-+        if name[0:len(t)] == t:
+++        if name[0:len(t)] == t or name.find("/" + t) != -1:
+ +            return True
+-+        return False
+++    return False
+ +
+-+def doconf(csva, tests):
+-+    failures = []
+++def doconf(csva, tests, test_root):
+++    unx_failures = []
+ +    unx_success = []
+++    failures = 0
+++    successes = 0
+ +    for a in csva:
+ +        exp_test = int(a[0])
+ +        if (exp_test and runtest(a[1], tests)):
+@@ -19119,17 +30007,25 @@ index 0000000..c896bc6
+ +            print "==== ", name,
+ +            sys.stdout.flush()
+ +
+-+            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
+++            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
+++            if (rv == 0):
+++                successes += 1
+++            else:
+++                failures += 1
+++
+ +            if (rv == 0):
+ +                if exp_test == 2:
+ +                    print ": * OK *"
+ +                    unx_success.append(name)
+ +                else:
+ +                    print ": ok"
+-+            elif exp_test > 1 and rv == 1:
+++            elif exp_test == 2 and rv == 1:
+ +                print ": fail"
+++            elif exp_test == 3 and rv == 2:
+++                # Call an expected "crash" an abort
+++                print ": abort"
+ +            else:
+-+                failures.append(name)
+++                unx_failures.append(name)
+ +                if rv == 1:
+ +                    print ": * FAIL *"
+ +                elif (rv == 2) :
+@@ -19139,11 +30035,11 @@ index 0000000..c896bc6
+ +                else :
+ +                    print ": * BANG *"
+ +
+-+    if failures or unx_success:
+-+        print "Unexpected Failures:", failures
+++    if unx_failures or unx_success:
+++        print "Unexpected Failures:", unx_failures
+ +        print "Unexpected Success: ", unx_success
+ +    else:
+-+        print "All tests normal"
+++        print "All tests normal:", successes, "ok,", failures, "failed"
+ +
+ +
+ +class ConfCSVDialect(csv.Dialect):
+@@ -19159,2638 +30055,194 @@ index 0000000..c896bc6
+ +
+ +    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+ +    argp.add_argument("tests", nargs='*')
+++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+ +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+-+    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
+++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
+ +    args = argp.parse_args()
+ +
+ +    if args.csvgen:
+-+        csv.writer(sys.stdout).writerows(scandir(conf_root))
+++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
+ +        exit(0)
+ +
+ +    with open(args.csv, 'rt') as csvfile:
+ +        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+ +
+ +
+-+    doconf(csva, args.tests)
+++    doconf(csva, args.tests, args.test_root)
+ +
+-diff --git a/pi-util/qasm.py b/pi-util/qasm.py
+-new file mode 100644
+-index 0000000..1eacc04
++diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
++new file mode 100755
++index 0000000000..27cc453963
+ --- /dev/null
+-+++ b/pi-util/qasm.py
+-@@ -0,0 +1,2502 @@
+-+#!/usr/bin/env python
+++++ b/pi-util/ffperf.py
++@@ -0,0 +1,124 @@
+++#!/usr/bin/env python3
+ +
+-+#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
+-+#    add  r0, r0, 1                    # implicit mul nop
+-+#    nop                               # explicit add nop, implicit mul nop
+-+#    bkpt                              # implicit add/mul nop
+-+#    mov  r0, 0x1234                   # hex immediate
+-+#    mov  r0, 20 * 40                  # expressions...
+-+#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
+-+#    mov  r0, a:label                  # put address of label in r0
+-+# :label
+-+#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
+-+# :1
+-+#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
+-+# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
+-+# .set my_val, 3                       # introduce alias for 3
+-+# .set my_reg, r0                      # and for r0
+-+#    mov  my_reg, my_val               # then use them
+-+# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
+-+# .macro my_add, a, b, c               # a, b, c act as if .set on entry
+-+# .set my_val, 10
+-+#    add  a, b, c
+-+#    mov  r0, my_val                   # 10
+-+# .endm                                # forget all .sets since .macro (including arg .sets)
+-+#    mov  r0, my_val                   # 3
+-+#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
+-+
+-+import math
+-+import optparse
+++import time
+++import string
+ +import os
+-+import random
+++import tempfile
+++import subprocess
+ +import re
+-+import struct
+++import argparse
+ +import sys
+-+import time
+++import csv
+++from stat import *
+ +
+-+###############################################################################
+-+# constants
+-+###############################################################################
+-+
+-+# ops
+-+######
+-+
+-+# negatives are internal qasm ops
+-+
+-+AOP_MOV     = -3   # two operands
+-+AOP_BRA     = -2   # two operands
+-+AOP_BRR     = -1   # two operands
+-+AOP_NOP     = 0x00 # no operands
+-+AOP_FADD    = 0x01
+-+AOP_FSUB    = 0x02
+-+AOP_FMIN    = 0x03
+-+AOP_FMAX    = 0x04
+-+AOP_FMINABS = 0x05
+-+AOP_FMAXABS = 0x06
+-+AOP_FTOI    = 0x07 # two operands
+-+AOP_ITOF    = 0x08 # two operands
+-+AOP_ADD     = 0x0c
+-+AOP_SUB     = 0x0d
+-+AOP_SHR     = 0x0e
+-+AOP_ASR     = 0x0f
+-+AOP_ROR     = 0x10
+-+AOP_SHL     = 0x11
+-+AOP_MIN     = 0x12
+-+AOP_MAX     = 0x13
+-+AOP_AND     = 0x14
+-+AOP_OR      = 0x15
+-+AOP_XOR     = 0x16
+-+AOP_NOT     = 0x17 # two operands
+-+AOP_CLZ     = 0x18 # two operands
+-+AOP_V8ADDS  = 0x1e
+-+AOP_V8SUBS  = 0x1f
+-+
+-+MOP_MOV    = -1  # two operands
+-+MOP_NOP    = 0x0 # no operands
+-+MOP_FMUL   = 0x1
+-+MOP_MUL24  = 0x2
+-+MOP_V8MULD = 0x3
+-+MOP_V8MIN  = 0x4
+-+MOP_V8MAX  = 0x5
+-+MOP_V8ADDS = 0x6
+-+MOP_V8SUBS = 0x7
+-+
+-+# ldi modes
+-+############
+-+
+-+LDI_32          = 0
+-+LDI_EL_SIGNED   = 1
+-+LDI_EL_UNSIGNED = 3
+-+LDI_SEMA        = 4
+-+
+-+# conds
+-+########
+-+
+-+COND_NEVER  = 0
+-+COND_ALWAYS = 1
+-+COND_IFZ    = 2
+-+COND_IFNZ   = 3
+-+COND_IFN    = 4
+-+COND_IFNN   = 5
+-+COND_IFC    = 6
+-+COND_IFNC   = 7
+-+
+-+BCOND_ALLZ   = 0
+-+BCOND_ALLNZ  = 1
+-+BCOND_ANYZ   = 2
+-+BCOND_ANYNZ  = 3
+-+BCOND_ALLN   = 4
+-+BCOND_ALLNN  = 5
+-+BCOND_ANYN   = 6
+-+BCOND_ANYNN  = 7
+-+BCOND_ALLC   = 8
+-+BCOND_ALLNC  = 9
+-+BCOND_ANYC   = 10
+-+BCOND_ANYNC  = 11
+-+BCOND_ALWAYS = 15
+-+
+-+# packing/unpacking
+-+####################
+-+
+-+# regfile a pack modes
+-+PACK_A_NOP   = 0
+-+PACK_A_16A   = 1
+-+PACK_A_16B   = 2
+-+PACK_A_8888  = 3
+-+PACK_A_8A    = 4
+-+PACK_A_8B    = 5
+-+PACK_A_8C    = 6
+-+PACK_A_8D    = 7
+-+PACK_A_32S   = 8
+-+PACK_A_16AS  = 9
+-+PACK_A_16BS  = 10
+-+PACK_A_8888S = 11
+-+PACK_A_8AS   = 12
+-+PACK_A_8BS   = 13
+-+PACK_A_8CS   = 14
+-+PACK_A_8DS   = 15
+-+
+-+# mul unit pack modes
+-+PACK_MUL_NOP  = 0
+-+PACK_MUL_8888 = 3
+-+PACK_MUL_8A   = 4
+-+PACK_MUL_8B   = 5
+-+PACK_MUL_8C   = 6
+-+PACK_MUL_8D   = 7
+-+
+-+# regfile a unpack modes
+-+UNPACK_A_NOP = 0
+-+UNPACK_A_16A = 1
+-+UNPACK_A_16B = 2
+-+UNPACK_A_8R  = 3
+-+UNPACK_A_8A  = 4
+-+UNPACK_A_8B  = 5
+-+UNPACK_A_8C  = 6
+-+UNPACK_A_8D  = 7
+-+
+-+# r4 unpack modes
+-+UNPACK_R4_NOP = 0
+-+UNPACK_R4_16A = 1
+-+UNPACK_R4_16B = 2
+-+UNPACK_R4_8R  = 3
+-+UNPACK_R4_8A  = 4
+-+UNPACK_R4_8B  = 5
+-+UNPACK_R4_8C  = 6
+-+UNPACK_R4_8D  = 7
+-+
+-+PACK_TYPE_INT    = 0
+-+PACK_TYPE_FLOAT  = 1
+-+PACK_TYPE_EITHER = -1
+-+
+-+PACK_MODE_A      = 0 # regfile a
+-+PACK_MODE_M      = 1 # mul unit
+-+PACK_MODE_EITHER = -1
+-+
+-+UNPACK_LOC_A     = 0 # regfile a
+-+UNPACK_LOC_R4    = 1 # r4
+-+UNPACK_LOC_AB    = 2 # either regfile a or regfile b
+-+UNPACK_LOC_OTHER = 3 # somewhere else
+-+
+-+# args
+-+#######
+-+
+-+# loc_t, ie internal
+-+MUX_AC  = 0
+-+MUX_ANY = 1
+-+MUX_A   = 2
+-+MUX_B   = 3
+-+RW_EITHER = 0
+-+RW_READ   = 1
+-+RW_WRITE  = 2
+-+
+-+RADDR_NOP = 39
+-+
+-+# negatives are for internal use
+-+RMUX_SEMA  = -6
+-+RMUX_LABEL = -5
+-+RMUX_IMMV  = -4
+-+RMUX_IMM   = -3
+-+RMUX_AC    = -2
+-+RMUX_ANY   = -1
+-+RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
+-+RMUX_A     = 6
+-+RMUX_B     = 7
+-+
+-+WADDR_R0  = 32 # followed by R1, R2, R3
+-+WADDR_NOP = 39
+-+
+-+WMUX_ANY = 0
+-+WMUX_A   = 1
+-+WMUX_B   = 2
+-+
+-+# signals
+-+##########
+-+
+-+SIG_BKPT       = 0
+-+SIG_NORMAL     = 1
+-+SIG_THRSW      = 2
+-+SIG_THREND     = 3
+-+SIG_SBWAIT     = 4
+-+SIG_SBDONE     = 5
+-+SIG_INT        = 6 # on a0
+-+SIG_LTHRSW     = 6 # on b0
+-+SIG_LOADCV     = 7
+-+SIG_LOADC      = 8
+-+SIG_LDCEND     = 9
+-+SIG_LDTMU0     = 10
+-+SIG_LDTMU1     = 11
+-+SIG_ROTATE     = 12 # on a0
+-+SIG_LOADAM     = 12 # on b0
+-+SIG_SMALLIMMED = 13
+-+SIG_IMMED      = 14
+-+SIG_BRANCH     = 15
+-+
+-+# multi-line assembler constructs
+-+##################################
+-+
+-+CONSTRUCT_MACRO = 0x1
+-+CONSTRUCT_IF    = 0x2
+-+CONSTRUCT_ELSE  = 0x4
+-+CONSTRUCT_REP   = 0x8
+-+
+-+###############################################################################
+-+# helpers
+-+###############################################################################
+-+
+-+def asm_error(message, location = None):
+-+   if location is None:
+-+      location = current_location
+-+   if location == '':
+-+      sys.stderr.write('qasm ERROR: %s\n' % message)
+-+   else:
+-+      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
+-+   sys.exit(-1)
+-+
+-+def asm_warning(message, location = None):
+-+   if disable_warnings or (nwarn_level != 0):
+-+      return
+-+   if location is None:
+-+      location = current_location
+-+   if location == '':
+-+      sys.stderr.write('qasm WARNING: %s\n' % message)
+-+   else:
+-+      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
+-+   if warnings_are_errors:
+-+      asm_error('warnings are errors!', location)
+-+
+-+# smart_split('') = []
+-+# smart_split('a') = ['a']
+-+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
+-+def smart_split(s, delim = ',', count = 0):
+-+   if len(s) == 0:
+-+      return []
+-+   parts = []
+-+   depth = 0
+-+   i = 0
+-+   for j in xrange(len(s)):
+-+      if s[j] in '([{':
+-+         depth += 1
+-+      elif s[j] in ')]}':
+-+         depth -= 1
+-+      elif (s[j] == delim) and (depth == 0):
+-+         parts.append(s[i:j])
+-+         i = j + 1
+-+         if len(parts) == count:
+-+            break
+-+   if depth != 0:
+-+      asm_error('bracket nesting fail')
+-+   parts.append(s[i:])
+-+   return parts
+-+
+-+def is_int(x):
+-+   return isinstance(x, int) or isinstance(x, long)
+-+
+-+###############################################################################
+-+# "parsing" stuff
+-+###############################################################################
+-+
+-+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
+-+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+-+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+-+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
+-+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
+-+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
+-+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
+-+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
+-+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
+-+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
+-+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
+-+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
+-+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
+-+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
+-+re_label_ref_left = re.compile('\\b([ar]):')
+-+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
+-+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
+-+
+-+# ops
+-+######
+-+
+-+aops = {
+-+   'mov': (AOP_MOV, 2),
+-+   'bra': (AOP_BRA, 2),
+-+   'brr': (AOP_BRR, 2),
+-+   'nop': (AOP_NOP, 0),
+-+   'fadd': (AOP_FADD, 3),
+-+   'fsub': (AOP_FSUB, 3),
+-+   'fmin': (AOP_FMIN, 3),
+-+   'fmax': (AOP_FMAX, 3),
+-+   'fminabs': (AOP_FMINABS, 3),
+-+   'fmaxabs': (AOP_FMAXABS, 3),
+-+   'ftoi': (AOP_FTOI, 2),
+-+   'itof': (AOP_ITOF, 2),
+-+   'add': (AOP_ADD, 3),
+-+   'sub': (AOP_SUB, 3),
+-+   'shr': (AOP_SHR, 3),
+-+   'asr': (AOP_ASR, 3),
+-+   'ror': (AOP_ROR, 3),
+-+   'shl': (AOP_SHL, 3),
+-+   'min': (AOP_MIN, 3),
+-+   'max': (AOP_MAX, 3),
+-+   'and': (AOP_AND, 3),
+-+   'or': (AOP_OR, 3),
+-+   'xor': (AOP_XOR, 3),
+-+   'not': (AOP_NOT, 2),
+-+   'clz': (AOP_CLZ, 2),
+-+   'v8adds': (AOP_V8ADDS, 3),
+-+   'v8subs': (AOP_V8SUBS, 3)}
+-+
+-+def get_aop(aop):
+-+   if aop not in aops:
+-+      asm_error('invalid aop')
+-+   return aops[aop]
+-+
+-+mops = {
+-+   'mov': (MOP_MOV, 2),
+-+   'nop': (MOP_NOP, 0),
+-+   'fmul': (MOP_FMUL, 3),
+-+   'mul24': (MOP_MUL24, 3),
+-+   'v8muld': (MOP_V8MULD, 3),
+-+   'v8min': (MOP_V8MIN, 3),
+-+   'v8max': (MOP_V8MAX, 3),
+-+   'v8adds': (MOP_V8ADDS, 3),
+-+   'v8subs': (MOP_V8SUBS, 3)}
+-+
+-+def get_mop(mop):
+-+   if mop not in mops:
+-+      asm_error('invalid mop')
+-+   return mops[mop]
+-+
+-+# conds
+-+########
+-+
+-+conds = {
+-+   'ifz': COND_IFZ,
+-+   'ifnz': COND_IFNZ,
+-+   'ifn': COND_IFN,
+-+   'ifnn': COND_IFNN,
+-+   'ifc': COND_IFC,
+-+   'ifnc': COND_IFNC}
+-+
+-+def get_cond(cond):
+-+   if not cond:
+-+      return COND_ALWAYS
+-+   if cond not in conds:
+-+      asm_error('invalid cond')
+-+   return conds[cond]
+-+
+-+bconds = {
+-+   'allz': BCOND_ALLZ,
+-+   'allnz': BCOND_ALLNZ,
+-+   'anyz': BCOND_ANYZ,
+-+   'anynz': BCOND_ANYNZ,
+-+   'alln': BCOND_ALLN,
+-+   'allnn': BCOND_ALLNN,
+-+   'anyn': BCOND_ANYN,
+-+   'anynn': BCOND_ANYNN,
+-+   'allc': BCOND_ALLC,
+-+   'allnc': BCOND_ALLNC,
+-+   'anyc': BCOND_ANYC,
+-+   'anync': BCOND_ANYNC}
+-+
+-+def get_bcond(bcond):
+-+   if not bcond:
+-+      return BCOND_ALWAYS
+-+   if bcond not in bconds:
+-+      asm_error('invalid bcond')
+-+   return bconds[bcond]
+-+
+-+def get_setf(setf):
+-+   if not setf:
+-+      return False
+-+   return True
+-+
+-+# packing/unpacking
+-+####################
+-+
+-+packs = {
+-+   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
+-+   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
+-+   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+-+   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+-+   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
+-+
+-+def get_pack(pack):
+-+   if not pack:
+-+      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
+-+   if pack not in packs:
+-+      asm_error('invalid pack')
+-+   return packs[pack]
+-+
+-+a_unpacks = {
+-+   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
+-+   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
+-+   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
+-+   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
+-+   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
+-+   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
+-+   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
+-+   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
+-+   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
+-+   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
+-+   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
+-+   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
+-+   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
+-+
+-+def get_a_unpack(unpack):
+-+   if not unpack:
+-+      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
+-+   if unpack not in a_unpacks:
+-+      asm_error('invalid ra unpack')
+-+   return a_unpacks[unpack] + (UNPACK_LOC_A,)
+-+
+-+r4_unpacks = {
+-+   '16af': UNPACK_R4_16A,
+-+   '16bf': UNPACK_R4_16B,
+-+   '8dr':  UNPACK_R4_8R,
+-+   '8ac':  UNPACK_R4_8A,
+-+   '8bc':  UNPACK_R4_8B,
+-+   '8cc':  UNPACK_R4_8C,
+-+   '8dc':  UNPACK_R4_8D}
+-+
+-+def get_r4_unpack(unpack):
+-+   if not unpack:
+-+      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
+-+   if unpack not in r4_unpacks:
+-+      asm_error('invalid r4 unpack')
+-+   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
+-+
+-+# args
+-+#######
+-+
+-+class loc_t:
+-+   def __init__(self, mux, i, rot, r5_rot, pack, rw):
+-+      self.mux = mux
+-+      self.i = i
+-+      self.rot = rot % 16
+-+      self.r5_rot = r5_rot % 16
+-+      self.pack = pack
+-+      self.rw = rw
+-+
+-+   def copy(self):
+-+      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
+-+
+-+   def __add__(self, i):
+-+      if not is_int(i):
+-+         raise Exception('can only add integer to loc')
+-+      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
+-+
+-+   def __sub__(self, i):
+-+      if not is_int(i):
+-+         raise Exception('can only subtract integer from loc')
+-+      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
+-+
+-+   def __cmp__(self, other):
+-+      if is_int(other):
+-+         return cmp(self.i, other)
+-+      if not isinstance(other, loc_t):
+-+         raise Exception('can only compare loc to integer or other loc')
+-+      if self.mux != other.mux:
+-+         return cmp(self.mux, other.mux)
+-+      if self.i != other.i:
+-+         return cmp(self.i, other.i)
+-+      if self.rot != other.rot:
+-+         return cmp(self.rot, other.rot)
+-+      if self.r5_rot != other.r5_rot:
+-+         return cmp(self.r5_rot, other.r5_rot)
+-+      return cmp(self.pack, other.pack)
+-+
+-+   def is_r5(self):
+-+      return (self.mux == MUX_AC) and (self.i == 5)
+-+
+-+   def shift(self, rot, left):
+-+      if isinstance(rot, loc_t) and rot.is_r5():
+-+         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
+-+            raise Exception('can\'t rotate by rotated/unpacked r5')
+-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
+-+      if not is_int(rot):
+-+         raise Exception('can only rotate by integer or r5')
+-+      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
+-+
+-+   def __lshift__(self, rot):
+-+      return self.shift(rot, True)
+-+
+-+   def __rshift__(self, rot):
+-+      return self.shift(rot, False)
+-+
+-+   def __getattr__(self, name):
+-+      # discard the first character if it is an underscore. this is a total hack
+-+      # to allow packs starting with a digit to work
+-+      if name[0] == '_':
+-+         name = name[1:]
+-+      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
+-+         if self.pack:
+-+            raise Exception('can\'t specify two packs')
+-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
+-+      raise AttributeError()
+-+
+-+   def __str__(self):
+-+      if self.mux == MUX_AC:
+-+         return 'r%d' % self.i
+-+      if self.mux == MUX_ANY:
+-+         return 'rany%d' % self.i
+-+      if self.mux == MUX_A:
+-+         return 'ra%d' % self.i
+-+      if self.mux == MUX_B:
+-+         return 'rb%d' % self.i
+-+      assert 0
+-+
+-+class sema_t:
+-+   def __init__(self, acq, i):
+-+      if not is_int(i):
+-+         raise Exception('semaphore index must be integer')
+-+      self.acq = acq
+-+      self.i = i
+-+
+-+class label_t:
+-+   def __init__(self, rel, name, offset):
+-+      self.rel = rel
+-+      self.name = name
+-+      self.offset = offset
+-+
+-+   def __add__(self, offset):
+-+      return label_t(self.rel, self.name, self.offset + offset)
+-+
+-+   def __sub__(self, offset):
+-+      return label_t(self.rel, self.name, self.offset - offset)
+-+
+-+class label_maker_t:
+-+   def __init__(self, rel):
+-+      self.rel = rel
+-+
+-+   def __getattr__(self, name):
+-+      # we discard the first character. this is a total hack to allow numeric labels to work
+-+      if not re_label_ref_right.match(name[1:]):
+-+         raise Exception('invalid label reference')
+-+      return label_t(self.rel, name[1:], 0)
+-+
+-+def bits(x, n):
+-+   if (x >> n) != 0:
+-+      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
+-+   return x
+-+
+-+def bitsw(x, n):
+-+   if x == (1 << n):
+-+      x = 0
+-+   return bits(x, n)
+-+
+-+def bitsws(x, n):
+-+   if x == (1 << (n - 1)):
+-+      x = 0
+-+   if -(1 << (n - 1)) <= x < 0:
+-+      x += 1 << n
+-+   return bits(x, n)
+-+
+-+def vpm_setup(n, stride, addr, v2 = False):
+-+   horiz, laned, size, y, x, p = addr
+-+   if size not in (0, 1, 2):
+-+      raise Exception('addr size should be 0, 1, or 2')
+-+   if horiz:
+-+      if x != 0:
+-+         raise Exception('horizontal accesses must have x of 0')
+-+   else:
+-+      if (y & 0xf) != 0:
+-+         raise Exception('vertical accesses must be 16 row aligned')
+-+   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
+-+   if v2:
+-+      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
+-+         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
+-+   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
+-+      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
+-+
+-+def vdw_setup_0(n, m, addr):
+-+   horiz, size, y, x, p = addr
+-+   if size not in (0, 1, 2):
+-+      raise Exception('addr size should be 0, 1, or 2')
+-+   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
+-+      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
+-+
+-+def vdr_setup_0(n, m, addr, vpm_stride, stride):
+-+   horiz, size, y, x, p = addr
+-+   if size not in (0, 1, 2):
+-+      raise Exception('addr size should be 0, 1, or 2')
+-+   if (stride < 8) or (stride & (stride - 1)):
+-+      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
+-+   log2_stride = 3
+-+   while (1 << log2_stride) != stride:
+-+      log2_stride += 1
+-+   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
+-+      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
+-+      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
+-+
+-+class allocator_t:
+-+   def __init__(self, *available):
+-+      self.available = list(available)
+-+      self.allocated = {}
+-+      self.reserved = []
+-+
+-+   def copy(self):
+-+      a = allocator_t()
+-+      a.available = self.available[:]
+-+      a.allocated = self.allocated.copy()
+-+      a.reserved = self.reserved[:]
+-+      return a
+-+
+-+   def forget(self):
+-+      self.__init__(self.available + self.allocated.values() + self.reserved)
+-+
+-+   def reserve(self, *rs):
+-+      for r in rs:
+-+         self.available.remove(r)
+-+         self.reserved.append(r)
+-+
+-+   def retire(self, name):
+-+      r = self.allocated.pop(name)
+-+      del r.__invert__
+-+      del r.retire
+-+      self.available.append(r)
+-+      return r
+-+
+-+   def __getattr__(self, name):
+-+      if name not in self.allocated:
+-+         r = self.available.pop()
+-+         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
+-+         r.__invert__ = r.retire
+-+         self.allocated[name] = r
+-+      return self.allocated[name]
+-+
+-+def pragma_allow_xor_0(x):
+-+   global allow_xor_0
+-+
+-+   if not isinstance(x, bool):
+-+      raise Exception('allow_xor_0 must be bool')
+-+   x, allow_xor_0 = allow_xor_0, x
+-+   return x
+-+
+-+def pragma_dont_warn_when_mul_rot_inp_r5(x):
+-+   global dont_warn_when_mul_rot_inp_r5
+-+
+-+   if not isinstance(x, bool):
+-+      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
+-+   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
+-+   return x
+-+
+-+arg_defs = {
+-+   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
+-+   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
+-+   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
+-+   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
+-+   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
+-+   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
+-+   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
+-+   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
+-+   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
+-+   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
+-+   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
+-+   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
+-+   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
+-+   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
+-+   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
+-+   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
+-+   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
+-+   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
+-+   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
+-+   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
+-+   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
+-+   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
+-+   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
+-+   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
+-+   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
+-+   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
+-+   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
+-+   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
+-+   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
+-+   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
+-+   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
+-+   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+-+   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+-+   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
+-+   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
+-+   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
+-+   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
+-+   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
+-+   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
+-+   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
+-+   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
+-+   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
+-+   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
+-+
+-+   # semaphore acq/rel
+-+   'sacq': lambda i: sema_t(True, i),
+-+   'srel': lambda i: sema_t(False, i),
+-+
+-+   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
+-+   'r_label_maker': label_maker_t(True),
+-+   'a_label_maker': label_maker_t(False),
+-+
+-+   # handy functions
+-+   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
+-+   'sqrt':  math.sqrt,
+-+   'sin':   math.sin,
+-+   'cos':   math.cos,
+-+   'atan2': math.atan2,
+-+   'pi':    math.pi,
+-+   'rseed': random.seed,
+-+   'rand':  lambda: int(random.getrandbits(32)),
+-+   'bits':  bits,
+-+   'bitsw': bitsw,
+-+   'bitsws': bitsws,
+-+
+-+   # handy vpm/vdw/vdr stuff
+-+   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
+-+   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
+-+   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
+-+   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
+-+   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
+-+   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
+-+   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
+-+   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
+-+   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
+-+   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
+-+   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
+-+   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
+-+   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
+-+   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
+-+   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
+-+   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
+-+   'vpm_setup': vpm_setup,
+-+   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
+-+   'vdw_setup_0': vdw_setup_0,
+-+   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
+-+   'vdr_setup_0': vdr_setup_0,
+-+   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
+-+   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
+-+
+-+   # annotations
+-+   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
+-+   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
+-+   'preserve_cond': ('preserve_cond', 1),
+-+
+-+   # somewhat experimental register allocator
+-+   'allocator_t': allocator_t,
+-+
+-+   # pragmas
+-+   'pragma_allow_xor_0': pragma_allow_xor_0,
+-+   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
+-+
+-+# accumulators and regs (regular names -- r0, ra0, etc)
+-+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
+-+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+-+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+-+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+-+
+-+def arg_eval(arg, sets):
+-+   s = (arg.strip().split('.', 1) + [None])[:2]
+-+   if s[0] == '-':
+-+      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
+-+   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
+-+   arg = re_pack.sub('._\\1', arg)
+-+   try:
+-+      # todo: i would like to be able to pass both arg_defs and sets in here
+-+      # (with sets hiding arg_defs in the case of conflicts), but the obvious
+-+      # dict(arg_defs, **sets) won't permit things such as:
+-+      # .set f, lambda x: y
+-+      # .set y, 4
+-+      # (the y in the lambda will be looked up in the temporary dict we created
+-+      # when evaluating the f .set, which doesn't contain y)
+-+      #
+-+      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
+-+      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
+-+      # also, before dumping sets at the end, we strip out the arg_defs stuff
+-+      # (this isn't entirely correct as we want to dump sets that are hiding
+-+      # arg_defs)
+-+      return eval(arg, sets)
+-+   except Exception, e:
+-+      asm_error(e)
+-+   except:
+-+      asm_error('unknown error while evaluating argument')
+-+
+-+# doesn't check/fixup pack
+-+def check_and_fixup_loc(loc, read):
+-+   if (not read) and (loc.rw == RW_READ):
+-+      asm_error('writing to read-only hardware register')
+-+   if read and (loc.rw == RW_WRITE):
+-+      asm_error('reading from write-only hardware register')
+-+   if not read:
+-+      # conceptually, we are writing to a location rotated right by
+-+      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
+-+      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
+-+      loc.rot = -loc.rot % 16
+-+      loc.r5_rot = -loc.r5_rot % 16
+-+   if (loc.rot != 0) and (loc.r5_rot != 0):
+-+      asm_error('can\'t rotate by both r5 and immediate')
+-+   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
+-+      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
+-+   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
+-+      if not read:
+-+         asm_error('target doesn\'t support write rotation')
+-+      if loc.mux == MUX_ANY:
+-+         loc.mux = MUX_A # can't do rotated read from regfile b
+-+      if loc.mux != MUX_A:
+-+         asm_error('rotation on read only allowed from regfile a')
+-+      if loc.i >= 32:
+-+         asm_warning('rotation only works from physical regfile')
+-+   if loc.mux == MUX_AC:
+-+      if (loc.i < 0) or (loc.i >= 6):
+-+         asm_error('reg out of range')
+-+      if not read:
+-+         if loc.i == 4:
+-+            asm_error('not allowed to write to r4')
+-+         if loc.i == 5:
+-+
+-+            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
+-+   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
+-+      if (loc.i < 0) or (loc.i >= 64):
+-+         asm_error('reg out of range')
+-+   else:
+-+      assert 0
+-+
+-+def get_dst(dst, sets):
+-+   if not dst:
+-+      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
+-+   dst = arg_eval(dst, sets)
+-+   if not isinstance(dst, loc_t):
+-+      asm_error('invalid dst')
+-+   dst = dst.copy()
+-+   check_and_fixup_loc(dst, False)
+-+   pack = get_pack(dst.pack)
+-+   if dst.mux == MUX_AC:
+-+      if pack[2] == PACK_MODE_A:
+-+         asm_warning('ra packing only works when writing to physical regfile')
+-+         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+-+      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+-+   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
+-+      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
+-+         asm_warning('ra packing only works when writing to physical regfile')
+-+      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+-+   if dst.mux == MUX_ANY:
+-+      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+-+   if dst.mux == MUX_B:
+-+      if pack[2] == PACK_MODE_A:
+-+         asm_error('this packing operation can only be used for regfile a')
+-+      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
+-+   assert 0
+-+
+-+def get_src(src, sets):
+-+   if not src:
+-+      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
+-+   src = arg_eval(src, sets)
+-+   if isinstance(src, sema_t):
+-+      if not have_sema:
+-+         asm_error('target does not support semaphores')
+-+      if (src.i < 0) or (src.i >= 16):
+-+         asm_error('semaphore number must be in [0, 16)')
+-+      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if isinstance(src, label_t):
+-+      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if isinstance(src, list):
+-+      if len(src) != 16:
+-+         asm_error('vector immediate must have length 16')
+-+      src = src[:]
+-+      for i in xrange(16):
+-+         if not is_int(src[i]):
+-+            asm_error('all elements of vector immediate must be integers')
+-+         src[i] &= (1 << 32) - 1
+-+      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if is_int(src):
+-+      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if not isinstance(src, loc_t):
+-+      asm_error('invalid src')
+-+   src = src.copy()
+-+   check_and_fixup_loc(src, True)
+-+   if mulw_rotate:
+-+      srot, sr5rot = 0, 0
+-+      drot, dr5rot = src.rot, src.r5_rot
+-+   else:
+-+      srot, sr5rot = src.rot, src.r5_rot
+-+      drot, dr5rot = 0, 0
+-+   if src.mux == MUX_AC:
+-+      if src.i == 4:
+-+         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
+-+      if src.pack:
+-+         asm_error('unpack only allowed for regfile a or r4')
+-+      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+-+   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
+-+      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
+-+   if src.mux == MUX_ANY:
+-+      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
+-+   if src.mux == MUX_B:
+-+      if src.pack:
+-+         asm_error('unpack only allowed for regfile a or r4')
+-+      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+-+   assert 0
+-+
+-+# signals
+-+##########
+-+
+-+sigs = {
+-+   'bkpt': SIG_BKPT,
+-+   'thrsw': SIG_THRSW,
+-+   'thrend': SIG_THREND,
+-+   'sbwait': SIG_SBWAIT,
+-+   'sbdone': SIG_SBDONE,
+-+   'int': SIG_INT,
+-+   'loadcv': SIG_LOADCV,
+-+   'loadc': SIG_LOADC,
+-+   'ldcend': SIG_LDCEND,
+-+   'ldtmu0': SIG_LDTMU0,
+-+   'ldtmu1': SIG_LDTMU1}
+-+
+-+def get_sig(sig):
+-+   if sig not in sigs:
+-+      return SIG_NORMAL
+-+   return sigs[sig]
+-+
+-+# annotations
+-+##############
+-+
+-+def get_annots(annot, sets):
+-+   annots = arg_eval(annot, sets)
+-+   if isinstance(annots, list):
+-+      annots = annots[:]
+-+   else:
+-+      annots = [annots]
+-+   for i, annot in enumerate(annots):
+-+      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
+-+         (not is_int(annot[1]))):
+-+         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
+-+      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
+-+   return annots
+-+
+-+###############################################################################
+-+# core
+-+###############################################################################
+-+
+-+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
+-+   needfloat = PACK_TYPE_EITHER
+-+   havefloata = False
+-+   havefloatr4 = False
+-+   unpacka = None
+-+   unpackr4 = None
+-+   forcebs = [False, False, False, False]
+-+   forcerafloat = False
+-+
+-+   pm = PACK_MODE_EITHER
+-+   for i in (0, 1, 2, 3):
+-+      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
+-+         assert rpacks[i][0] == 0
+-+      else:
+-+         if rpacks[i][2] == UNPACK_LOC_A:
+-+            if unpacka is None:
+-+               unpacka = rpacks[i][0]
+-+            elif unpacka != rpacks[i][0]:
+-+               asm_error('conflicting unpack operations on regfile a')
+-+            havefloata = havefloata or rfloats[i]
+-+         elif rpacks[i][2] == UNPACK_LOC_R4:
+-+            if unpackr4 is None:
+-+               unpackr4 = rpacks[i][0]
+-+            elif unpackr4 != rpacks[i][0]:
+-+               asm_error('conflicting unpack operations on r4')
+-+            havefloatr4 = havefloatr4 or rfloats[i]
+-+         else:
+-+            assert 0
+-+
+-+         if rpacks[i][1] != PACK_TYPE_EITHER:
+-+            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
+-+               asm_error('conflicting unpack float requirements')
+-+            needfloat = rpacks[i][1]
+-+   for i in (0, 1, 2, 3):
+-+      if rpacks[i][2] == UNPACK_LOC_AB:
+-+         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
+-+            forcebs[i] = True # non-nop unpack from regfile a. must use b
+-+
+-+   if unpacka:
+-+      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
+-+         havefloata = True
+-+         forcerafloat = True
+-+      havefloat = havefloata
+-+   else:
+-+      havefloat = havefloatr4
+-+
+-+   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
+-+      asm_error('float unpack operation used in integer alu operations')
+-+   if (needfloat == PACK_TYPE_INT) and havefloat:
+-+      asm_error('integer unpack operation used in float alu operation')
+-+
+-+   unpack = 0
+-+   if unpacka and unpackr4:
+-+      asm_error('cannot specify pack operation for both regfile a and r4')
+-+   if unpacka:
+-+      pm = PACK_MODE_A
+-+      unpack = unpacka
+-+   elif unpackr4:
+-+      pm = PACK_MODE_M
+-+      unpack = unpackr4
+-+
+-+   pack = 0
+-+   if wpacks[0][2] == PACK_MODE_M:
+-+      asm_error('mul-unit pack operation used on add result')
+-+   for i in (0, 1):
+-+      if wpacks[i][2] == PACK_MODE_A:
+-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
+-+            asm_error('conflicting pack modes')
+-+         pm = PACK_MODE_A
+-+         pack = wpacks[i][0]
+-+      elif wpacks[i][2] == PACK_MODE_M:
+-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
+-+            asm_error('conflicting pack modes')
+-+         pm = PACK_MODE_M
+-+         pack = wpacks[i][0]
+-+
+-+      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
+-+         asm_error('float pack operation used with integer alu result')
+-+      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
+-+         asm_error('integer pack operation used with float alu result')
+-+
+-+   if pm == PACK_MODE_EITHER:
+-+      pm = PACK_MODE_A
+-+   return pm, pack, unpack, forcebs, forcerafloat
+-+
+-+# immediates that can be encoded with SIG_SMALLIMMED
+-+bimms = {}
+-+bimms.update((i, i) for i in xrange(16))
+-+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
+-+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
+-+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
+-+
+-+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
+-+   if rmux == RMUX_SEMA:
+-+      asm_error('semaphore op can only be used with mov')
+-+   if rmux == RMUX_LABEL:
+-+      asm_error('label not allowed here')
+-+   if rmux == RMUX_IMMV:
+-+      asm_error('vector immediate can only be used with mov')
+-+   if rmux == RMUX_IMM:
+-+      if raddr not in bimms:
+-+         asm_error('can\'t encode immediate 0x%08x' % raddr)
+-+      raddr = bimms[raddr]
+-+      if not immb:
+-+         if raddr_b is not None:
+-+            asm_error('regfile b and immediates don\'t mix')
+-+         raddr_b = raddr
+-+         immb = True
+-+      elif raddr_b != raddr:
+-+         asm_error('can only encode one rotation/immediate')
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+   if rmux == RMUX_AC:
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
+-+   if rmux == RMUX_ANY:
+-+      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+      if (not immb) and (raddr_b == raddr):
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+      if raddr_a is None:
+-+         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
+-+         raddr_a = raddr
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+      if raddr_b is None:
+-+         assert not immb
+-+         raddr_b = raddr
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+      asm_error('no free read slots')
+-+   if rmux == RMUX_A:
+-+      if (not mulw_rotate) and (raddr_a is not None) and (
+-+         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
+-+         asm_error('conflicting rotations from regfile a')
+-+      if raddr_a is None:
+-+         raddr_a = raddr[0]
+-+      elif raddr_a != raddr[0]:
+-+         asm_error('can only read from one location in each regfile')
+-+      arot_r5 = raddr[2]
+-+      if raddr[1] == 0:
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+      raddr = 48 + raddr[1]
+-+      if not immb:
+-+         if raddr_b is not None:
+-+            asm_error('regfile b and rotation don\'t mix')
+-+         raddr_b = raddr
+-+         immb = True
+-+      elif raddr_b != raddr:
+-+         asm_error('can only encode one rotation/immediate')
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+   if rmux == RMUX_B:
+-+      if immb:
+-+         asm_error('regfile b and rotation/immediates don\'t mix')
+-+      if raddr_b is None:
+-+         raddr_b = raddr
+-+      elif raddr_b != raddr:
+-+         asm_error('can only read from one location in each regfile')
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+   assert 0
+-+
+-+# ok if:
+-+# - accumulator (r0-r3)
+-+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
+-+#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
+-+#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
+-+#   what about vr_wait/vw_wait/mutex?
+-+def read_rot_ok(rmux, raddr_a, raddr_b):
+-+   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
+-+      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
+-+      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
+-+
+-+def asm_flush_prog_data():
+-+   global prog_data
+-+
+-+   while len(prog_data) & 7:
+-+      prog_data.append(0)
+-+   for i in xrange(0, len(prog_data), 8):
+-+      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
+-+         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
+-+   prog_data = []
+-+
+-+def asm_line(sets, location, line):
+-+   global current_location, construct, nwarn_level
+-+
+-+   prev_location = current_location
+-+   current_location = location
+-+
+-+   try:
+-+      if construct != None:
+-+         if re_macro.match(line):
+-+            construct_stack.append(CONSTRUCT_MACRO)
+-+         elif re_if.match(line):
+-+            construct_stack.append(CONSTRUCT_IF)
+-+         elif re_rep.match(line):
+-+            construct_stack.append(CONSTRUCT_REP)
+-+         else:
+-+            else_m = line == '.else'
+-+            elif_m = re_elif.match(line)
+-+            if elif_m:
+-+               end_construct = CONSTRUCT_IF
+-+            else:
+-+               end_construct = {
+-+                  '.endm':  CONSTRUCT_MACRO,
+-+                  '.else':  CONSTRUCT_IF,
+-+                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
+-+                  '.endr':  CONSTRUCT_REP}.get(line)
+-+            if end_construct is not None:
+-+               end_construct &= construct_stack.pop()
+-+               if end_construct == 0:
+-+                  if elif_m:
+-+                     asm_error('unexpected .elif')
+-+                  asm_error('unexpected %s' % line)
+-+               if len(construct_stack) == 0:
+-+                  lines = construct
+-+                  construct = None
+-+                  if end_construct == CONSTRUCT_MACRO:
+-+                     return
+-+                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
+-+                     condition_if, condition_else = lines[0]
+-+                     lines = lines[1:]
+-+                     if condition_if:
+-+                        for location, line in lines:
+-+                           asm_line(sets, location, line)
+-+                     if else_m:
+-+                        construct = [(condition_else, False)]
+-+                        construct_stack.append(CONSTRUCT_ELSE)
+-+                     elif elif_m:
+-+                        if elif_m.group('set'):
+-+                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
+-+                        else:
+-+                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
+-+                        condition_else = condition_else and (not condition_if)
+-+                        construct = [(condition_if, condition_else)]
+-+                        construct_stack.append(CONSTRUCT_IF)
+-+                     return
+-+                  if end_construct == CONSTRUCT_REP:
+-+                     name, count = lines[0]
+-+                     lines = lines[1:]
+-+                     for i in xrange(count):
+-+                        sets[name] = i
+-+                        for location, line in lines:
+-+                           asm_line(sets, location, line)
+-+                     return
+-+                  assert 0
+-+               if else_m:
+-+                  construct_stack.append(CONSTRUCT_ELSE)
+-+               elif elif_m:
+-+                  construct_stack.append(CONSTRUCT_IF)
+-+         construct.append((current_location, line))
+-+         return
+-+
+-+      if line in ('.endm', '.else', '.endif', '.endr'):
+-+         asm_error('unexpected %s' % line)
+-+      if re_elif.match(line):
+-+         asm_error('unexpected .elif')
+-+
+-+      m = re_macro.match(line)
+-+      if m:
+-+         construct = []
+-+         construct_stack.append(CONSTRUCT_MACRO)
+-+         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
+-+         return
+-+
+-+      m = re_if.match(line)
+-+      if m:
+-+         if m.group('set'):
+-+            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
+-+         else:
+-+            # not not forces condition to a bool (this matters if condition is
+-+            # something mutable like a list)
+-+            condition = not not arg_eval(m.group('condition'), sets)
+-+         construct = [(condition, not condition)]
+-+         construct_stack.append(CONSTRUCT_IF)
+-+         return
+-+
+-+      m = re_rep.match(line)
+-+      if m:
+-+         count = arg_eval(m.group('count'), sets)
+-+         if not is_int(count):
+-+            asm_error('.rep count must be integer')
+-+         construct = [(m.group('name'), count)]
+-+         construct_stack.append(CONSTRUCT_REP)
+-+         return
+-+
+-+      m = re_include.match(line)
+-+      if m:
+-+         filename = arg_eval(m.group('filename'), sets)
+-+         if not isinstance(filename, str):
+-+            asm_error('expected string')
+-+         asm_file(sets, '%s: %s' % (current_location, filename), filename)
+-+         return
+-+
+-+      m = re_set.match(line)
+-+      if m:
+-+         sets[m.group('name')] = arg_eval(m.group('val'), sets)
+-+         return
+-+
+-+      m = re_unset.match(line)
+-+      if m:
+-+         name = m.group('name')
+-+         if name not in sets:
+-+            asm_error('%s not set' % name)
+-+         if name in arg_defs: # todo: see arg_eval
+-+            sets[name] = arg_defs[name]
+-+         else:
+-+            del sets[name]
+-+         return
+-+
+-+      m = re_eval.match(line)
+-+      if m:
+-+         arg_eval(m.group('expr'), sets)
+-+         return
+-+
+-+      m = re_print_info_warn_error.match(line)
+-+      if m:
+-+         def print_fn(message):
+-+            print message
+-+         def info_fn(message):
+-+            sys.stderr.write('%s\n' % message)
+-+         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
+-+            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
+-+         return
+-+
+-+      m = re_assert.match(line)
+-+      if m:
+-+         if not arg_eval(m.group('condition'), sets):
+-+            asm_error('assertion failure: \'%s\'' % m.group('condition'))
+-+         return
+-+
+-+      m = re_data.match(line)
+-+      if m:
+-+         size = int(m.group('size'))
+-+         for datum in smart_split(m.group('data')):
+-+            datum = arg_eval(datum, sets)
+-+            if not is_int(datum):
+-+               asm_error('datum must be integer')
+-+            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
+-+         return
+-+
+-+      m = re_macro_inst.match(line)
+-+      if m:
+-+         name = m.group('name')
+-+         if name in macros:
+-+            params, lines = macros[name]
+-+            args = smart_split(m.group('args'))
+-+            if len(args) > len(params):
+-+               asm_error('too many arguments to macro')
+-+            sets = sets.copy()
+-+            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
+-+            for param in params[len(args):]:
+-+               if param in sets:
+-+                  if param in arg_defs: # todo: see arg_eval
+-+                     sets[param] = arg_defs[param]
+-+                  else:
+-+                     del sets[param]
+-+            for location, line in lines:
+-+               asm_line(sets, '%s: %s' % (current_location, location), line)
+-+            return
+-+
+-+      if line == '.pushnwarn':
+-+         nwarn_level += 1
+-+         return
+-+      if line == '.popnwarn':
+-+         if nwarn_level == 0:
+-+            asm_error('.popnwarn without .pushnwarn')
+-+         nwarn_level -= 1
+-+         return
+-+
+-+      # everything below assumes prog is up to date
+-+      asm_flush_prog_data()
+-+
+-+      m = re_label.match(line)
+-+      if m:
+-+         name = m.group('name')
+-+         if name[0].isdigit():
+-+            labels.setdefault(name, []).append(len(prog))
+-+         else:
+-+            if name[0] == ':':
+-+               undecorated_name = name[1:]
+-+            else:
+-+               undecorated_name = name
+-+            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
+-+               asm_error('named label defined twice')
+-+            labels[name] = len(prog)
+-+         return
+-+
+-+      annots = line.split('@')
+-+      ops = [op.strip() for op in annots[0].split(';')]
+-+      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
+-+      sig = get_sig(ops[-1])
+-+      if sig != SIG_NORMAL:
+-+         ops = ops[:-1]
+-+      if len(ops) > 2:
+-+         asm_error('too many ops')
+-+      elif (len(ops) == 1) and (ops[0] == ''):
+-+         ops = []
+-+      ops = (ops + ['nop', 'nop'])[:2]
+-+      m = re_op.match(ops[0])
+-+      if not m:
+-+         asm_error('invalid syntax')
+-+      aop, aargs_n = get_aop(m.group('op'))
+-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
+-+         acond = get_bcond(m.group('cond'))
+-+      else:
+-+         acond = get_cond(m.group('cond'))
+-+      asf = get_setf(m.group('sf'))
+-+      aargs = smart_split(m.group('args'))
+-+      if len(aargs) != aargs_n:
+-+         asm_error('wrong operand count')
+-+      ard, ara, arb = (aargs + [None, None, None])[:3]
+-+      m = re_op.match(ops[1])
+-+      if not m:
+-+         asm_error('invalid syntax')
+-+      mop, margs_n = get_mop(m.group('op'))
+-+      mcond = get_cond(m.group('cond'))
+-+      msf = get_setf(m.group('sf'))
+-+      margs = smart_split(m.group('args'))
+-+      if len(margs) != margs_n:
+-+         asm_error('wrong operand count')
+-+      mrd, mra, mrb = (margs + [None, None, None])[:3]
+-+      # eval srcs first so allocator can retire and reuse registers for dst
+-+      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
+-+      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
+-+      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
+-+      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
+-+      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
+-+      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
+-+      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
+-+         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
+-+         asm_error('cannot have 2 arguments with different rotations')
+-+      if aarmux is not None:
+-+         awrot = (awrot + aadrot) % 16
+-+         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
+-+      if (awrot != 0) or awrot_r5:
+-+         asm_error('rotate not allowed on add write')
+-+      if marmux is not None:
+-+         mwrot = (mwrot + madrot) % 16
+-+         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
+-+
+-+      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
+-+      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
+-+      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
+-+         [aarpack, abrpack, marpack, mbrpack],
+-+         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
+-+         aop == AOP_FTOI,
+-+         [awpack, mwpack],
+-+         [afloatw, mop == MOP_FMUL])
+-+      if forcebs[0]:
+-+         aarmux = RMUX_B
+-+      if forcebs[1]:
+-+         abrmux = RMUX_B
+-+      if forcebs[2]:
+-+         marmux = RMUX_B
+-+      if forcebs[3]:
+-+         mbrmux = RMUX_B
+-+
+-+      # extend nops to 3 operands
+-+      if aop == AOP_NOP:
+-+         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+-+      if mop == MOP_NOP:
+-+         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+-+
+-+      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
+-+      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
+-+         if forcerafloat:
+-+            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
+-+            # instead of duplicating the 2nd operand, take the ra operand from
+-+            # the mul op thus forcing the ra value to be considered a float for
+-+            # the purposes of unpacking
+-+            if marmux == RMUX_A:
+-+               abraddr, abrmux = maraddr, marmux
+-+            else:
+-+               assert mbrmux == RMUX_A
+-+               abraddr, abrmux = mbraddr, mbrmux
+-+         else:
+-+            abraddr, abrmux = aaraddr, aarmux
+-+      else:
+-+         assert not forcerafloat # can only forcerafloat if we have an unused operand
+-+
+-+      # handle write addrs
+-+      if (awmux == mwmux) and (awmux != WMUX_ANY):
+-+         asm_error('add/mul ops not allowed to write to same regfile')
+-+      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
+-+
+-+      # handle branch
+-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
+-+         # check setf
+-+         if asf:
+-+            asm_error('setf not allowed on bra/brr')
+-+
+-+         # check pack/unpack
+-+         if (pack != 0) or (unpack != 0):
+-+            asm_error('pack/unpack not allowed with bra/brr')
+-+
+-+         # handle read address
+-+         if aarmux == RMUX_LABEL:
+-+            if (aop == AOP_BRA) and aaraddr[1]:
+-+               asm_warning('bra with rel label')
+-+            if (aop == AOP_BRR) and (not aaraddr[1]):
+-+               asm_warning('brr with abs label')
+-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+-+         if aarmux == RMUX_ANY:
+-+            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
+-+         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
+-+            asm_error('branch destination must be either label, immediate, or from regfile a')
+-+         if aarmux == RMUX_IMM:
+-+            imm = aaraddr
+-+            raddr = 0 # can't use RADDR_NOP
+-+         elif aarmux == RMUX_A:
+-+            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
+-+               asm_error('rotation of read from regfile a not allowed with branch')
+-+            if aop == AOP_BRR:
+-+               asm_warning('brr with ra')
+-+            imm = 0
+-+            raddr = aaraddr[0]
+-+         else:
+-+            assert 0
+-+
+-+         # check mul op is nop
+-+         if mop != MOP_NOP:
+-+            asm_error('mul op not allowed with branch')
+-+
+-+         # check sig
+-+         if sig != SIG_NORMAL:
+-+            asm_error('no signal allowed with branch')
+-+
+-+         if raddr >= 32:
+-+            asm_error('can only branch to register locations in physical regfile')
+-+         if raddr & 1:
+-+            asm_warning('branch instruction will destroy flags (see hw-2780)')
+-+
+-+         # construct branch instruction
+-+         prog.append((imm,
+-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
+-+            line, annots))
+-+
+-+         return
+-+
+-+      # use COND_NEVER when possible (might save power / allow mul setf)
+-+      if not dict(annots).get('preserve_cond', 0):
+-+          if (awaddr == WADDR_NOP) and (not asf):
+-+             acond = COND_NEVER
+-+          if (mwaddr == WADDR_NOP) and (not msf):
+-+             mcond = COND_NEVER
+-+
+-+      # attempt to convert movs to ldi
+-+      if (# no mul setf
+-+         (not msf) and
+-+         # ops must either be nop or mov of sema/label/imm/immv
+-+         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+-+         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+-+         # but we don't want 2 nops
+-+         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
+-+         # if both ops are movs, srcs must be identical
+-+         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
+-+         # no signal
+-+         (sig == SIG_NORMAL)):
+-+         # make sure aarmux/aaraddr contains the value
+-+         if aop != AOP_MOV:
+-+            aarmux = marmux
+-+            aaraddr = maraddr
+-+
+-+         # convert immediate
+-+         if aarmux == RMUX_SEMA:
+-+            ldi_mode = LDI_SEMA
+-+         elif aarmux == RMUX_LABEL:
+-+            ldi_mode = LDI_32
+-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+-+         elif aarmux == RMUX_IMMV:
+-+            signed, unsigned = True, True
+-+            imm = 0
+-+            for i, elem in enumerate(aaraddr):
+-+               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
+-+                  signed = False
+-+               if elem not in (0, 1, 2, 3):
+-+                  unsigned = False
+-+               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
+-+            if not (signed or unsigned):
+-+               asm_error('can\'t encode vector immediate')
+-+            if signed:
+-+               ldi_mode = LDI_EL_SIGNED
+-+            else:
+-+               ldi_mode = LDI_EL_UNSIGNED
+-+            aaraddr, aarmux = imm, RMUX_IMM
+-+         elif aarmux == RMUX_IMM:
+-+            ldi_mode = LDI_32
+-+         else:
+-+            assert 0
+-+
+-+         # construct ldi instruction
+-+         prog.append((aaraddr,
+-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
+-+            line, annots))
+-+
+-+         return
+-+
+-+      # convert movs to alu ops
+-+      if aop == AOP_MOV:
+-+         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
+-+            aop = AOP_XOR
+-+            aaraddr, aarmux = 0, RMUX_AC
+-+            abraddr, abrmux = 0, RMUX_AC
+-+         else:
+-+            aop = AOP_OR
+-+            abraddr, abrmux = aaraddr, aarmux
+-+      if mop == MOP_MOV:
+-+         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
+-+            mop = MOP_V8SUBS
+-+            maraddr, marmux = 0, RMUX_AC
+-+            mbraddr, mbrmux = 0, RMUX_AC
+-+         else:
+-+            mop = MOP_V8MIN
+-+            mbraddr, mbrmux = maraddr, marmux
+-+
+-+      # normal alu instruction...
+-+
+-+      # handle setf
+-+      if asf and (aop == AOP_NOP):
+-+         asm_error('nop.setf is not allowed in add pipe')
+-+      if msf and (mop == MOP_NOP):
+-+         asm_warning('nop.setf, really?')
+-+      if (aop == AOP_NOP) or (acond == COND_NEVER):
+-+         sf = msf
+-+      else:
+-+         if msf:
+-+            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
+-+         sf = asf
+-+
+-+      # handle read addrs
+-+      raddr_a = None
+-+      raddr_b = None
+-+      immb = False
+-+      arot_r5 = False
+-+      muxes = [0, 0, 0, 0]
+-+      if mwrot != 0:
+-+         raddr_b = 48 + mwrot
+-+         immb = True
+-+      if mwrot_r5 and have_am:
+-+         raddr_b = 48
+-+         immb = True
+-+      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
+-+         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
+-+            if f(rmux):
+-+               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
+-+      add_a, add_b, mul_a, mul_b = muxes
+-+      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
+-+         # some output elements might not be as expected
+-+         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
+-+            bad_elems = 0xffff
+-+         else:
+-+            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
+-+            if mwrot > 12:
+-+               bad_elems ^= 0xffff
+-+         bad_elems &= dict(annots).get('mul_used', 0xffff)
+-+         if not msf:
+-+            if mwaddr == WADDR_NOP:
+-+               # not writing anywhere and not setting flags. no elements used
+-+               bad_elems = 0
+-+            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
+-+               ((not ws) and (mwaddr == 37))):
+-+               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
+-+               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
+-+               # only use element 0
+-+               bad_elems &= 0x0001
+-+            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
+-+               ((not ws) and (mwaddr == 42))):
+-+               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
+-+               # flags. only use elements 0, 4, 8, and 12
+-+               bad_elems &= 0x1111
+-+         if bad_elems:
+-+            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
+-+      if raddr_a is None:
+-+         raddr_a = RADDR_NOP
+-+      if raddr_b is None:
+-+         raddr_b = RADDR_NOP
+-+      if immb:
+-+         if sig != SIG_NORMAL:
+-+            asm_error('rotation/immediates and signal don\'t mix')
+-+         sig = SIG_SMALLIMMED
+-+      if arot_r5 or (mwrot_r5 and (not have_am)):
+-+         if sig != SIG_NORMAL:
+-+            asm_error('rotation/immediates/signal don\'t mix')
+-+         sig = SIG_ROTATE
+-+
+-+      # construct instruction
+-+      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
+-+         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
+-+         line, annots))
+-+   finally:
+-+      current_location = prev_location
+-+
+-+def preprocess_passthrough(file):
+-+   line_number = 0
+-+   for line in file:
+-+      line_number += 1
+-+      yield line_number, line
+-+
+-+def asm_file(sets, location, filename, preprocess = None):
+-+   global current_dir, current_location
+-+
+-+   if filename is None:
+-+      location = '<stdin>'
+-+      file = sys.stdin
+-+
+-+      prev_dir = current_dir
+-+   else:
+-+      filename = os.path.normpath(os.path.join(current_dir, filename))
+-+
+-+      try:
+-+         file = open(filename)
+-+      except Exception, e:
+-+         asm_error(e)
+-+      except:
+-+         asm_error('unknown error while opening file %s' % filename)
+-+
+-+      prev_dir = current_dir
+-+      current_dir = os.path.dirname(filename)
+-+
+-+   prev_location = current_location
+-+   current_location = location
+-+
+-+   if preprocess is None:
+-+      preprocess = preprocess_passthrough
+-+
+-+   try:
+-+      for line_number, line in preprocess(file):
+-+         # strip off comments and whitespace
+-+         line = line.split('#')[0].strip()
+-+         if line == '':
+-+            continue
+-+
+-+         asm_line(sets, '%s: %d' % (current_location, line_number), line)
+-+   finally:
+-+      current_dir = prev_dir
+-+      current_location = prev_location
+-+
+-+def asm_end_prog():
+-+   # check we aren't in a multi-line construct (eg .macro or .rep)
+-+   if construct != None:
+-+      asm_error({
+-+         CONSTRUCT_MACRO: '.macro without .endm',
+-+         CONSTRUCT_IF:    '.if/.elif without .endif',
+-+         CONSTRUCT_ELSE:  '.else without .endif',
+-+         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
+-+
+-+   # check no warnings level back to 0
+-+   if nwarn_level != 0:
+-+      asm_error('.pushnwarn without .popnwarn')
+-+
+-+   # flush queued up data
+-+   asm_flush_prog_data()
+-+
+-+   # fixup all the label references we can
+-+   for pc in xrange(len(prog)):
+-+      if isinstance(prog[pc][0], tuple):
+-+         location, label, rel, offset = prog[pc][0]
+-+         if label[0].isdigit():
+-+            label_pcs = labels.get(label[:-1], [])
+-+            if label[-1] == 'b':
+-+               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
+-+            else:
+-+               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
+-+            if label_pcs == []:
+-+               asm_error('search for label reached begin/end of file', location = location)
+-+            imm = label_pcs[0]
+-+         elif label in labels:
+-+            imm = labels[label]
+-+         elif (':' + label) in labels:
+-+            imm = labels[':' + label]
+-+         elif external_link:
+-+            continue # let the external linker deal with it
+-+         else:
+-+            asm_error('undefined label', location = location)
+-+         imm = (imm * 8) + offset
+-+         if rel:
+-+            imm -= (pc + 4) * 8 # relative to instruction after delay slots
+-+            imm &= (1 << 32) - 1
+-+         else:
+-+            if not external_link:
+-+               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
+-+            imm = (location, label, rel, offset, imm)
+-+         prog[pc] = (imm,) + prog[pc][1:]
+-+
+-+def asm_init():
+-+   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
+-+
+-+   current_dir = os.getcwd()
+-+   current_location = ''
+-+   prog = []
+-+   prog_data = []
+-+   macros = {
+-+      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
+-+      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
+-+   labels = {}
+-+   construct = None
+-+   construct_stack = []
+-+   nwarn_level = 0
+-+
+-+def asm_reset_prog():
+-+   global prog, labels
+-+
+-+   prog = []
+-+   labels = {}
+-+
+-+###############################################################################
+-+# dumping
+-+###############################################################################
+-+
+-+def print_lines(lines):
+-+   for line in lines:
+-+      print line
+-+
+-+class dumper_t:
+-+   def external_link(self): return False
+-+   def begin(self): pass
+-+   def label(self, pc, name): pass
+-+   def line(self, pc, ls, ms, line, annots, first): pass
+-+   def end(self): pass
+-+   def sets(self, sets): pass
+-+   def direct(self, line): pass
+-+
+-+class clif_dumper_t(dumper_t):
+-+   def __init__(self):
+-+      self.annot_mode = 0
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def parse_annot_mode(self, line):
+-+      l = line.split(',')
+-+      self.annot_mode = int(l[0])
+-+      if self.annot_mode not in (0, 1, 2):
+-+         asm_error('bad annot mode')
+-+      if self.annot_mode == 2:
+-+         if len(l) != 2:
+-+            asm_error('expected buffer name')
+-+         self.annot_name = l[1].strip()
+-+         self.annot_offset = 0
+-+      elif len(l) != 1:
+-+         asm_error('unexpected comma')
+-+
+-+   def label(self, pc, name):
+-+      if (self.annot_mode != 1) and (name[0] == ':'):
+-+         if self.annot_mode == 2:
+-+            name = name + '_annotations'
+-+         print '@label %s' % name[1:]
+-+      else:
+-+         print '// :%s' % name
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if self.annot_mode == 0:
+-+         if isinstance(ls, tuple):
+-+            if len(ls) == 5:
+-+               location, label, rel, offset, offset_from_prog = ls
+-+               assert not rel
+-+               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
+-+            else:
+-+               location, label, rel, offset = ls
+-+               if rel:
+-+                  asm_error('relative external label references not allowed in this mode', location = location)
+-+               ls = '[%s + %d]' % (label, offset)
+-+         else:
+-+            ls = '0x%08x' % ls
+-+         print '%s 0x%08x // %s' % (ls, ms, line)
+-+      elif self.annot_mode == 1:
+-+         print '// %s' % line
+-+         for annot in annots:
+-+            print '0x%08x 0x%08x // %s' % ({
+-+               # todo: would rather not have these hard coded
+-+               'mul_used':              1,
+-+               'preserve_cond':         2,
+-+               'geomd_open':            3,
+-+               'geomd_i':               4,
+-+               'geomd_tris_clear':      5,
+-+               'geomd_verts':           6,
+-+               'geomd_tris_add':        7,
+-+               'geomd_tris_set_center': 8,
+-+               'geomd_region_clear':    9,
+-+               'geomd_region_set':      10,
+-+               'geomd_images_clear':    11,
+-+               'geomd_images_l':        12,
+-+               'geomd_images_b':        13,
+-+               'geomd_images_r':        14,
+-+               'geomd_images_t':        15,
+-+               'geomd_images_add_vpm':  16,
+-+               'trace_4c':              17,
+-+               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
+-+         if len(annots) != 0:
+-+            print '0x00000000 // end'
+-+      else:
+-+         assert self.annot_mode == 2
+-+         if len(annots) == 0:
+-+            print '0x00000000 // %s' % line
+-+         else:
+-+            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
+-+            self.annot_offset += (len(annots) * 8) + 4
+-+
+-+   def direct(self, line):
+-+      print line
+-+
+-+class plain_dumper_t(dumper_t):
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
+-+
+-+class c_c_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, array_name):
+-+      self.header_name = header_name
+-+      self.array_name = array_name
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      self.external_labels = set()
+-+      self.lines = []
+-+
+-+      print '#include "%s.h"' % self.header_name
+-+      print ''
+-+      print '#ifdef _MSC_VER'
+-+      print '   #include <stdint.h>'
+-+      print '   /* cast through uintptr_t to avoid warnings */'
+-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
+-+      print '#else'
+-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
+-+      print '#endif'
+-+      print ''
+-+      print '#ifdef __cplusplus'
+-+      print 'extern "C" { /* the types are probably wrong... */'
+-+      print '#endif'
+-+
+-+   def label(self, pc, name):
+-+      self.lines.append('// :%s' % name)
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if isinstance(ls, tuple):
+-+         if len(ls) == 5:
+-+            location, label, rel, offset, offset_from_prog = ls
+-+            assert not rel
+-+            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
+-+         else:
+-+            location, label, rel, offset = ls
+-+            if rel:
+-+               asm_error('relative external label references not allowed in this mode', location = location)
+-+            if label not in self.external_labels:
+-+               self.external_labels.add(label)
+-+               print 'extern uint8_t %s[];' % label
+-+            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
+-+      else:
+-+         ls = '0x%08x' % ls
+-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+-+
+-+   def end(self):
+-+      print '#ifdef __cplusplus'
+-+      print '}'
+-+      print '#endif'
+-+      print ''
+-+      print '#ifdef _MSC_VER'
+-+      print '__declspec(align(8))'
+-+      print '#elif defined(__GNUC__)'
+-+      print '__attribute__((aligned(8)))'
+-+      print '#endif'
+-+      print 'unsigned int %s[] = {' % self.array_name
+-+      print_lines(self.lines)
+-+      print '};'
+-+      print '#ifdef __HIGHC__'
+-+      print '#pragma Align_to(8, %s)' % self.array_name
+-+      print '#endif'
+-+
+-+class c_h_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, array_name):
+-+      self.full_header_name = full_header_name
+-+      self.array_name = array_name
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      print '#ifndef %s_H' % self.full_header_name
+-+      print '#define %s_H' % self.full_header_name
+-+      print ''
+-+      print 'extern unsigned int %s[];' % self.array_name
+-+      print ''
+-+
+-+   def label(self, pc, name):
+-+      if name[0] == ':':
+-+         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
+-+
+-+   def end(self):
+-+      print ''
+-+      print '#endif'
+-+
+-+class ml_c_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, name, annots):
+-+      self.header_name = header_name
+-+      self.name = name
+-+      self.annots = annots
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      if self.annots:
+-+         self.annot_lines = []
+-+      self.lines = []
+-+      self.external_labels = set()
+-+      self.link_lines = []
+-+
+-+      print '#include "%s.h"' % self.header_name
+-+      print '#include <assert.h>'
+-+      if self.annots:
+-+         print '#ifdef SIMPENROSE'
+-+         print '#include <stddef.h>'
+-+         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
+-+      print ''
+-+
+-+   def label(self, pc, name):
+-+      self.lines.append('// :%s' % name)
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if self.annots:
+-+         if len(annots) == 0:
+-+            self.annot_lines.append('NULL,')
+-+         else:
+-+            print 'static unsigned int const annotations_%d[] = {' % pc
+-+            for annot in annots:
+-+               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
+-+            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
+-+            print ''
+-+            self.annot_lines.append('annotations_%d,' % pc)
+-+      if isinstance(ls, tuple):
+-+         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
+-+         if len(ls) == 5:
+-+            location, label, rel, offset, offset_from_prog = ls
+-+            assert not rel
+-+            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
+-+         else:
+-+            location, label, rel, offset = ls
+-+            self.external_labels.add(label)
+-+            if rel:
+-+               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
+-+            else:
+-+               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
+-+         ls = '0xdeadbeef'
+-+      else:
+-+         ls = '0x%08x' % ls
+-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+-+
+-+   def end(self):
+-+      if self.annots:
+-+         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
+-+         print_lines(self.annot_lines)
+-+         print '};'
+-+         print '#endif'
+-+         print ''
+-+      print 'static unsigned int const array[] = {'
+-+      print_lines(self.lines)
+-+      print '};'
+-+      print ''
+-+      print 'void %s_link(void *p_in, unsigned int base' % self.name
+-+      for label in sorted(self.external_labels):
+-+         print '   , unsigned int %s' % label
+-+      print '   )'
+-+      print '{'
+-+      print '   unsigned int *p = (unsigned int *)p_in;'
+-+      print '   unsigned int i;'
+-+      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
+-+      print '      p[i] = array[i];'
+-+      print '   }'
+-+      print_lines(self.link_lines)
+-+      print '}'
+-+
+-+class ml_h_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, name, annots):
+-+      self.full_header_name = full_header_name
+-+      self.name = name
+-+      self.annots = annots
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      self.external_labels = set()
+-+      self.lines_n = 0
+-+
+-+      print '#ifndef %s_H' % self.full_header_name
+-+      print '#define %s_H' % self.full_header_name
+-+      print ''
+-+      if self.annots:
+-+         print '#ifdef SIMPENROSE'
+-+         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
+-+         print '#endif'
+-+         print ''
+-+
+-+   def label(self, pc, name):
+-+      if name[0] == ':':
+-+         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
+-+         if self.annots:
+-+            print '#ifdef SIMPENROSE'
+-+            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
+-+            print '#endif'
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if isinstance(ls, tuple) and (len(ls) != 5):
+-+         self.external_labels.add(ls[1])
+-+      self.lines_n += 1
+-+
+-+   def end(self):
+-+      print ''
+-+      print 'extern void %s_link(void *p, unsigned int base' % self.name
+-+      for label in sorted(self.external_labels):
+-+         print '   , unsigned int %s' % label
+-+      print '   );'
+-+      print ''
+-+      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
+-+      print ''
+-+      print '#endif'
+-+
+-+def print_lines_lc(lines):
+-+   for line in lines:
+-+      print '%s \\' % line
+-+
+-+def print_groups_lc(groups):
+-+   first = True
+-+   for group in groups:
+-+      if first:
+-+         print '{ \\'
+-+      else:
+-+         print ', { \\'
+-+      print_lines_lc(group)
+-+      print '} \\'
+-+      first = False
+-+
+-+class inline_c_dumper_t(dumper_t):
+-+   def __init__(self, annots):
+-+      self.annots = annots
+-+      self.iteration = False
+-+
+-+   def begin_iteration(self):
+-+      assert not self.iteration
+-+      self.iteration = True
+-+      self.iteration_lines = []
+-+      if self.annots:
+-+         self.iteration_annot_lines = []
+-+         self.annot_arrs = []
+-+
+-+   def end_iteration(self):
+-+      assert self.iteration
+-+      self.iteration = False
+-+      print '%d, \\' % self.iteration_n
+-+      if self.annots:
+-+         print '( \\'
+-+      print_groups_lc(self.iteration_lines)
+-+      if self.annots:
+-+         print '), ( \\'
+-+         print_groups_lc(self.iteration_annot_lines)
+-+         print '), ( \\'
+-+         for annot_arr in self.annot_arrs:
+-+            print_lines_lc(annot_arr)
+-+         print ') \\'
+-+
+-+   def begin(self):
+-+      self.n = 0
+-+      self.lines = []
+-+      if self.annots:
+-+         self.annot_lines = []
+-+         if not self.iteration:
+-+            self.annot_arrs = []
+-+
+-+   def label(self, pc, name):
+-+      self.lines.append('/* :%s */' % name)
+-+      if self.annots:
+-+         self.annot_lines.append('/* :%s */' % name)
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      self.n += 1
+-+      if first:
+-+         prefix = ''
+-+      else:
+-+         prefix = ', '
+-+      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
+-+      if self.annots:
+-+         if len(annots) == 0:
+-+            a = 'NULL'
+-+         else:
+-+            a = 'annotations_%d' % len(self.annot_arrs)
+-+            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
+-+            for annot in annots:
+-+               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
+-+            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
+-+            self.annot_arrs.append(annot_arr)
+-+         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
+-+
+-+   def end(self):
+-+      if self.iteration:
+-+         if len(self.iteration_lines) == 0:
+-+            self.iteration_n = self.n
+-+         elif self.iteration_n != self.n:
+-+            asm_error('number of instructions differs between iterations')
+-+         self.iteration_lines.append(self.lines)
+-+         if self.annots:
+-+            self.iteration_annot_lines.append(self.annot_lines)
+-+      else:
+-+         if self.annots:
+-+            print '( \\'
+-+         print_lines_lc(self.lines)
+-+         if self.annots:
+-+            print '), ( \\'
+-+            print_lines_lc(self.annot_lines)
+-+            print '), ( \\'
+-+            for annot_arr in self.annot_arrs:
+-+               print_lines_lc(annot_arr)
+-+            print ') \\'
+-+
+-+   def direct(self, line):
+-+      print line
+-+
+-+class asvc_dumper_t(dumper_t):
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      print '.align 8'
+-+
+-+   def label(self, pc, name):
+-+      if name[0] == ':':
+-+         print '%s::' % name[1:]
+-+      else:
+-+         print '%s:' % name
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if isinstance(ls, tuple):
+-+         location, label, rel, offset = ls[:4]
+-+         if rel:
+-+            ls = '%s + %d - (. + 32)' % (label, offset)
+-+         else:
+-+            ls = '%s + %d' % (label, offset)
+-+      else:
+-+         ls = '0x%08x' % ls
+-+      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
+-+
+-+def is_ra_or_rb(val):
+-+   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
+-+
+-+class aliases_dumper_t(dumper_t):
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      print '#ifndef JUST_DQASM_ARGS'
+-+
+-+   def label(self, pc, name):
+-+      if not name[0].isdigit():
+-+         if name[0] == ':':
+-+            name = name[1:]
+-+         print '"bs%s", "bs%x",' % (name, pc * 8)
+-+         print '"bu%s", "bu%x",' % (name, pc * 8)
+-+
+-+   def end(self):
+-+      print '#endif'
+-+
+-+   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
+-+   def sets(self, sets):
+-+      dqasm_args = []
+-+      print '#ifndef JUST_DQASM_ARGS'
+-+      for name in sets:
+-+         if is_ra_or_rb(sets[name]):
+-+            dqasm_args.append('-r%s=%s' % (sets[name], name))
+-+            print '"%s", "%s",' % (name, sets[name])
+-+         elif isinstance(sets[name], list):
+-+            for i, val in enumerate(sets[name]):
+-+               if is_ra_or_rb(val):
+-+                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
+-+                  print '"%s[%d]", "%s",' % (name, i, val)
+-+      print '#endif'
+-+      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
+-+
+-+def dump(dumper):
+-+   if (len(prog) != 0) or (len(labels) != 0):
+-+      dumper.begin()
+-+
+-+      sorted_labels = []
+-+      for name in labels:
+-+         if name[0].isdigit():
+-+            for pc in labels[name]:
+-+               sorted_labels.append((pc, name))
+-+         else:
+-+            sorted_labels.append((labels[name], name))
+-+      sorted_labels.sort(reverse = True)
+-+
+-+      first = True
+-+      for pc in xrange(len(prog)):
+-+         ls, ms, line, annots = prog[pc]
+-+         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
+-+            dumper.label(*sorted_labels.pop())
+-+         dumper.line(pc, ls, ms, line, annots, first)
+-+         first = False
+-+      for sorted_label in sorted_labels:
+-+         assert sorted_label[0] == len(prog)
+-+         dumper.label(*sorted_label)
+-+
+-+      dumper.end()
+-+
+-+###############################################################################
+-+# preprocessing
+-+###############################################################################
+-+
+-+def preprocess_inline_c(dumper):
+-+   def preprocess(file):
+-+      ls = None
+-+      line_number = 0
+-+      for line in file:
+-+         line_number += 1
+-+         while True:
+-+            if ls is None:
+-+               l = line.split('%[', 1)
+-+               if len(l) == 1:
+-+                  dumper.direct(l[0].rstrip())
+-+                  break
+-+               dumper.direct('%s \\' % l[0].rstrip())
+-+               line = l[1]
+-+               ls = []
+-+            else:
+-+               l = line.split('%]', 1)
+-+               ls.append((line_number, l[0]))
+-+               if len(l) == 1:
+-+                  break
+-+               line = l[1]
+-+               l = ls[-1][1].split('%|', 1)
+-+               if len(l) == 1:
+-+                  for l_number, l in ls:
+-+                     yield l_number, l
+-+                  asm_end_prog()
+-+                  dump(dumper)
+-+                  asm_reset_prog()
+-+               else:
+-+                  ls[-1] = (ls[-1][0], l[0])
+-+                  if hasattr(dumper, 'begin_iteration'):
+-+                     dumper.begin_iteration()
+-+                  for repls in l[1].split('%,'):
+-+                     repls = [repl.strip() for repl in repls.split('%/')]
+-+                     for l_number, l in ls:
+-+                        for i, repl in enumerate(repls):
+-+                           l = l.replace('%' + str(i), repl)
+-+                        yield l_number, l
+-+                     asm_end_prog()
+-+                     dump(dumper)
+-+                     asm_reset_prog()
+-+                  if hasattr(dumper, 'end_iteration'):
+-+                     dumper.end_iteration()
+-+               ls = None
+-+   return preprocess
+-+
+-+def preprocess_clif(dumper):
+-+   def preprocess(file):
+-+      in_asm = False
+-+      line_number = 0
+-+      for line in file:
+-+         line_number += 1
+-+         if in_asm:
+-+            if line.strip() == '%]':
+-+               asm_end_prog()
+-+               dump(dumper)
+-+               asm_reset_prog()
+-+               in_asm = False
+-+            else:
+-+               yield line_number, line
+-+         else:
+-+            if line.strip() == '%[':
+-+               in_asm = True
+-+            elif (line[:1] == '%') and (line[:2] != '%@'):
+-+               yield line_number, line[1:]
+-+            else:
+-+               asm_end_prog()
+-+               dump(dumper)
+-+               asm_reset_prog()
+-+               if line[:2] == '%@':
+-+                  if hasattr(dumper, 'parse_annot_mode'):
+-+                     dumper.parse_annot_mode(line[2:])
+-+               else:
+-+                  dumper.direct(line.rstrip())
+-+   return preprocess
+-+
+-+###############################################################################
+-+# main
+-+###############################################################################
+++class tstats:
+++    close_threshold = 0.01
+++
+++    def __init__(self, stats_dict=None):
+++        if stats_dict != None:
+++            self.name = stats_dict["name"]
+++            self.elapsed = float(stats_dict["elapsed"])
+++            self.user = float(stats_dict["user"])
+++            self.sys = float(stats_dict["sys"])
+++
+++    def times_str(self):
+++        ctime = self.sys + self.user
+++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+++
+++    def dict(self):
+++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+++
+++    def is_close(self, other):
+++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+++
+++    def __lt__(self, other):
+++        return self.elapsed < other.elapsed
+++    def __gt__(self, other):
+++        return self.elapsed > other.elapsed
+++
+++    def time_file(name, prefix):
+++        stats = tstats()
+++        stats.name = name
+++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
+++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+++        pinfo = os.wait4(cproc.pid, 0)
+++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+++        stats.elapsed = end_time - start_time
+++        stats.user = pinfo[2].ru_utime
+++        stats.sys = pinfo[2].ru_stime
+++        return stats
+++
+++
+++def common_prefix(s1, s2):
+++    for i in range(min(len(s1),len(s2))):
+++        if s1[i] != s2[i]:
+++            return s1[:i]
+++    return s1[:i+1]
+ +
+ +def main():
+-+   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
+-+   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
+-+
+-+   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
+-+
+-+   # parse command line
+-+   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
+-+   parser.add_option('-m', '--mode', dest = 'mode',
+-+      help = '<mode> should be clif, plain, ' +
+-+      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
+-+      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
+-+      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
+-+      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
+-+      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
+-+   parser.add_option('-t', '--target', dest = 'target',
+-+      help = '<target> should be a0, b0, or hera', metavar = '<target>')
+-+   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
+-+   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
+-+   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
+-+   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
+-+   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
+-+   options, args = parser.parse_args()
+-+   if len(args) == 0:
+-+      filename = None
+-+   elif len(args) == 1:
+-+      filename = args[0]
+-+   else:
+-+      parser.print_help()
+-+      sys.exit(-1)
+-+
+-+   # handle mode
+-+   mode = options.mode or 'clif' # assume clif if no mode specified
+-+   if mode == 'clif':
+-+      dumper = clif_dumper_t()
+-+      preprocess = preprocess_clif(dumper)
+-+   elif mode == 'plain':
+-+      dumper = plain_dumper_t()
+-+      preprocess = None
+-+   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
+-+      mode_options = mode[4:].split(',')
+-+      if len(mode_options) != 3:
+-+         asm_error('badly formatted mode on command line')
+-+      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
+-+      preprocess = None
+-+   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
+-+      mode_options = mode[5:].split(',')
+-+      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
+-+         asm_error('badly formatted mode on command line')
+-+      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
+-+         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
+-+      preprocess = None
+-+   elif mode == 'inline_c':
+-+      dumper = inline_c_dumper_t(False)
+-+      preprocess = preprocess_inline_c(dumper)
+-+   elif mode == 'inline_c:annots':
+-+      dumper = inline_c_dumper_t(True)
+-+      preprocess = preprocess_inline_c(dumper)
+-+   elif mode == 'asvc':
+-+      dumper = asvc_dumper_t()
+-+      preprocess = None
+-+   elif mode == 'aliases':
+-+      dumper = aliases_dumper_t()
+-+      preprocess = None
+-+   elif mode == 'aliases:inline_c':
+-+      dumper = aliases_dumper_t()
+-+      preprocess = preprocess_inline_c(dumper)
+-+   else:
+-+      asm_error('invalid mode')
+-+   external_link = dumper.external_link()
+-+
+-+   # handle target
+-+   target = options.target or 'b0' # assume b0 if no target specified
+-+   if target == 'a0':
+-+      have_sema = False
+-+      have_am = False
+-+      mulw_rotate = False
+-+      have_lthrsw = False
+-+   elif target == 'b0':
+-+      have_sema = True
+-+      have_am = True
+-+      mulw_rotate = True
+-+      have_lthrsw = True
+-+   elif target == 'hera':
+-+      have_sema = True
+-+      have_am = False
+-+      mulw_rotate = True
+-+      have_lthrsw = True
+-+   else:
+-+      asm_error('invalid target')
+-+   if have_am:
+-+      sigs['loadam'] = SIG_LOADAM
+-+      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
+-+   if have_lthrsw:
+-+      sigs['lthrsw'] = SIG_LTHRSW
+-+      del sigs['int']
+-+      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
+-+
+-+   # handle misc options
+-+   allow_xor_0 = options.allow_xor_0
+-+   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
+-+   warnings_are_errors = options.warnings_are_errors
+-+   disable_warnings = options.disable_warnings
+-+
+-+   # make options visible to asm
+-+   arg_defs['mode'] = mode
+-+   arg_defs['target'] = target
+-+
+-+   # arg_defs all setup at this point
+-+   sets = arg_defs.copy() # todo: see arg_eval
+-+
+-+   # handle command line sets
+-+   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
+-+   for options_set in options.sets:
+-+      m = re_options_set.match(options_set)
+-+      if not m:
+-+         asm_error('badly formatted set on command line')
+-+      sets[m.group('name')] = arg_eval(m.group('val'), sets)
+-+
+-+   # assemble input file and dump
+-+   asm_file(sets, filename, filename, preprocess)
+-+   asm_end_prog()
+-+   dump(dumper)
+-+   for name in arg_defs: # todo: see arg_eval
+-+      del sets[name]
+-+   dumper.sets(sets)
+++    global flog
+ +
+-+if __name__ == '__main__':
+-+   main()
+-diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+-new file mode 100644
+-index 0000000..47dd071
+---- /dev/null
+-+++ b/pi-util/qem.sh
+-@@ -0,0 +1,9 @@
+-+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+-+QASM=python\ pi-util/qasm.py
+-+SRC_FILE=libavcodec/rpi_shader.qasm
+-+DST_BASE=shader
+++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
+++To blank the screen before starting use "xdg-screensaver activate"
+++(For some reason this doesn't seem to work from within python).
+++""")
+ +
+-+cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+-+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+-+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+++    argp.add_argument("streams", nargs='*')
+++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
+++    argp.add_argument("--csv_in", help="CSV input filename")
+++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
+ +
+-diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+-new file mode 100755
+-index 0000000..6a9a33f
+---- /dev/null
+-+++ b/pi-util/rebase_liblinks.py
+-@@ -0,0 +1,37 @@
+-+#!/usr/bin/env python
+++    args = argp.parse_args()
+ +
+-+import os, sys
+-+from stat import *
+++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
+++    csv_out.writeheader()
+++
+++    stats_in = {}
+++    if args.csv_in != None:
+++        with open(args.csv_in, 'r', newline='') as f_in:
+++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+++
+++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
+++
+++    streams = args.streams
+++    if not streams:
+++        if not stats_in:
+++            print ("No source streams specified")
+++            return 1
+++        prefix = "" if args.prefix == None else args.prefix
+++        streams = [k for k in stats_in]
+++    elif args.prefix != None:
+++        prefix = args.prefix
+++    else:
+++        prefix = streams[0]
+++        for f in streams[1:]:
+++            prefix = common_prefix(prefix, f)
+++        pp = prefix.rpartition(os.sep)
+++        prefix = pp[0] + pp[1]
+++        streams = [s[len(prefix):] for s in streams]
+++
+++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
+++        print ("====", f)
+++
+++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
+++        for i in range(3):
+++            t = tstats.time_file(f, prefix)
+++            print ("...", t.times_str())
+++            if t0 > t:
+++                t0 = t
+++
+++        if t0.name in stats_in:
+++            pstat = stats_in[t0.name]
+++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
+++
+++        csv_out.writerow(t0.dict())
+++
+++        print ()
+++
+++    return 0
+ +
+-+def walktree(top, callback, n, prefix):
+-+    '''recursively descend the directory tree rooted at top,
+-+       calling the callback function for each regular file'''
+-+
+-+    for f in os.listdir(top):
+-+        pathname = os.path.join(top, f)
+-+        mode = os.lstat(pathname).st_mode
+-+        if S_ISDIR(mode):
+-+            # It's a directory, recurse into it
+-+            walktree(pathname, callback, n+1, prefix)
+-+        elif S_ISLNK(mode):
+-+            # It's a file, call the callback function
+-+            callback(pathname, os.readlink(pathname), n, prefix)
+-+
+-+def visitfile(file, linkname, n, prefix):
+-+    if (linkname.startswith(prefix + 'lib/')):
+-+        newlink = "../" * n + linkname[len(prefix):]
+-+        print 'relinking', file, "->", newlink
+-+        os.remove(file)
+-+        os.symlink(newlink, file)
+ +
+ +if __name__ == '__main__':
+-+    argc = len(sys.argv)
+-+    if argc == 2:
+-+        walktree(sys.argv[1], visitfile, 0, "/")
+-+    elif argc == 3:
+-+        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
+-+    else:
+-+        print "rebase_liblinks.py <local root> [<old sysroot>]"
+++    exit(main())
+ +
++diff --git a/pi-util/make_array.py b/pi-util/make_array.py
++new file mode 100755
++index 0000000000..864fa5e704
++--- /dev/null
+++++ b/pi-util/make_array.py
++@@ -0,0 +1,19 @@
+++#!/usr/bin/env python
+ +
+++# Usage
+++#   make_array file.bin
+++#   Produces file.h with array of bytes.
+++#
+++import sys
+++for file in sys.argv[1:]:
+++  prefix,suffix = file.split('.')
+++  assert suffix=='bin'
+++  name=prefix.split('/')[-1]
+++  print 'Converting',file
+++  with open(prefix+'.h','wb') as out:
+++    print >>out, 'static const unsigned char',name,'[] = {'
+++    with open(file,'rb') as fd:  
+++      for byte in fd.read():
+++        print >>out, '%d,' % ord(byte)
+++    print >>out,'};'
+ +
+-diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
++diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+ new file mode 100755
+-index 0000000..d8bdd91
++index 0000000000..5ce2eeaf72
+ --- /dev/null
+-+++ b/pi-util/syncroot.sh
+-@@ -0,0 +1,43 @@
+-+set -e
+-+
+-+if [ "$1" == "" ]; then
+-+  echo Usage: $0 \<src_dir\> [\<rootname\>]
+-+  echo src_dir is a source for rsync so may contain m/c name.
+-+  echo rootname will be set to \"raspian_jessie_pi1\" if missing
+-+  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
+-+  exit 1
+-+fi
+-+
+-+SYSROOT_NAME=$2
+-+if [ "$SYSROOT_NAME" == "" ]; then
+-+  SYSROOT_NAME=raspian_jessie_pi1
+-+fi
+-+
+-+DST_ROOT=`pwd`
+-+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
+-+SRC=$1
+-+
+-+echo Sync src:  $SRC
+-+echo Sync dest: $DST
+-+
+-+mkdir -p $DST/lib
+-+mkdir -p $DST/opt/vc/include
+-+mkdir -p $DST/usr/lib/pkgconfig
+-+mkdir -p $DST/usr/bin
+-+mkdir -p $DST/usr/share
+-+
+-+#### MUST NOT include /opt/vc/include/*GL*
+-+# Creates conflicts with GL includes inside Chrome
+-+
+-+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
+-+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
+-+rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
+-+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
+-+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
+-+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
+-+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
+-+rsync -rl $SRC/usr/include $DST/usr
+-+
+-+pi-util/rebase_liblinks.py $DST
+++++ b/pi-util/qem.sh
++@@ -0,0 +1,9 @@
+++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+++QASM=python\ ../local/bin/qasm.py
+++SRC_FILE=libavcodec/rpi_shader.qasm
+++DST_BASE=shader
+ +
+++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+ +
+ diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
+-new file mode 100644
+-index 0000000..5935a11
++new file mode 100755
++index 0000000000..5935a11ca5
+ --- /dev/null
+ +++ b/pi-util/v3dusage.py
+ @@ -0,0 +1,128 @@
+
+From 78b1b2b50f01ae8a61aec3b8efb839aa3b120827 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 8 Sep 2017 17:06:43 +0100
+Subject: [PATCH 77/78] RBP: Request allocation are pre-pinned
+
+---
+ xbmc/linux/RBP.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
+index 79f932378cf37747be79e65fd0c2e2476f95474f..ee4a1d71fd8cc4517907952b14db86f310cb7ab0 100644
+--- a/xbmc/linux/RBP.cpp
++++ b/xbmc/linux/RBP.cpp
+@@ -406,7 +406,7 @@ static int get_image_params(int file_desc, VC_IMAGE_T * img)
+ CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
+ {
+   m_numbytes = numbytes;
+-  m_vcsm_handle = vcsm_malloc_cache(numbytes, cached ? VCSM_CACHE_TYPE_HOST : VCSM_CACHE_TYPE_NONE, (char *)"CGPUMEM");
++  m_vcsm_handle = vcsm_malloc_cache(numbytes, (VCSM_CACHE_TYPE_T)(0x80 | (unsigned)(cached ? VCSM_CACHE_TYPE_HOST : VCSM_CACHE_TYPE_NONE)), (char *)"CGPUMEM");
+   assert(m_vcsm_handle);
+   m_vc_handle = vcsm_vc_hdl_from_hdl(m_vcsm_handle);
+   assert(m_vc_handle);
+
+From d8396450f95c8119a99c27cb3b60730ff4f170af Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 13 Oct 2017 20:29:23 +0100
+Subject: [PATCH 78/78] MMALFFMpeg: Avoid crash with 10bit HEVC by accepting
+ format but failing
+
+---
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+index 8444d0df598caef958e4ac3254419f3b4f95c513..a5a28ab25a97417d8524e68b46d3e44fa8b35bad 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+@@ -241,7 +241,7 @@ enum AVPixelFormat CDVDVideoCodecFFmpeg::GetFormat(struct AVCodecContext * avctx
+ #endif
+ 
+ #ifdef HAS_MMAL
+-    if (*cur == AV_PIX_FMT_YUV420P || *cur == AV_PIX_FMT_SAND128)
++    if (*cur == AV_PIX_FMT_YUV420P || *cur == AV_PIX_FMT_SAND128 || *cur == AV_PIX_FMT_SAND64_10)
+     {
+       MMAL::CDecoder* dec = new MMAL::CDecoder(ctx->m_processInfo, ctx->m_hints);
+       if(dec->Open(avctx, ctx->m_pCodecContext, *cur, ctx->m_uSurfacesCount))
diff --git a/projects/RPi2/patches/kodi/kodi-001-backport.patch b/projects/RPi2/patches/kodi/kodi-001-backport.patch
index ed3b647051..2f28a65810 100644
--- a/projects/RPi2/patches/kodi/kodi-001-backport.patch
+++ b/projects/RPi2/patches/kodi/kodi-001-backport.patch
@@ -1,7 +1,7 @@
-From 6cebd3b7186d58ee1dd14263f532f9a8c6f005bd Mon Sep 17 00:00:00 2001
+From 9eeffaec4cf147576df92e0e97cd9fd8ca248c53 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Oct 2014 00:19:40 +0000
-Subject: [PATCH 01/75] [cec] Add settings for configuring button repeats
+Subject: [PATCH 01/78] [cec] Add settings for configuring button repeats
 
 ---
  addons/resource.language.en_gb/resources/strings.po | 15 +++++++++++++++
@@ -10,10 +10,10 @@ Subject: [PATCH 01/75] [cec] Add settings for configuring button repeats
  3 files changed, 34 insertions(+), 1 deletion(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index e0060d1fae556de529274dbc6be07455701573a3..6443f3dd885bf0aa8e031039e36e273972a310ae 100644
+index 9009023f4d39d10b180cdbe981c0329cc3a3c3b2..7735c3cb0f010bb824896f5fb70ff28e9548b5ac 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19745,3 +19745,18 @@ msgstr ""
+@@ -19787,3 +19787,18 @@ msgstr ""
  msgctxt "#39010"
  msgid "Select sort method"
  msgstr ""
@@ -48,7 +48,7 @@ index d5704b249c3065b2980dc92c7c81dc7b384187bc..02b1a9ed6fce1986bd864bba09a9df06
  
    <peripheral vendor_product="2548:1001,2548:1002" bus="usb" name="Pulse-Eight CEC Adapter" mapTo="cec">
 diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e829f35046f 100644
+index c78d1c206c14ea6d7ee92cd7fd03fbc62f0fd1d5..88289b3cbabacbe51aab3ab2ed0e1f2d46b5be79 100644
 --- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 +++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 @@ -1296,6 +1296,20 @@ void CPeripheralCecAdapter::SetConfigurationFromLibCEC(const CEC::libcec_configu
@@ -82,10 +82,10 @@ index d032ffd707fee5eec035e90bdf618530f7215c37..30367a3fde956090afdca9930fa52e82
    if (GetSettingBool("pause_playback_on_deactivate"))
    {
 
-From 0fdeeb63794764ebdd628e52d170bf8bac330efd Mon Sep 17 00:00:00 2001
+From 527184b27a880ec5bc2722d8c1e3075416889818 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Apr 2014 17:27:52 +0100
-Subject: [PATCH 02/75] [cec] Don't suspend pi on tv switch off - it can't wake
+Subject: [PATCH 02/78] [cec] Don't suspend pi on tv switch off - it can't wake
  up
 
 ---
@@ -106,10 +106,10 @@ index 02b1a9ed6fce1986bd864bba09a9df0621f9e041..54f9b70cfd5c8c82ceb99932e1b3e325
      <setting key="use_tv_menu_language" type="bool" value="1" label="36018" order="10" />
      <setting key="pause_playback_on_deactivate" type="bool" value="1" label="36033" configurable="0" />
 
-From 36f4544b7ac9c810c875e8ae19ab92b3f3dafb59 Mon Sep 17 00:00:00 2001
+From 9bc5c32ef31ccd55b48689b7287cf5afa003514f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Apr 2014 18:19:32 +0100
-Subject: [PATCH 03/75] [rbp/omxplayer] When opening a stream don't try to
+Subject: [PATCH 03/78] [rbp/omxplayer] When opening a stream don't try to
  update gui so often
 
 ---
@@ -133,10 +133,10 @@ index c8fe0706d128b3c67a4000894129ae0fa08bb223..8a5916299575661743131b921a27a76f
          dialog->ProcessRenderLoop(false);
          if (allowCancel && dialog->IsCanceled())
 
-From 2be0471046b5e75078f1a284348b3d2fbd033555 Mon Sep 17 00:00:00 2001
+From eb6f9850358766675b79c8724d9f645ac8d9c280 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 8 Mar 2014 15:36:06 +0000
-Subject: [PATCH 04/75] [hifiberry] Hack: force it to be recognised as IEC958
+Subject: [PATCH 04/78] [hifiberry] Hack: force it to be recognised as IEC958
  capable to enable passthrough options
 
 ---
@@ -144,10 +144,10 @@ Subject: [PATCH 04/75] [hifiberry] Hack: force it to be recognised as IEC958
  1 file changed, 4 insertions(+)
 
 diff --git a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5ca4f62f5 100644
+index 4d87afa2c94c4e18e8001b9c105e0b5e6cc379d8..274000806ed7dc43130f4282cc0aedb3ae4ee209 100644
 --- a/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
 +++ b/xbmc/cores/AudioEngine/Sinks/AESinkALSA.cpp
-@@ -1351,6 +1351,10 @@ void CAESinkALSA::EnumerateDevice(AEDeviceInfoList &list, const std::string &dev
+@@ -1356,6 +1356,10 @@ void CAESinkALSA::EnumerateDevice(AEDeviceInfoList &list, const std::string &dev
      if (snd_card_get_name(cardNr, &cardName) == 0)
        info.m_displayName = cardName;
  
@@ -159,10 +159,10 @@ index d66993a09583d8f9f54f5f97c18fbba45dddee9b..3c0b691860ace57e0a25f01013df01a5
          info.m_displayName.substr(info.m_displayName.size()-5) == " HDMI")
      {
 
-From e2b718b239b65f2132406355dfdf9c66da744b9c Mon Sep 17 00:00:00 2001
+From 3d01a30955c492a7992442dd493eecbfb2f4a4c6 Mon Sep 17 00:00:00 2001
 From: Ben Avison <bavison@riscosopen.org>
 Date: Thu, 1 May 2014 16:28:39 +0100
-Subject: [PATCH 05/75] Improved file buffering in CArchive
+Subject: [PATCH 05/78] Improved file buffering in CArchive
 
 Even though memcpy is typically inlined by the compiler into byte/word loads
 and stores (at least for release builds), the frequency with which 1, 2 and 4
@@ -222,10 +222,10 @@ index 23cac2759fb10d532da56fa75c5528c5589e9010..89d31d4db1afa7340ed8cd51a7a9fa7a
      }
  
 
-From e59492cefc6ebc66027e7fb96475f14ad14a650c Mon Sep 17 00:00:00 2001
+From 3db1ae9a40311ab19b2b31a6b311b1e9e95db224 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 10 Aug 2014 16:45:16 +0100
-Subject: [PATCH 06/75] filesystem: Make support of browsing into archives
+Subject: [PATCH 06/78] filesystem: Make support of browsing into archives
  optional
 
 The ability to browse, scan and play content in archives can cause problems on low powered/low memory devices.
@@ -244,10 +244,10 @@ We'll let people who don't use archives disable it manually
  4 files changed, 26 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 6443f3dd885bf0aa8e031039e36e273972a310ae..7dfc5355cc0d85d94360ba21bc738733e4878f3d 100644
+index 7735c3cb0f010bb824896f5fb70ff28e9548b5ac..6b275ba8dba63f6d09a92c92a4d599af8ef46bec 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19388,6 +19388,15 @@ msgstr ""
+@@ -19430,6 +19430,15 @@ msgstr ""
  #: system/settings/rbp.xml
  msgctxt "#38010"
  msgid "GPU accelerated"
@@ -284,10 +284,10 @@ index 62e9c8ed2199f8c57a640b06b0216ee4c8f0ca1e..e8b0d3d472b02fd161a4b51e957b9129
 +  </section>
  </settings>
 diff --git a/xbmc/Util.cpp b/xbmc/Util.cpp
-index c3567941192c724f2600494a8d7e355584b57b52..da1508dcedbd196789988d895e64548a08439d8f 100644
+index ca99cf148057f44883ca6be08d340956bbe40f80..4fdac55278ee3a7e4c88f038bb6a39ddb54211cd 100644
 --- a/xbmc/Util.cpp
 +++ b/xbmc/Util.cpp
-@@ -1899,7 +1899,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
+@@ -1904,7 +1904,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
      URIUtils::RemoveExtension(strCandidate);
      if (StringUtils::StartsWithNoCase(strCandidate, videoName))
      {
@@ -296,7 +296,7 @@ index c3567941192c724f2600494a8d7e355584b57b52..da1508dcedbd196789988d895e64548a
          CUtil::ScanArchiveForAssociatedItems(pItem->GetPath(), "", item_exts, associatedFiles);
        else
        {
-@@ -1909,7 +1909,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
+@@ -1914,7 +1914,7 @@ void CUtil::ScanPathsForAssociatedItems(const std::string& videoName,
      }
      else
      {
@@ -335,10 +335,10 @@ index a0fd0a9011e71f4af1535110c696b6ea5c4b37db..688b71a297c7c617c6764bfe6be157d7
    {
      CURL xbtUrl = URIUtils::CreateArchivePath("xbt", url);
 
-From 73698542aed16c452fc15f5cd5a438e127676b68 Mon Sep 17 00:00:00 2001
+From d32758d3aef8d023416c0911983901fb85912bfc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 13:06:57 +0000
-Subject: [PATCH 07/75] [rbp] Make cachemembuffersize default depend on memory
+Subject: [PATCH 07/78] [rbp] Make cachemembuffersize default depend on memory
  size
 
 ---
@@ -402,7 +402,7 @@ index a35a509a91483f13e2cf0e688fc7e9528f254290..fffa5182126159f6dfcf750b21fa0464
    void Deinitialize();
    int GetArmMem() { return m_arm_mem; }
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 91574029c28c4fabacb4bc022aa028dcaf299adb..46d72aa072d34119f4a7273dc8f71176abebd27c 100644
+index aa802635ba3c295bd5d425af204e9ea98dee0a17..96021d579fe144d0050a7bb813e7a0dbc9d3c804 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -50,6 +50,9 @@
@@ -440,10 +440,10 @@ index 91574029c28c4fabacb4bc022aa028dcaf299adb..46d72aa072d34119f4a7273dc8f71176
  }
  
 
-From 48eb57a16b9d386dc54b42ab04700f8f7f85fab9 Mon Sep 17 00:00:00 2001
+From d31cd7eb3c58e0478dd1f388162aa3c665cc918b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 30 May 2014 14:58:43 +0100
-Subject: [PATCH 08/75] [settings] Experiment: Report DESKTOP resolution in
+Subject: [PATCH 08/78] [settings] Experiment: Report DESKTOP resolution in
  video settings
 
 ---
@@ -465,10 +465,10 @@ index ef95bc286fa982790248bad26da3c3e00c1da002..da69c6960867621d4ebe9267929664d9
          StringUtils::Format("%dx%d%s", resolution->width, resolution->height,
                              ModeFlagsToString(resolution->flags, false).c_str()),
 
-From 952474c036385667d8ec894c178f58490af6f69c Mon Sep 17 00:00:00 2001
+From 883c180b3e1e5faf2391e2a5770a20c086608893 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 24 Sep 2014 23:13:52 +0100
-Subject: [PATCH 09/75] [audio] Add settings option to boost centre channel
+Subject: [PATCH 09/78] [audio] Add settings option to boost centre channel
  when downmixing
 
 This allows a dB volume increase to be added to centre channel.
@@ -486,10 +486,10 @@ Should work with Pi Sink (dvdplayer/paplayer) and omxplayer
  5 files changed, 46 insertions(+)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 7dfc5355cc0d85d94360ba21bc738733e4878f3d..c67fc9a16f303a822dadfb4f558a390ada04bca8 100644
+index 6b275ba8dba63f6d09a92c92a4d599af8ef46bec..a1b537ff0d3b1d72f0b4e122c93f2e9fbfc4f4ac 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19608,6 +19608,21 @@ msgstr ""
+@@ -19650,6 +19650,21 @@ msgstr ""
  
  #empty strings from id 38062 to 38099
  
@@ -512,10 +512,10 @@ index 7dfc5355cc0d85d94360ba21bc738733e4878f3d..c67fc9a16f303a822dadfb4f558a390a
  #: system/settings/settings.xml
  msgctxt "#38100"
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 301e7276e5b79e00457db1f33b1cd576bdef4c85..5f1f3ca48342ef1a4eeed7432221d7b2dda354e8 100644
+index 5ff71b9741c5d4d3c555042929e6764f3c6426da..536c2881d73e36ebb42ef495b426fc3fc34ba8ee 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
-@@ -2358,6 +2358,18 @@
+@@ -2374,6 +2374,18 @@
            </dependencies>
            <control type="toggle" />
          </setting>
@@ -594,10 +594,10 @@ index f16b822ed7b4aebe18b5d339b3f71ee66e97c23f..993d4b33a294e88c2c004b7943895ba5
      // stereo upmix
      if (upmix && m_src_channels == 2 && m_dst_channels > 2)
 
-From 1296ca8ae16f160bd8bdf00491582f94577122c5 Mon Sep 17 00:00:00 2001
+From a11f649848327bd03eaed9224112c14a59e092cc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Oct 2014 15:23:51 +0000
-Subject: [PATCH 10/75] [rbp] Default extract thumbnails to false
+Subject: [PATCH 10/78] [rbp] Default extract thumbnails to false
 
 It can take 80 seconds for a single file on a Pi. It can cause crashes with out-of-memory errors.
 It genereates a lot of support issues. Best to default to disabled and let users enable it if they must
@@ -623,10 +623,10 @@ index e8b0d3d472b02fd161a4b51e957b9129e3cb9792..289dc55ec41aa44848519a05f8ee1ccc
      </category>
    </section>
 
-From 221907efb819c990488518eb9c4b7cfd91151e4e Mon Sep 17 00:00:00 2001
+From 1917960dc4fd495cb2b180d8a36235b6a1879773 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 27 Nov 2014 16:31:56 +0000
-Subject: [PATCH 11/75] [languageinvoker] Reduce priority of python threads
+Subject: [PATCH 11/78] [languageinvoker] Reduce priority of python threads
 
 ---
  xbmc/interfaces/generic/LanguageInvokerThread.cpp | 5 +++++
@@ -649,10 +649,10 @@ index fcdd0633f30cd9595ae6cc4ed293677cdcb1f422..16f0c8916b5e0a9e90973d194cf2ebd1
  }
  
 
-From cf222655784da191a022a153fa5614cfbb4d79bd Mon Sep 17 00:00:00 2001
+From e9010a2ae44ca3ea0175a19721fd2dcd010e3019 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 29 Nov 2014 15:25:16 +0000
-Subject: [PATCH 12/75] [rbp] hack: wait for splash to complete before changing
+Subject: [PATCH 12/78] [rbp] hack: wait for splash to complete before changing
  hdmi mode
 
 ---
@@ -736,10 +736,10 @@ index ee297700f8583dbb15cbe53baf8c887b36bd2ea0..bbe501d40c5e101f1d0d64b8b59b1928
  
    RENDER_STEREO_MODE stereo_mode = g_graphicsContext.GetStereoMode();
 
-From 7c77d589e065637bb0644889b520f3902b44b880 Mon Sep 17 00:00:00 2001
+From cc4bfd3f49bf4829e781fdc0a01743c9fa3927f5 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 11 Dec 2014 17:00:57 +0000
-Subject: [PATCH 13/75] Fix for UI not showing both extractflags and
+Subject: [PATCH 13/78] Fix for UI not showing both extractflags and
  extractthumb
 
 ---
@@ -748,7 +748,7 @@ Subject: [PATCH 13/75] Fix for UI not showing both extractflags and
  2 files changed, 9 insertions(+), 5 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7 100644
+index a1b537ff0d3b1d72f0b4e122c93f2e9fbfc4f4ac..78ef8335f01cf1b023416a536155fdb5f3f62458 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -12451,7 +12451,7 @@ msgstr ""
@@ -778,7 +778,7 @@ index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea
  msgstr ""
  
  #: system/settings/settings.xml
-@@ -19784,3 +19784,7 @@ msgstr ""
+@@ -19826,3 +19826,7 @@ msgstr ""
  msgctxt "#38052"
  msgid "Remote button press release time (ms)"
  msgstr ""
@@ -787,7 +787,7 @@ index c67fc9a16f303a822dadfb4f558a390ada04bca8..b2f17db119a179e3e2bf4c8c186a19ea
 +msgid "Extract thumbnails from video files"
 +msgstr ""
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b29942315a 100644
+index 536c2881d73e36ebb42ef495b426fc3fc34ba8ee..eb96ec79d8c14a5a17af5228dd953699ae867008 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -974,8 +974,8 @@
@@ -802,10 +802,10 @@ index 5f1f3ca48342ef1a4eeed7432221d7b2dda354e8..2ed5fb217c6b9f63f28d760e2a2c00b2
            <control type="toggle" />
          </setting>
 
-From 9e7d22b484cbccf5d54293a36c3cae38ce7426dd Mon Sep 17 00:00:00 2001
+From f61ebfb47fb09969e0f4e2ada140c08c1b5f08f0 Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Thu, 11 Sep 2014 21:30:43 +0200
-Subject: [PATCH 14/75] Disable autoscrolling while on screensaver and while
+Subject: [PATCH 14/78] Disable autoscrolling while on screensaver and while
  opening streams.
 
 ---
@@ -818,10 +818,10 @@ Subject: [PATCH 14/75] Disable autoscrolling while on screensaver and while
  6 files changed, 24 insertions(+), 3 deletions(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 947f0937d73cde5e4a4f39ed1a7932bd1e8eb0fe..593acafd15bb0409b4446b6e598f7aa4d7baf434 100644
+index a2448dc49e3be651761d5d6357ee946b46163ca9..1575f31827b842b19beea072b01ce3234c5d31b8 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -5232,3 +5232,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
+@@ -5241,3 +5241,13 @@ bool CApplication::NotifyActionListeners(const CAction &action) const
    
    return false;
  }
@@ -936,10 +936,10 @@ index d7bc1c5ba6067af9a460589920367288c640a915..ac766293f1c47c7f145cb46f6b152144
        if (m_lastRenderTime)
          m_autoScrollDelayTime += currentTime - m_lastRenderTime;
 
-From 831794fa04a8589069317953f813ada9f0d3bf54 Mon Sep 17 00:00:00 2001
+From add9791eb2b60ba9a0269a2d8e749bc8c4e58d5c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 13 Dec 2014 18:35:20 +0000
-Subject: [PATCH 15/75] [demuxer] Avoid memcpy on every demuxer packet
+Subject: [PATCH 15/78] [demuxer] Avoid memcpy on every demuxer packet
 
 Avoids an unnecessary memcpy on every demuxer packet which for
 high bitrate videos can be significant.
@@ -1039,10 +1039,10 @@ index df0f35bd49c65b302de4ccd110d859e8b881ea5f..b4b591ae4c4dd4fb0b36d4d00fedca96
      }
      catch(...) {
 
-From 9673bb4533c0a82f4712752b6f6d28f5f1ceb24e Mon Sep 17 00:00:00 2001
+From ce7935004d48f8ac5fa752e3eb08bcdba156ff23 Mon Sep 17 00:00:00 2001
 From: anaconda <anaconda@menakite.eu>
 Date: Wed, 25 Feb 2015 18:22:21 +0100
-Subject: [PATCH 16/75] Load OSD dialogs on startup.
+Subject: [PATCH 16/78] Load OSD dialogs on startup.
 
 Fixes skipped frames the first time they're loaded in memory on less powered
 devices, like a Raspberry Pi, when using DVDPlayer.
@@ -1137,10 +1137,10 @@ index 0534828dd85520134f7a6890e43a873e223062c1..5a86dfc1e2a54c8fe8d82cb75b612d8e
  CGUIDialogVideoSettings::~CGUIDialogVideoSettings()
  { }
 
-From 19b2018244c328f5f88f90271e31de66bea486e3 Mon Sep 17 00:00:00 2001
+From 72df8a38ceba40e384e32f035e92f74685d5f5ef Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Apr 2015 20:51:14 +0100
-Subject: [PATCH 17/75] [gui] Also limit GUI updates when in non full-screen
+Subject: [PATCH 17/78] [gui] Also limit GUI updates when in non full-screen
  video mode
 
 ---
@@ -1148,10 +1148,10 @@ Subject: [PATCH 17/75] [gui] Also limit GUI updates when in non full-screen
  1 file changed, 3 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/Application.cpp b/xbmc/Application.cpp
-index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea61b2ddcc 100644
+index 1575f31827b842b19beea072b01ce3234c5d31b8..7d597841f438ad6175444e1d6da601e479ee445d 100644
 --- a/xbmc/Application.cpp
 +++ b/xbmc/Application.cpp
-@@ -2771,7 +2771,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2780,7 +2780,7 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
  #if defined(TARGET_RASPBERRY_PI) || defined(HAS_IMXVPU)
      // This code reduces rendering fps of the GUI layer when playing videos in fullscreen mode
      // it makes only sense on architectures with multiple layers
@@ -1160,7 +1160,7 @@ index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea
        fps = CSettings::GetInstance().GetInt(CSettings::SETTING_VIDEOPLAYER_LIMITGUIUPDATE);
  #endif
  
-@@ -2784,6 +2784,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
+@@ -2793,6 +2793,8 @@ void CApplication::FrameMove(bool processEvents, bool processGUI)
      {
        if (!m_skipGuiRender)
          g_windowManager.Process(CTimeUtils::GetFrameTime());
@@ -1170,10 +1170,10 @@ index 593acafd15bb0409b4446b6e598f7aa4d7baf434..f9aed6476b069ccf391697642e7999ea
      g_windowManager.FrameMove();
    }
 
-From b7e74e740581f7e6ab94609171000b747da9c911 Mon Sep 17 00:00:00 2001
+From 5a463508bcd3a5c3863227f1c4fa53891b00da57 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 5 May 2015 23:58:06 +0100
-Subject: [PATCH 18/75] [screensaver] Leave GUI contents available for
+Subject: [PATCH 18/78] [screensaver] Leave GUI contents available for
  screensaver
 
 ---
@@ -1203,10 +1203,10 @@ index 5808f7ed1e94d68ead7305ba6d284edd4df12bdd..2a3b7f16531c9822e79c77efabdd30ac
  
    // Add window to the history list (we must do this before we activate it,
 
-From fe4cef6b6e2a35352ede135ac84ff3539d1ff09e Mon Sep 17 00:00:00 2001
+From d18202353c69d022f480e48d8a6b1457ea7bd162 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 6 Jun 2015 18:43:57 +0100
-Subject: [PATCH 19/75] ffmpeg: Automatic switch to software decode for GMC
+Subject: [PATCH 19/78] ffmpeg: Automatic switch to software decode for GMC
  with more than one warp point
 
 ---
@@ -1434,10 +1434,10 @@ index f135d423c0ca76fd70e79ae5b7d035f0cb79fc75..d9b576bc46055fdab1c134e5f2c63cd4
        else if ((hint.codec == AV_CODEC_ID_VC1 || hint.codec == AV_CODEC_ID_WMV3) && g_RBP.GetCodecWvc1())
          supported = true;
 
-From f5dabe10623f19cd9e8ea015e2d248d47c03900c Mon Sep 17 00:00:00 2001
+From 352bf85b5b1e081fb0222625869943fc136f2a6e Mon Sep 17 00:00:00 2001
 From: Claudio-Sjo <Claudio.Porfiri@gmail.com>
 Date: Mon, 16 Feb 2015 14:51:26 +0100
-Subject: [PATCH 20/75] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
+Subject: [PATCH 20/78] - allow reads < CDIO_CD_FRAMESIZE_RAW by using a buffer
  - fixes #15794
 
 ---
@@ -1629,10 +1629,10 @@ index 0427af4534bfe59a343f0518c7f4242d93299836..e99236294fa8b9b613e465a8ecaf3ad3
    lsn_t m_lsnCurrent; // Position inside the track in logical sector number
    lsn_t m_lsnEnd;   // End of m_iTrack in logical sector number
 
-From 9e3b4fd8c161b01d324220252289a5b3a49fb7e8 Mon Sep 17 00:00:00 2001
+From ef632a0c5391db64b1bd65181141a8bab14af2e6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 24 Jun 2016 19:38:13 +0100
-Subject: [PATCH 21/75] codecoverlay: Include codec name in overlay
+Subject: [PATCH 21/78] codecoverlay: Include codec name in overlay
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerAudio.cpp | 4 ++++
@@ -1726,10 +1726,10 @@ index 0df7e72cc9d1947173c2bac5e72eb09976b51aa5..b5050081c360d29b1b478c27e6b88291
    double                    m_iSubtitleDelay;
    bool                      m_bRenderSubs;
 
-From 119f7291d3b7c1a57d3a86b3836c8a73a7cd1211 Mon Sep 17 00:00:00 2001
+From e207be7c755180ebf556c455e70fd9cbba0e9540 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 8 Mar 2016 21:20:58 +0300
-Subject: [PATCH 22/75] [DebugInfo] Add cpu usage info.
+Subject: [PATCH 22/78] [DebugInfo] Add cpu usage info.
 
 ---
  .../VideoPlayer/VideoRenderers/DebugRenderer.cpp   | 56 ++++++++--------------
@@ -1899,10 +1899,10 @@ index 420b5b5d8e6089e1049ef9af25e23d915df50dc1..fd8a0a2447c40357a9e13003f2ef45ef
  
        m_debugTimer.Set(1000);
 
-From 21927619971ef137030d64a0dd102a90a7effaf0 Mon Sep 17 00:00:00 2001
+From d5f5094b8c01bbfe96a13d56bdf4bbdd680f1876 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 22 May 2015 13:56:29 +0100
-Subject: [PATCH 23/75] ffmpeg: Allow neon to be enabled in unified builds
+Subject: [PATCH 23/78] ffmpeg: Allow neon to be enabled in unified builds
 
 ---
  tools/depends/target/ffmpeg/Makefile | 4 ++++
@@ -1925,10 +1925,10 @@ index 8dd14cdfd053f142f386b6dee1fc0b21bb1f8d93..b5f38a458dfb341c43089e07afded153
  ifeq ($(OS), linux)
    ffmpg_config += --target-os=$(OS) --cpu=$(CPU)
 
-From 7c9767ac163fada0423cf8cc27b05f0d74482220 Mon Sep 17 00:00:00 2001
+From c652b5f7d6541c5a6110f8003ea85162d36418b7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 27 Feb 2015 14:37:27 +0000
-Subject: [PATCH 24/75] ffmpeg: Add some upstream HEVC optimisations
+Subject: [PATCH 24/78] ffmpeg: Add some upstream HEVC optimisations
 
 ---
  tools/depends/target/ffmpeg/Makefile               |    6 +-
@@ -5726,10 +5726,10 @@ index 0000000000000000000000000000000000000000..5e8e07d407f045fc99554f0f061d1e81
 +2.5.0
 +
 
-From f15eaf9000104c97d5bfc5ea046b4407cab2a261 Mon Sep 17 00:00:00 2001
+From 246118b5d0b263cc78efc5fedf3bfb0dc87727b6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 14:04:18 +0100
-Subject: [PATCH 25/75] [ffmpeg] Add GPU acceleration to hevc
+Subject: [PATCH 25/78] [ffmpeg] Add GPU acceleration to hevc
 
 ---
  tools/depends/target/ffmpeg/Makefile               |     4 +-
@@ -43915,10 +43915,10 @@ index 0000000000000000000000000000000000000000..e172ebf157aebffe1ae50b4a2b25fd71
 +2.7.4
 +
 
-From 88b331888a7677058bb3dfb064d7eb952b0ce1a9 Mon Sep 17 00:00:00 2001
+From 2fb19cb4de89eb353132eddf05725bb802cf4a15 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 12 Jan 2016 16:29:57 +0000
-Subject: [PATCH 26/75] ffmpeg: Add cabac opimisations for hevc
+Subject: [PATCH 26/78] ffmpeg: Add cabac opimisations for hevc
 
 ---
  .../0001-Squashed-commit-of-the-following.patch    | 2179 ++++++++++++++++++++
@@ -46163,10 +46163,10 @@ index d6856dbd4fb4957ace700cbc08332223c01938f6..a61357f14cb2139e8125ae04684bed1b
  
  make -j ${BUILDTHREADS} 
 
-From ce532b19d18df015cecb0e2e2ec85f0c89885a25 Mon Sep 17 00:00:00 2001
+From 9f9d02837471153722950ee6455ce8cd0b92b4fc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 16 Sep 2015 19:05:12 +0100
-Subject: [PATCH 27/75] [3d] Make MVC a valid 3D filename tag
+Subject: [PATCH 27/78] [3d] Make MVC a valid 3D filename tag
 
 ---
  xbmc/guilib/StereoscopicsManager.cpp | 9 +++++++++
@@ -46195,7 +46195,7 @@ index b34873cba6534086ae243326550385867a03256a..1443acaf0f25df458ae49766e13dd032
  }
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 46d72aa072d34119f4a7273dc8f71176abebd27c..cca5c7f932241d146291d2bb0a0042f99fa0d596 100644
+index 96021d579fe144d0050a7bb813e7a0dbc9d3c804..0c636c9ed2f57b7a39d58c361012337c862128bc 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -402,6 +402,7 @@ void CAdvancedSettings::Initialize()
@@ -46227,10 +46227,10 @@ index fc526d11c3a78bc74125429120e29bf295bd3b16..6b0e3b8cf9e3ff40e6af758c54fe7eef
      bool m_useDisplayControlHWStereo;
  
 
-From df4fc81637ca4b47d4ce0e64110d8bab4bd77cd4 Mon Sep 17 00:00:00 2001
+From f9e13ac675d5d62b461ed4c6bb54e52dd1c85685 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 5 Oct 2015 14:58:05 +0100
-Subject: [PATCH 28/75] [3d] Swap top/bottom sides of GUI
+Subject: [PATCH 28/78] [3d] Swap top/bottom sides of GUI
 
 ---
  xbmc/guilib/GraphicContext.cpp | 2 +-
@@ -46250,10 +46250,10 @@ index 3706e4d80b3b31da4c5be0a1b21f36e59d2910f2..e170b3fb05279ffa316794dbce1d4f9d
    }
    if(m_stereoMode == RENDER_STEREO_MODE_SPLIT_VERTICAL)
 
-From 2373df61c862bc62538391596c098a80968d1c0d Mon Sep 17 00:00:00 2001
+From 67c2ae9f396aa82586b5fd896ee36447ceb5cdba Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 11 Oct 2015 20:51:37 +0100
-Subject: [PATCH 29/75] Revert "Revert "Disable extra logging by default""
+Subject: [PATCH 29/78] Revert "Revert "Disable extra logging by default""
 
 This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
 ---
@@ -46261,10 +46261,10 @@ This reverts commit a880554325be187b877cd8f0e2b338e7267da636.
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2fdbe11b2 100644
+index eb96ec79d8c14a5a17af5228dd953699ae867008..3a0d9bd1274b0664e34eb8865f41caf816bc2c30 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
-@@ -2834,12 +2834,12 @@
+@@ -2850,12 +2850,12 @@
          </setting>
          <setting id="debug.extralogging" type="boolean" label="666" help="36394">
            <level>1</level>
@@ -46280,10 +46280,10 @@ index 2ed5fb217c6b9f63f28d760e2a2c00b29942315a..850abcd174cc8773319639c7e337f2e2
              <options>loggingcomponents</options>
              <delimiter>,</delimiter>
 
-From a0543043a26699a0e4a8bed989481ab1320e3f0c Mon Sep 17 00:00:00 2001
+From 620756eeb948638bc08d6ca831cd12d82955dfa4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 21 Dec 2015 22:17:25 +0000
-Subject: [PATCH 30/75] [omximage] Fall back to arm jpeg encode/decode when gpu
+Subject: [PATCH 30/78] [omximage] Fall back to arm jpeg encode/decode when gpu
  is busy
 
 ---
@@ -46526,10 +46526,10 @@ index a93aa82663903fb1bf712058c2e259290ee742e6..6f38dbc7e5cc721c59a3633935f08218
  
  extern COMXImage g_OMXImage;
 
-From 72ad7c69c3f847ade231f29ac23ffb96ebaf2ae4 Mon Sep 17 00:00:00 2001
+From 3492f4e80fc154aeaebf4178079db75b536cb9d7 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 9 Dec 2015 13:31:14 +0000
-Subject: [PATCH 31/75] [mmalcodec] Fail to open when width is invalid. Can
+Subject: [PATCH 31/78] [mmalcodec] Fail to open when width is invalid. Can
  happen with mpegts files
 
 ---
@@ -46551,10 +46551,10 @@ index 822b7bf75f2e732b5eed8687403d0eda503fa641..c43952d4d29b42f3a5c7605573294568
    if (!CSettings::GetInstance().GetBool(CSettings::SETTING_VIDEOPLAYER_USEMMAL) || hints.software)
      return false;
 
-From 0e735b38e2891c582c5a37dc5ded26cb954948a8 Mon Sep 17 00:00:00 2001
+From 93000006d2c2edaf16d7b3302914f3f32135830f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 Sep 2014 11:54:49 +0100
-Subject: [PATCH 32/75] [videoplayer/rbp] Add pi specific option to maintain
+Subject: [PATCH 32/78] [videoplayer/rbp] Add pi specific option to maintain
  vsync with pll adjustment
 
 New A/V sync option in settings/video/playback to do "Adjust PLL".
@@ -46576,10 +46576,10 @@ or drop/dupe audio packets which is normally required.
  12 files changed, 143 insertions(+), 21 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index b2f17db119a179e3e2bf4c8c186a19ea4e6d49a7..55ec0a9985a8e77873d787e879d73c076e13b2c6 100644
+index 78ef8335f01cf1b023416a536155fdb5f3f62458..097464415a596cf13b3c245bbedd616f5a4e49ef 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
-@@ -19788,3 +19788,35 @@ msgstr ""
+@@ -19830,3 +19830,35 @@ msgstr ""
  msgctxt "#38190"
  msgid "Extract thumbnails from video files"
  msgstr ""
@@ -46641,7 +46641,7 @@ index 289dc55ec41aa44848519a05f8ee1ccc72740085..2572e25753712186f69390965ee1448b
        <group id="3">
          <setting id="audiooutput.ac3transcode" help="37024">
 diff --git a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp
-index f5671b8dfb03216301d936ae3b08bfc3e8225729..68399ab14faf813bd195d2fdf03a4a376307b4cd 100644
+index a1ea0791f48888257db50ce3b7807fd6ced6e8c1..45a293d1d34c12d77c03027cc282c3a3a7940354 100644
 --- a/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp
 +++ b/xbmc/cores/AudioEngine/Engines/ActiveAE/ActiveAE.cpp
 @@ -363,11 +363,12 @@ void CActiveAE::StateMachine(int signal, Protocol *port, Message *msg)
@@ -46995,10 +46995,10 @@ index fffa5182126159f6dfcf750b21fa0464e229e545..815d758e7086d73b4d4eb16849fdbb50
  
  extern CRBP g_RBP;
 
-From d4a5c46043ced09c53dea24e6ca090a574806e3b Mon Sep 17 00:00:00 2001
+From e9ca06686d8335b2292676c0f1cbc7b6ab66b1fc Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 7 May 2015 15:35:43 +0100
-Subject: [PATCH 33/75] rbp: Support zero copy interface with hevc acceleration
+Subject: [PATCH 33/78] rbp: Support zero copy interface with hevc acceleration
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 9 +++++++++
@@ -47042,10 +47042,10 @@ index 77ae3273bc8e224fe6c193300ccef32fb7fbafe1..c0b3f19f2ef9cdef9adf00cf81154803
    if (g_advancedSettings.CanLogComponent(LOGVIDEO))
      CLog::Log(LOGDEBUG, "%s::%s - mmal:%p dts:%.3f pts:%.3f buf:%p gpu:%p", CLASSNAME, __FUNCTION__, picture->MMALBuffer->mmal_buffer, 1e-6*picture->dts, 1e-6*picture->pts, picture->MMALBuffer, gmem);
 
-From 0fbf365c6de020f0d094c8ab221b159593eecce5 Mon Sep 17 00:00:00 2001
+From c666a1a3c43b1dd4bf03cf64a433a945140597d3 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 16 May 2015 18:26:04 +0100
-Subject: [PATCH 34/75] ffmpeg: use upstream mvc patches
+Subject: [PATCH 34/78] ffmpeg: use upstream mvc patches
 
 ---
  ...vcodec-add-h264_mvc-codec-id-and-profiles.patch |  68 ++++++++++++
@@ -47355,10 +47355,10 @@ index 0000000000000000000000000000000000000000..b39480ad098b9cd0882fcf75b96afb1b
 +2.7.4
 +
 
-From f303faf857227cee88db21f5e95bd0a7d2f8c06e Mon Sep 17 00:00:00 2001
+From af658a2b7faaf843451a5ccac8113949c8cd0de7 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 29 Jan 2016 17:18:50 +0300
-Subject: [PATCH 35/75] [win32] Settings: Added setting to enable/disable MVC
+Subject: [PATCH 35/78] [win32] Settings: Added setting to enable/disable MVC
  decoder.
 
 ---
@@ -47388,10 +47388,10 @@ index a017d30c24232fb01220b87b29398403b8ed9662..2fcee72a64e8b701c8e895143410bbe9
      <category id="display">
        <group id="1">
 
-From 9f1937bc8941347695d09078e624cc30beab4a6d Mon Sep 17 00:00:00 2001
+From 9f1ecf9e4e7dab6b7649d3ee549e0dee28274a92 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Wed, 20 Jan 2016 17:02:16 +0300
-Subject: [PATCH 36/75] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
+Subject: [PATCH 36/78] [VideoPlayer] DemuxFFmpeg: Properly demuxing h264_mvc
  streams.
 
 ---
@@ -47454,10 +47454,10 @@ index 54a18c669a058b705e0276cb7e14522ae6cd04ae..55431978dcfabee8da95e2e76292ff81
        }
      case AVMEDIA_TYPE_DATA:
 
-From a451efc2d79422565ef1cbf931444c3ef5165125 Mon Sep 17 00:00:00 2001
+From c9931f6dca15e99b1557269f4277ebfb68bfc52c Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 25 Feb 2016 11:21:25 +0300
-Subject: [PATCH 37/75] [Stereo3D] Added block_lr and block_rl to supported
+Subject: [PATCH 37/78] [Stereo3D] Added block_lr and block_rl to supported
  modes.
 
 ---
@@ -47507,10 +47507,10 @@ index 1443acaf0f25df458ae49766e13dd0323454f2eb..6aaa82f4d883b8cae0ccdedf6c5a6814
      i++;
    }
 
-From 39522c63603fb5bf00b95a0eba5df6a626ea240f Mon Sep 17 00:00:00 2001
+From 368c41c30868369865b59bea96f21a21b01bf9b9 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Sat, 23 Jan 2016 10:21:32 +0300
-Subject: [PATCH 38/75] [VideoPlayer] Fix possible wrong aspect.
+Subject: [PATCH 38/78] [VideoPlayer] Fix possible wrong aspect.
 
 ---
  xbmc/cores/VideoPlayer/VideoPlayerVideo.cpp | 2 +-
@@ -47530,10 +47530,10 @@ index 903f0d83527d9088ff1bf0ba056f357f6abfda81..a5a33d34c70892cde77ad4d8f3cb65fd
    else
      m_fForcedAspectRatio = 0.0;
 
-From b362a9d5e20db180bc6fce923188a921e7a0e985 Mon Sep 17 00:00:00 2001
+From db6058178103680919790f134a21bc7898c6561a Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 22 Jan 2016 18:18:33 +0300
-Subject: [PATCH 39/75] [VideoPlayer] DemuxFFmpeg: ssif remux
+Subject: [PATCH 39/78] [VideoPlayer] DemuxFFmpeg: ssif remux
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/CMakeLists.txt  |   2 +
@@ -47954,7 +47954,7 @@ index e4f8aed0af96fe0dceec4d8517087742f2c7df81..30076937bd084936571abf0e6eeecf5a
  LIB = DVDDemuxers.a
  
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index cca5c7f932241d146291d2bb0a0042f99fa0d596..edbc96f7be3ae4dae994320f8c137555c927d455 100644
+index 0c636c9ed2f57b7a39d58c361012337c862128bc..1c23e5b0f25f0c6f2e5f7cab166aac825af5a30e 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -391,7 +391,7 @@ void CAdvancedSettings::Initialize()
@@ -47967,10 +47967,10 @@ index cca5c7f932241d146291d2bb0a0042f99fa0d596..edbc96f7be3ae4dae994320f8c137555
    m_discStubExtensions = ".disc";
    // internal music extensions
 
-From 0bd2f0f4af5d90cd685380e36379590a378d024d Mon Sep 17 00:00:00 2001
+From 327cb97b6787b445345bb92befc6edb937fd11f7 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:02:46 +0300
-Subject: [PATCH 40/75] [3DBD] Added support of 3D-BluRay playback.
+Subject: [PATCH 40/78] [3DBD] Added support of 3D-BluRay playback.
 
 ---
  lib/DllLibbluray.h                                 |   8 +
@@ -48960,10 +48960,10 @@ index b967a85e6557e42a7f1235cdd804d5a0263b866f..561fb5cd4f971bc9ee4f41218a60bb3d
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From 913cd365b12a9730cb04bb8a9d5ebddde02d5503 Mon Sep 17 00:00:00 2001
+From d308e52086d930545d3411b0280cfb552d237c8e Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 11 Mar 2016 16:58:53 +0300
-Subject: [PATCH 41/75] [VideoPlayer] HasVideo returns true if video stream
+Subject: [PATCH 41/78] [VideoPlayer] HasVideo returns true if video stream
  exists. This don't allow start visualization if audio is opened before video.
 
 ---
@@ -48971,7 +48971,7 @@ Subject: [PATCH 41/75] [VideoPlayer] HasVideo returns true if video stream
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index 0285de264b4abc9433d70ae056b80c3db4b318c9..b244a21ac083c6f7b0e2d455e2b7a45fb2497640 100644
+index f909c8e451a057aa9f1d7d4c3264c8a7059185c1..69ce875fd44606d55e3186868927aaaec99e934c 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3074,7 +3074,7 @@ void CVideoPlayer::Pause()
@@ -48984,10 +48984,10 @@ index 0285de264b4abc9433d70ae056b80c3db4b318c9..b244a21ac083c6f7b0e2d455e2b7a45f
  
  bool CVideoPlayer::HasAudio() const
 
-From e8a09603950b958dd1934cb460fda960759485f8 Mon Sep 17 00:00:00 2001
+From 206d1d4f0b845f29638f2139c26d3461215c3b43 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 42/75] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 42/78] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -49024,10 +49024,10 @@ index 3a080d06c90b0762482816928642e6de7810b539..a8323f419e404037c4e5fb4d78fa1b45
      CDVDOverlayImage* overlay = new CDVDOverlayImage();
  
 
-From f10689878e33dc69a2ebbd559f41de12e72784c5 Mon Sep 17 00:00:00 2001
+From c4eb63ed988135ca18ab0b77357def3513bd3585 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 7 Apr 2016 17:28:50 +0300
-Subject: [PATCH 43/75] [VideoPlayer] Disable reading extension stream from
+Subject: [PATCH 43/78] [VideoPlayer] Disable reading extension stream from
  input stream if decoder doesn't support it.
 
 ---
@@ -49257,7 +49257,7 @@ index 0b676c9b611fe956f1aa721013412e41ff5b62f6..6762e733848d1298a75a862b0aaf81aa
  
  class CDVDAudioCodec;
 diff --git a/xbmc/cores/VideoPlayer/VideoPlayer.cpp b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
-index b244a21ac083c6f7b0e2d455e2b7a45fb2497640..69b031a5623888a1b9a8c0ca7fe34fe3b1900fdc 100644
+index 69ce875fd44606d55e3186868927aaaec99e934c..abdaef946b4155a74ea4abe9f8bf0db9403be710 100644
 --- a/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 +++ b/xbmc/cores/VideoPlayer/VideoPlayer.cpp
 @@ -3802,6 +3802,10 @@ bool CVideoPlayer::OpenVideoStream(CDVDStreamInfo& hint, bool reset)
@@ -49284,10 +49284,10 @@ index 0d4100e58e9db7e5035bcf9ae23b0147f80cec8f..69570153f0810a5840f3780c7a6681a1
    // classes
    CDVDOverlayContainer* m_pOverlayContainer;
 
-From 74d399ad03a76c6f63c4fab2ba8ba2760a2f2180 Mon Sep 17 00:00:00 2001
+From edc458879b9e964f886c17cee950951ac3d1000e Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Fri, 16 Sep 2016 11:37:48 +0300
-Subject: [PATCH 44/75] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
+Subject: [PATCH 44/78] [Settings] move SETTING_VIDEOPLAYER_SUPPORTMVC from
  platform settings to common settings.
 
 ---
@@ -49317,7 +49317,7 @@ index 2572e25753712186f69390965ee1448bff3fadd5..7098edf32dff8c00e192229c3ffb060b
    </section>
    <section id="media">
 diff --git a/system/settings/settings.xml b/system/settings/settings.xml
-index 850abcd174cc8773319639c7e337f2e2fdbe11b2..0fb9464a598cad05893bff627cbd7ddee7341ca8 100644
+index 3a0d9bd1274b0664e34eb8865f41caf816bc2c30..e18bc802a49be8b12fcaac2af583c8b3c167b249 100644
 --- a/system/settings/settings.xml
 +++ b/system/settings/settings.xml
 @@ -343,6 +343,12 @@
@@ -49383,10 +49383,10 @@ index 473ca093f45f6a5779cade1268269bb7ba483e9d..11a422b1a5cbfde9914d3bfd23b5b540
    m_simpleConditions.insert("have_lcms2");
  #endif
 
-From 1f0f86550e8cfed2a5de0d436c5c1e1e2ea642a1 Mon Sep 17 00:00:00 2001
+From 3b9367571d460fefdb993c055d6a5a618976ed61 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Fri, 4 Nov 2016 22:56:56 +0300
-Subject: [PATCH 45/75] [VideoPlayer] SSIF: fix for corner case when mvc stream
+Subject: [PATCH 45/78] [VideoPlayer] SSIF: fix for corner case when mvc stream
  is switched before the last packet is read from previous stream.
 
 ---
@@ -49575,17 +49575,17 @@ index f70657c9e31fb2460d12910c635dba5163282e74..a11ec77903d2a9b2c68106a8e2301af9
    typedef std::shared_ptr<CDVDOverlayImage> SOverlay;
    typedef std::list<SOverlay>                 SOverlays;
 
-From ddc42633af64cfc6e9447d40f988c86a9a04250d Mon Sep 17 00:00:00 2001
+From 2cfde24377717ce4f3f10879b5f7fc547a00b324 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <afedchin@ruswizards.com>
 Date: Tue, 23 Feb 2016 16:01:08 +0300
-Subject: [PATCH 46/75] [libbluray] bump libbluray to 0.9.2-mvc.
+Subject: [PATCH 46/78] [libbluray] bump libbluray to 0.9.2-mvc.
 
 ---
  project/BuildDependencies/scripts/0_package.list | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/project/BuildDependencies/scripts/0_package.list b/project/BuildDependencies/scripts/0_package.list
-index 6f53a2785027cf6c34d084402f3f1aee7cf5860a..e4a67e91b0a6b9fafad972b0f6f8e86c619c436f 100644
+index 2d15488f24703223db57848459e536cc08eb22cf..9c0c60ce3447e4d0e992457e5ca3be95d4296ea9 100644
 --- a/project/BuildDependencies/scripts/0_package.list
 +++ b/project/BuildDependencies/scripts/0_package.list
 @@ -17,7 +17,7 @@ freetype-db5a22-win32-vc140.7z
@@ -49598,10 +49598,10 @@ index 6f53a2785027cf6c34d084402f3f1aee7cf5860a..e4a67e91b0a6b9fafad972b0f6f8e86c
  libcec-4.0.1-win32-vc140-2.7z
  libfribidi-0.19.2-win32.7z
 
-From 30060bc20c7f25701009d77d6b566e26ef77fa14 Mon Sep 17 00:00:00 2001
+From 2ca5b9e1beff48b0eb69c9d73e44acc7b8bc36bb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 29 Feb 2016 17:00:50 +0000
-Subject: [PATCH 47/75] libbluray: Bump to Nevcairie's v0.9.2
+Subject: [PATCH 47/78] libbluray: Bump to Nevcairie's v0.9.2
 
 This includes 3D support
 ---
@@ -51258,10 +51258,10 @@ index 0000000000000000000000000000000000000000..5ef0124e35c9d81143921a328e272220
 + 
 +     return fp;
 
-From d3ad5d1c9d8da1ee7c63cd9302bef058b1da1135 Mon Sep 17 00:00:00 2001
+From 38193615b554a1fccc44f24dc43586c22bf59637 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 6 Mar 2016 12:54:59 +0000
-Subject: [PATCH 48/75] mvc: Automatically enable stereo mode
+Subject: [PATCH 48/78] mvc: Automatically enable stereo mode
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/MMALCodec.cpp | 6 +++++-
@@ -51319,10 +51319,10 @@ index 311dd6689236d660919c4c4483c51dca2752514a..536332c43e22ccb229e72b88518e54dd
      break;
      case AV_CODEC_ID_MPEG4:
 
-From f1b065ebbb0f130da3e28a6a4375f9458cee3fd3 Mon Sep 17 00:00:00 2001
+From ba2b2c2b0373f3b598d32fef5beb1de84043a7f0 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 24 Mar 2016 13:02:58 +0000
-Subject: [PATCH 49/75] ffmpeg: mvc: fix for pixelation from packets with no
+Subject: [PATCH 49/78] ffmpeg: mvc: fix for pixelation from packets with no
  pts/dts
 
 ---
@@ -51384,10 +51384,10 @@ index 7e97e4d91a443d46d933df528763422ff5e8f4fa..d4f279fd4f2ceb260698cd6fedb124ba
  	cd $(PLATFORM);\
  	CFLAGS="$(CFLAGS)" CXXFLAGS="$(CXXFLAGS)" CPPFLAGS="$(CPPFLAGS)" LDFLAGS="$(LDFLAGS)" \
 
-From 332a8c9c8739a159f62542856c686ee14e996bdd Mon Sep 17 00:00:00 2001
+From 7d414c38f171bb4bc394725b3c9057cd5ef0e2af Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 11 Nov 2016 15:53:53 +0000
-Subject: [PATCH 50/75] stereoscopicmanager: fixups for rbp
+Subject: [PATCH 50/78] stereoscopicmanager: fixups for rbp
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/DVDCodecUtils.cpp | 61 ++++++++++++++++++++++
@@ -51625,10 +51625,10 @@ index 6aaa82f4d883b8cae0ccdedf6c5a6814e7aaa720..cc929b599125a44ac128713fd4331782
  };
  
 
-From 2d81f94dcaf52e951bb7e203ea248b48c24d15aa Mon Sep 17 00:00:00 2001
+From 28a24fe8f76d7a8506ec472a2075bcb110009471 Mon Sep 17 00:00:00 2001
 From: Anton Fedchin <anightik@gmail.com>
 Date: Thu, 10 Mar 2016 18:11:33 +0300
-Subject: [PATCH 51/75] fixup! Revert supporting crappy tab/sbs subtitles. this
+Subject: [PATCH 51/78] fixup! Revert supporting crappy tab/sbs subtitles. this
  fixes regular subtitles.
 
 ---
@@ -51648,10 +51648,10 @@ index a8323f419e404037c4e5fb4d78fa1b45409337a7..7c0b70777556ac7694e7fc511cd4bb18
    }
  
 
-From 48664856527a85a6d242649a5dcebf85d9420171 Mon Sep 17 00:00:00 2001
+From 669cd51bee072e99b3b0d2b2a911b9738b1f8e75 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 26 Nov 2016 18:24:18 +0000
-Subject: [PATCH 52/75] DemuxMVC: fixup after SeekTime API change
+Subject: [PATCH 52/78] DemuxMVC: fixup after SeekTime API change
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DemuxMVC.cpp | 2 +-
@@ -51685,17 +51685,17 @@ index bbb836a61344689a83af68c821c05c212a86b097..54f91a02391368fbfbb4d669c003f425
    virtual int GetStreamLength() { return 0; };
    virtual CDemuxStream* GetStream(int iStreamId) const override { return nullptr; };
 
-From 945b547c444e7ec5039c88e31b612c57b25edd1b Mon Sep 17 00:00:00 2001
+From e9a6484af92ae3058b3e1afbdfb96c3bc8434a12 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 3 Nov 2014 23:17:46 +0000
-Subject: [PATCH 53/75] [cec] Don't discard buttons when repeat mode is enabled
+Subject: [PATCH 53/78] [cec] Don't discard buttons when repeat mode is enabled
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 5 ++++-
  1 file changed, 4 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27d911b4da 100644
+index 88289b3cbabacbe51aab3ab2ed0e1f2d46b5be79..543a65716cd2eec73210fa80ba992ed5acc13b6b 100644
 --- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 +++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 @@ -803,7 +803,10 @@ void CPeripheralCecAdapter::PushCecKeypress(const CecButtonPress &key)
@@ -51711,17 +51711,17 @@ index 30367a3fde956090afdca9930fa52e829f35046f..febacb3b7964eab3b8615a6a807e0f27
      if (m_currentButton.iButton == key.iButton && m_currentButton.iDuration == 0)
      {
 
-From 70d24188f34e2846d42f18146baf43952c31aae3 Mon Sep 17 00:00:00 2001
+From a4f0a7285c72b817b79cbe5ee5ff0c30b00ab758 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 4 Nov 2014 18:50:00 +0000
-Subject: [PATCH 54/75] [cec] Temp - more logging
+Subject: [PATCH 54/78] [cec] Temp - more logging
 
 ---
  xbmc/peripherals/devices/PeripheralCecAdapter.cpp | 8 +++++++-
  1 file changed, 7 insertions(+), 1 deletion(-)
 
 diff --git a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
-index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7adde96cb8 100644
+index 543a65716cd2eec73210fa80ba992ed5acc13b6b..7192a651afef38e34f0cd6def89160c86ea39ee0 100644
 --- a/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 +++ b/xbmc/peripherals/devices/PeripheralCecAdapter.cpp
 @@ -800,12 +800,15 @@ void CPeripheralCecAdapter::GetNextKey(void)
@@ -51766,10 +51766,10 @@ index febacb3b7964eab3b8615a6a807e0f27d911b4da..52d6e6a7ab68ce91faf5a3881b23ea7a
  }
  
 
-From 0d75b80f8862d67a4edc9f769acc0d18448ad268 Mon Sep 17 00:00:00 2001
+From b5674b0835773d0e8f95a882b706f69de2f4f7a6 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 25 May 2016 18:31:17 +0100
-Subject: [PATCH 55/75] rbp: Hard code the number of buffers to improve audio
+Subject: [PATCH 55/78] rbp: Hard code the number of buffers to improve audio
  sync
 
 ---
@@ -51811,10 +51811,10 @@ index fd8a0a2447c40357a9e13003f2ef45ef20ccb205..be0de0d962fd374bc17bfa48a27ca17d
  
  }
 
-From b7bcc39b920c47e7c4273895feae92d4a82ba08f Mon Sep 17 00:00:00 2001
+From e4311d9bb897c794054551aa3b2b2a4715daa93b Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 4 Jul 2016 18:30:03 +0100
-Subject: [PATCH 56/75] rbp: Update the GL libs to new naming scheme
+Subject: [PATCH 56/78] rbp: Update the GL libs to new naming scheme
 
 As the opensource mesa GL library is getting more usable, the name collision wih the firmware GL driver is causing issues.
 As such we are renaming the firmware GL driver to avoid this.
@@ -51828,7 +51828,7 @@ will be dropped at some point
  3 files changed, 5 insertions(+), 5 deletions(-)
 
 diff --git a/configure.ac b/configure.ac
-index cbaefbe0a6a42f7d863800d87281a3f680cfea5b..2329e126f807b3eccb8cfd4e6ef3117ec20c85b5 100644
+index 71e942b1c3236a686ad6ff9fc930fff8b2019e0a..0336f766234f0825c164de17fec8e074120f1828 100644
 --- a/configure.ac
 +++ b/configure.ac
 @@ -949,7 +949,7 @@ if test "$use_gles" = "yes"; then
@@ -51879,10 +51879,10 @@ index 3626ea5204eb561dc1ae0b64c6bb7253d2ec59ec..100ff3178bafe7434bd5456100b5bb71
  fi
  
 
-From e63ee8ac3fd87a12bdcf197827a182043e58b4af Mon Sep 17 00:00:00 2001
+From 22915642ef576f666ed2976230d15e23c5d153a8 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 28 Jun 2016 14:46:01 +0100
-Subject: [PATCH 57/75] ffmpeg: hacky fix for files with GMC
+Subject: [PATCH 57/78] ffmpeg: hacky fix for files with GMC
 
 ---
  xbmc/cores/VideoPlayer/DVDDemuxers/DVDDemuxFFmpeg.cpp | 4 ++--
@@ -51904,10 +51904,10 @@ index 9149698884c8ae6a23649abbaa0e659587dfe982..84d515e9e2df6a4c1c448a52a42f4675
          {
            if (pStream->codec->codec_id == AV_CODEC_ID_PROBE)
 
-From 73498b227b428c32c7e5ebc5623d094020fe98a7 Mon Sep 17 00:00:00 2001
+From 1e9adac965b29cac5e640b15c5aedb1dc2908114 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 19 Jul 2016 20:39:18 +0100
-Subject: [PATCH 58/75] mmalrender: Add sharpness control
+Subject: [PATCH 58/78] mmalrender: Add sharpness control
 
 ---
  addons/resource.language.en_gb/resources/strings.po         |  2 +-
@@ -51916,7 +51916,7 @@ Subject: [PATCH 58/75] mmalrender: Add sharpness control
  3 files changed, 14 insertions(+), 2 deletions(-)
 
 diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
-index 55ec0a9985a8e77873d787e879d73c076e13b2c6..eea89feb0f698619623ec67ed0078d30d18c22fc 100644
+index 097464415a596cf13b3c245bbedd616f5a4e49ef..8b9af01094e5f2e6e47008ab8cc6fd07c95574e3 100644
 --- a/addons/resource.language.en_gb/resources/strings.po
 +++ b/addons/resource.language.en_gb/resources/strings.po
 @@ -8694,7 +8694,7 @@ msgstr ""
@@ -51979,10 +51979,10 @@ index e0e6f7c0e0546013ca74265aef54704fd332f8e4..69eae6cbef0131d20dc979dcb35915cd
    CCriticalSection m_sharedSection;
    MMAL_COMPONENT_T *m_vout;
 
-From 57c94de16036e00a6822e374cc8ebbc8a042dc6b Mon Sep 17 00:00:00 2001
+From 9d38391944a046fe348943ff09ba6d340e22079d Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 14 Oct 2016 15:37:53 +0100
-Subject: [PATCH 59/75] MMALFFMpeg: Report as SW decode in codec overlay info
+Subject: [PATCH 59/78] MMALFFMpeg: Report as SW decode in codec overlay info
 
 ---
  xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
@@ -52002,10 +52002,10 @@ index 8bace5b3eb98b3b1ddad7f56af83a41ae067bc75..c820a04c903866862b5ff04b38124ff0
    CLog::Log(LOGDEBUG, "CDVDVideoCodecFFmpeg - Updated codec: %s", m_name.c_str());
  }
 
-From 43c6b165b6d0f754f938d54bba00655d436679fd Mon Sep 17 00:00:00 2001
+From 74be792acc04d3bcc6c8f4cfc788b28f682af79f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 7 Nov 2016 18:28:01 +0000
-Subject: [PATCH 60/75] advancedsettings: Add option to set cache size on
+Subject: [PATCH 60/78] advancedsettings: Add option to set cache size on
  libass
 
 E.g to set total cache size in libass to 32M
@@ -52071,7 +52071,7 @@ index f9de4f15e7c612d69ef46e7cad870ecb61afaec3..b5303fd100f1a930eb5c010a95193206
    END_METHOD_RESOLVE()
  };
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index edbc96f7be3ae4dae994320f8c137555c927d455..7f3325392993823b8d2d6a915579c48401ca2c12 100644
+index 1c23e5b0f25f0c6f2e5f7cab166aac825af5a30e..173cefba5b1e7f364d364020ad9ac1496f20c583 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -364,6 +364,8 @@ void CAdvancedSettings::Initialize()
@@ -52107,10 +52107,10 @@ index 6b0e3b8cf9e3ff40e6af758c54fe7eefb89a131c..35bf38719f0eaaa5ac29e9495480ae97
      unsigned int m_jsonTcpPort;
  
 
-From 84623dff0ea921cf494fb9f15379b1bbc43844a0 Mon Sep 17 00:00:00 2001
+From 5bd43949ffaf2781febfbbea59ce5fcc7bfa3298 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 13 Nov 2016 20:30:15 +0000
-Subject: [PATCH 61/75] [rbp] Experimental limit libass cache size depending on
+Subject: [PATCH 61/78] [rbp] Experimental limit libass cache size depending on
  arm memory size
 
 ---
@@ -52141,7 +52141,7 @@ index 6e8529001b1a464b4547a846f553d98f5bc0b6c0..238eba372af2cbab11d7543c857ee476
    response[sizeof(response) - 1] = '\0';
    CLog::Log(LOGNOTICE, "Config:\n%s", response);
 diff --git a/xbmc/settings/AdvancedSettings.cpp b/xbmc/settings/AdvancedSettings.cpp
-index 7f3325392993823b8d2d6a915579c48401ca2c12..410ad30aeb60316e9438ee56aaca7e73f0b3bedd 100644
+index 173cefba5b1e7f364d364020ad9ac1496f20c583..bc6945c09397c3bd7d2107d96cbb3bc7fbd1cd7f 100644
 --- a/xbmc/settings/AdvancedSettings.cpp
 +++ b/xbmc/settings/AdvancedSettings.cpp
 @@ -361,8 +361,10 @@ void CAdvancedSettings::Initialize()
@@ -52156,10 +52156,10 @@ index 7f3325392993823b8d2d6a915579c48401ca2c12..410ad30aeb60316e9438ee56aaca7e73
    m_libAssCache = 0;
  
 
-From b5d95824c6e029b58aaf3b1d6dd2774661925096 Mon Sep 17 00:00:00 2001
+From 96c80f46bcba665013551f6a946a17d7f6b31046 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 22 Jun 2015 21:46:57 +0100
-Subject: [PATCH 62/75] [rbp] Use default resampling setting on Pi2
+Subject: [PATCH 62/78] [rbp] Use default resampling setting on Pi2
 
 ---
  system/settings/rbp2.xml | 5 +++++
@@ -52182,10 +52182,10 @@ index 50bd55e9c90864c1ff4c36c4650e9ec247737a44..f218216e615d9723e5a163aab9c42ca5
    </section>
  </settings>
 
-From c6165dc89c629abd2583eb7181e0543d6b69c255 Mon Sep 17 00:00:00 2001
+From 2dd3e5dcde97a08d92e03003bdecfedffa8e634a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Thu, 1 Dec 2016 17:06:01 +0000
-Subject: [PATCH 63/75] MMALRender: Allow advanced deinterlace with software
+Subject: [PATCH 63/78] MMALRender: Allow advanced deinterlace with software
  decode
 
 Uses YUV420 directly which improves performance.
@@ -52208,10 +52208,10 @@ index f5f0f0d01227b3b4dcebb4a22a54dbcaac2d5ee9..05cbd8eeaef1a21fc32ea1fa23ea686e
      status = mmal_port_format_commit(m_deint_output);
      if (status != MMAL_SUCCESS)
 
-From 15e9791cb79c6c3b5f8c09bba979761451bea04c Mon Sep 17 00:00:00 2001
+From f26960cc83c044117dbd4d5f3458f24b1dd88e79 Mon Sep 17 00:00:00 2001
 From: Nuno Senica <nsenica@gmail.com>
 Date: Tue, 27 Dec 2016 20:59:56 +0000
-Subject: [PATCH 64/75] Apply ffmpeg patches automatically after downloading
+Subject: [PATCH 64/78] Apply ffmpeg patches automatically after downloading
  and extracting the ffmpeg tar ball
 
 ---
@@ -52219,10 +52219,10 @@ Subject: [PATCH 64/75] Apply ffmpeg patches automatically after downloading
  1 file changed, 11 insertions(+), 1 deletion(-)
 
 diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
-index 7c68b4c3d09a037d3b85c81604d47a7ea6dd1c21..eec635ef493d13ea97c9b806eb57cccbc452297d 100644
+index 28cc80710ea4a1e29f5d7050e3797d7c28901b70..88c976efe765f24034238b9933871d90a08704d4 100644
 --- a/project/cmake/modules/FindFFMPEG.cmake
 +++ b/project/cmake/modules/FindFFMPEG.cmake
-@@ -261,7 +261,17 @@ if(NOT FFMPEG_FOUND)
+@@ -264,7 +264,17 @@ if(NOT FFMPEG_FOUND)
                                      <SOURCE_DIR> &&
                                      ${CMAKE_COMMAND} -E copy
                                      ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
@@ -52242,10 +52242,10 @@ index 7c68b4c3d09a037d3b85c81604d47a7ea6dd1c21..eec635ef493d13ea97c9b806eb57cccb
    file(WRITE ${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/ffmpeg/ffmpeg-link-wrapper
  "#!/bin/bash
 
-From 358df1970de1f6f107e1681785ed723db0756f0e Mon Sep 17 00:00:00 2001
+From b2e11686321e5d6d2504085f4e49e272aa813e12 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sun, 1 May 2016 19:56:43 +0100
-Subject: [PATCH 65/75] omxplayer: Avoid CAEFactory::Suspend which should only
+Subject: [PATCH 65/78] omxplayer: Avoid CAEFactory::Suspend which should only
  be called by application
 
 ---
@@ -52345,10 +52345,10 @@ index db7f98ddbc2db2f20bdc42379df3f08eba165bfc..02acfc8cfe57446be4e00b991ef6fde9
    COMXCoreComponent m_omx_render_analog;
    COMXCoreComponent m_omx_render_hdmi;
 
-From dd69c1880f97b81981df1ad50f09bfb457ad8532 Mon Sep 17 00:00:00 2001
+From d824f2b09a72ee5d74b558513908a0b68f0bce1a Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Wed, 1 Mar 2017 21:40:22 +0000
-Subject: [PATCH 66/75] MMALRender: default to bob (x2) deinterlace for HD
+Subject: [PATCH 66/78] MMALRender: default to bob (x2) deinterlace for HD
 
 There are still issues with some dvb dongles run on the same Pi as playback.
 Default to bob. Users who aren't using these devices will have to manually enable advanced.
@@ -52390,10 +52390,10 @@ index 39bc0530cecd54ae8c3a5481c92f1a6a18a4d9c5..cb0a06888a919879155fea2a689c1bae
    if (m_deinterlace && interlace_method != VS_INTERLACEMETHOD_NONE)
    {
 
-From b96bf65f71bca91e4e029ed64c7e3dc86c0d0dad Mon Sep 17 00:00:00 2001
+From 11190e20978752245187e681aab3b089a2b504e4 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 17 Feb 2017 17:58:13 +0000
-Subject: [PATCH 67/75] ffmpeg: Update hevc optimisation to use the gpu service
+Subject: [PATCH 67/78] ffmpeg: Update hevc optimisation to use the gpu service
 
 ---
  project/cmake/modules/FindFFMPEG.cmake             |    14 +-
@@ -52403,10 +52403,10 @@ Subject: [PATCH 67/75] ffmpeg: Update hevc optimisation to use the gpu service
  4 files changed, 16342 insertions(+), 35924 deletions(-)
 
 diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
-index eec635ef493d13ea97c9b806eb57cccbc452297d..301534c5b0113815f9a196fdd0ed9db6ff587cda 100644
+index 88c976efe765f24034238b9933871d90a08704d4..db2d4465e3182363a812325d6bd1aeb146018e01 100644
 --- a/project/cmake/modules/FindFFMPEG.cmake
 +++ b/project/cmake/modules/FindFFMPEG.cmake
-@@ -262,14 +262,14 @@ if(NOT FFMPEG_FOUND)
+@@ -265,14 +265,14 @@ if(NOT FFMPEG_FOUND)
                                      ${CMAKE_COMMAND} -E copy
                                      ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake
                                      <SOURCE_DIR> &&
@@ -105705,10 +105705,10 @@ index e172ebf157aebffe1ae50b4a2b25fd71bc708c93..852815d5f4ae80771c5304f6f3520b5e
 ++
 ++
 
-From 1ec8569a01645467680e3090afba9927cea120d0 Mon Sep 17 00:00:00 2001
+From 700c032b538e91c8a138f4c71bebd310340ce1bb Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Sat, 4 Mar 2017 19:25:40 +0000
-Subject: [PATCH 68/75] ffmpeg: Call get_format to fix an issue with MMAL
+Subject: [PATCH 68/78] ffmpeg: Call get_format to fix an issue with MMAL
  rendering
 
 ---
@@ -105720,10 +105720,10 @@ Subject: [PATCH 68/75] ffmpeg: Call get_format to fix an issue with MMAL
  create mode 100644 tools/depends/target/ffmpeg/0001-ffmpeg-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
 
 diff --git a/project/cmake/modules/FindFFMPEG.cmake b/project/cmake/modules/FindFFMPEG.cmake
-index 301534c5b0113815f9a196fdd0ed9db6ff587cda..2cfd61642d52f05a84bea5ca7eb1766ad8e8ddbd 100644
+index db2d4465e3182363a812325d6bd1aeb146018e01..7e0aeefab542fb063595b09c47e69514a656bd85 100644
 --- a/project/cmake/modules/FindFFMPEG.cmake
 +++ b/project/cmake/modules/FindFFMPEG.cmake
-@@ -270,7 +270,8 @@ if(NOT FFMPEG_FOUND)
+@@ -273,7 +273,8 @@ if(NOT FFMPEG_FOUND)
                                      #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-avcodec-add-h264_mvc-codec-id-and-profiles.patch &&
                                      #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-h264_parser-add-support-for-parsing-h264-mvc-NALUs.patch &&
                                      #patch -p1 < ${CORE_SOURCE_DIR}/tools/depends/target/ffmpeg/h264_parser_fix_parsing_of_mvc_slices_in_some_corner_cases.patch &&
@@ -105830,10 +105830,10 @@ index 3d970429012c1f3aede4df0545ced5006c165d50..e070d96fc340f5bff94d72ae9004c4a9
  CFLAGS="$CFLAGS" CXXFLAGS="$CXXFLAGS" LDFLAGS="$LDFLAGS" \
  ./configure --prefix=$FFMPEG_PREFIX \
 
-From b230c015d539db71bb2eb04232b25805703014c6 Mon Sep 17 00:00:00 2001
+From a3e069e98940801c936f03908275d5f67a47d847 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 22:52:37 +0000
-Subject: [PATCH 69/75] MMAL: Remove periodic prime calls and handle from
+Subject: [PATCH 69/78] MMAL: Remove periodic prime calls and handle from
  buffer destructor
 
 If a number of buffers are released at once we can end up stalled in GetPicture with the buffers
@@ -105927,10 +105927,10 @@ index 9279966fa634f6f5a3e00f12dd528337392cf038..c6ba9b024b3c3bbe53d3f0870dd8c839
    CLog::Log(LOGDEBUG, "%s::%s - stopping", CLASSNAME, __func__);
  }
 
-From 6f29617ca776bb2e6459a55710a4a569311c8d7e Mon Sep 17 00:00:00 2001
+From f3393e3c78e5ffa39d26e88b9ff28207217e408f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 14 Mar 2017 23:22:43 +0000
-Subject: [PATCH 70/75] MMALCodec: Include a timeout of GetPicture in default
+Subject: [PATCH 70/78] MMALCodec: Include a timeout of GetPicture in default
  debug logging
 
 ---
@@ -105951,10 +105951,10 @@ index 22d594cdc217f32f820e3618b4d90a1d75fc769b..e8bc3b930e84e058460b6cfd7caca0d7
  
    return ret;
 
-From a3185132fc1828162ad59e09155464b26a7f35b0 Mon Sep 17 00:00:00 2001
+From b558e52052ebab00434f5d44f36f2c7b8be212c2 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Tue, 21 Mar 2017 20:15:55 +0000
-Subject: [PATCH 71/75] ffmpeg: Add calls to init and deinit gpu service
+Subject: [PATCH 71/78] ffmpeg: Add calls to init and deinit gpu service
 
 ---
  tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 6 ++++--
@@ -105994,10 +105994,10 @@ index 852815d5f4ae80771c5304f6f3520b5e49b18a67..b4c15b782a4deb36c35a006e8547ce69
  +
  +  mbox_close(mb);
 
-From 9ef1f2fdde0e49ae3c5da03defa83d32ab2e432d Mon Sep 17 00:00:00 2001
+From 02a75c682b1f599af712cd67a625d708281d878c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Mon, 27 Mar 2017 20:06:42 +0100
-Subject: [PATCH 72/75] squash: ffmpeg: hevc: Remove rules that require qasm
+Subject: [PATCH 72/78] squash: ffmpeg: hevc: Remove rules that require qasm
 
 ---
  tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch | 12 ------------
@@ -106027,10 +106027,10 @@ index b4c15b782a4deb36c35a006e8547ce69665a10fe..58379fb0874521205184c53be5aae893
  index 54efaad..02a89c3 100644
  --- a/libavcodec/allcodecs.c
 
-From 38a49f21a7430779830d9d4e2468e76de6faf92c Mon Sep 17 00:00:00 2001
+From 5493a835c968dda113c577afa0ba879371023f15 Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 May 2017 15:11:37 +0100
-Subject: [PATCH 73/75] RBP: Add api to query gpu frame geometry
+Subject: [PATCH 73/78] RBP: Add api to query gpu frame geometry
 
 ---
  xbmc/linux/RBP.cpp | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
@@ -106204,10 +106204,10 @@ index 815d758e7086d73b4d4eb16849fdbb509a3c251d..a7f07403854b81996cca72eff82e3a7d
    double GetAdjustHDMIClock() { return m_actual_pll_adjust; }
  
 
-From 1856e86917eef62f3069c465d7c8ff2f8e213395 Mon Sep 17 00:00:00 2001
+From 450096d2ae373ac00618675372926275cef37e6f Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 May 2017 15:12:28 +0100
-Subject: [PATCH 74/75] MMALFFmpeg: Add Sand/YUVUV128 support
+Subject: [PATCH 74/78] MMALFFmpeg: Add Sand/YUVUV128 support
 
 ---
  .../DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp        |  2 +-
@@ -106349,10 +106349,10 @@ index f9b7172c45d5a0158259ebfb53ea75696f0acb6d..456214a679779469ea52db7ce846a387
      return false;
  
 
-From ed215d6a95935eabbbb5f56d9259b24e8ab4929d Mon Sep 17 00:00:00 2001
+From 00ac50fa4ab6c087dce909394efd6d0a33d2151c Mon Sep 17 00:00:00 2001
 From: popcornmix <popcornmix@gmail.com>
 Date: Fri, 19 May 2017 15:10:42 +0100
-Subject: [PATCH 75/75] ffmpeg: hevc: Update to latest version
+Subject: [PATCH 75/78] ffmpeg: hevc: Update to latest version
 
 ---
  .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 11940 ++++++++++++-------
@@ -121057,3 +121057,41750 @@ index 58379fb0874521205184c53be5aae893cfd39d49..96cfa9ae30e72b377b2561cf7a329e02
 ++
 ++    do_logparse(args.logfile)
 ++
+
+From d2b967d8c520b416fb30a2162dfe40a9aad3d6c0 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 13 Oct 2017 17:33:36 +0100
+Subject: [PATCH 76/78] ffmpeg: hevc: Update to latest version
+
+---
+ .../target/ffmpeg/pfcd_hevc_optimisations.patch    | 39170 +++++++++++--------
+ 1 file changed, 23811 insertions(+), 15359 deletions(-)
+
+diff --git a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+index 96cfa9ae30e72b377b2561cf7a329e02b9212ceb..abd1499a6d9b2500fe379c8754bdeac54e44006d 100644
+--- a/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++++ b/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
+@@ -1,8 +1,16 @@
+ diff --git a/.gitignore b/.gitignore
+-index 524fb73..305632b 100644
++index 524fb73c16..bcc983739f 100644
+ --- a/.gitignore
+ +++ b/.gitignore
+-@@ -23,6 +23,7 @@
++@@ -1,6 +1,7 @@
++ *.a
++ *.o
++ *.o.*
+++*.bin
++ *.d
++ *.def
++ *.dll
++@@ -23,6 +24,7 @@
+  .\#*
+  /.config
+  /.version
+@@ -11,7 +19,7 @@ index 524fb73..305632b 100644
+  /ffplay
+  /ffprobe
+ diff --git a/ffmpeg.c b/ffmpeg.c
+-index 9ffd833..e2474e5 100644
++index cdded8673f..5eee7dfd40 100644
+ --- a/ffmpeg.c
+ +++ b/ffmpeg.c
+ @@ -23,6 +23,11 @@
+@@ -20,13 +28,21 @@ index 9ffd833..e2474e5 100644
+  
+ +#ifdef RPI
+ +#define RPI_DISPLAY
+-+#define RPI_ZERO_COPY
+++#define RPI_DISPLAY_ALL 0
+ +#endif
+ +
+  #include "config.h"
+  #include <ctype.h>
+  #include <string.h>
+-@@ -66,6 +71,25 @@
++@@ -42,6 +47,7 @@
++ #include "libavformat/avformat.h"
++ #include "libavdevice/avdevice.h"
++ #include "libswresample/swresample.h"
+++#include "libavutil/atomic.h"
++ #include "libavutil/opt.h"
++ #include "libavutil/channel_layout.h"
++ #include "libavutil/parseutils.h"
++@@ -66,6 +72,25 @@
+  # include "libavfilter/buffersrc.h"
+  # include "libavfilter/buffersink.h"
+  
+@@ -38,21 +54,21 @@ index 9ffd833..e2474e5 100644
+ +#include <interface/mmal/mmal.h>
+ +#include <interface/mmal/mmal_parameters_camera.h>
+ +#include <interface/mmal/mmal_buffer.h>
+++#include <interface/mmal/mmal_port.h>
+ +#include <interface/mmal/util/mmal_util.h>
+ +#include <interface/mmal/util/mmal_default_components.h>
+ +#include <interface/mmal/util/mmal_connection.h>
+ +#include <interface/mmal/util/mmal_util_params.h>
+ +#pragma GCC diagnostic pop
+-+#ifdef RPI_ZERO_COPY
+ +#include "libavcodec/rpi_qpu.h"
+-+#endif
+++#include "libavutil/rpi_sand_fns.h"
+ +#include "libavcodec/rpi_zc.h"
+ +#endif
+ +
+  #if HAVE_SYS_RESOURCE_H
+  #include <sys/time.h>
+  #include <sys/types.h>
+-@@ -158,6 +182,182 @@ static int restore_tty;
++@@ -158,6 +183,241 @@ static int restore_tty;
+  static void free_input_threads(void);
+  #endif
+  
+@@ -60,39 +76,36 @@ index 9ffd833..e2474e5 100644
+ +
+ +#define NUM_BUFFERS 4
+ +
+-+static MMAL_COMPONENT_T* rpi_display = NULL;
+-+static MMAL_POOL_T *rpi_pool = NULL;
+-+static volatile int rpi_display_count = 0;
+ +
+-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port, size_t w, size_t h)
+++typedef struct rpi_display_env_s
+++{
+++    MMAL_COMPONENT_T* display;
+++    MMAL_COMPONENT_T* isp;
+++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
+++    MMAL_CONNECTION_T * conn;
+++
+++    MMAL_POOL_T *rpi_pool;
+++    volatile int rpi_display_count;
+++    enum AVPixelFormat avfmt;
+++} rpi_display_env_t;
+++
+++static rpi_display_env_t * rpi_display_env = NULL;
+++
+++
+++static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
+ +{
+ +    MMAL_POOL_T* pool;
+-+    size_t i;
+-+    size_t size = (w*h*3)/2;
+-+#ifdef RPI_ZERO_COPY
+ +    mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
+ +    pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
+ +    assert(pool);
+-+#else
+-+    pool = mmal_port_pool_create(port, NUM_BUFFERS, size);
+-+
+-+    for (i = 0; i < NUM_BUFFERS; ++i)
+-+    {
+-+       MMAL_BUFFER_HEADER_T* buffer = pool->header[i];
+-+       char * bufPtr = buffer->data;
+-+       memset(bufPtr, i*30, w*h);
+-+       memset(bufPtr+w*h, 128, (w*h)/2);
+-+    }
+-+#endif
+ +
+ +    return pool;
+ +}
+ +
+ +static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
+-+#ifdef RPI_ZERO_COPY
+++    rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
+ +    av_rpi_zc_unref(buffer->user_data);
+-+    --rpi_display_count;
+-+#endif
+++    avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, -1);
+ +    mmal_buffer_header_release(buffer);
+ +}
+ +
+@@ -100,9 +113,12 @@ index 9ffd833..e2474e5 100644
+ +  mmal_buffer_header_release(buffer);
+ +}
+ +
+-+static MMAL_COMPONENT_T* display_init(const enum AVPixelFormat fmt, size_t x, size_t y, size_t w, size_t h)
+++#define DISPLAY_PORT_DEPTH 4
+++
+++static rpi_display_env_t *
+++display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
+ +{
+-+    MMAL_COMPONENT_T* display;
+++    MMAL_STATUS_T err;
+ +    MMAL_DISPLAYREGION_T region =
+ +    {
+ +        .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
+@@ -111,51 +127,113 @@ index 9ffd833..e2474e5 100644
+ +        .fullscreen = 0,
+ +        .dest_rect = {x, y, w, h}
+ +    };
+++#if RPI_ZC_SAND_8_IN_10_BUF
+++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
+++#else
+++    const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
+++#endif
+ +    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
+++    rpi_display_env_t * de;
+++    int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
+++
+++    bcm_host_init();  // Needs to be done by someone...
+++
+++    if ((de = av_mallocz(sizeof(*de))) == NULL) {
+++        return NULL;
+++    }
+ +
+-+    bcm_host_init();  // TODO is this needed?
+-+    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &display);
+-+    assert(display);
+++    mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
+++    av_assert0(de->display);
+++    de->port_in = de->display->input[0];
+++
+++    if (isp_req)
+++    {
+++        mmal_component_create("vc.ril.isp", &de->isp);
+++        de->port_in = de->isp->input[0];
+++    }
+ +
+-+    mmal_port_parameter_set(display->input[0], &region.hdr);
+++    mmal_port_parameter_set(de->display->input[0], &region.hdr);
+ +
+ +    {
+-+        MMAL_ES_FORMAT_T* format = display->input[0]->format;
+-+        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 : MMAL_ENCODING_I420;
+++        MMAL_PORT_T * const port = de->port_in;
+++        MMAL_ES_FORMAT_T* const format = port->format;
+++        port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
+++        port->buffer_num = DISPLAY_PORT_DEPTH;
+++        format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
+++            fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
+++                MMAL_ENCODING_I420;
+ +        format->es->video.width = geo.stride_y;
+-+        format->es->video.height = geo.height_y;
+++        format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
+++                                      (h + 15) & ~15 : geo.height_y;  // Magic
+ +        format->es->video.crop.x = 0;
+ +        format->es->video.crop.y = 0;
+ +        format->es->video.crop.width = w;
+ +        format->es->video.crop.height = h;
+-+        mmal_port_format_commit(display->input[0]);
+++        mmal_port_format_commit(port);
+ +    }
+ +
+-+    mmal_component_enable(display);
+++    de->rpi_pool = display_alloc_pool(de->port_in);
+++    mmal_port_enable(de->port_in,display_cb_input);
+++
+++    if (isp_req) {
+++        MMAL_PORT_T * const port_out = de->isp->output[0];
+++        mmal_log_dump_port(de->port_in);
+++        mmal_format_copy(port_out->format, de->port_in->format);
+++        if (fmt == AV_PIX_FMT_SAND64_10) {
+++            if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
+++                (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
+++            {
+++                av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
+++            }
+++            else
+++                av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
+ +
+-+    rpi_pool = display_alloc_pool(display->input[0], geo.stride_y, geo.height_y);
+++        }
+++        port_out->format->encoding = MMAL_ENCODING_I420;
+++        mmal_log_dump_port(port_out);
+++        if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
+++        {
+++            av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
+++            goto fail;
+++        }
+++        if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
+++            av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
+++            goto fail;
+++        }
+++        if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
+++            av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
+++            goto fail;
+++        }
+++        mmal_port_enable(de->isp->control,display_cb_control);
+++        mmal_component_enable(de->isp);
+++    }
+ +
+-+    mmal_port_enable(display->input[0],display_cb_input);
+-+    mmal_port_enable(display->control,display_cb_control);
+++    mmal_component_enable(de->display);
+++    mmal_port_enable(de->display->control,display_cb_control);
+++    de->avfmt = fmt;
+ +
+ +    printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
+ +
+-+    return display;
+++    return de;
+++
+++fail:
+++    // **** Free stuff
+++    return NULL;
+ +}
+ +
+-+static void display_frame(struct AVCodecContext * const s, MMAL_COMPONENT_T* const display, const AVFrame* const fr)
+++static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
+ +{
+ +    MMAL_BUFFER_HEADER_T* buf;
+ +
+-+    if (!display || !rpi_pool)
+++    if (de == NULL)
+ +        return;
+ +
+-+    if (rpi_display_count >= 3) {
+++    if (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+ +        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
+ +        return;
+ +    }
+ +
+-+    buf = mmal_queue_get(rpi_pool->queue);
+++    buf = mmal_queue_get(de->rpi_pool->queue);
+ +    if (!buf) {
+ +        // Running too fast so drop the frame
+ +        printf("Q alloc failure\n");
+@@ -165,67 +243,64 @@ index 9ffd833..e2474e5 100644
+ +    buf->cmd = 0;
+ +    buf->offset = 0; // Offset to valid data
+ +    buf->flags = 0;
+-+#ifdef RPI_ZERO_COPY
+-+{
+-+    const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, 1);
+-+    if (fr_buf == NULL) {
+-+        mmal_buffer_header_release(buf);
+-+        return;
+-+    }
+-+
+-+    buf->user_data = fr_buf;
+-+    buf->data = av_rpi_zc_vc_handle(fr_buf);
+-+    buf->offset = av_rpi_zc_offset(fr_buf);
+-+    buf->length = av_rpi_zc_length(fr_buf);
+-+    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+-+#if 0
+ +    {
+-+        unsigned int n;
+-+        for (n = 0; n < fr->width; n += 128) {
+-+            memset(fr->data[1] + n * fr->linesize[3], 0x80, 128 * fr->height / 2);
+++        const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
+++        if (fr_buf == NULL) {
+++            mmal_buffer_header_release(buf);
+++            return;
+ +        }
+-+    }
+-+#endif
+-+    ++rpi_display_count;
+-+}
+-+#else
+-+{
+-+#error YYY
+-+    int w = fr->width;
+-+    int h = fr->height;
+-+    int w2 = (w+31)&~31;
+-+    int h2 = (h+15)&~15;
+-+
+-+    buf->length = (w2 * h2 * 3)/2;
+-+    buf->user_data = NULL;
+-+
+-+    //mmal_buffer_header_mem_lock(buf);
+-+    memcpy(buf->data, fr->data[0], w2 * h);
+-+    memcpy(buf->data+w2*h2, fr->data[1], w2 * h / 4);
+-+    memcpy(buf->data+w2*h2*5/4, fr->data[2], w2 * h / 4);
+-+    //mmal_buffer_header_mem_unlock(buf);
+-+}
+-+#endif
+ +
+-+    while (rpi_display_count >= 3) {
+++        buf->user_data = fr_buf;
+++        buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
+++        buf->offset = av_rpi_zc_offset(fr_buf);
+++        buf->length = av_rpi_zc_length(fr_buf);
+++        buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
+++        avpriv_atomic_int_add_and_fetch(&de->rpi_display_count, 1);
+++    }
+++#if RPI_DISPLAY_ALL
+++    while (avpriv_atomic_int_get(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
+ +        usleep(5000);
+ +    }
+++#endif
+ +
+-+    if (mmal_port_send_buffer(display->input[0], buf) != MMAL_SUCCESS)
+++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
+ +    {
+-+        printf("** send failed: depth=%d\n", rpi_display_count);
+-+        display_cb_input(NULL, buf);
+++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
+++        display_cb_input(de->port_in, buf);
+ +    }
+ +}
+ +
+-+static void display_exit(MMAL_COMPONENT_T* display)
+++static void display_exit(rpi_display_env_t ** const pde)
+ +{
+++    rpi_display_env_t * const de = *pde;
+++    *pde = NULL;
+++
+++    if (de != NULL) {
+ +//    sleep(120);
+-+    if (display) {
+-+        mmal_component_destroy(display);
+-+    }
+-+    if (rpi_pool) {
+-+        mmal_port_pool_destroy(display->input[0], rpi_pool);
+++
+++        if (de->port_in != NULL) {
+++            mmal_port_disable(de->port_in);
+++        }
+++
+++        // The above disable should kick out all buffers - check that
+++        if (avpriv_atomic_int_get(&de->rpi_display_count) != 0) {
+++            av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", avpriv_atomic_int_get(&de->rpi_display_count));
+++        }
+++
+++        if (de->conn != NULL) {
+++            mmal_connection_destroy(de->conn);
+++        }
+++        if (de->isp != NULL) {
+++            mmal_component_destroy(de->isp);
+++        }
+++        if (de->display != NULL) {
+++            mmal_component_destroy(de->display);
+++        }
+++        if (de->rpi_pool != NULL) {
+++            mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
+++        }
+++
+++        av_free(de);
+ +    }
+ +}
+ +
+@@ -235,29 +310,29 @@ index 9ffd833..e2474e5 100644
+  /* sub2video hack:
+     Convert subtitles to video with alpha to insert them in filter graphs.
+     This is a temporary solution until libavfilter gets real subtitles support.
+-@@ -540,6 +740,11 @@ static void ffmpeg_cleanup(int ret)
++@@ -540,6 +800,11 @@ static void ffmpeg_cleanup(int ret)
+          avformat_close_input(&input_files[i]->ctx);
+          av_freep(&input_files[i]);
+      }
+ +
+ +#ifdef RPI_DISPLAY
+-+    display_exit(rpi_display);
+++    display_exit(&rpi_display_env);
+ +#endif
+ +
+      for (i = 0; i < nb_input_streams; i++) {
+          InputStream *ist = input_streams[i];
+  
+-@@ -551,6 +756,9 @@ static void ffmpeg_cleanup(int ret)
++@@ -551,6 +816,9 @@ static void ffmpeg_cleanup(int ret)
+          av_freep(&ist->filters);
+          av_freep(&ist->hwaccel_device);
+  
+-+#ifdef RPI_ZERO_COPY
+++#ifdef RPI_DISPLAY
+ +        av_rpi_zc_uninit(ist->dec_ctx);
+ +#endif
+          avcodec_free_context(&ist->dec_ctx);
+  
+          av_freep(&input_streams[i]);
+-@@ -581,6 +789,7 @@ static void ffmpeg_cleanup(int ret)
++@@ -581,6 +849,7 @@ static void ffmpeg_cleanup(int ret)
+      }
+      term_exit();
+      ffmpeg_exited = 1;
+@@ -265,28 +340,28 @@ index 9ffd833..e2474e5 100644
+  }
+  
+  void remove_avoptions(AVDictionary **a, AVDictionary *b)
+-@@ -944,6 +1153,15 @@ static void do_video_out(AVFormatContext *s,
++@@ -944,6 +1213,15 @@ static void do_video_out(AVFormatContext *s,
+      if (ost->source_index >= 0)
+          ist = input_streams[ost->source_index];
+  
+ +#ifdef RPI_DISPLAY
+ +    if (next_picture && ist != NULL)
+ +    {
+-+        if (!rpi_display)
+-+            rpi_display = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+-+        display_frame(ist->dec_ctx, rpi_display, next_picture);
+++        if (rpi_display_env == NULL)
+++            rpi_display_env = display_init(next_picture->format, 0, 0, next_picture->width, next_picture->height);
+++        display_frame(ist->dec_ctx, rpi_display_env, next_picture);
+ +    }
+ +#endif
+ +
+      if (filter->inputs[0]->frame_rate.num > 0 &&
+          filter->inputs[0]->frame_rate.den > 0)
+          duration = 1/(av_q2d(filter->inputs[0]->frame_rate) * av_q2d(enc->time_base));
+-@@ -2549,6 +2767,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
++@@ -2544,6 +2822,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+          ist->dec_ctx->opaque                = ist;
+          ist->dec_ctx->get_format            = get_format;
+          ist->dec_ctx->get_buffer2           = get_buffer;
+ +
+-+#ifdef RPI_ZERO_COPY
+++#ifdef RPI_DISPLAY
+ +        // Overrides the above get_buffer2
+ +        av_rpi_zc_init(ist->dec_ctx);
+ +#endif
+@@ -295,66 +370,74 @@ index 9ffd833..e2474e5 100644
+  
+          av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
+ diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+-index fd0d1f0..1740768 100644
++index bb28aea1e2..741aa0bdc4 100644
+ --- a/libavcodec/Makefile
+ +++ b/libavcodec/Makefile
+-@@ -5,6 +5,12 @@ NAME = avcodec
++@@ -5,6 +5,16 @@ NAME = avcodec
+  HEADERS = avcodec.h                                                     \
+            avdct.h                                                       \
+            avfft.h                                                       \
+++          rpi_opts.h                                                    \
+ +          rpi_qpu.h                                                     \
+ +          rpi_shader.h                                                  \
+-+	  rpi_shader_cmd.h                                              \
+++          rpi_shader_cmd.h                                              \
+++          rpi_shader_template.h                                         \
+++          rpi_shader_template_fn.h                                      \
+ +          rpi_mailbox.h                                                 \
+-+          rpi_hevc_transform.h                                          \
+++          rpi_hevc_transform8.h                                         \
+++          rpi_hevc_transform10.h                                        \
+ +          rpi_zc.h                                                      \
+            d3d11va.h                                                     \
+            dirac.h                                                       \
+            dv_profile.h                                                  \
+-@@ -43,6 +49,10 @@ OBJS = allcodecs.o                                                      \
++@@ -43,6 +53,11 @@ OBJS = allcodecs.o                                                      \
+         resample.o                                                       \
+         resample2.o                                                      \
+         utils.o                                                          \
+ +       rpi_qpu.o                                                        \
+ +       rpi_shader.o                                                     \
+++       rpi_shader_template.o                                            \
+ +       rpi_mailbox.o                                                    \
+ +       rpi_zc.o                                                         \
+         vorbis_parser.o                                                  \
+         xiph.o                                                           \
+  
+-@@ -1078,3 +1088,15 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
++@@ -1079,3 +1094,30 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+  $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+  $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
+  endif
+ +
+-+QASM := $(SUBDIR)../pi-util/qasm.py
+++QASM_PY := ../local/bin/qasm.py
+++VASMVIDCORE := ../local/bin/vasmvidcore_std
+ +
+-+ifneq ("$(wildcard $(QASM))","")
+++ifneq ("$(wildcard $(QASM_PY))","")
+ +$(SUBDIR)rpi_shader.c: $(SUBDIR)rpi_shader.qasm
+-+	python $(QASM) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+++	$(QASM_PY) -mc_c:rpi_shader,rpi_shader,rpi_shader $< > $@
+ +
+ +$(SUBDIR)rpi_shader.h: $(SUBDIR)rpi_shader.qasm
+-+	python $(QASM) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++	$(QASM_PY) -mc_h:rpi_shader,rpi_shader,rpi_shader $< > $@
+++endif
+++
+++ifneq ("$(wildcard $(VASMVIDCORE))","")
+++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
+++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
+++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
+++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
+++
+++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
+++	python pi-util/make_array.py $<
+++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
+++	python pi-util/make_array.py $<
+++
+ +endif
+ +
+-+$(SUBDIR)rpi_qpu.o $(SUBDIR)hevc.o: $(SUBDIR)rpi_shader.h
+-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+-index 54efaad..02a89c3 100644
+---- a/libavcodec/allcodecs.c
+-+++ b/libavcodec/allcodecs.c
+-@@ -667,6 +667,7 @@ void avcodec_register_all(void)
+-     REGISTER_PARSER(H261,               h261);
+-     REGISTER_PARSER(H263,               h263);
+-     REGISTER_PARSER(H264,               h264);
+-+    REGISTER_PARSER(H264_MVC,           h264_mvc);
+-     REGISTER_PARSER(HEVC,               hevc);
+-     REGISTER_PARSER(MJPEG,              mjpeg);
+-     REGISTER_PARSER(MLP,                mlp);
+++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
+++$(SUBDIR)hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_shader.h
+ diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+-index a4ceca7..cafd25d 100644
++index a4ceca7f46..f8229a80e2 100644
+ --- a/libavcodec/arm/Makefile
+ +++ b/libavcodec/arm/Makefile
+-@@ -131,9 +131,12 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
++@@ -131,9 +131,14 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+  NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+  NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+  NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
+@@ -363,13 +446,15 @@ index a4ceca7..cafd25d 100644
+ +                                          arm/hevcdsp_epel_neon.o       \
+                                            arm/hevcdsp_idct_neon.o       \
+ -                                          arm/hevcdsp_qpel_neon.o
+++                                          arm/hevcdsp_cres_neon.o       \
+++                                          arm/hevcdsp_res16_neon.o      \
+ +                                          arm/hevcdsp_qpel_neon.o       \
+ +                                          arm/hevcdsp_sao_neon.o
+  NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
+  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
+                                            arm/rv40dsp_neon.o
+ diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
+-index fdbf86b..0a3980a 100644
++index fdbf86b45e..0a3980a1ef 100644
+ --- a/libavcodec/arm/cabac.h
+ +++ b/libavcodec/arm/cabac.h
+ @@ -26,13 +26,34 @@
+@@ -552,7 +637,7 @@ index fdbf86b..0a3980a 100644
+  #endif /* AVCODEC_ARM_CABAC_H */
+ diff --git a/libavcodec/arm/hevc_cabac.h b/libavcodec/arm/hevc_cabac.h
+ new file mode 100644
+-index 0000000..31d3c59
++index 0000000000..31d3c59205
+ --- /dev/null
+ +++ b/libavcodec/arm/hevc_cabac.h
+ @@ -0,0 +1,491 @@
+@@ -1047,9 +1132,239 @@ index 0000000..31d3c59
+ +#endif /* HAVE_ARMV6T2_INLINE */
+ +
+ +#endif /* AVCODEC_ARM_HEVC_CABAC_H */
++diff --git a/libavcodec/arm/hevc_idct_fn_neon.S b/libavcodec/arm/hevc_idct_fn_neon.S
++new file mode 100644
++index 0000000000..380d3c8d3b
++--- /dev/null
+++++ b/libavcodec/arm/hevc_idct_fn_neon.S
++@@ -0,0 +1,224 @@
+++@ Included multiple times from hevc_idct_neon.S
+++@ Macros defined there
+++
+++#define DC_SHIFT  (15 - BIT_DEPTH)
+++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
+++#define TRN_SHIFT (20 - BIT_DEPTH)
+++
+++function JOIN(ff_hevc_idct_4x4_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        vdup.16     q0, r1
+++        vdup.16     q1, r1
+++        vst1.16     {q0, q1}, [r0]
+++        bx lr
+++endfunc
+++
+++function JOIN(ff_hevc_idct_8x8_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        vdup.16     q8, r1
+++        vdup.16     q9, r1
+++        vmov.16     q10, q8
+++        vmov.16     q11, q8
+++        vmov.16     q12, q8
+++        vmov.16     q13, q8
+++        vmov.16     q14, q8
+++        vmov.16     q15, q8
+++        vstm        r0, {q8-q15}
+++        bx lr
+++endfunc
+++
+++function JOIN(ff_hevc_idct_16x16_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        vdup.16     q8, r1
+++        vdup.16     q9, r1
+++        vmov.16     q10, q8
+++        vmov.16     q11, q8
+++        vmov.16     q12, q8
+++        vmov.16     q13, q8
+++        vmov.16     q14, q8
+++        vmov.16     q15, q8
+++        vstm        r0!, {q8-q15}
+++        vstm        r0!, {q8-q15}
+++        vstm        r0!, {q8-q15}
+++        vstm        r0, {q8-q15}
+++        bx lr
+++endfunc
+++
+++function JOIN(ff_hevc_idct_32x32_dc_neon_, BIT_DEPTH), export=1
+++        ldrsh       r1, [r0]
+++        add         r1, #DC_ADD
+++        asr         r1, #DC_SHIFT
+++        mov         r3, #16
+++        vdup.16     q8, r1
+++        vdup.16     q9, r1
+++        vmov.16     q10, q8
+++        vmov.16     q11, q8
+++        vmov.16     q12, q8
+++        vmov.16     q13, q8
+++        vmov.16     q14, q8
+++        vmov.16     q15, q8
+++1:      subs        r3, #1
+++        vstm        r0!, {q8-q15}
+++        bne         1b
+++        bx lr
+++endfunc
+++
+++
+++function JOIN(ff_hevc_transform_4x4_neon_, BIT_DEPTH), export=1
+++        vpush       {d8-d15}
+++        vld1.16     {q14, q15}, [r0]  // coeffs
+++        ldr         r3, =0x00240053 // 36 and 83
+++        vmov.32     d0[0], r3
+++
+++        tr4_shift d28, d29, d30, d31, #7
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++
+++        tr4_shift d28, d29, d30, d31, #(TRN_SHIFT)
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++
+++        vst1.16     {q14, q15}, [r0]
+++        vpop        {d8-d15}
+++        bx lr
+++endfunc
+++
+++
+++
+++function JOIN(ff_hevc_transform_luma_4x4_neon_, BIT_DEPTH), export=1
+++        vpush       {d8-d15}
+++        vld1.16     {q14, q15}, [r0]  // coeffs
+++        ldr         r3, =0x4a  // 74
+++        vmov.32     d0[0], r3
+++        ldr         r3, =0x1d  // 29
+++        vmov.32     d0[1], r3
+++        ldr         r3, =0x37  // 55
+++        vmov.32     d1[0], r3
+++
+++        tr4_luma_shift d28, d29, d30, d31, #7
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++
+++        tr4_luma_shift d28, d29, d30, d31, #(TRN_SHIFT)
+++
+++        vtrn.16     d28, d29
+++        vtrn.16     d30, d31
+++        vtrn.32     q14, q15
+++        vst1.16     {q14, q15}, [r0]
+++        vpop        {d8-d15}
+++        bx lr
+++endfunc
+++
+++
+++
+++function JOIN(ff_hevc_transform_8x8_neon_, BIT_DEPTH), export=1
+++        push   {r4-r8}
+++        vpush {d8-d15}
+++        mov    r5, #16
+++
+++        adrl      r3, tr4f
+++        vld1.16   {d0, d1}, [r3]
+++
+++        // left half
+++        vld1.16 {d24}, [r0], r5
+++        vld1.16 {d25}, [r0], r5
+++        vld1.16 {d26}, [r0], r5
+++        vld1.16 {d27}, [r0], r5
+++        vld1.16 {d28}, [r0], r5
+++        vld1.16 {d29}, [r0], r5
+++        vld1.16 {d30}, [r0], r5
+++        vld1.16 {d31}, [r0], r5
+++        sub      r0, #128
+++        tr8_begin d25, d27, d29, d31
+++        tr4       d24, d26, d28, d30
+++        tr8_end   #7
+++        vst1.16 {d2}, [r0], r5
+++        vst1.16 {d3}, [r0], r5
+++        vst1.16 {d4}, [r0], r5
+++        vst1.16 {d5}, [r0], r5
+++        vst1.16 {d6}, [r0], r5
+++        vst1.16 {d7}, [r0], r5
+++        vst1.16 {d8}, [r0], r5
+++        vst1.16 {d9}, [r0], r5
+++        sub      r0, #128
+++        //skip right half if col_limit in r1 is less than 4
+++        cmp      r1, #4
+++        blt      1f
+++        //right half
+++        add      r0, #8
+++        vld1.16 {d24}, [r0], r5
+++        vld1.16 {d25}, [r0], r5
+++        vld1.16 {d26}, [r0], r5
+++        vld1.16 {d27}, [r0], r5
+++        vld1.16 {d28}, [r0], r5
+++        vld1.16 {d29}, [r0], r5
+++        vld1.16 {d30}, [r0], r5
+++        vld1.16 {d31}, [r0], r5
+++        sub      r0, #128
+++        tr8_begin d25, d27, d29, d31
+++        tr4       d24, d26, d28, d30
+++        tr8_end   #7
+++        vst1.16 {d2}, [r0], r5
+++        vst1.16 {d3}, [r0], r5
+++        vst1.16 {d4}, [r0], r5
+++        vst1.16 {d5}, [r0], r5
+++        vst1.16 {d6}, [r0], r5
+++        vst1.16 {d7}, [r0], r5
+++        vst1.16 {d8}, [r0], r5
+++        vst1.16 {d9}, [r0], r5
+++        sub      r0, #136
+++1:
+++        // top half
+++        vldm r0, {q12-q15} // coeffs
+++        transpose_16b_4x4 d24, d26, d28, d30
+++        transpose_16b_4x4 d25, d27, d29, d31
+++        tr8_begin d26, d30, d27, d31
+++        tr4 d24, d28, d25, d29
+++        tr8_end #(TRN_SHIFT)
+++        transpose_16b_4x4 d2, d3, d4, d5
+++        transpose_16b_4x4 d6, d7, d8, d9
+++        vswp     d7, d5
+++        vswp     d7, d8
+++        vswp     d3, d6
+++        vswp     d6, d4
+++        vstm r0!, {q1-q4}
+++
+++        // bottom half
+++        vldm r0, {q12-q15} // coeffs
+++        transpose_16b_4x4 d24, d26, d28, d30
+++        transpose_16b_4x4 d25, d27, d29, d31
+++        tr8_begin d26, d30, d27, d31
+++        tr4 d24, d28, d25, d29
+++        tr8_end #(TRN_SHIFT)
+++        transpose_16b_4x4 d2, d3, d4, d5
+++        transpose_16b_4x4 d6, d7, d8, d9
+++        vswp     d7, d5
+++        vswp     d7, d8
+++        vswp     d3, d6
+++        vswp     d6, d4
+++        //vstm     r0, {q1-q4}
+++        vst1.16 {q1-q2}, [r0]
+++        add     r0, #32
+++        vst1.16 {q3-q4}, [r0]
+++        sub     r0, #32
+++        vpop {d8-d15}
+++        pop {r4-r8}
+++        bx lr
+++endfunc
+++
+++#undef DC_SHIFT
+++#undef DC_ADD
+++#undef TRN_SHIFT
+++
+ diff --git a/libavcodec/arm/hevc_misc_neon.S b/libavcodec/arm/hevc_misc_neon.S
+ new file mode 100644
+-index 0000000..373576b
++index 0000000000..373576b4cb
+ --- /dev/null
+ +++ b/libavcodec/arm/hevc_misc_neon.S
+ @@ -0,0 +1,62 @@
+@@ -1115,8 +1430,310 @@ index 0000000..373576b
+ +
+ +endfunc
+ +
++diff --git a/libavcodec/arm/hevcdsp_cres_neon.S b/libavcodec/arm/hevcdsp_cres_neon.S
++new file mode 100644
++index 0000000000..bafefd4318
++--- /dev/null
+++++ b/libavcodec/arm/hevcdsp_cres_neon.S
++@@ -0,0 +1,296 @@
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++@ General notes:
+++@
+++@ Residual is only guaranteed to be cliped to 16 bits
+++@ This means that we do need to do movul, qadd, qmovun
+++@ rather than addw, qmovun (if we were clipped to 15 then we could get away
+++@ with this)
+++
+++@ ============================================================================
+++@ U add
+++
+++@ add_residual4x4_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc_v)             [r3]
+++
+++function ff_hevc_add_residual_4x4_u_neon_8, export=1
+++        vld1.8      {d16}, [r0, :64], r2
+++        vld1.8      {d17}, [r0, :64], r2
+++        vld1.8      {d18}, [r0, :64], r2
+++        vld1.8      {d19}, [r0, :64], r2
+++        vld1.16     {q0, q1}, [r1]
+++        vdup.16     q2, r3
+++        vdup.16     q3, r3
+++        vmovl.u8    q10, d16
+++        sub         r0, r0, r2, lsl #2
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vzip.16     q0, q2
+++        vzip.16     q1, q3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q2
+++        vqmovun.s16 d2,  q1
+++        vqmovun.s16 d3,  q3
+++        vst1.8      {d0}, [r0, :64], r2
+++        vst1.8      {d1}, [r0, :64], r2
+++        vst1.8      {d2}, [r0, :64], r2
+++        vst1.8      {d3}, [r0, :64]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++@   int dc_v)             [r3]
+++
+++function ff_hevc_add_residual_8x8_u_neon_8, export=1
+++        mov         r12,    #4
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {d16, d17}, [r0, :128], r2
+++        vld2.8      {d18, d19}, [r0, :128]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12, #1
+++        vmovl.u8    q10, d16
+++        sub         r0, r2
+++        vmovl.u8    q11, d18
+++        vqadd.s16   q0,  q10
+++        vaddw.u8    q2,  q15, d17
+++        vqadd.s16   q1,  q11
+++        vaddw.u8    q3,  q15, d19
+++        vqmovun.s16 d16,  q0
+++        vqmovun.s16 d17,  q2
+++        vqmovun.s16 d18,  q1
+++        vqmovun.s16 d19,  q3
+++        vst2.8      {d16, d17}, [r0, :128], r2
+++        vst2.8      {d18, d19}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++@   int dc_v)             [r3]
+++
+++function ff_hevc_add_residual_16x16_u_neon_8, export=1
+++        mov         r12,    #16
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {q8, q9}, [r0, :256]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12,   #1
+++        vmovl.u8    q10, d16
+++        vmovl.u8    q11, d17
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vaddw.u8    q2,  q15, d18
+++        vaddw.u8    q3,  q15, d19
+++        vqmovun.s16 d16, q0
+++        vqmovun.s16 d17, q1
+++        vqmovun.s16 d18, q2
+++        vqmovun.s16 d19, q3
+++        vst2.8      {q8, q9}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ V add
+++
+++@ add_residual4x4_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_4x4_v_neon_8, export=1
+++        vld1.8      {d16}, [r0, :64], r2
+++        vld1.8      {d17}, [r0, :64], r2
+++        vld1.8      {d18}, [r0, :64], r2
+++        vld1.8      {d19}, [r0, :64], r2
+++        vld1.16     {q2, q3}, [r1]
+++        vdup.16     q0, r3
+++        vdup.16     q1, r3
+++        vmovl.u8    q10, d16
+++        sub         r0, r0, r2, lsl #2
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vzip.16     q0, q2
+++        vzip.16     q1, q3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q2
+++        vqmovun.s16 d2,  q1
+++        vqmovun.s16 d3,  q3
+++        vst1.8      {d0}, [r0, :64], r2
+++        vst1.8      {d1}, [r0, :64], r2
+++        vst1.8      {d2}, [r0, :64], r2
+++        vst1.8      {d3}, [r0, :64]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_8x8_v_neon_8, export=1
+++        mov         r12,    #4
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {d16, d17}, [r0, :128], r2
+++        vld2.8      {d18, d19}, [r0, :128]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12, #1
+++        vmovl.u8    q10, d17
+++        sub         r0, r2
+++        vmovl.u8    q11, d19
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vaddw.u8    q2,  q15, d16
+++        vaddw.u8    q3,  q15, d18
+++        vqmovun.s16 d17,  q0
+++        vqmovun.s16 d16,  q2
+++        vqmovun.s16 d19,  q1
+++        vqmovun.s16 d18,  q3
+++        vst2.8      {d16, d17}, [r0, :128], r2
+++        vst2.8      {d18, d19}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_16x16_v_neon_8, export=1
+++        mov         r12,    #16
+++        vdup.16     q15, r3
+++1:
+++        vld2.8      {q8, q9}, [r0, :256]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        subs        r12,   #1
+++        vmovl.u8    q10, d18
+++        vmovl.u8    q11, d19
+++        vaddw.u8    q2,  q15, d16
+++        vaddw.u8    q3,  q15, d17
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqmovun.s16 d16, q2
+++        vqmovun.s16 d17, q3
+++        vqmovun.s16 d18, q0
+++        vqmovun.s16 d19, q1
+++        vst2.8      {q8, q9}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ U & V add
+++
+++@ add_residual4x4_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_4x4_c_neon_8, export=1
+++        vld1.8      {d16}, [r0, :64], r2
+++        vld1.8      {d17}, [r0, :64], r2
+++        vld1.8      {d18}, [r0, :64], r2
+++        vld1.8      {d19}, [r0, :64], r2
+++        vldm        r1, {q0-q3}           @ Q0/1 gets all of U, Q2/3 gets all of V
+++        vmovl.u8    q10, d16
+++        sub         r0, r0, r2, lsl #2
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vzip.16     q0, q2
+++        vzip.16     q1, q3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q2
+++        vqmovun.s16 d2,  q1
+++        vqmovun.s16 d3,  q3
+++        vst1.8      {d0}, [r0, :64], r2
+++        vst1.8      {d1}, [r0, :64], r2
+++        vst1.8      {d2}, [r0, :64], r2
+++        vst1.8      {d3}, [r0, :64]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_8x8_c_neon_8, export=1
+++        mov         r12,    #8
+++        add         r3, r1, #(8*8*2)  @ Offset to V
+++1:
+++        vld2.8      {d16, d17}, [r0, :128]
+++        vld1.16     {q0}, [r1, :128]!
+++        vld1.16     {q1}, [r3, :128]!
+++        subs        r12, #1
+++        vmovl.u8    q10, d16
+++        vmovl.u8    q11, d17
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vst2.8      {d0, d1}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function ff_hevc_add_residual_16x16_c_neon_8, export=1
+++        mov         r12,    #16
+++        add         r3, r1, #(16*16*2)  @ Offset to V
+++1:
+++        vld2.8      {q8, q9}, [r0, :256]
+++        vld1.16     {q0, q1}, [r1, :256]!
+++        vld1.16     {q2, q3}, [r3, :256]!
+++        subs        r12,   #1
+++        vmovl.u8    q10, d16
+++        vmovl.u8    q11, d17
+++        vmovl.u8    q12, d18
+++        vmovl.u8    q13, d19
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vqmovun.s16 d2,  q2
+++        vqmovun.s16 d3,  q3
+++        vst2.8      {q0, q1}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ 32x32 chroma never occurs so NIF
+++
+++@ ============================================================================
+ diff --git a/libavcodec/arm/hevcdsp_deblock_neon.S b/libavcodec/arm/hevcdsp_deblock_neon.S
+-index 166bddb..9bd0a42 100644
++index 166bddb104..15c4329cdb 100644
+ --- a/libavcodec/arm/hevcdsp_deblock_neon.S
+ +++ b/libavcodec/arm/hevcdsp_deblock_neon.S
+ @@ -15,7 +15,7 @@
+@@ -1128,58 +1745,235 @@ index 166bddb..9bd0a42 100644
+   */
+  
+  
+-@@ -31,6 +31,9 @@
++@@ -24,70 +24,238 @@
++ 
++ .macro hevc_loop_filter_chroma_start
++         ldr      r12, [r2]
++-        ldr      r3, [r2, #4]
++-        add      r2, r3, r12
++-        cmp      r2, #0
+++        ldr      r2, [r2, #4]
+++        orrs     r2, r12, r2, lsl #16
++         it       eq
+          bxeq     lr
+  .endm
+  
++-.macro hevc_loop_filter_chroma_body
++-        vsubl.u8  q3, d4, d2
++-        vsubl.u8  q11, d18, d19
++-        vshl.i16  q3, #2
++-        vadd.i16  q11, q3
++-        vdup.16   d0, r12
++-        vdup.16   d1, r3
++-        vrshr.s16 q11, q11, #3
++-        vneg.s16  q12, q0
+ +@ Uses: d2, d4, d18, d19
+ +@ Returns: d2, d4
+-+@ Modifies: d0-d7, d22-d25
+- .macro hevc_loop_filter_chroma_body
+-         vsubl.u8  q3, d4, d2
+-         vsubl.u8  q11, d18, d19
+-@@ -49,6 +52,33 @@
+-         vqmovun.s16 d4, q2
+- .endm
+- 
+-+
+-+@ Uses r2[0:7], r2[8:15]
+-+@ Modifies: d0-d7, d22-d25
+-+.macro hevc_loop_filter_uv_body P1, P0, Q0, Q1
+-+        vsubl.u8  q3, \Q0, \P0
+-+        vsubl.u8  q11, \P1, \Q1
+-+        vshl.i16  q3, #2
+-+        vadd.i16  q11, q3
+-+
+-+        @ r2[0:7] -> d0.16 (all), r2[8:15] -> d1.16(all)
+-+        vdup.16   d0, r2
+-+        vmovl.u8  q0, d0
+-+        vuzp.16   d0, d1
+-+
+-+        vrshr.s16 q11, q11, #3
+-+        vneg.s16  q12, q0
+++@ Modifies: d0-d7, d22-d25, r12
+++
+++.macro hevc_loop_filter_chroma_body P1, P0, Q0, Q1
+++        vsubl.u8  q0, \Q0, \P0
+++        vsubl.u8  q1, \P1, \Q1
+++        vdup.16   d4, r2
+++        lsr       r2, r2, #16
+++        vshl.i16  q0, #2
+++        ldr       r12, [sp, #0] @ r12 = &no_q
+++        vadd.i16  q0, q1
+++        ldrh      r3, [r3]      @ r3[0:8] = no_p[0], r3[8:15] = no_p[1]
+++        vdup.16   d5, r2
+++
+++        vrshr.s16 q0, q0, #3
+++        ldrh      r12, [r12]
+++        vneg.s16  q3, q2
+++        vmin.s16  q0, q0, q2
+ +        vmovl.u8  q2, \Q0
+-+        vmin.s16  q11, q11, q0
+-+        vmax.s16  q11, q11, q12
+-+        vaddw.u8  q1, q11, \P0
+-+        vsub.i16  q2, q11
+++        vmax.s16  q0, q0, q3
+++        vaddw.u8  q1, q0, \P0
+++        vsub.i16  q2, q0
+++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+ +        vqmovun.s16 \P0, q1
+ +        vqmovun.s16 \Q0, q2
+ +.endm
+ +
+++@ Uses r2 (tc a;b)
+++@ Modifies: q0-q3
+++@ On exit
+++@   r12 (and flags) contain no_p;no_q
+++.macro hevc_loop_filter_chroma_body_16 P1, P0, Q0, Q1, bit_depth
+++        vsub.i16  q0, \Q0, \P0
+++        lsl       r12, r2, #(\bit_depth - 8)
+++        vsub.i16  q1, \P1, \Q1
+++        vshl.i16  q0, #2
+++        vdup.16   d4, r12
+++        lsr       r12, r12, #16
+++        vadd.i16  q0, q1
+++        ldrh      r3, [r3]
+++        vdup.16   d5, r12
+++
+++        vrshr.s16 q0, q0, #3
+++        vneg.s16  q3, q2
+++        movw      r12, #(1 << \bit_depth) - 1
+++        vmin.s16  q0, q0, q2
+++        vmax.s16  q0, q0, q3
+++        vdup.i16  q3, r12
+++        ldr       r12, [sp, #0]
+++
+++        vadd.i16  \P0, q0, \P0
+++        vsub.i16  \Q0, q0
+++
+++        vmov.i64  q2, #0
+++        ldrh      r12, [r12]
+++        vmin.s16  \P0, q3
+++        vmin.s16  \Q0, q3
+++        orrs      r12, r3, r12, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+++        vmax.s16  \P0, q2
+++        vmax.s16  \Q0, q2
+++.endm
+++
+++
+++@ Preserves r12
+++@ Clobbers r2
+++.macro hevc_loop_filter_uv_body2 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v
+++        vsubl.u8  q0, \Q0u, \P0u
+++        vsubl.u8  q1, \Q0v, \P0v
+++        vsubl.u8  q2, \P1u, \Q1u
+++        vsubl.u8  q3, \P1v, \Q1v
+++        vshl.i16  q0, #2
+++        vshl.i16  q1, #2
+++        vadd.i16  q0, q2
+++        vdup.16   d4, r2
+++        lsr       r2, #16
+++        vadd.i16  q1, q3
+++
+++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+++        vrshr.s16 q0, #3
+++        vdup.16   d6, r2
++         vmovl.u8  q2, d4
++-        vmin.s16  q11, q11, q0
++-        vmax.s16  q11, q11, q12
++-        vaddw.u8  q1, q11, d2
++-        vsub.i16  q2, q11
++-        vqmovun.s16 d2, q1
++-        vqmovun.s16 d4, q2
+++        vmovl.u8  q3, d6
+++        vuzp.16   d4, d5
+++        vrshr.s16 q1, #3
+++        vuzp.16   d6, d7
+++
+++        vmin.s16  q0, q2
+++        vneg.s16  q2, q2
+++        vmin.s16  q1, q3
+++        vneg.s16  q3, q3
+++        vmax.s16  q0, q2
+++        vaddw.u8  q2, q0, \P0u
+++        vmax.s16  q1, q3
+++        vaddw.u8  q3, q1, \P0v
+++
+++        vqmovun.s16 \P0u, q2
+++        vmovl.u8  q2, \Q0u
+++        vqmovun.s16 \P0v, q3
+++        vmovl.u8  q3, \Q0v
+++        vsub.i16  q2, q0
+++        vsub.i16  q3, q1
+++
+++        vqmovun.s16 \Q0u, q2
+++        vqmovun.s16 \Q0v, q3
++ .endm
++ 
+++@ Preserves r12
+++@ Clobbers r2
+++.macro hevc_loop_filter_uv_body2_16 P1u, P1v, P0u, P0v, Q0u, Q0v, Q1u, Q1v, bit_depth
+++        vsub.i16  q0, \Q0u, \P0u
+++        vsub.i16  q1, \Q0v, \P0v
+++        vsub.i16  q2, \P1u, \Q1u
+++        vsub.i16  q3, \P1v, \Q1v
+++        vshl.i16  q0, #2
+++        vshl.i16  q1, #2
+++        vadd.i16  q0, q2
+++        vdup.16   d4, r2
+++        lsr       r2, #16
+++        vadd.i16  q1, q3
+++
+++        @ r2[0:7] -> d4.16 (all), r2[8:15] -> d5.16(all)
+++        vrshr.s16 q0, #3
+++        vdup.16   d6, r2
+++        vshll.u8  q2, d4, #\bit_depth - 8
+++        vshll.u8  q3, d6, #\bit_depth - 8
+++        vuzp.16   d4, d5
+++        vrshr.s16 q1, #3
+++        vuzp.16   d6, d7
+++
+++        movw      r2, #(1 << \bit_depth) - 1
+++        vmin.s16  q0, q2
+++        vneg.s16  q2, q2
+++        vmin.s16  q1, q3
+++        vneg.s16  q3, q3
+++        vmax.s16  q0, q2
+++        vmov.i64  q2, #0
+++        vmax.s16  q1, q3
+++        vdup.i16  q3, r2
+++        vadd.i16  \P0u, q0
+++        vsub.i16  \Q0u, q0
+++        vadd.i16  \P0v, q1
+++        vsub.i16  \Q0v, q1
+++
+++        vmax.s16  \P0u, q2
+++        vmax.s16  \Q0u, q2
+++        vmax.s16  \P0v, q2
+++        vmax.s16  \Q0v, q2
+++        vmin.s16  \P0u, q3
+++        vmin.s16  \Q0u, q3
+++        vmin.s16  \P0v, q3
+++        vmin.s16  \Q0v, q3
+++.endm
+++
+ +
+ +
+  .macro hevc_loop_filter_luma_start
+          ldr     r12, [r3]
+          ldr      r3, [r3, #4]
+-@@ -60,15 +90,17 @@
+-         lsr      r3, #16
++-        lsl      r3, #16
++-        orr      r3, r12
++-        cmp      r3, #0
+++        orrs     r3, r12, r3, lsl #16
++         it       eq
++         bxeq     lr
++-        lsr      r3, #16
+  .endm
+  
+ -.macro hevc_loop_filter_luma_body
++-        vmovl.u8  q8, d16
++-        vmovl.u8  q9, d18
++-        vmovl.u8  q10, d20
++-        vmovl.u8  q11, d22
++-        vmovl.u8  q12, d24
++-        vmovl.u8  q13, d26
++-        vmovl.u8  q14, d28
++-        vmovl.u8  q15, d30
+ +@ Uses: r2, r3, r12
+ +@ Modifies: r5, r6, r7, r8, r9
+-+function hevc_loop_filter_luma_body
+++
+++@ Input:
+++@  r2          beta    (raw: needs shift for bitdepth > 8)
+++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
+++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
+++@  [sp,#96]    &no_p[0]
+++@  [sp,#100]   &no_q[0]
+++@
+++@ Input & output
+++@  8-bit: d16-d23
+++@ 16-bit:  q8-q15
+++@
+++@ Output
+++@  Z           r10==0
+++@  r10[ 0:7 ]  no_p[0]
+++@  r10[ 8:15]  no_p[1]
+++@  r10[16:23]  no_q[0]
+++@  r10[24:31]  no_q[1]
+++
++ 
+++.macro m_filter_luma bit_depth
+++.if \bit_depth == 8
+ +        vmovl.u8  q15, d23
+ +        vmovl.u8  q14, d22
+ +        vmovl.u8  q13, d21
+@@ -1187,54 +1981,72 @@ index 166bddb..9bd0a42 100644
+ +        vmovl.u8  q11, d19
+ +        vmovl.u8  q10, d18
+ +        vmovl.u8  q9, d17
+-         vmovl.u8  q8, d16
+--        vmovl.u8  q9, d18
+--        vmovl.u8  q10, d20
+--        vmovl.u8  q11, d22
+--        vmovl.u8  q12, d24
+--        vmovl.u8  q13, d26
+--        vmovl.u8  q14, d28
+--        vmovl.u8  q15, d30
+- 
+++        vmovl.u8  q8, d16
+++.endif
+          vadd.i16   q7, q9, q11
+++.if \bit_depth > 8
+++        lsl        r2, r2, #(\bit_depth - 8)
+++.endif
+          vadd.i16   q6, q14, q12
+-@@ -77,7 +109,6 @@
+++.if \bit_depth > 8
+++        lsl        r3, r3, #(\bit_depth - 8)
+++.endif
++         vsub.i16   q7, q10
+++        ldr        r5, [sp, #96]        @ Bolt no_x values together into r10
++         vsub.i16   q6, q13
+          vabd.s16   q7, q7, q10
+          vabd.s16   q6, q6, q13
+- 
+ -
+++        ldrh       r10, [r5]
++ 
+          vdup.16    q0, r2
+          vmov       q4, q7
+          vmov       q5, q6
+-@@ -152,7 +183,7 @@
++-        vdup.16    d4, r12
+++        ldr        r5, [sp, #100]
+++        vdup.16    d4, r3
+++        lsr        r3, r3, #16
++         vtrn.16    q7, q4
+++        ldrh       r5, [r5]
++         vtrn.16    q6, q5
++ 
++         vshl.u64   q7, #32
++         vshr.u64   q4, #32
++         vshl.u64   q6, #32
+++        orr        r10, r10, r5, lsl #16
++         vshr.u64   q5, #32
++         vshr.u64   q7, #32
++         vshr.u64   q6, #32
++@@ -152,7 +320,7 @@
+  
+          and        r9, r8, r7
+          cmp        r9, #0
+ -        beq        weakfilter_\@
+-+        beq        weakfilter_
+++        beq        1f
+  
+          vadd.i16  q2, q11, q12
+          vadd.i16  q4, q9, q8
+-@@ -210,11 +241,11 @@
++@@ -210,11 +378,11 @@
+          vbit      q13, q3, q5
+          vbit      q14, q2, q5
+  
+ -weakfilter_\@:
+-+weakfilter_:
+++1:
+          mvn       r8, r8
+          and       r9, r8, r7
+          cmp       r9, #0
+ -        beq       ready_\@
+-+        beq       ready_
+++        beq       2f
+  
+          vdup.16    q4, r2
+  
+-@@ -275,75 +306,345 @@ weakfilter_\@:
++@@ -275,111 +443,1041 @@ weakfilter_\@:
+          vbit      q11, q0, q5
+          vbit      q12, q4, q5
+  
+ -ready_\@:
+-+ready_:
+++2:
+++.if \bit_depth == 8
+          vqmovun.s16 d16, q8
+ -        vqmovun.s16 d18, q9
+ -        vqmovun.s16 d20, q10
+@@ -1243,7 +2055,7 @@ index 166bddb..9bd0a42 100644
+ -        vqmovun.s16 d26, q13
+ -        vqmovun.s16 d28, q14
+ -        vqmovun.s16 d30, q15
+--.endm
+++        cmp       r10, #0
+ +        vqmovun.s16 d17, q9
+ +        vqmovun.s16 d18, q10
+ +        vqmovun.s16 d19, q11
+@@ -1251,7 +2063,30 @@ index 166bddb..9bd0a42 100644
+ +        vqmovun.s16 d21, q13
+ +        vqmovun.s16 d22, q14
+ +        vqmovun.s16 d23, q15
+++.else
+++        movw      r12, #(1 << \bit_depth - 1)
+++        vmov.i64  q0, #0
+++        vdup.i16  q1, r12
+++        @ q8 & q15 should be unaltered and so don't require clipping
+++        vmax.s16  q9,  q0
+++        cmp       r10, #0
+++        vmax.s16  q10, q0
+++        vmax.s16  q11, q0
+++        vmax.s16  q12, q0
+++        vmax.s16  q13, q0
+++        vmax.s16  q14, q0
+++        vmin.s16  q9,  q1
+++        vmin.s16  q10, q1
+++        vmin.s16  q11, q1
+++        vmin.s16  q12, q1
+++        vmin.s16  q13, q1
+++        vmin.s16  q14, q1
+++.endif
+ +        mov       pc, lr
++ .endm
++ 
+++function hevc_loop_filter_luma_body
+++        m_filter_luma 8
+ +endfunc
+ +
+ +@ ff_hevc_v_loop_filter_luma2_neon(src (r0), stride (r1), beta (r2), tc (r3), np_p (sp[0]), no_q (sp[4]), src2 (sp[8]))
+@@ -1263,7 +2098,16 @@ index 166bddb..9bd0a42 100644
+ +        b        v_loop_luma_common
+ +endfunc
+ +
+- 
+++
+++@ void ff_hevc_v_loop_filter_luma_neon(
+++@   uint8_t *_pix,      [r0]
+++@   ptrdiff_t _stride,  [r1]
+++@   int _beta,          [r2]
+++@   int *_tc,           [r3]
+++@   uint8_t *_no_p,     [sp+0]
+++@   uint8_t *_no_q)     [sp+4]
+++
+++
+  function ff_hevc_v_loop_filter_luma_neon, export=1
+          hevc_loop_filter_luma_start
+ -        push     {r5-r11}
+@@ -1271,14 +2115,6 @@ index 166bddb..9bd0a42 100644
+ +
+ +        sub      r4, r0, #4
+ +v_loop_luma_common:
+-+        @ Why this isn't a bitmask to start with I have no idea...
+-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+-+        ldr      r5, [sp, #32]
+-+        ldrh     r10, [r5]
+-+        ldr      r5, [sp, #36]
+-+        ldrh     r5, [r5]
+-+        orr      r10, r10, r5, lsl #16  @ So should have b0:no_p[0], b8:no_p[1], b16: no_q[0], b24:no_q[1]
+-+
+          vpush    {d8-d15}
+ -        sub      r0, #4
+ -        vld1.8   {d16}, [r0], r1
+@@ -1335,44 +2171,38 @@ index 166bddb..9bd0a42 100644
+ +
+ +        @ no_p[1]
+ +        tst     r10, #0xff00
+-+        itt ne
+-+        addne    r4, r4, r1, lsl #2
+++        add     r2, r4, r1, lsl #2
+ +        bne     1f
+ +        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
+ +        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r4:32], r1
+ +        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
+-+        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32], r1
+-+
+++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r4:32]
+++1:
+++        @ no_p[0]
+++        tst     r10, #0xff
+++        bne     1f
+++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r2:32], r1
+++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r2:32], r1
+++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r2:32], r1
+++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r2:32]
+ +1:
+ +        @ no_q[1]
+ +        tst     r10, #0xff000000
+-+        itt ne
+-+        addne    r0, r0, r1, lsl #2
+-+        bne     2f
+++        add     r2, r0, r1, lsl #2
+++        bne     1f
+ +        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
+ +        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r0:32], r1
+ +        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
+-+        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32], r1
+-+
+-+2:
+-+        @ no_p[0]
+-+        tst     r10, #0xff
+-+        bne     3f
+-+        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
+-+        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r4:32], r1
+-+        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
+-+        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r4:32]
+-+
+-+3:
+++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r0:32]
+++1:
+ +        @ no_q[0]
+ +        tst     r10, #0xff0000
+-+        bne     4f
+-+        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
+-+        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r0:32], r1
+-+        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
+-+        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r0:32]
+-+
+-+4:
+++        bne     1f
+++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r2:32], r1
+++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
+++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r2:32], r1
+++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
+++1:
+ +bypasswrite:
+          vpop     {d8-d15}
+ -        pop      {r5-r11}
+@@ -1380,6 +2210,81 @@ index 166bddb..9bd0a42 100644
+ +        pop      {r4-r10,pc}
+  endfunc
+  
+++.macro m_filter_v_luma_common_16 bit_depth
+++        vpush    {d8-d15}
+++
+++        @ Uses slightly fewer instructions to do laned loads than unlaned
+++        @ and transpose.  This also means that we can use the same code for
+++        @ both split & unsplit deblock
+++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
+++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+++
+++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
+++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+++
+++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
+++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+++
+++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
+++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+++
+++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
+++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+++
+++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+++
+++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+++
+++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
+++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+++
+++        bl hevc_loop_filter_luma_body_\bit_depth
+++
+++        neg     r1, r1
+++
+++        @ p[1]
+++        tst      r10, #0xff00
+++        add      r2, r4, r1, lsl #2
+++        bne      1f
+++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
+++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
+++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
+++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4]
+++1:
+++        @ p[0]
+++        tst      r10, #0xff
+++        bne      1f
+++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r2], r1
+++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r2], r1
+++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r2], r1
+++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r2]
+++1:
+++        @ q[1]
+++        tst      r10, #0xff000000
+++        add      r2, r0, r1, lsl #2
+++        bne      1f
+++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
+++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0]
+++1:
+++        @ q[0]
+++        tst      r10, #0xff0000
+++        bne      1f
+++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r2], r1
+++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
+++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r2], r1
+++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
+++1:
+++        vpop     {d8-d15}
+++        pop      {r4-r10,pc}
+++.endm
+++
+++
+++
+++
+ +@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
+ +@                                 ptrdiff_t stride, [r1]
+ +@                                 int beta,         [r2]
+@@ -1429,13 +2334,6 @@ index 166bddb..9bd0a42 100644
+ +        neg     r1, r1
+ +        add     r0, r0, r1
+ +
+-+        @ Why this isn't a bitmask to start with I have no idea...
+-+        @ Beware that no_x[] seems to be loaded with 2/0 rather than 1/0
+-+        ldr      r5, [sp, #32]
+-+        ldrh     r10, [r5]
+-+        ldr      r5, [sp, #36]
+-+        ldrh     r5, [r5]
+-+        orrs     r10, r10, r5, lsl #16  @ So should have b1:no_p[0], b9:no_p[1], b17: no_q[0], b25:no_q[1]
+ +        bne      1f
+ +
+ +        vst1.8  {d22}, [r0], r1
+@@ -1486,8 +2384,81 @@ index 166bddb..9bd0a42 100644
+ +
+ +        pop      {r4-r10,pc}
+ +
+- endfunc
+- 
+++endfunc
+++
+++
+++.macro m_filter_h_luma_16 bit_depth
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}
+++
+++        vpush    {d8-d15}
+++        sub      r0, r0, r1, lsl #2
+++
+++        vld1.16 { q8}, [r0], r1
+++        vld1.16 { q9}, [r0], r1
+++        vld1.16 {q10}, [r0], r1
+++        vld1.16 {q11}, [r0], r1
+++        vld1.16 {q12}, [r0], r1
+++        vld1.16 {q13}, [r0], r1
+++        vld1.16 {q14}, [r0], r1
+++        vld1.16 {q15}, [r0]
+++
+++        bl hevc_loop_filter_luma_body_\bit_depth
+++
+++        vpop     {d8-d15}
+++
+++        sub      r0, r1
+++        neg      r1, r1
+++        bne      1f
+++
+++        vst1.16  {q14}, [r0], r1
+++        vst1.16  {q13}, [r0], r1
+++        vst1.16  {q12}, [r0], r1
+++        vst1.16  {q11}, [r0], r1
+++        vst1.16  {q10}, [r0], r1
+++        vst1.16  { q9}, [r0]
+++        pop      {r4-r10,pc}
+++
+++@ Partial write
+++1:
+++        tst      r10, #0xff0000
+++        mov      r2, r0
+++        bne      1f
+++        vst1.16  {d28}, [r2], r1
+++        vst1.16  {d26}, [r2], r1
+++        vst1.16  {d24}, [r2]
+++
+++1:
+++        tst      r10, #0xff000000
+++        add      r2, r0, #8
+++        bne      1f
+++        vst1.16  {d29}, [r2], r1
+++        vst1.16  {d27}, [r2], r1
+++        vst1.16  {d25}, [r2]
+++
+++1:
+++        tst      r10, #0xff
+++        @ r0 = r0 + r1 * 3
+++        add      r0, r0, r1
+++        add      r0, r0, r1, lsl # 1
+++        add      r2, r0, #8
+++        bne      1f
+++        vst1.16  {d22}, [r0], r1
+++        vst1.16  {d20}, [r0], r1
+++        vst1.16  {d18}, [r0]
+++
+++1:
+++        tst      r10, #0xff00
+++        bne      1f
+++        vst1.16  {d23}, [r2], r1
+++        vst1.16  {d21}, [r2], r1
+++        vst1.16  {d19}, [r2]
+++
+++1:
+++        pop      {r4-r10,pc}
+++.endm
+++
+++
+ +@ void ff_hevc_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
+ +@                                     unsigned int stride,   // r1
+ +@                                     uint32_t tc4,          // r2
+@@ -1501,9 +2472,7 @@ index 166bddb..9bd0a42 100644
+ +        vld2.8   {d26,d27}, [r0], r1
+ +        vld2.8   {d28,d29}, [r0]
+ +        sub      r0, r0, r1, lsl #1
+-+        hevc_loop_filter_uv_body d16, d18, d26, d28
+-+        lsr      r2, r2, #16
+-+        hevc_loop_filter_uv_body d17, d19, d27, d29
+++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29
+ +        cmp      r3, #0
+ +        bne      1f
+ +        vst2.8   {d18,d19}, [r0], r1
+@@ -1513,122 +2482,509 @@ index 166bddb..9bd0a42 100644
+ +        @ At least one no_f bit is set
+ +        @ Which means we need to break this apart in an ugly fashion
+ +1:      vzip.8   d18, d19
+++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
+ +        vzip.8   d26, d27
+ +        sub      r1, r1, #8
+ +
+-+        tst      r3, #1
+-+        bne      1f
+++        bmi      1f
+ +        vst1.8   {d18}, [r0]
+ +1:      add      r0, r0, #8
+-+        tst      r3, #2
+-+        bne      2f
+++        bcs      2f
+ +        vst1.8   {d19}, [r0]
+-+2:      add      r0, r0, r1
+++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
+++        add      r0, r0, r1
+ +
+-+        tst      r3, #4
+-+        bne      1f
+++        bmi      1f
+ +        vst1.8   {d26}, [r0]
+-+1:      add      r0, r0, #8
+-+        tst      r3, #8
+-+        it ne
+-+        bxne     lr
+++1:      it cs
+++        bxcs     lr
+++        add      r0, r0, #8
+ +        vst1.8   {d27}, [r0]
+ +        bx       lr
+ +
+ +endfunc
+ +
+ +
+++@ void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     unsigned int no_f);    // r3
+++@
+++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++@
+++@ Macro here actual function near bottom
+++
+++.macro m_filter_h_uv_16 bit_depth
+++        sub      r0, r0, r1, lsl #1
+++        vld2.16  {q8,  q9 }, [r0], r1
+++        vld2.16  {q10, q11}, [r0], r1
+++        vld2.16  {q12, q13}, [r0], r1
+++        vld2.16  {q14, q15}, [r0]
+++        sub      r0, r0, r1, lsl #1
+++
+++        hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+++
+++        cmp      r3, #0
+++        bne      1f
+++        vst2.16  {q10, q11}, [r0], r1
+++        vst2.16  {q12, q13}, [r0]
+++        bx       lr
+++
+++        @ At least one no_f bit is set
+++        @ Which means we need to break this apart in an ugly fashion
+++1:      vzip.16  q10, q11
+++        lsls     r2, r3, #31            @ b0 -> N, b1 -> C
+++        vzip.16  q12, q13
+++        sub      r1, r1, #16
+++
+++        bmi      1f
+++        vst1.16  {q10}, [r0]
+++1:      add      r0, r0, #16
+++        bcs      2f
+++        vst1.16  {q11}, [r0]
+++2:      lsls     r2, r3, #29            @ b2 -> N, b3 -> C
+++        add      r0, r0, r1
+++
+++        bmi      1f
+++        vst1.16  {q12}, [r0]
+++1:      it cs
+++        bxcs     lr
+++        add      r0, r0, #16
+++        vst1.16  {q13}, [r0]
+++        bx       lr
+++.endm
+++
+++
+ +@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+ +@                                     unsigned int stride,   // r1
+ +@                                     uint32_t tc4,          // r2
+ +@                                     uint8_t * src_l,       // r3
+ +@                                     unsigned int no_f);   // sp[0]
+ +@
+-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++
+ +function ff_hevc_v_loop_filter_uv2_neon_8, export=1
+ +        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3], r1
+-+        vld4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0], r1
+++        vld4.8   {d20[0], d21[0], d22[0], d23[0]}, [r0], r1
+++        sub      r12, r0, r3
+ +
+ +        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+-+        vld4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+++        vld4.8   {d20[1], d21[1], d22[1], d23[1]}, [r0], r1
+++        cmp      r12, #4
+ +
+ +        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+-+        vld4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+++        vld4.8   {d20[2], d21[2], d22[2], d23[2]}, [r0], r1
+ +
+ +        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+-+        vld4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+++        vld4.8   {d20[3], d21[3], d22[3], d23[3]}, [r0], r1
+ +
+ +        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+-+        vld4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++        vld4.8   {d20[4], d21[4], d22[4], d23[4]}, [r0], r1
+ +
+ +        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+-+        vld4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+++        vld4.8   {d20[5], d21[5], d22[5], d23[5]}, [r0], r1
+ +
+ +        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+-+        vld4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+++        vld4.8   {d20[6], d21[6], d22[6], d23[6]}, [r0], r1
+ +
+ +        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3]
+-+        vld4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0]
+++        vld4.8   {d20[7], d21[7], d22[7], d23[7]}, [r0]
+++        it eq
+++        ldreq    r12, [sp, #0]
+++
+++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23
+++        cmp      r12, #0
+++        add      r3, #2
+++        neg      r1, r1
+++        bne      1f
+++
+++@ Much/most of the time r0 == r3 + 4 and no_f == 0
+++@ so it is worth having this special case
+++        vst4.8   {d18[7], d19[7], d20[7], d21[7]}, [r3], r1
+++        vst4.8   {d18[6], d19[6], d20[6], d21[6]}, [r3], r1
+++        vst4.8   {d18[5], d19[5], d20[5], d21[5]}, [r3], r1
+++        vst4.8   {d18[4], d19[4], d20[4], d21[4]}, [r3], r1
+++        vst4.8   {d18[3], d19[3], d20[3], d21[3]}, [r3], r1
+++        vst4.8   {d18[2], d19[2], d20[2], d21[2]}, [r3], r1
+++        vst4.8   {d18[1], d19[1], d20[1], d21[1]}, [r3], r1
+++        vst4.8   {d18[0], d19[0], d20[0], d21[0]}, [r3]
+++        bx       lr
+++
+++@ Either split or partial
+++1:
+++        ldr      r12, [sp, #0]
+++        lsls     r12, #29               @ b2 -> N, b3 -> C
+++        add      r2, r0, r1, lsl #2
+++        bcs      1f
+++        vst2.8   {d20[7], d21[7]}, [r0], r1
+++        vst2.8   {d20[6], d21[6]}, [r0], r1
+++        vst2.8   {d20[5], d21[5]}, [r0], r1
+++        vst2.8   {d20[4], d21[4]}, [r0]
+++1:
+++        bmi      2f
+++        vst2.8   {d20[3], d21[3]}, [r2], r1
+++        vst2.8   {d20[2], d21[2]}, [r2], r1
+++        vst2.8   {d20[1], d21[1]}, [r2], r1
+++        vst2.8   {d20[0], d21[0]}, [r2]
+++
+++2:
+++        lsls     r12, #2
+++        add      r2, r3, r1, lsl #2
+++        bcs      3f
+++        vst2.8   {d18[7], d19[7]}, [r3], r1
+++        vst2.8   {d18[6], d19[6]}, [r3], r1
+++        vst2.8   {d18[5], d19[5]}, [r3], r1
+++        vst2.8   {d18[4], d19[4]}, [r3]
+++3:
+++        it mi
+++        bxmi     lr
+++        vst2.8   {d18[3], d19[3]}, [r2], r1
+++        vst2.8   {d18[2], d19[2]}, [r2], r1
+++        vst2.8   {d18[1], d19[1]}, [r2], r1
+++        vst2.8   {d18[0], d19[0]}, [r2]
+++        bx       lr
++ endfunc
++ 
+++
+++@ void ff_hevc_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
+++@                                     unsigned int stride,   // r1
+++@                                     uint32_t tc4,          // r2
+++@                                     uint8_t * src_l,       // r3
+++@                                     unsigned int no_f);   // sp[0]
+++@
+++@ no_f = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
+++.macro m_filter_v_uv2_16 bit_depth
+++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r3], r1
+++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
+++        sub      r12, r0, r3
+++
+++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r3], r1
+++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
+++        cmp      r12, #8
+++
+++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r3], r1
+++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
+++
+++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r3], r1
+++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
+ +
+-+        hevc_loop_filter_uv_body d16, d18, d26, d28
+-+        lsr      r2, r2, #16
+-+        hevc_loop_filter_uv_body d17, d19, d27, d29
+++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r3], r1
+++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
+ +
+++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r3], r1
+++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
+++
+++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r3], r1
+++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
+++
+++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r3]
+++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
+++        it eq
+++        ldreq    r12, [sp, #0]
+++
+++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth
+++        cmp      r12, #0
+++        add      r3, #4
+ +        neg      r1, r1
+++        bne      1f
+ +
+-+        ldr      r2, [sp, #0]
+++@ Much/most of the time r0 == r3 + 4 and no_f == 0
+++@ so it is worth having this special case
+++        vst4.16  {d21[3], d23[3],d25[3], d27[3]}, [r3], r1
+++        vst4.16  {d21[2], d23[2],d25[2], d27[2]}, [r3], r1
+++        vst4.16  {d21[1], d23[1],d25[1], d27[1]}, [r3], r1
+++        vst4.16  {d21[0], d23[0],d25[0], d27[0]}, [r3], r1
+++        vst4.16  {d20[3], d22[3],d24[3], d26[3]}, [r3], r1
+++        vst4.16  {d20[2], d22[2],d24[2], d26[2]}, [r3], r1
+++        vst4.16  {d20[1], d22[1],d24[1], d26[1]}, [r3], r1
+++        vst4.16  {d20[0], d22[0],d24[0], d26[0]}, [r3], r1
+++        bx       lr
+ +
+-+        @ p[1]
+-+        tst      r2, #2
+-+        itt ne
+-+        addne    r3, r3, r1, lsl #2
+++@ Either split or partial
+++1:
+++        ldr      r12, [sp, #0]
+++        lsls     r12, #29               @ b2 -> N, b3 -> C
+++        add      r2, r0, r1, lsl #2
+++        bcs      1f
+++        vst2.16  {d25[3], d27[3]}, [r0], r1
+++        vst2.16  {d25[2], d27[2]}, [r0], r1
+++        vst2.16  {d25[1], d27[1]}, [r0], r1
+++        vst2.16  {d25[0], d27[0]}, [r0]
+++1:
+++        bmi      2f
+++        vst2.16  {d24[3], d26[3]}, [r2], r1
+++        vst2.16  {d24[2], d26[2]}, [r2], r1
+++        vst2.16  {d24[1], d26[1]}, [r2], r1
+++        vst2.16  {d24[0], d26[0]}, [r2]
+++
+++2:
+++        lsls     r12, #2
+++        add      r2, r3, r1, lsl #2
+++        bcs      3f
+++        vst2.16  {d21[3], d23[3]}, [r3], r1
+++        vst2.16  {d21[2], d23[2]}, [r3], r1
+++        vst2.16  {d21[1], d23[1]}, [r3], r1
+++        vst2.16  {d21[0], d23[0]}, [r3]
+++3:
+++        it mi
+++        bxmi     lr
+++        vst2.16  {d20[3], d22[3]}, [r2], r1
+++        vst2.16  {d20[2], d22[2]}, [r2], r1
+++        vst2.16  {d20[1], d22[1]}, [r2], r1
+++        vst2.16  {d20[0], d22[0]}, [r2]
+++        bx       lr
+++.endm
+++
+++
+++
++ function ff_hevc_v_loop_filter_chroma_neon, export=1
++         hevc_loop_filter_chroma_start
+++
+++        sub      r0, #2
+++        vld4.8   {d16[0], d17[0], d18[0], d19[0]}, [r0], r1
+++        vld4.8   {d16[1], d17[1], d18[1], d19[1]}, [r0], r1
+++        vld4.8   {d16[2], d17[2], d18[2], d19[2]}, [r0], r1
+++        vld4.8   {d16[3], d17[3], d18[3], d19[3]}, [r0], r1
+++        vld4.8   {d16[4], d17[4], d18[4], d19[4]}, [r0], r1
+++        vld4.8   {d16[5], d17[5], d18[5], d19[5]}, [r0], r1
+++        vld4.8   {d16[6], d17[6], d18[6], d19[6]}, [r0], r1
+++        vld4.8   {d16[7], d17[7], d18[7], d19[7]}, [r0], r1
+++
+++        sub      r0, r0, r1, lsl #3
+++        add      r0, r0, #1
+++        hevc_loop_filter_chroma_body d16, d17, d18, d19
+ +        bne      1f
+-+        vst4.8   {d16[7], d17[7], d18[7], d19[7]}, [r3], r1
+-+        vst4.8   {d16[6], d17[6], d18[6], d19[6]}, [r3], r1
+-+        vst4.8   {d16[5], d17[5], d18[5], d19[5]}, [r3], r1
+-+        vst4.8   {d16[4], d17[4], d18[4], d19[4]}, [r3], r1
+++
+++        vst2.8   {d17[0], d18[0]}, [r0], r1
+++        vst2.8   {d17[1], d18[1]}, [r0], r1
+++        vst2.8   {d17[2], d18[2]}, [r0], r1
+++        vst2.8   {d17[3], d18[3]}, [r0], r1
+++        vst2.8   {d17[4], d18[4]}, [r0], r1
+++        vst2.8   {d17[5], d18[5]}, [r0], r1
+++        vst2.8   {d17[6], d18[6]}, [r0], r1
+++        vst2.8   {d17[7], d18[7]}, [r0], r1
+++        bx       lr
+ +
+ +1:
+-+        @ q[1]
+-+        tst      r2, #8
+-+        itt ne
+-+        addne    r0, r0, r1, lsl #2
+-+        bne 2f
+-+        vst4.8   {d26[7], d27[7], d28[7], d29[7]}, [r0], r1
+-+        vst4.8   {d26[6], d27[6], d28[6], d29[6]}, [r0], r1
+-+        vst4.8   {d26[5], d27[5], d28[5], d29[5]}, [r0], r1
+-+        vst4.8   {d26[4], d27[4], d28[4], d29[4]}, [r0], r1
+++        tst      r12, #0xff             @ P0a
+++        bne      2f
+++
+++        vst1.8   {d17[0]}, [r0], r1
+++        vst1.8   {d17[1]}, [r0], r1
+++        vst1.8   {d17[2]}, [r0], r1
+++        vst1.8   {d17[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+ +
+ +2:
+-+        @ p[0]
+-+        tst      r2, #1
+++        tst      r12, #0xff0000         @ Q0a
+++        add      r0, #1
+ +        bne      3f
+-+        vst4.8   {d16[3], d17[3], d18[3], d19[3]}, [r3], r1
+-+        vst4.8   {d16[2], d17[2], d18[2], d19[2]}, [r3], r1
+-+        vst4.8   {d16[1], d17[1], d18[1], d19[1]}, [r3], r1
+-+        vst4.8   {d16[0], d17[0], d18[0], d19[0]}, [r3]
+++        vst1.8   {d18[0]}, [r0], r1
+++        vst1.8   {d18[1]}, [r0], r1
+++        vst1.8   {d18[2]}, [r0], r1
+++        vst1.8   {d18[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+ +
+ +3:
+-+        @ q[0]
+-+        tst      r2, #4
+++        tst      r12, #0xff000000       @ Q0b
+++        add      r0, r0, r1, lsl #2
+++        bne      4f
+++        vst1.8   {d18[4]}, [r0], r1
+++        vst1.8   {d18[5]}, [r0], r1
+++        vst1.8   {d18[6]}, [r0], r1
+++        vst1.8   {d18[7]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++4:
+++        tst      r12, #0xff00           @ P0b
+ +        it ne
+ +        bxne     lr
+-+        vst4.8   {d26[3], d27[3], d28[3], d29[3]}, [r0], r1
+-+        vst4.8   {d26[2], d27[2], d28[2], d29[2]}, [r0], r1
+-+        vst4.8   {d26[1], d27[1], d28[1], d29[1]}, [r0], r1
+-+        vst4.8   {d26[0], d27[0], d28[0], d29[0]}, [r0]
+ +
+++        sub      r0, #1
+++        vst1.8   {d17[4]}, [r0], r1
+++        vst1.8   {d17[5]}, [r0], r1
+++        vst1.8   {d17[6]}, [r0], r1
+++        vst1.8   {d17[7]}, [r0], r1
+ +        bx       lr
+++
+ +endfunc
+ +
+ +
+- function ff_hevc_v_loop_filter_chroma_neon, export=1
+-         hevc_loop_filter_chroma_start
+++.macro m_filter_v_chroma_16 bit_depth
+++        hevc_loop_filter_chroma_start
+++
+          sub      r0, #4
+-@@ -383,3 +684,128 @@ function ff_hevc_h_loop_filter_chroma_neon, export=1
+-         vst1.8   {d4}, [r0]
+++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r0], r1
+++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r0], r1
+++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r0], r1
+++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r0], r1
+++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r0], r1
+++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r0], r1
+++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r0], r1
+++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r0], r1
+++
+++        sub      r0, r0, r1, lsl #3
+++        add      r0, r0, #2
+++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+++        bne      1f
+++
+++        vst2.16  {d18[0], d20[0]}, [r0], r1
+++        vst2.16  {d18[1], d20[1]}, [r0], r1
+++        vst2.16  {d18[2], d20[2]}, [r0], r1
+++        vst2.16  {d18[3], d20[3]}, [r0], r1
+++        vst2.16  {d19[0], d21[0]}, [r0], r1
+++        vst2.16  {d19[1], d21[1]}, [r0], r1
+++        vst2.16  {d19[2], d21[2]}, [r0], r1
+++        vst2.16  {d19[3], d21[3]}, [r0], r1
+++        bx       lr
+++
+++1:
+++        tst      r12, #0xff             @ P0a
+++        bne      2f
+++
+++        vst1.16  {d18[0]}, [r0], r1
+++        vst1.16  {d18[1]}, [r0], r1
+++        vst1.16  {d18[2]}, [r0], r1
+++        vst1.16  {d18[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++2:
+++        tst      r12, #0xff0000         @ Q0a
+++        add      r0, #1
+++        bne      3f
+++        vst1.16  {d20[0]}, [r0], r1
+++        vst1.16  {d20[1]}, [r0], r1
+++        vst1.16  {d20[2]}, [r0], r1
+++        vst1.16  {d20[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++3:
+++        tst      r12, #0xff000000       @ Q0b
+++        add      r0, r0, r1, lsl #2
+++        bne      4f
+++        vst1.16  {d21[0]}, [r0], r1
+++        vst1.16  {d21[1]}, [r0], r1
+++        vst1.16  {d21[2]}, [r0], r1
+++        vst1.16  {d21[3]}, [r0], r1
+++        sub      r0, r0, r1, lsl #2
+++
+++4:
+++        tst      r12, #0xff00           @ P0b
+++        it ne
+++        bxne     lr
+++
+++        sub      r0, #1
+++        vst1.16  {d19[0]}, [r0], r1
+++        vst1.16  {d19[1]}, [r0], r1
+++        vst1.16  {d19[2]}, [r0], r1
+++        vst1.16  {d19[3]}, [r0], r1
+++        bx       lr
+++.endm
+++
+++
+++@ void ff_hevc_h_loop_filter_chroma_neon(
+++@   uint8_t *_pix,     [r0]
+++@   ptrdiff_t _stride, [r1]
+++@   int *_tc,          [r2]
+++@   uint8_t *_no_p,    [r3]
+++@   uint8_t *_no_q);   [sp+0]
+++
+++function ff_hevc_h_loop_filter_chroma_neon, export=1
+++        hevc_loop_filter_chroma_start
+++        sub      r0, r0, r1, lsl #1
++         vld1.8   {d16}, [r0], r1
++         vld1.8   {d17}, [r0], r1
++         vld1.8   {d18}, [r0], r1
++-        vld1.8   {d2},  [r0], r1
++-        vld1.8   {d4},  [r0], r1
++-        vld1.8   {d19}, [r0], r1
++-        vld1.8   {d20}, [r0], r1
++-        vld1.8   {d21}, [r0], r1
++-        sub      r0, r0, r1, lsl #3
++-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
++-        hevc_loop_filter_chroma_body
++-        transpose_8x8 d16, d17, d18, d2, d4, d19, d20, d21
++-        vst1.8   {d16}, [r0], r1
+++        vld1.8   {d19}, [r0]
+++        sub      r0, r0, r1, lsl #1
+++        hevc_loop_filter_chroma_body d16, d17, d18, d19
+++        bne      1f     @ Partial write
++         vst1.8   {d17}, [r0], r1
++-        vst1.8   {d18}, [r0], r1
++-        vst1.8   {d2},  [r0], r1
++-        vst1.8   {d4},  [r0], r1
++-        vst1.8   {d19}, [r0], r1
++-        vst1.8   {d20}, [r0], r1
++-        vst1.8   {d21}, [r0]
+++        vst1.8   {d18}, [r0]
+++        bx       lr
+++1:
+++        tst      r12, #0xff
+++        vmov     r2, r3, d17
+++        it eq
+++        streq    r2, [r0]
+++        tst      r12, #0xff00
+++        it eq
+++        streq    r3, [r0, #4]
+++
+++        add      r0, r1
+++        tst      r12, #0xff0000
+++        vmov     r2, r3, d18
+++        it eq
+++        streq    r2, [r0]
+++        tst      r12, #0xff000000
+++        it eq
+++        streq    r3, [r0, #4]
+++
+          bx       lr
+  endfunc
++ 
++-function ff_hevc_h_loop_filter_chroma_neon, export=1
+++.macro m_filter_h_chroma_16 bit_depth
++         hevc_loop_filter_chroma_start
++         sub      r0, r0, r1, lsl #1
++-        vld1.8   {d18}, [r0], r1
++-        vld1.8   {d2}, [r0], r1
++-        vld1.8   {d4}, [r0], r1
++-        vld1.8   {d19}, [r0]
+++        vld1.16  {q8}, [r0], r1
+++        vld1.16  {q9}, [r0], r1
+++        vld1.16  {q10}, [r0], r1
+++        vld1.16  {q11}, [r0]
++         sub      r0, r0, r1, lsl #1
++-        hevc_loop_filter_chroma_body
++-        vst1.8   {d2}, [r0], r1
++-        vst1.8   {d4}, [r0]
+++        hevc_loop_filter_chroma_body_16 q8, q9, q10, q11, \bit_depth
+++        bne      1f     @ Partial write
+++        vst1.16  {q9}, [r0], r1
+++        vst1.16  {q10}, [r0]
+++        bx       lr
+++1:
+++        tst      r12, #0xff
+++        bne      2f
+++        vst1.16  {d18}, [r0]
+++2:
+++        tst      r12, #0xff00
+++        bne      3f
+++        add      r0, #8
+++        vst1.16  {d19}, [r0]
+++        sub      r0, #8
+++3:
+++        tst      r12, #0xff0000
+++        add      r0, r1
+++        bne      4f
+++        vst1.16  {d20}, [r0]
+++4:
+++        tst      r12, #0xff000000
+++        it ne
+++        bxne     lr
+++        add      r0, #8
+++        vst1.16  {d21}, [r0]
+++
++         bx       lr
+++.endm
+++
+ +
+ +/* ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_i
+ + *                                            int *curr_rpl0, int *curr_
+@@ -1754,9 +3110,54 @@ index 166bddb..9bd0a42 100644
+ +        b           11b
+ +endfunc
+ +
+++@ =============================================================================
+++@
+++@ 10 bit
+++
+++function hevc_loop_filter_luma_body_10
+++        m_filter_luma 10
+++endfunc
+++
+++function ff_hevc_h_loop_filter_luma_neon_10, export=1
+++        m_filter_h_luma_16 10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_luma2_neon_10, export=1
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}       @ 8 regs = 32 bytes
+++
+++        ldr      r4, [sp, #40]
+++        b        v_loop_luma_common_10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_luma_neon_10, export=1
+++        hevc_loop_filter_luma_start
+++        push     {r4-r10,lr}
+++
+++        sub      r4, r0, #8
+++v_loop_luma_common_10:
+++        m_filter_v_luma_common_16 10
+++endfunc
+++
+++function ff_hevc_h_loop_filter_uv_neon_10, export=1
+++        m_filter_h_uv_16 10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_uv2_neon_10, export=1
+++        m_filter_v_uv2_16 10
+++endfunc
+++
+++function ff_hevc_h_loop_filter_chroma_neon_10, export=1
+++        m_filter_h_chroma_16 10
+++endfunc
+++
+++function ff_hevc_v_loop_filter_chroma_neon_10, export=1
+++        m_filter_v_chroma_16 10
++ endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_epel_neon.S b/libavcodec/arm/hevcdsp_epel_neon.S
+ new file mode 100644
+-index 0000000..00eab9e
++index 0000000000..00eab9eeee
+ --- /dev/null
+ +++ b/libavcodec/arm/hevcdsp_epel_neon.S
+ @@ -0,0 +1,337 @@
+@@ -2097,70 +3498,620 @@ index 0000000..00eab9e
+ +       .byte 4, 28, 46, 6
+ +       .byte 2, 16, 54, 4
+ +       .byte 2, 10, 58, 2
+-diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
+-index 5591807..b6c48ee 100644
+---- a/libavcodec/arm/hevcdsp_init_neon.c
+-+++ b/libavcodec/arm/hevcdsp_init_neon.c
+-@@ -22,11 +22,26 @@
+- #include "libavutil/arm/cpu.h"
+- #include "libavcodec/hevcdsp.h"
+- #include "hevcdsp_arm.h"
+-+#include "libavcodec/avcodec.h"
+-+#include "libavcodec/bit_depth_template.c"
++diff --git a/libavcodec/arm/hevcdsp_idct_neon.S b/libavcodec/arm/hevcdsp_idct_neon.S
++index 13d540e5ff..9b6d745556 100644
++--- a/libavcodec/arm/hevcdsp_idct_neon.S
+++++ b/libavcodec/arm/hevcdsp_idct_neon.S
++@@ -21,82 +21,6 @@
++ #include "libavutil/arm/asm.S"
++ #include "neon.S"
++ 
++-function ff_hevc_idct_4x4_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        vdup.16     q0, r1
++-        vdup.16     q1, r1
++-        vst1.16     {q0, q1}, [r0]
++-        bx lr
++-endfunc
++-
++-function ff_hevc_idct_8x8_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        vdup.16     q8, r1
++-        vdup.16     q9, r1
++-        vmov.16     q10, q8
++-        vmov.16     q11, q8
++-        vmov.16     q12, q8
++-        vmov.16     q13, q8
++-        vmov.16     q14, q8
++-        vmov.16     q15, q8
++-        vstm        r0, {q8-q15}
++-        bx lr
++-endfunc
++-
++-function ff_hevc_idct_16x16_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        vdup.16     q8, r1
++-        vdup.16     q9, r1
++-        vmov.16     q10, q8
++-        vmov.16     q11, q8
++-        vmov.16     q12, q8
++-        vmov.16     q13, q8
++-        vmov.16     q14, q8
++-        vmov.16     q15, q8
++-        vstm        r0!, {q8-q15}
++-        vstm        r0!, {q8-q15}
++-        vstm        r0!, {q8-q15}
++-        vstm        r0, {q8-q15}
++-        bx lr
++-endfunc
++-
++-function ff_hevc_idct_32x32_dc_neon_8, export=1
++-        ldrsh       r1, [r0]
++-        ldr         r2, =0x20
++-        add         r1, #1
++-        asr         r1, #1
++-        add         r1, r2
++-        asr         r1, #6
++-        mov         r3, #16
++-        vdup.16     q8, r1
++-        vdup.16     q9, r1
++-        vmov.16     q10, q8
++-        vmov.16     q11, q8
++-        vmov.16     q12, q8
++-        vmov.16     q13, q8
++-        vmov.16     q14, q8
++-        vmov.16     q15, q8
++-1:      subs        r3, #1
++-        vstm        r0!, {q8-q15}
++-        bne         1b
++-        bx lr
++-endfunc
++-
++ function ff_hevc_transform_add_4x4_neon_8, export=1
++         vldm        r1, {q0-q1}
++         vld1.32     d4[0], [r0], r2
++@@ -168,6 +92,131 @@ function ff_hevc_transform_add_32x32_neon_8, export=1
++         bx          lr
++ endfunc
+  
+- void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+- void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+- void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+- void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+ +
+-+#ifdef RPI
+-+void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+-+                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+-+                             const uint8_t no_p[2], const uint8_t no_q[2],
+-+                             uint8_t * _pix_l);
+-+void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+-+                             unsigned int no_f);
+-+void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+-+                             uint8_t * src_l,
+-+                             unsigned int no_f);
+-+#endif
+++@ ff_hevc_add_residual_4x4_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_4x4_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++
+++        vld1.32     d4[0], [r0], r1
+++        vld1.32     d4[1], [r0], r1
+++        vld1.32     d5[0], [r0], r1
+++        vld1.32     d5[1], [r0], r1
+++        sub         r0, r0, r1, lsl #2
+++        vaddw.u8    q0, q15, d4
+++        vaddw.u8    q1, q15, d5
+++        vqmovun.s16 d0, q0
+++        vqmovun.s16 d1, q1
+++        vst1.32     d0[0], [r0], r1
+++        vst1.32     d0[1], [r0], r1
+++        vst1.32     d1[0], [r0], r1
+++        vst1.32     d1[1], [r0], r1
+++        bx          lr
+++endfunc
+ +
+- void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
+- void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
+- void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
+-@@ -43,6 +58,31 @@ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
+- void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
+-                                       ptrdiff_t stride);
+- 
+-+void ff_hevc_sao_band_w8_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-+void ff_hevc_sao_band_w16_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-+void ff_hevc_sao_band_w32_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+-+void ff_hevc_sao_band_w64_neon_8(uint8_t *_dst, uint8_t *_src, int8_t * offset_table, ptrdiff_t stride_src, ptrdiff_t stride_dst, int height);
+ +
+-+void ff_hevc_sao_edge_eo0_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo1_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo2_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo3_w32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++@ ff_hevc_add_residual_4x4_dc_c_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+ +
+-+void ff_hevc_sao_edge_eo0_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo1_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo2_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+-+void ff_hevc_sao_edge_eo3_w64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height, int8_t *sao_offset_table);
+++function ff_hevc_add_residual_4x4_dc_c_neon_8, export=1
+++        vdup.32     q15, r2
+++        mov         r3,  #4
+++        b           1f
+++endfunc
+ +
+-+void ff_hevc_sao_edge_c_w64_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, int height,
+-+                                   const int16_t *sao_offset_table_u, const int16_t *sao_offset_table_v, int eo);
+++@ ff_hevc_add_residual_8x8_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_8x8_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++        mov         r3,  #8
+++
+++1:      subs        r3,   #1
+++        vld1.8      d16,  [r0]
+++        vaddw.u8    q0,   q15, d16
+++        vqmovun.s16 d0,   q0
+++        vst1.32     d0,   [r0], r1
+++        bne         1b
+++        bx          lr
+++endfunc
+ +
+-+void ff_hevc_sao_band_c_neon_8(uint8_t *_dst, const uint8_t *_src,
+-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                                  int width, int height);
+ +
+++@ ff_hevc_add_residual_8x8_dc_c_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+ +
+++function ff_hevc_add_residual_8x8_dc_c_neon_8, export=1
+++        vdup.32     q15, r2
+++        mov         r3,  #8
+++        b           1f
+++endfunc
+++
+++@ ff_hevc_add_residual_16x16_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_16x16_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++        mov         r3,  #16
+++
+++1:      subs        r3,   #1
+++        vld1.8      {q8},  [r0]
+++        vaddw.u8    q0,  q15, d16
+++        vaddw.u8    q1,  q15, d17
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vst1.8      {q0},   [r0], r1
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++
+++@ ff_hevc_add_residual_16x16_dc_c_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_16x16_dc_c_neon_8, export=1
+++        vdup.32     q15, r2
+++        mov         r3,  #16
+++        b           1f
+++endfunc
+++
+++@ ff_hevc_add_residual_32x32_dc_neon_8(
+++@   uint8_t * dst,              // [r0]
+++@   unsigned int stride,        // [r1]
+++@   int dc)                     // [r2]
+++
+++function ff_hevc_add_residual_32x32_dc_neon_8, export=1
+++        vdup.16     q15, r2
+++        mov         r3,  #32
+++
+++1:      subs        r3,   #1
+++        vld1.8      {q8, q9},  [r0]
+++        vaddw.u8    q0,  q15, d16
+++        vaddw.u8    q1,  q15, d17
+++        vaddw.u8    q2,  q15, d18
+++        vaddw.u8    q3,  q15, d19
+++        vqmovun.s16 d0,  q0
+++        vqmovun.s16 d1,  q1
+++        vqmovun.s16 d2,  q2
+++        vqmovun.s16 d3,  q3
+++        vst1.8     {q0, q1},   [r0], r1
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++
+++
++ .macro  transpose_16b_8x8   r0, r1, r2, r3, r4, r5, r6, r7
++         vtrn.64         \r0, \r4
++         vtrn.64         \r1, \r5
++@@ -263,55 +312,6 @@ endfunc
++         vqrshrn.s32   \r3, q3, \shift
++ .endm
++ 
++-function ff_hevc_transform_4x4_neon_8, export=1
++-        vpush       {d8-d15}
++-        vld1.16     {q14, q15}, [r0]  // coeffs
++-        ldr         r3, =0x00240053 // 36 and 83
++-        vmov.32     d0[0], r3
++-
++-        tr4_shift d28, d29, d30, d31, #7
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-
++-        tr4_shift d28, d29, d30, d31, #12
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-
++-        vst1.16     {q14, q15}, [r0]
++-        vpop        {d8-d15}
++-        bx lr
++-endfunc
++-
++-function ff_hevc_transform_luma_4x4_neon_8, export=1
++-        vpush       {d8-d15}
++-        vld1.16     {q14, q15}, [r0]  // coeffs
++-        ldr         r3, =0x4a  // 74
++-        vmov.32     d0[0], r3
++-        ldr         r3, =0x1d  // 29
++-        vmov.32     d0[1], r3
++-        ldr         r3, =0x37  // 55
++-        vmov.32     d1[0], r3
++-
++-        tr4_luma_shift d28, d29, d30, d31, #7
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-
++-        tr4_luma_shift d28, d29, d30, d31, #12
++-
++-        vtrn.16     d28, d29
++-        vtrn.16     d30, d31
++-        vtrn.32     q14, q15
++-        vst1.16     {q14, q15}, [r0]
++-        vpop        {d8-d15}
++-        bx lr
++-endfunc
++-
++ .macro tr8_begin in0, in1, in2, in3
++         vmull.s16  q7, \in0, d1[1]   // 89 * src1
++         vmull.s16  q8, \in0, d1[0]   // 75 * src1
++@@ -356,100 +356,6 @@ endfunc
++         vqrshrn.s32   d8, q5, \shift
++ .endm
++ 
++-function ff_hevc_transform_8x8_neon_8, export=1
++-        push   {r4-r8}
++-        vpush {d8-d15}
++-        mov    r5, #16
++-
++-        adr       r3, tr4f
++-        vld1.16   {d0, d1}, [r3]
++-
++-        // left half
++-        vld1.16 {d24}, [r0], r5
++-        vld1.16 {d25}, [r0], r5
++-        vld1.16 {d26}, [r0], r5
++-        vld1.16 {d27}, [r0], r5
++-        vld1.16 {d28}, [r0], r5
++-        vld1.16 {d29}, [r0], r5
++-        vld1.16 {d30}, [r0], r5
++-        vld1.16 {d31}, [r0], r5
++-        sub      r0, #128
++-        tr8_begin d25, d27, d29, d31
++-        tr4       d24, d26, d28, d30
++-        tr8_end   #7
++-        vst1.16 {d2}, [r0], r5
++-        vst1.16 {d3}, [r0], r5
++-        vst1.16 {d4}, [r0], r5
++-        vst1.16 {d5}, [r0], r5
++-        vst1.16 {d6}, [r0], r5
++-        vst1.16 {d7}, [r0], r5
++-        vst1.16 {d8}, [r0], r5
++-        vst1.16 {d9}, [r0], r5
++-        sub      r0, #128
++-        //skip right half if col_limit in r1 is less than 4
++-        cmp      r1, #4
++-        blt      1f
++-        //right half
++-        add      r0, #8
++-        vld1.16 {d24}, [r0], r5
++-        vld1.16 {d25}, [r0], r5
++-        vld1.16 {d26}, [r0], r5
++-        vld1.16 {d27}, [r0], r5
++-        vld1.16 {d28}, [r0], r5
++-        vld1.16 {d29}, [r0], r5
++-        vld1.16 {d30}, [r0], r5
++-        vld1.16 {d31}, [r0], r5
++-        sub      r0, #128
++-        tr8_begin d25, d27, d29, d31
++-        tr4       d24, d26, d28, d30
++-        tr8_end   #7
++-        vst1.16 {d2}, [r0], r5
++-        vst1.16 {d3}, [r0], r5
++-        vst1.16 {d4}, [r0], r5
++-        vst1.16 {d5}, [r0], r5
++-        vst1.16 {d6}, [r0], r5
++-        vst1.16 {d7}, [r0], r5
++-        vst1.16 {d8}, [r0], r5
++-        vst1.16 {d9}, [r0], r5
++-        sub      r0, #136
++-1:
++-        // top half
++-        vldm r0, {q12-q15} // coeffs
++-        transpose_16b_4x4 d24, d26, d28, d30
++-        transpose_16b_4x4 d25, d27, d29, d31
++-        tr8_begin d26, d30, d27, d31
++-        tr4 d24, d28, d25, d29
++-        tr8_end #12
++-        transpose_16b_4x4 d2, d3, d4, d5
++-        transpose_16b_4x4 d6, d7, d8, d9
++-        vswp     d7, d5
++-        vswp     d7, d8
++-        vswp     d3, d6
++-        vswp     d6, d4
++-        vstm r0!, {q1-q4}
++-
++-        // bottom half
++-        vldm r0, {q12-q15} // coeffs
++-        transpose_16b_4x4 d24, d26, d28, d30
++-        transpose_16b_4x4 d25, d27, d29, d31
++-        tr8_begin d26, d30, d27, d31
++-        tr4 d24, d28, d25, d29
++-        tr8_end #12
++-        transpose_16b_4x4 d2, d3, d4, d5
++-        transpose_16b_4x4 d6, d7, d8, d9
++-        vswp     d7, d5
++-        vswp     d7, d8
++-        vswp     d3, d6
++-        vswp     d6, d4
++-        //vstm     r0, {q1-q4}
++-        vst1.16 {q1-q2}, [r0]
++-        add     r0, #32
++-        vst1.16 {q3-q4}, [r0]
++-        sub     r0, #32
++-        vpop {d8-d15}
++-        pop {r4-r8}
++-        bx lr
++-endfunc
++ 
++ .align 4
++ tr4f:
++@@ -463,3 +369,11 @@ tr16:
++ .word 0x00500046  // 80, d2[2] = 70
++ .word 0x0039002b  // 57, d2[0] = 43
++ .word 0x00190009  // 25, d2[2] = 9
+++
+++#define BIT_DEPTH 8
+++#include "hevc_idct_fn_neon.S"
+++
+++#undef BIT_DEPTH
+++#define BIT_DEPTH 10
+++#include "hevc_idct_fn_neon.S"
+++
++diff --git a/libavcodec/arm/hevcdsp_init_neon.c b/libavcodec/arm/hevcdsp_init_neon.c
++index 55918077e2..e708b7c074 100644
++--- a/libavcodec/arm/hevcdsp_init_neon.c
+++++ b/libavcodec/arm/hevcdsp_init_neon.c
++@@ -22,11 +22,41 @@
++ #include "libavutil/arm/cpu.h"
++ #include "libavcodec/hevcdsp.h"
++ #include "hevcdsp_arm.h"
+++#include "libavcodec/avcodec.h"
+++#include "libavcodec/bit_depth_template.c"
++ 
++ void ff_hevc_v_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_luma_neon(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_v_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++ void ff_hevc_h_loop_filter_chroma_neon(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++
+++void ff_hevc_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++void ff_hevc_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++void ff_hevc_v_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++void ff_hevc_h_loop_filter_chroma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
+++
+++#ifdef RPI
+++void ff_hevc_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
+++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                             const uint8_t no_p[2], const uint8_t no_q[2],
+++                             uint8_t * _pix_l);
+++void ff_hevc_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                             unsigned int no_f);
+++void ff_hevc_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                             uint8_t * src_l,
+++                             unsigned int no_f);
+++
+++void ff_hevc_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
+++                             unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                             const uint8_t no_p[2], const uint8_t no_q[2],
+++                             uint8_t * _pix_l);
+++void ff_hevc_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                             unsigned int no_f);
+++void ff_hevc_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                             uint8_t * src_l,
+++                             unsigned int no_f);
+++#endif
+++
++ void ff_hevc_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++ void ff_hevc_idct_4x4_dc_neon_8(int16_t *coeffs);
++@@ -34,14 +64,174 @@ void ff_hevc_idct_8x8_dc_neon_8(int16_t *coeffs);
++ void ff_hevc_idct_16x16_dc_neon_8(int16_t *coeffs);
++ void ff_hevc_idct_32x32_dc_neon_8(int16_t *coeffs);
++ void ff_hevc_transform_luma_4x4_neon_8(int16_t *coeffs);
+++
+++void ff_hevc_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
+++void ff_hevc_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
+++void ff_hevc_idct_4x4_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_idct_8x8_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_idct_16x16_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_idct_32x32_dc_neon_10(int16_t *coeffs);
+++void ff_hevc_transform_luma_4x4_neon_10(int16_t *coeffs);
+++
++ void ff_hevc_transform_add_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                     ptrdiff_t stride);
++ void ff_hevc_transform_add_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                     ptrdiff_t stride);
++ void ff_hevc_transform_add_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                       ptrdiff_t stride);
++ void ff_hevc_transform_add_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++-                                      ptrdiff_t stride);
+++                                       ptrdiff_t stride);
+++
+++void ff_hevc_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
+++
+++
+++void ff_hevc_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                     ptrdiff_t stride);
+++void ff_hevc_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                     ptrdiff_t stride);
+++void ff_hevc_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
+++                                       ptrdiff_t stride);
+++
+++void ff_hevc_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++void ff_hevc_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
+++
+++
+++#if RPI_HEVC_SAND
+++void ff_hevc_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++
+++
+++void ff_hevc_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_v);
+++void ff_hevc_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride, int dc_u);
+++void ff_hevc_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
+++                                       ptrdiff_t stride);
+++void ff_hevc_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++void ff_hevc_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
+++#endif
+++
+++void ff_hevc_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++
+++void ff_hevc_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++void ff_hevc_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
+++
+++#if RPI_HEVC_SAND
+++void ff_hevc_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++
+++void ff_hevc_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++void ff_hevc_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height);
+++
+++void ff_hevc_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++
+++void ff_hevc_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++void ff_hevc_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height);
+++#endif
+++
+++void ff_hevc_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++
+++void ff_hevc_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++void ff_hevc_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++
++ 
+  #define PUT_PIXELS(name) \
+      void name(int16_t *dst, uint8_t *src, \
+-                                 ptrdiff_t srcstride, int height, \
+-@@ -58,6 +98,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
++@@ -58,6 +248,15 @@ PUT_PIXELS(ff_hevc_put_pixels_w32_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w48_neon_8);
+  PUT_PIXELS(ff_hevc_put_pixels_w64_neon_8);
+  #undef PUT_PIXELS
+@@ -2176,227 +4127,110 @@ index 5591807..b6c48ee 100644
+  
+  static void (*put_hevc_qpel_neon[4][4])(int16_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
+                                     int height, int width);
+-@@ -142,14 +191,239 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
++@@ -142,25 +341,181 @@ void ff_hevc_put_qpel_bi_neon_wrapper(uint8_t *dst, ptrdiff_t dststride, uint8_t
+      put_hevc_qpel_uw_neon[my][mx](dst, dststride, src, srcstride, width, height, src2, MAX_PB_SIZE);
+  }
+  
+-+static void ff_hevc_sao_band_neon_wrapper(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                          int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+++                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                                MvField *curr, MvField *neigh, uint8_t *bs);
+++
+++
+++static void ff_hevc_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+ +{
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int8_t offset_table[32] = { 0 };
+-+    int k, y, x;
+-+    int shift  = 3; // BIT_DEPTH - 5
+-+    int cwidth = 0;
+++    ff_hevc_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+++    ff_hevc_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
+++}
+++static void ff_hevc_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
+++    ff_hevc_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
+++}
+ +
+-+    stride_src /= sizeof(pixel);
+-+    stride_dst /= sizeof(pixel);
+++static void ff_hevc_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+++    ff_hevc_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++}
+++static void ff_hevc_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
+++    ff_hevc_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++}
+ +
+-+    for (k = 0; k < 4; k++)
+-+        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
+++#if SAO_FILTER_N == 6
+++static void ff_hevc_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+++    ff_hevc_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
+++}
+++static void ff_hevc_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
+++    ff_hevc_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
+++}
+ +
+-+    if (height % 8 == 0)
+-+        cwidth = width;
+++static void ff_hevc_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++    ff_hevc_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+++}
+++static void ff_hevc_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
+++{
+++    ff_hevc_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
+++    ff_hevc_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
+++}
+ +
+-+    switch(cwidth){
+-+    case 8:
+-+        ff_hevc_sao_band_w8_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    case 16:
+-+        ff_hevc_sao_band_w16_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    case 32:
+-+        ff_hevc_sao_band_w32_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    case 64:
+-+        ff_hevc_sao_band_w64_neon_8(_dst, _src, offset_table, stride_src, stride_dst, height);
+-+        break;
+-+    default:
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width; x++)
+-+                dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
+-+            dst += stride_dst;
+-+            src += stride_src;
+-+        }
+-+    }
+++#if RPI_HEVC_SAND
+++static void ff_hevc_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+++    ff_hevc_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+++}
+++static void ff_hevc_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+++                                  int eo, int width, int height)
+++{
+++    ff_hevc_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
+++    ff_hevc_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
+ +}
+ +
+-+static void ff_hevc_sao_band_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src,
+++static void ff_hevc_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
+ +                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+ +                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+ +                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+ +                                  int width, int height)
+ +{
+-+    // Width 32 already dealt with
+-+    // width 16 code works in double lines
+-+    if (width == 16 && (height & 1) == 0) {
+-+        ff_hevc_sao_band_c_neon_8(_dst, _src, stride_src, stride_dst,
+-+                                          sao_offset_val_u, sao_left_class_u,
+-+                                          sao_offset_val_v, sao_left_class_v,
+-+                                          width, height);
+-+    }
+-+    else
+-+    {
+-+        const int shift  = 3; // BIT_DEPTH - 5
+-+        int k, y, x;
+-+        pixel *dst = (pixel *)_dst;
+-+        pixel *src = (pixel *)_src;
+-+        int8_t offset_table_u[32] = { 0 };
+-+        int8_t offset_table_v[32] = { 0 };
+-+
+-+        stride_src /= sizeof(pixel);
+-+        stride_dst /= sizeof(pixel);
+-+
+-+        for (k = 0; k < 4; k++)
+-+            offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+-+        for (k = 0; k < 4; k++)
+-+            offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+-+
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width * 2; x += 2)
+-+            {
+-+                dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+-+                dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+-+            }
+-+            dst += stride_dst;
+-+            src += stride_src;
+-+
+-+        }
+-+    }
+++    ff_hevc_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+++    ff_hevc_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+ +}
+-+
+-+#define CMP(a, b) ((a) > (b) ? 1 : ((a) == (b) ? 0 : -1))
+-+static void ff_hevc_sao_edge_neon_wrapper(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-+                                          int16_t *_sao_offset_val, int eo, int width, int height)
+++static void ff_hevc_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+ +{
+-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+-+    static const int8_t pos[4][2][2] = {
+-+        { { -1,  0 }, {  1, 0 } }, // horizontal
+-+        { {  0, -1 }, {  0, 1 } }, // vertical
+-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+-+    };
+-+    int8_t sao_offset_val[8];  // padding of 3 for vld
+-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE);
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int a_stride, b_stride;
+-+    int x, y;
+-+    int cwidth = 0;
+-+
+-+    for (x = 0; x < 5; x++) {
+-+        sao_offset_val[x] = _sao_offset_val[edge_idx[x]];
+-+    }
+-+
+-+    if (height % 8 == 0)
+-+        cwidth = width;
+-+
+-+    stride_src /= sizeof(pixel);
+-+    stride_dst /= sizeof(pixel);
+-+
+-+    switch (cwidth) {
+-+    case 32:
+-+        switch(eo) {
+-+        case 0:
+-+            ff_hevc_sao_edge_eo0_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 1:
+-+            ff_hevc_sao_edge_eo1_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 2:
+-+            ff_hevc_sao_edge_eo2_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 3:
+-+            ff_hevc_sao_edge_eo3_w32_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        }
+-+        break;
+-+    case 64:
+-+        switch(eo) {
+-+        case 0:
+-+            ff_hevc_sao_edge_eo0_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 1:
+-+            ff_hevc_sao_edge_eo1_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 2:
+-+            ff_hevc_sao_edge_eo2_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        case 3:
+-+            ff_hevc_sao_edge_eo3_w64_neon_8(dst, src, stride_dst, stride_src, height, sao_offset_val);
+-+            break;
+-+        }
+-+        break;
+-+    default:
+-+        a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
+-+        b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width; x++) {
+-+                int diff0         = CMP(src[x], src[x + a_stride]);
+-+                int diff1         = CMP(src[x], src[x + b_stride]);
+-+                int idx           = diff0 + diff1;
+-+                if (idx)
+-+                    dst[x] = av_clip_pixel(src[x] + sao_offset_val[idx+2]);
+-+            }
+-+            src += stride_src;
+-+            dst += stride_dst;
+-+        }
+-+    }
+++    ff_hevc_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
+++    ff_hevc_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
+++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
+ +}
+++#endif
+++#endif
+ +
+ +
+-+static void ff_hevc_sao_edge_c_neon_wrapper(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+-+                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
+-+                                  int eo, int width, int height)
+-+{
+-+    const ptrdiff_t stride_src = (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+-+
+-+    if (width == 32 && (height & 7) == 0) {
+-+        ff_hevc_sao_edge_c_w64_neon_8(_dst, _src, stride_dst, stride_src, height, _sao_offset_val_u, _sao_offset_val_v, eo);
+-+    }
+-+    else
+-+    {
+-+        static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+-+        static const int8_t pos[4][2][2] = {
+-+            { { -1,  0 }, {  1, 0 } }, // horizontal
+-+            { {  0, -1 }, {  0, 1 } }, // vertical
+-+            { { -1, -1 }, {  1, 1 } }, // 45 degree
+-+            { {  1, -1 }, { -1, 1 } }, // 135 degree
+-+        };
+-+        int8_t sao_offset_val_u[8];  // padding of 3 for vld
+-+        int8_t sao_offset_val_v[8];  // padding of 3 for vld
+-+        pixel *dst = (pixel *)_dst;
+-+        pixel *src = (pixel *)_src;
+-+        int a_stride, b_stride;
+-+        int x, y;
+-+
+-+        for (x = 0; x < 5; x++) {
+-+            sao_offset_val_u[x] = _sao_offset_val_u[edge_idx[x]];
+-+            sao_offset_val_v[x] = _sao_offset_val_v[edge_idx[x]];
+-+        }
+-+
+-+        a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+-+        b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+-+        for (y = 0; y < height; y++) {
+-+            for (x = 0; x < width * 2; x += 2) {
+-+                int diff0u = CMP(src[x], src[x + a_stride]);
+-+                int diff1u = CMP(src[x], src[x + b_stride]);
+-+                int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+-+                int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+-+                dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[2 + diff0u + diff1u]);
+-+                dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[2 + diff0v + diff1v]);
+-+            }
+-+            src += stride_src;
+-+            dst += stride_dst;
+-+        }
+-+    }
+-+}
+-+#undef CMP
+ +
+-+void ff_hevc_deblocking_boundary_strengths_neon(int pus, int dup, int in_inc, int out_inc,
+-+                                                int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                                MvField *curr, MvField *neigh, uint8_t *bs);
+++#if (2*MAX_PB_SIZE + FF_INPUT_BUFFER_PADDING_SIZE) != 160
+++#error SAO edge src stride not 160 - value used in .S
+++#endif
+ +
+  av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+  {
+@@ -2407,7 +4241,9 @@ index 5591807..b6c48ee 100644
+          c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon;
+ +        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon;
+          c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon;
+++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon;
+          c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon;
+++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon;
+ +#ifdef RPI
+ +        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_8;
+ +        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_8;
+@@ -2416,21 +4252,68 @@ index 5591807..b6c48ee 100644
+          c->idct[0]                     = ff_hevc_transform_4x4_neon_8;
+          c->idct[1]                     = ff_hevc_transform_8x8_neon_8;
+          c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_8;
+-@@ -161,6 +435,13 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+-         c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
+-         c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
++         c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_8;
++         c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_8;
++         c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_8;
++-        c->transform_add[0]            = ff_hevc_transform_add_4x4_neon_8;
++-        c->transform_add[1]            = ff_hevc_transform_add_8x8_neon_8;
++-        c->transform_add[2]            = ff_hevc_transform_add_16x16_neon_8;
++-        c->transform_add[3]            = ff_hevc_transform_add_32x32_neon_8;
+++        c->transform_add[0]             = ff_hevc_transform_add_4x4_neon_8;
+++        c->transform_add[1]             = ff_hevc_transform_add_8x8_neon_8;
+++        c->transform_add[2]             = ff_hevc_transform_add_16x16_neon_8;
+++        c->transform_add[3]             = ff_hevc_transform_add_32x32_neon_8;
+++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_8;
+++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_8;
+++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_8;
+++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_8;
+++#if RPI_HEVC_SAND
+++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_8;
+++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_8;
+++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_8;
+++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_8;
+++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_8;
+++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_8;
+++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_8;
+++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_8;
+++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_8;
+++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_8;
+++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_8;
+++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_8;
+++#endif
+          c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_8;
+-+        for (x = 0; x < sizeof c->sao_band_filter / sizeof *c->sao_band_filter; x++) {
+-+          c->sao_band_filter[x]        = ff_hevc_sao_band_neon_wrapper;
+-+          c->sao_band_filter_c[x]      = ff_hevc_sao_band_c_neon_wrapper;
+-+          c->sao_edge_filter[x]        = ff_hevc_sao_edge_neon_wrapper;
+-+          c->sao_edge_filter_c[x]      = ff_hevc_sao_edge_c_neon_wrapper;
+-+        }
+-+        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_neon_8;  // width=32
+++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_8;
+++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_8;
+++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_8;
+++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_8;
+++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_8;
+++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_8;
+++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_8;
+++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_8;
+++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_8;
+++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_8;
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_8;
+++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_8;
+++#endif
+++#if RPI_HEVC_SAND
+++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_8;
+++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_8;
+++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_8;
+++
+++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_8;
+++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_8;
+++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_8;
+++
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_8;
+++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_8;
+++#endif
+++#endif
+          put_hevc_qpel_neon[1][0]       = ff_hevc_put_qpel_v1_neon_8;
+          put_hevc_qpel_neon[2][0]       = ff_hevc_put_qpel_v2_neon_8;
+          put_hevc_qpel_neon[3][0]       = ff_hevc_put_qpel_v3_neon_8;
+-@@ -201,7 +482,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -201,7 +556,21 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+              c->put_hevc_qpel_bi[x][1][0]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][0][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+              c->put_hevc_qpel_bi[x][1][1]      = ff_hevc_put_qpel_bi_neon_wrapper;
+@@ -2452,22 +4335,711 @@ index 5591807..b6c48ee 100644
+          c->put_hevc_qpel[0][0][0]  = ff_hevc_put_pixels_w2_neon_8;
+          c->put_hevc_qpel[1][0][0]  = ff_hevc_put_pixels_w4_neon_8;
+          c->put_hevc_qpel[2][0][0]  = ff_hevc_put_pixels_w6_neon_8;
+-@@ -221,4 +516,9 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
++@@ -221,4 +590,82 @@ av_cold void ff_hevcdsp_init_neon(HEVCDSPContext *c, const int bit_depth)
+          c->put_hevc_qpel_uni[8][0][0]  = ff_hevc_put_qpel_uw_pixels_w48_neon_8;
+          c->put_hevc_qpel_uni[9][0][0]  = ff_hevc_put_qpel_uw_pixels_w64_neon_8;
+      }
+-+
+-+    assert(offsetof(MvField, mv) == 0);
+-+    assert(offsetof(MvField, ref_idx) == 8);
+-+    assert(offsetof(MvField, pred_flag) == 10);
+-+    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+++    else if (bit_depth == 10) {
+++        c->hevc_v_loop_filter_luma     = ff_hevc_v_loop_filter_luma_neon_10;
+++        c->hevc_v_loop_filter_luma_c   = ff_hevc_v_loop_filter_luma_neon_10;
+++        c->hevc_h_loop_filter_luma     = ff_hevc_h_loop_filter_luma_neon_10;
+++        c->hevc_h_loop_filter_luma_c   = ff_hevc_h_loop_filter_luma_neon_10;
+++        c->hevc_v_loop_filter_chroma   = ff_hevc_v_loop_filter_chroma_neon_10;
+++        c->hevc_v_loop_filter_chroma_c = ff_hevc_v_loop_filter_chroma_neon_10;
+++        c->hevc_h_loop_filter_chroma   = ff_hevc_h_loop_filter_chroma_neon_10;
+++        c->hevc_h_loop_filter_chroma_c = ff_hevc_h_loop_filter_chroma_neon_10;
+++#ifdef RPI
+++        c->hevc_v_loop_filter_luma2    = ff_hevc_v_loop_filter_luma2_neon_10;
+++        c->hevc_h_loop_filter_uv       = ff_hevc_h_loop_filter_uv_neon_10;
+++        c->hevc_v_loop_filter_uv2      = ff_hevc_v_loop_filter_uv2_neon_10;
+++#endif
+++        c->idct[0]                     = ff_hevc_transform_4x4_neon_10;
+++        c->idct[1]                     = ff_hevc_transform_8x8_neon_10;
+++        c->idct_dc[0]                  = ff_hevc_idct_4x4_dc_neon_10;
+++        c->idct_dc[1]                  = ff_hevc_idct_8x8_dc_neon_10;
+++        c->idct_dc[2]                  = ff_hevc_idct_16x16_dc_neon_10;
+++        c->idct_dc[3]                  = ff_hevc_idct_32x32_dc_neon_10;
+++        c->transform_add[0]             = ff_hevc_add_residual_4x4_neon_10;
+++        c->transform_add[1]             = ff_hevc_add_residual_8x8_neon_10;
+++        c->transform_add[2]             = ff_hevc_add_residual_16x16_neon_10;
+++        c->transform_add[3]             = ff_hevc_add_residual_32x32_neon_10;
+++        c->add_residual_dc[0]          = ff_hevc_add_residual_4x4_dc_neon_10;
+++        c->add_residual_dc[1]          = ff_hevc_add_residual_8x8_dc_neon_10;
+++        c->add_residual_dc[2]          = ff_hevc_add_residual_16x16_dc_neon_10;
+++        c->add_residual_dc[3]          = ff_hevc_add_residual_32x32_dc_neon_10;
+++#if RPI_HEVC_SAND
+++        c->add_residual_u[0]           = ff_hevc_add_residual_4x4_u_neon_10;
+++        c->add_residual_u[1]           = ff_hevc_add_residual_8x8_u_neon_10;
+++        c->add_residual_u[2]           = ff_hevc_add_residual_16x16_u_neon_10;
+++        c->add_residual_v[0]           = ff_hevc_add_residual_4x4_v_neon_10;
+++        c->add_residual_v[1]           = ff_hevc_add_residual_8x8_v_neon_10;
+++        c->add_residual_v[2]           = ff_hevc_add_residual_16x16_v_neon_10;
+++        c->add_residual_c[0]           = ff_hevc_add_residual_4x4_c_neon_10;
+++        c->add_residual_c[1]           = ff_hevc_add_residual_8x8_c_neon_10;
+++        c->add_residual_c[2]           = ff_hevc_add_residual_16x16_c_neon_10;
+++        c->add_residual_dc_c[0]        = ff_hevc_add_residual_4x4_dc_c_neon_10;
+++        c->add_residual_dc_c[1]        = ff_hevc_add_residual_8x8_dc_c_neon_10;
+++        c->add_residual_dc_c[2]        = ff_hevc_add_residual_16x16_dc_c_neon_10;
+++#endif
+++        c->idct_4x4_luma               = ff_hevc_transform_luma_4x4_neon_10;
+++        c->sao_band_filter[0]          = ff_hevc_sao_band_8_neon_10;
+++        c->sao_band_filter[1]          = ff_hevc_sao_band_16_neon_10;
+++        c->sao_band_filter[2]          = ff_hevc_sao_band_32_neon_10;
+++        c->sao_band_filter[3]          = ff_hevc_sao_band_48_neon_10;
+++        c->sao_band_filter[4]          = ff_hevc_sao_band_64_neon_10;
+++
+++        c->sao_edge_filter[0]          = ff_hevc_sao_edge_8_neon_10;
+++        c->sao_edge_filter[1]          = ff_hevc_sao_edge_16_neon_10;
+++        c->sao_edge_filter[2]          = ff_hevc_sao_edge_32_neon_10;
+++        c->sao_edge_filter[3]          = ff_hevc_sao_edge_48_neon_10;
+++        c->sao_edge_filter[4]          = ff_hevc_sao_edge_64_neon_10;
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter[5]          = ff_hevc_sao_band_24_neon_10;
+++        c->sao_edge_filter[5]          = ff_hevc_sao_edge_24_neon_10;
+++#endif
+++#if RPI_HEVC_SAND
+++        c->sao_band_filter_c[0]        = ff_hevc_sao_band_c_8_neon_10;
+++        c->sao_band_filter_c[1]        = ff_hevc_sao_band_c_16_neon_10;
+++        c->sao_band_filter_c[2]        = ff_hevc_sao_band_c_32_neon_10;
+++
+++        c->sao_edge_filter_c[0]        = ff_hevc_sao_edge_c_8_neon_10;
+++        c->sao_edge_filter_c[1]        = ff_hevc_sao_edge_c_16_neon_10;
+++        c->sao_edge_filter_c[2]        = ff_hevc_sao_edge_c_32_neon_10;
+++
+++#if SAO_FILTER_N == 6
+++        c->sao_band_filter_c[5]        = ff_hevc_sao_band_c_24_neon_10;
+++        c->sao_edge_filter_c[5]        = ff_hevc_sao_edge_c_24_neon_10;
+++#endif
+++#endif
+++    }
+++
+++    assert(offsetof(MvField, mv) == 0);
+++    assert(offsetof(MvField, ref_idx) == 8);
+++    assert(offsetof(MvField, pred_flag) == 10);
+++    c->hevc_deblocking_boundary_strengths = ff_hevc_deblocking_boundary_strengths_neon;
+  }
++diff --git a/libavcodec/arm/hevcdsp_res16_neon.S b/libavcodec/arm/hevcdsp_res16_neon.S
++new file mode 100644
++index 0000000000..7cc5cd5e5c
++--- /dev/null
+++++ b/libavcodec/arm/hevcdsp_res16_neon.S
++@@ -0,0 +1,610 @@
+++#include "libavutil/arm/asm.S"
+++#include "neon.S"
+++
+++#define BIT_DEPTH 10
+++
+++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+++        vmax.s16  \Q0, \Q_MIN
+++        vmax.s16  \Q1, \Q_MIN
+++        vmax.s16  \Q2, \Q_MIN
+++        vmax.s16  \Q3, \Q_MIN
+++        vmin.s16  \Q0, \Q_MAX
+++        vmin.s16  \Q1, \Q_MAX
+++        vmin.s16  \Q2, \Q_MAX
+++        vmin.s16  \Q3, \Q_MAX
+++.endm
+++
+++@ add_residual4x4(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_neon_, BIT_DEPTH), export=1
+++        vld1.16     {q10, q11}, [r1]
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vld1.16     {d0}, [r0, :64], r2
+++        vld1.16     {d1}, [r0, :64], r2
+++        vld1.16     {d2}, [r0, :64], r2
+++        vld1.16     {d3}, [r0, :64], r2
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        sub         r0,  r0,  r2, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vst1.16     {d0}, [r0, :64], r2
+++        vst1.16     {d1}, [r0, :64], r2
+++        vst1.16     {d2}, [r0, :64], r2
+++        vst1.16     {d3}, [r0, :64], r2
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual4x4(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vdup.i16    q9,  r3
+++        vld1.16     {d0}, [r0, :64], r1
+++        vld1.16     {d1}, [r0, :64], r1
+++        vdup.16     q15, r2
+++        vld1.16     {d2}, [r0, :64], r1
+++        vld1.16     {d3}, [r0, :64], r1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q15
+++        sub         r0,  r0,  r1, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vst1.16     {d0}, [r0, :64], r1
+++        vst1.16     {d1}, [r0, :64], r1
+++        vst1.16     {d2}, [r0, :64], r1
+++        vst1.16     {d3}, [r0, :64], r1
+++        bx          lr
+++
+++endfunc
+++
+++
+++@ add_residual8x8(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        mov         r12, #2
+++1:
+++        vldm        r1!, {q10-q13}
+++        vld1.16     {q0}, [r0, :128], r2
+++        subs        r12, #1
+++        vld1.16     {q1}, [r0, :128], r2
+++        vqadd.s16   q0,  q10
+++        vld1.16     {q2}, [r0, :128], r2
+++        vqadd.s16   q1,  q11
+++        vld1.16     {q3}, [r0, :128], r2
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r0,  r2, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vst1.16     {q0}, [r0, :128], r2
+++        vmin.s16    q2,  q2,  q9
+++        vst1.16     {q1}, [r0, :128], r2
+++        vmin.s16    q3,  q3,  q9
+++        vst1.16     {q2}, [r0, :128], r2
+++        vst1.16     {q3}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual4x4_dc_c(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc_uv)         [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
+++        mov         r12, #1
+++        vdup.32     q15, r2
+++        b           9f
+++endfunc
+++
+++@ add_residual8x8_dc(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
+++        mov         r12, #2
+++        vdup.16     q15, r2
+++9:
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++1:
+++        vld1.16     {q0}, [r0, :128], r1
+++        subs        r12, #1
+++        vld1.16     {q1}, [r0, :128], r1
+++        vqadd.s16   q0,  q15
+++        vld1.16     {q2}, [r0, :128], r1
+++        vqadd.s16   q1,  q15
+++        vld1.16     {q3}, [r0, :128], r1
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r0,  r1, lsl #2
+++        vmax.s16    q0,  q8
+++        vmax.s16    q1,  q8
+++        vmax.s16    q2,  q8
+++        vmax.s16    q3,  q8
+++        vmin.s16    q0,  q9
+++        vmin.s16    q1,  q9
+++        vst1.16     {q0}, [r0, :128], r1
+++        vmin.s16    q2,  q9
+++        vst1.16     {q1}, [r0, :128], r1
+++        vmin.s16    q3,  q9
+++        vst1.16     {q2}, [r0, :128], r1
+++        vst1.16     {q3}, [r0, :128], r1
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual16x16(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        mov         r12, #8
+++1:
+++        vldm        r1!, {q10-q13}
+++        @ For RPI Sand we could guarantee :256 but not for general
+++        @ non-RPI allocation. :128 is as good as we can claim
+++        vld1.16     {q0, q1}, [r0, :128], r2
+++        subs        r12, #1
+++        vld1.16     {q2, q3}, [r0, :128]
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++        vst1.16     {q0, q1}, [r0, :128], r2
+++        vst1.16     {q2, q3}, [r0, :128], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_dc_c(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc_uv)         [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
+++        mov         r12, #4
+++        vdup.32     q15, r2
+++        b           9f
+++endfunc
+++
+++@ add_residual16x16_dc(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
+++        vdup.i16    q15, r2
+++        mov         r12, #8
+++9:
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++1:
+++        @ For RPI Sand we could guarantee :256 but not for general
+++        @ non-RPI allocation. :128 is as good as we can claim
+++        vld1.16     {q0, q1}, [r0, :128], r1
+++        subs        r12, #1
+++        vld1.16     {q2, q3}, [r0, :128]
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r1
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst1.16     {q0, q1}, [r0, :128], r1
+++        vst1.16     {q2, q3}, [r0, :128], r1
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++
+++@ add_residual32x32(
+++@  uint8_t *_dst,     [r0]
+++@  int16_t *res,      [r1]
+++@  ptrdiff_t stride)  [r2]
+++
+++function JOIN(ff_hevc_add_residual_32x32_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++        mov         r12, #32
+++1:
+++        vldm        r1!, {q10-q13}
+++        vldm        r0,  {q0-q3}
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q11
+++        vqadd.s16   q2,  q12
+++        vqadd.s16   q3,  q13
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vstm        r0,  {q0-q3}
+++        add         r0,  r2
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ add_residual8x8_dc_c(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc_uv)         [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
+++        mov         r12, #16
+++        vdup.32     q15, r2
+++        b           9f
+++endfunc
+++
+++@ add_residual32x32_dc(
+++@  uint8_t *_dst,     [r0]
+++@  ptrdiff_t stride,  [r1]
+++@  int dc)            [r2]
+++
+++function JOIN(ff_hevc_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
+++        vdup.i16    q15, r2
+++        mov         r12, #32
+++9:
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++1:
+++        vldm        r0,  {q0-q3}
+++        subs        r12, #1
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q15
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vstm        r0,  {q0-q3}
+++        add         r0,  r1
+++        bne         1b
+++        bx          lr
+++
+++endfunc
+++
+++@ ============================================================================
+++@ U add
+++
+++@ add_residual4x4_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
+++        vld1.16     {q10, q11}, [r1, :256]
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++
+++        vld2.16     {d0, d2}, [r0, :128], r2
+++        vld2.16     {d1, d3}, [r0, :128], r2
+++        vld2.16     {d4, d6}, [r0, :128], r2
+++        vld2.16     {d5, d7}, [r0, :128], r2
+++
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r0,  r2, lsl #2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++
+++        vst2.16     {d0, d2}, [r0, :128], r2
+++        vst2.16     {d1, d3}, [r0, :128], r2
+++        vst2.16     {d4, d6}, [r0, :128], r2
+++        vst2.16     {d5, d7}, [r0, :128]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #4
+++        vdup.i16    q9,  r3
+++1:
+++        vld2.16     {q0, q1}, [r0, :256], r2
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q3,  q15
+++        sub         r0,  r2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256], r2
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_u(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #16
+++        vdup.i16    q9,  r3
+++        sub         r2,  #32
+++1:
+++        vld2.16     {q0, q1}, [r0, :256]!
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q1,  q15
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q3,  q15
+++        sub         r0,  #32
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256]!
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ V add
+++
+++@ add_residual4x4_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
+++        vld1.16     {q10, q11}, [r1, :256]
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++
+++        vld2.16     {d0, d2}, [r0, :128], r2
+++        vld2.16     {d1, d3}, [r0, :128], r2
+++        vld2.16     {d4, d6}, [r0, :128], r2
+++        vld2.16     {d5, d7}, [r0, :128], r2
+++
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q10
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q11
+++        sub         r0,  r0,  r2, lsl #2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++
+++        vst2.16     {d0, d2}, [r0, :128], r2
+++        vst2.16     {d1, d3}, [r0, :128], r2
+++        vst2.16     {d4, d6}, [r0, :128], r2
+++        vst2.16     {d5, d7}, [r0, :128]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #4
+++        vdup.i16    q9,  r3
+++1:
+++        vld2.16     {q0, q1}, [r0, :256], r2
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q10
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q11
+++        sub         r0,  r2
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256], r2
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_v(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride,     [r2]
+++@   int dc)               [r3]
+++
+++function JOIN(ff_hevc_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
+++        vdup.16     q15, r3
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #16
+++        vdup.i16    q9,  r3
+++        sub         r2,  #32
+++1:
+++        vld2.16     {q0, q1}, [r0, :256]!
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q15
+++        vqadd.s16   q1,  q10
+++        vqadd.s16   q2,  q15
+++        vqadd.s16   q3,  q11
+++        sub         r0,  #32
+++        clip16_4 q0, q1, q2, q3, q8, q9
+++        vst2.16     {q0, q1}, [r0, :256]!
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ ============================================================================
+++@ U & V add
+++
+++@ add_residual4x4_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function JOIN(ff_hevc_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
+++        vldm        r1, {q10-q13}
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        vdup.i16    q9,  r3
+++
+++        vld2.16     {d0, d2}, [r0, :128], r2
+++        vld2.16     {d1, d3}, [r0, :128], r2
+++        vld2.16     {d4, d6}, [r0, :128], r2
+++        vld2.16     {d5, d7}, [r0, :128], r2
+++
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r0,  r2, lsl #2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++
+++        vst2.16     {d0, d2}, [r0, :128], r2
+++        vst2.16     {d1, d3}, [r0, :128], r2
+++        vst2.16     {d4, d6}, [r0, :128], r2
+++        vst2.16     {d5, d7}, [r0, :128]
+++        bx          lr
+++endfunc
+++
+++@ add_residual8x8_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function JOIN(ff_hevc_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #4
+++        vdup.i16    q9,  r3
+++        add         r3, r1, #(8*8*2)  @ Offset to V
+++1:
+++        vld2.16     {q0, q1}, [r0, :256], r2
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        vld1.16     {q12, q13}, [r3, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  r2
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++        vst2.16     {q0, q1}, [r0, :256], r2
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+++@ add_residual16x16_c(
+++@   uint8_t *_dst,        [r0]
+++@   const int16_t *res,   [r1]
+++@   ptrdiff_t stride)     [r2]
+++
+++function JOIN(ff_hevc_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
+++        movw        r3,  #(1 << BIT_DEPTH) - 1
+++        vmov.i64    q8,  #0
+++        mov         r12, #16
+++        vdup.i16    q9,  r3
+++        add         r3,  r1, #(16*16*2)  @ Offset to V
+++        sub         r2,  #32
+++1:
+++        vld2.16     {q0, q1}, [r0, :256]!
+++        vld2.16     {q2, q3}, [r0, :256]
+++        vld1.16     {q10, q11}, [r1, :256]!
+++        vld1.16     {q12, q13}, [r3, :256]!
+++        subs        r12, #1
+++        vqadd.s16   q0,  q10
+++        vqadd.s16   q2,  q11
+++        vqadd.s16   q1,  q12
+++        vqadd.s16   q3,  q13
+++        sub         r0,  #32
+++        vmax.s16    q0,  q0,  q8
+++        vmax.s16    q1,  q1,  q8
+++        vmax.s16    q2,  q2,  q8
+++        vmax.s16    q3,  q3,  q8
+++        vmin.s16    q0,  q0,  q9
+++        vmin.s16    q1,  q1,  q9
+++        vmin.s16    q2,  q2,  q9
+++        vmin.s16    q3,  q3,  q9
+++        vst2.16     {q0, q1}, [r0, :256]!
+++        vst2.16     {q2, q3}, [r0, :256], r2
+++        bne         1b
+++        bx          lr
+++endfunc
+++
+ diff --git a/libavcodec/arm/hevcdsp_sao_neon.S b/libavcodec/arm/hevcdsp_sao_neon.S
+ new file mode 100644
+-index 0000000..08a021d
++index 0000000000..30113d9c93
+ --- /dev/null
+ +++ b/libavcodec/arm/hevcdsp_sao_neon.S
+-@@ -0,0 +1,862 @@
++@@ -0,0 +1,1882 @@
+ +/*
+ + * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
+ + *
+@@ -2491,124 +5063,211 @@ index 0000000..08a021d
+ +#include "libavutil/arm/asm.S"
+ +#include "neon.S"
+ +
+-+.macro init_sao_band
+-+        pld      [r1]
+-+        vld1.8   {q0, q1}, [r2]  // offset table
+-+        ldr       r2, [sp, #0]   // stride_dst
+-+        ldr      r12, [sp, #4]   // height
+-+        vmov.u8  q3, #128
+-+.endm
+++.set EDGE_SRC_STRIDE, 160
+++
+++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128
+++        vshr.u8 q12, q8, #3
+++        vadd.s8  q8, \Q_K128
+++        vshr.u8 q13, q9, #3
+++        vadd.s8  q9, \Q_K128
+++
+++        vtbl.8   d24, \XLAT0, d24
+++        vtbl.8   d25, \XLAT0, d25
+++        vtbl.8   d26, \XLAT1, d26
+++        vtbl.8   d27, \XLAT1, d27
+ +
+-+// 128 in q3
+-+// input q8 - q11
+-+.macro sao_band_64
+-+        vtbl.8   d24, {d0, d1, d2, d3}, d24
+-+        vadd.s8  q8, q3
+-+        vtbl.8   d25, {d0, d1, d2, d3}, d25
+-+        vadd.s8  q9, q3
+-+        vtbl.8   d26, {d0, d1, d2, d3}, d26
+-+        vadd.s8  q10, q3
+-+        vtbl.8   d27, {d0, d1, d2, d3}, d27
+-+        vadd.s8  q11, q3
+-+        vtbl.8   d28, {d0, d1, d2, d3}, d28
+ +        vqadd.s8 q8, q12
+-+        vtbl.8   d29, {d0, d1, d2, d3}, d29
+++        vshr.u8 q12, q10, #3
+++        vadd.s8  q10, \Q_K128
+ +        vqadd.s8 q9, q13
+-+        vtbl.8   d30, {d0, d1, d2, d3}, d30
+-+        vqadd.s8 q10, q14
+-+        vtbl.8   d31, {d0, d1, d2, d3}, d31
+-+        vsub.s8  q8, q3
+-+        vqadd.s8 q11, q15
+-+        vsub.s8  q9, q3
+-+        vsub.s8  q10, q3
+-+        vsub.s8  q11, q3
+++        vshr.u8 q13, q11, #3
+++        vadd.s8  q11, \Q_K128
+++
+++        vsub.s8  q8, \Q_K128
+++        vtbl.8   d24, \XLAT0, d24
+++        vtbl.8   d25, \XLAT0, d25
+++        vsub.s8  q9, \Q_K128
+++        vtbl.8   d26, \XLAT1, d26
+++        vtbl.8   d27, \XLAT1, d27
+++        vqadd.s8 q10, q12
+++        vqadd.s8 q11, q13
+++        vsub.s8  q10, \Q_K128
+++        vsub.s8  q11, \Q_K128
+ +.endm
+ +
+-+function ff_hevc_sao_band_w8_neon_8, export=1
+-+        init_sao_band
+-+1:      subs     r12, #8
+-+        vld1.8   {d16}, [r1, :64], r3
+-+        vld1.8   {d17}, [r1, :64], r3
+-+        vshr.u8  q12, q8, #3
+-+        vld1.8   {d18}, [r1, :64], r3
+-+        vld1.8   {d19}, [r1, :64], r3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8   {d20}, [r1, :64], r3
+-+        vld1.8   {d21}, [r1, :64], r3
+-+        vshr.u8  q14, q10, #3
+-+        vld1.8   {d22}, [r1, :64], r3
+-+        vld1.8   {d23}, [r1, :64], r3
+-+        vshr.u8  q15, q11, #3
+-+        sao_band_64
+-+        vst1.8  {d16}, [r0, :64], r2
+-+        vst1.8  {d17}, [r0, :64], r2
+-+        vst1.8  {d18}, [r0, :64], r2
+-+        vst1.8  {d19}, [r0, :64], r2
+-+        vst1.8  {d20}, [r0, :64], r2
+-+        vst1.8  {d21}, [r0, :64], r2
+-+        vst1.8  {d22}, [r0, :64], r2
+-+        vst1.8  {d23}, [r0, :64], r2
+-+        bne    1b
+++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128
+++        vshr.u8 q12, q8, #3
+++        vadd.s8  q8, \Q_K128
+ +
+-+        bx lr
+-+endfunc
+++        vtbl.8   d24, \XLAT0, d24
+++        vtbl.8   d25, \XLAT1, d25
+ +
+-+function ff_hevc_sao_band_w16_neon_8, export=1
+-+        init_sao_band
+-+1:      subs     r12, #4
+-+        vld1.8  {q8}, [r1, :128], r3
+-+        vshr.u8  q12, q8, #3
+-+        vld1.8  {q9}, [r1, :128], r3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8  {q10}, [r1, :128], r3
+-+        vshr.u8  q14, q10, #3
+-+        vld1.8  {q11}, [r1, :128], r3
+-+        vshr.u8  q15, q11, #3
+-+        sao_band_64
+-+        vst1.8   {q8}, [r0, :128], r2
+-+        vst1.8   {q9}, [r0, :128], r2
+-+        vst1.8   {q10}, [r0, :128], r2
+-+        vst1.8   {q11}, [r0, :128], r2
+-+        bne    1b
+++        vqadd.s8 q8, q12
+++        vsub.s8  q8, \Q_K128
+++.endm
+ +
+-+        bx lr
+++
+++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
+++        vmax.s16  \Q0, \Q_MIN
+++        vmax.s16  \Q1, \Q_MIN
+++        vmax.s16  \Q2, \Q_MIN
+++        vmax.s16  \Q3, \Q_MIN
+++        vmin.s16  \Q0, \Q_MAX
+++        vmin.s16  \Q1, \Q_MAX
+++        vmin.s16  \Q2, \Q_MAX
+++        vmin.s16  \Q3, \Q_MAX
+++.endm
+++
+++@ Clobbers q12, q13
+++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
+++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
+++        vtbl.8    d24, \XLAT0, d24
+++        vtbl.8    d25, \XLAT1, d25
+++        vtbl.8    d26, \XLAT0, d26
+++        vtbl.8    d27, \XLAT1, d27
+++        vaddw.s8  \Q0, d24
+++        vaddw.s8  \Q1, d25
+++        vaddw.s8  \Q2, d26
+++        vaddw.s8  \Q3, d27
+++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
+++.endm
+++
+++@ Clobbers q12
+++.macro sao_band_32b_16  Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth
+++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
+++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
+++        vtbl.8    d24, \XLAT0, d24
+++        vtbl.8    d25, \XLAT1, d25
+++        vaddw.s8  \Q0, d24
+++        vaddw.s8  \Q1, d25
+++        vmax.s16  \Q0, \Q_MIN
+++        vmax.s16  \Q1, \Q_MIN
+++        vmin.s16  \Q0, \Q_MAX
+++        vmin.s16  \Q1, \Q_MAX
+++.endm
+++
+++
+++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
+++@ so we are quite safe stuffing it into a byte array
+++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
+++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
+++@ precision
+++
+++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
+++@ array via the stack
+++@ Given that sao_left_class > 28 can cause wrap we can't just poke
+++@ all 4 bytes in at once
+++@
+++@ It also loads other common regs
+++
+++function band_load_y
+++        vmov.i64  q0, #0
+++        ldr       r12, [sp, #8]         @ &sao_offset_val[0]
+++        add       r12, #2               @ 1st interesting val is [1]
+++        vld1.16   {d16}, [r12]          @ Unaligned
+++        vmov.i64  q1, #0
+++        ldr       r12, [sp, #12]        @ sao_left_class
+++
+++        mov       r4, sp
+++        sub       sp, #32
+++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
+++        vst1.8    {q0, q1}, [sp, :256]  @ Put zero array on stack
+++        add       r12, sp
+++        vst1.8    {d16[0]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[2]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[4]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[6]}, [r12]
+++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
+++        mov       sp, r4
+++
+++        ldr       r12, [sp, #20]        @ height
+++        pld       [r1]
+++
+++        sub       r12, #1
+++        add       r4, r1, r3
+++        bx        lr
+ +endfunc
+ +
+-+function ff_hevc_sao_band_w32_neon_8, export=1
+-+        init_sao_band
+-+1:      subs     r12, #2
+-+        vld1.8   {q8-q9}, [r1, :128], r3
+-+        vshr.u8  q12, q8, #3
+-+        vshr.u8  q13, q9, #3
+-+        vld1.8   {q10-q11}, [r1, :128], r3
+-+        vshr.u8  q14, q10, #3
+-+        vshr.u8  q15, q11, #3
+-+        sao_band_64
+-+        vst1.8   {q8-q9}, [r0, :128], r2
+-+        vst1.8   {q10-q11}, [r0, :128], r2
+-+        bne      1b
+ +
+-+        bx       lr
+++function band_load_c
+++        vmov.i64  q2, #0
+++        ldr       r12, [sp, #8]         @ &sao_offset_val1[0]
+++        add       r12, #2               @ 1st interesting val is [1]
+++        vld1.16   {d16}, [r12]          @ Unaligned
+++        vmov.i64  q3, #0
+++        ldr       r12, [sp, #12]        @ sao_left_class
+++
+++        mov       r4, sp                @ Remember SP
+++        sub       sp, #32
+++        and       sp, #~63              @ Align stack so we can wrap with a simple AND
+++
+++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack
+++        add       r12, sp
+++        vst1.8    {d16[0]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[2]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[4]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[6]}, [r12]
+++        vld1.8    {q0, q1}, [sp, :256]  @ Pop modified array
+++
+++        @ And again for the 2nd set
+++        ldr       r12, [r4, #16]        @ &sao_offset_val2[0]
+++        add       r12, #2               @ 1st interesting val is [1]
+++        vld1.16   {d16}, [r12]          @ Unaligned
+++        ldr       r12, [r4, #20]        @ sao_left_class2
+++
+++        vst1.8    {q2, q3}, [sp, :256]  @ Put zero array on stack (again)
+++        add       r12, sp
+++        vst1.8    {d16[0]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[2]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[4]}, [r12]!
+++        and       r12, #~32
+++        vst1.8    {d16[6]}, [r12]
+++        vld1.8    {q2, q3}, [sp, :256]  @ Pop modified array
+++
+++        mov       sp, r4
+++
+++        ldr       r12, [sp, #28]        @ height
+++        pld       [r1]
+++
+++        subs      r12, #1
+++        add       r4, r1, r3
+++        bx        lr
+ +endfunc
+ +
+-+function ff_hevc_sao_band_w64_neon_8, export=1
+-+        init_sao_band
+ +
+++@ ff_hevc_sao_band_64_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_64_neon_8, export=1
+ +        push      {r4, lr}
+-+        subs      r12, #1
+-+        mov       r4, r1
+-+        it ne
+-+        addne     r4, r3
+++        bl        band_load_y
+++        vmov.u8   q15, #128
+ +
+ +1:      subs      r12, #1
+ +        vldm      r1, {q8-q11}
+ +        pld       [r4]
+-+        vshr.u8   q12, q8, #3
+-+        vshr.u8   q13, q9, #3
+ +        add       r1, r3
+-+        vshr.u8   q14, q10, #3
+-+        vshr.u8   q15, q11, #3
+-+        sao_band_64
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+ +        it ne
+ +        addne     r4, r3
+ +        vstm      r0, {q8-q11}
+@@ -2618,8 +5277,113 @@ index 0000000..08a021d
+ +        pop       {r4, pc}
+ +endfunc
+ +
+++@ ff_hevc_sao_band_32_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_32_neon_8, export=1
+++        push      {r4, lr}
+++        bl        band_load_y
+++        vmov.u8   q15, #128
+++
+++1:      subs      r12, #2
+++        vld1.8    { q8, q9 }, [r1, :128], r3
+++        vld1.8    {q10, q11}, [r1, :128], r3
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.8    { q8, q9 }, [r0, :128], r2
+++        vst1.8    {q10, q11}, [r0, :128], r2
+++        bpl       1b
+++
+++        pop       {r4, pc}
+++endfunc
+++
+++@ ff_hevc_sao_band_16_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_16_neon_8, export=1
+++        push      {r4, lr}
+++        bl        band_load_y
+++        vmov.u8   q15, #128
+++
+++1:      subs      r12, #4
+++        vld1.8    { q8}, [r1, :128], r3
+++        vld1.8    { q9}, [r1, :128], r3
+++        vld1.8    {q10}, [r1, :128], r3
+++        vld1.8    {q11}, [r1, :128], r3
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.8    { q8}, [r0, :128], r2
+++        vst1.8    { q9}, [r0, :128], r2
+++        vst1.8    {q10}, [r0, :128], r2
+++        vst1.8    {q11}, [r0, :128], r2
+++        bpl       1b
+++
+++        pop       {r4, pc}
+++endfunc
+++
+++@ ff_hevc_sao_band_8_neon_8 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++function ff_hevc_sao_band_8_neon_8, export=1
+++        push      {r4, lr}
+++        bl        band_load_y
+++        ldr       lr, [sp, #16]         @ width
+++        vmov.u8   q15, #128
+++        cmp       lr, #8
+++        blt       4f
+++
+++1:      subs      r12, #2
+++        vld1.8    {d16}, [r1, :64], r3
+++        vld1.8    {d17}, [r1, :64], r3
+ +
+-+@ ff_hevc_sao_band_c_w64_neon_8(
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.8    {d16}, [r0, :64], r2
+++        vst1.8    {d17}, [r0, :64], r2
+++        bpl       1b
+++        pop       {r4, pc}
+++
+++4:
+++1:      subs      r12, #4
+++        vld1.32   {d16[0]}, [r1, :32], r3
+++        vld1.32   {d16[1]}, [r1, :32], r3
+++        vld1.32   {d17[0]}, [r1, :32], r3
+++        vld1.32   {d17[1]}, [r1, :32], r3
+++
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q15
+++
+++        vst1.32   {d16[0]}, [r0, :32], r2
+++        vst1.32   {d16[1]}, [r0, :32], r2
+++        vst1.32   {d17[0]}, [r0, :32], r2
+++        vst1.32   {d17[1]}, [r0, :32], r2
+++        bpl       1b
+++        pop       {r4, pc}
+++endfunc
+++
+++@ ff_hevc_sao_band_c_32_neon_8(
+ +@   uint8_t * dst          [r0]
+ +@   uint8_t * src          [r1]
+ +@   uint32_t dst_stride    [r2]
+@@ -2631,10868 +5395,9596 @@ index 0000000..08a021d
+ +@   int width              sp[16]
+ +@   int height             sp[20]
+ +
+-+@ As this is often done in-place on the frame buffer it is worth preloading
+-+@ the pixel values but we want to beware of loading ouside our buffer to avoid
+-+@ loading stuff into the cache that should still be invalid (in use by QPU, VPU)
+-+
+-+function ff_hevc_sao_band_c_neon_8, export=1
+-+        mov     r12, sp
+-+        push   {r4-r8, lr}  // 24 bytes
+-+
+-+        ldm     r12, {r4-r7}
+-+
+-+        add     r4, #2
+-+        add     r6, #2
+-+        vld1.16 {d16}, [r4]    @ Unaligned
+-+        lsl     r5, r5, #3
+-+        vld1.16 {d18}, [r6]
+-+        pld     [r1]
+-+        vmov.i8  d17, #0
+-+        mov     r4, r1
+-+        vmov.i8  d19, #0
+-+        lsl     r7, r7, #3
+-+        vdup.8  q1, r5
+-+        ldr     r5, [r12, #16]  @ width
+-+        vdup.8  q2, r7
+-+        ldr     r12, [r12, #20]
+-+        vqmovn.s16 d0, q8
+-+        cmp     r5, #16         @ At some point we may want a table lookup
+-+        vqmovn.s16 d1, q9
+-+        vmov.i8 q3, #128
+-+        beq     16f
+-+
+-+        @ d0 U lookup
+-+        @ d1 V lookup
+-+        @ q1 U raw offset
+-+        @ q2 V raw offset
+-+        @ q3 #128
+-+
+-+        @ r4 = r1 = src - Inteded for preload pointer
+-+        @ r12 = height
+-+
+-+        @ Might (unlikely) be called with height == 1
+-+        subs      r12, #1
+-+        it ne
+-+        addne     r4, r3
+++function ff_hevc_sao_band_c_32_neon_8, export=1
+++        push    {r4, lr}
+++        bl      band_load_c
+++
+++        vmov.i8   q15, #128
+++        sub       r3, #32
+++        sub       r2, #32
+++
+++1:      subs      r12, #1
+++        vld2.8    { q8, q9 }, [r1, :128]!
+++        vld2.8    {q10, q11}, [r1, :128], r3
+ +
+-+1:
+-+        subs      r12, #1
+-+        vld2.8    {q8-q9}, [r1, :128]!
+-+        vsub.u8   q12, q8, q1
+-+        vld2.8    {q10-q11}, [r1, :128], r3
+-+        vsub.u8   q14, q10, q1
+-+        vsub.u8   q13, q9, q2
+-+        sub       r1, #32
+-+        vsub.u8   q15, q11, q2
+ +        pld       [r4]
+-+        vshr.u8   q12, #3
+-+        vadd.s8   q8, q3
+-+        vshr.u8   q13, #3
+-+        vadd.s8   q9, q3
+-+
+-+        vtbl.8   d24, {d0}, d24
+-+        vshr.u8  q14, #3
+-+        vtbl.8   d25, {d0}, d25
+-+        vshr.u8  q15, #3
+-+        vtbl.8   d26, {d1}, d26
+-+        vadd.s8  q10, q3
+-+        vtbl.8   d27, {d1}, d27
+-+        vadd.s8  q11, q3
+-+        vtbl.8   d28, {d0}, d28
+-+        vqadd.s8 q8, q12
+-+        vtbl.8   d29, {d0}, d29
+-+        vqadd.s8 q9, q13
+-+        vtbl.8   d30, {d1}, d30
+-+        vqadd.s8 q10, q14
+-+        vtbl.8   d31, {d1}, d31
+-+        vsub.s8  q8, q3
+-+        vqadd.s8 q11, q15
+-+        vsub.s8  q9, q3
+-+        vsub.s8  q10, q3
+-+        vsub.s8  q11, q3
+ +
+-+        it ne
+-+        addne     r4, r3        @ Do not inc on final pass
+-+        vst2.8    {q8-q9}, [r0, :128]!
+-+        vst2.8    {q10-q11}, [r0, :128], r2
+-+        sub       r0, #32
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+++
+++        vst2.8    { q8, q9 }, [r0, :128]!
+++        vst2.8    {q10, q11}, [r0, :128], r2
+++
+++        itt ne
+++        addne     r4, r3
+++        addne     r4, #32
+++
+ +        bpl       1b
+ +
+-+        pop    {r4-r8, pc}
+++        pop     {r4, pc}
+++endfunc
+ +
+-+@ -- width 16 (UV pairs) --
+-+16:
+-+        subs    r12, #2
+-+        it ne
+-+        addne   r4, r4, r3, lsl #1
+++@ ff_hevc_sao_band_c_16_neon_8(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+1:
+-+        subs      r12, #2
+-+        vld2.8    {q8-q9}, [r1, :128], r3
+-+        vsub.u8   q12, q8, q1
+-+        vld2.8    {q10-q11}, [r1, :128], r3
+-+        vsub.u8   q14, q10, q1
+-+        vsub.u8   q13, q9, q2
+-+        pld       [r4]
+-+        vsub.u8   q15, q11, q2
+-+        pld       [r4, r3]
+-+        vshr.u8  q12, #3
+-+        vadd.s8  q8, q3
+-+        vshr.u8  q13, #3
+-+        vadd.s8  q9, q3
+-+
+-+        vtbl.8   d24, {d0}, d24
+-+        vshr.u8  q14, #3
+-+        vtbl.8   d25, {d0}, d25
+-+        vshr.u8  q15, #3
+-+        vtbl.8   d26, {d1}, d26
+-+        vadd.s8  q10, q3
+-+        vtbl.8   d27, {d1}, d27
+-+        vadd.s8  q11, q3
+-+        vtbl.8   d28, {d0}, d28
+-+        vqadd.s8 q8, q12
+-+        vtbl.8   d29, {d0}, d29
+-+        vqadd.s8 q9, q13
+-+        vtbl.8   d30, {d1}, d30
+-+        vqadd.s8 q10, q14
+-+        vtbl.8   d31, {d1}, d31
+-+        vsub.s8  q8, q3
+-+        vqadd.s8 q11, q15
+-+        vsub.s8  q9, q3
+-+        vsub.s8  q10, q3
+-+        vsub.s8  q11, q3
+++function ff_hevc_sao_band_c_16_neon_8, export=1
+++        push    {r4, lr}
+++        bl      band_load_c
+++        vmov.i8   q15, #128
+ +
+-+        it ne
+-+        addne   r4, r4, r3, lsl #1
+-+        vst2.8    {q8-q9}, [r0, :128], r2
+-+        vst2.8    {q10-q11}, [r0, :128], r2
+-+        bpl       1b
+++1:      subs      r12, #2
+++        vld2.8    { q8, q9 }, [r1, :128], r3
+++        vld2.8    {q10, q11}, [r1, :128], r3
+++
+++        sao_band_64b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+ +
+-+        pop    {r4-r8, pc}
+++        vst2.8    { q8, q9 }, [r0, :128], r2
+++        vst2.8    {q10, q11}, [r0, :128], r2
+ +
+++        bpl       1b
+++        pop     {r4, pc}
+ +endfunc
+ +
+++@ ff_hevc_sao_band_c_8_neon_8(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+++
+++function ff_hevc_sao_band_c_8_neon_8, export=1
+++        push    {r4, lr}
+++        bl      band_load_c
+++        ldr       lr, [sp, #16]         @ width
+++        vmov.u8   q15, #128
+++        cmp       lr, #8
+++        blt       4f
+ +
+-+.macro diff32 out0, out1, tmp0, tmp1, in0, in1, in2, in3
+-+        vcgt.u8 \out0, \in2, \in0  // c > a -> -1 , otherwise 0
+-+        vcgt.u8 \tmp0,  \in0, \in2  // a > c -> -1 , otherwise 0
+-+        vcgt.u8 \out1, \in3, \in1  // c > a -> -1 , otherwise 0 part 2
+-+        vcgt.u8 \tmp1,  \in1, \in3  // a > c -> -1 , otherwise 0 part 2
+-+        vsub.s8 \out0, \tmp0, \out0 // diff0
+-+        vsub.s8 \out1, \tmp1, \out1 // diff0 part 2
+-+.endm
+++1:      subs      r12, #1
+++        vld2.8    {d16, d17}, [r1, :128], r3
+ +
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+ +
+-+// input
+-+// a in q0 - q3
+-+// c in q4 - q7
+-+// b in q8 - q11
+-+// offset table r4,r5 and r6,r7
+-+//   r4,r5 applied to even samples; r6 r7 applied to odd - allows filtering of C
+-+// output in q0 - q3
+-+// clobbers q12 - q15
+++        vst2.8    {d16, d17}, [r0, :128], r2
+++        bpl       1b
+++        pop     {r4, pc}
+ +
+-+@ a <- c <- b
+-+@
+-+@ It appears that Neon can stall if you try and use results too soon so we try to
+-+@ spread our instruction out
+-+
+-+.macro edgeidx64
+-+
+-+        vcgt.u8 q12, q4, q0  // c > a -> -1 , otherwise 0
+-+        vcgt.u8 q13, q5, q1
+-+        vcgt.u8 q14, q6, q2
+-+        vcgt.u8 q15, q7, q3
+-+
+-+        vcgt.u8 q0, q0, q4  // a > c -> -1 , otherwise 0
+-+        vcgt.u8 q1, q1, q5
+-+        vcgt.u8 q2, q2, q6
+-+        vcgt.u8 q3, q3, q7
+-+
+-+        vsub.s8 q0, q0, q12 // a = sign(c-a)
+-+        vsub.s8 q1, q1, q13
+-+        vsub.s8 q2, q2, q14
+-+        vsub.s8 q3, q3, q15
+-+
+-+        vcgt.u8 q12, q4, q8  // c > b -> -1 , otherwise 0
+-+        vcgt.u8 q13, q5, q9
+-+        vcgt.u8 q14, q6, q10
+-+        vcgt.u8 q15, q7, q11
+-+
+-+        vsub.s8 q0, q0, q12
+-+        vsub.s8 q1, q1, q13
+-+        vsub.s8 q2, q2, q14
+-+        vsub.s8 q3, q3, q15
+-+
+-+        vcgt.u8 q12, q8, q4  // c < b -> -1 , otherwise 0
+-+        vcgt.u8 q13, q9, q5
+-+        vcgt.u8 q14, q10, q6
+-+        vcgt.u8 q15, q11, q7
+-+
+-+        vadd.s8 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+-+        vadd.s8 q1, q1, q13
+-+        vmov.u8 q12, #2
+-+        vadd.s8 q2, q2, q14
+-+        vadd.s8 q3, q3, q15
+-+
+-+        vadd.s8 q0, q0, q12
+-+        vadd.s8 q1, q1, q12
+-+        @ whilst vmov dn, rm, rn exists it is a vfp instruction
+-+        @ and causes a stall till neon pipe empty - so don't do that!
+-+        vmov    d26[0], r4
+-+        vmov    d26[1], r5
+-+        vmov    d27[0], r6
+-+        vmov    d27[1], r7
+-+        vadd.s8 q2, q2, q12
+-+        vuzp.8    q0, q1
+-+        vmov.u8 q15, #128
+-+        vadd.s8 q3, q3, q12 // a = 2 + sign(c-a) + sign(c-b)
+-+
+-+        vtbl.8  d0, {d26}, d0
+-+        vadd.s8 q12, q4, q15  // Add -128 so we can use saturating signed add
+-+
+-+        vtbl.8  d1, {d26}, d1
+-+        vadd.s8 q14, q5, q15
+-+
+-+        vtbl.8  d2, {d27}, d2
+-+        vuzp.8    q2, q3
+-+
+-+        vtbl.8  d3, {d27}, d3
+-+
+-+        vtbl.8  d4, {d26}, d4
+-+        vzip.8    q0, q1
+-+
+-+        vtbl.8  d5, {d26}, d5
+-+        vqadd.s8 q0, q0, q12
+-+        vqadd.s8 q1, q1, q14
+-+        vadd.s8 q12, q6, q15  // Add -128 so we can use saturating signed add
+-+
+-+        vtbl.8  d6, {d27}, d6
+-+        vadd.s8 q14, q7, q15  // Add -128 so we can use saturating signed add
+-+
+-+        vtbl.8  d7, {d27}, d7
+-+        vzip.8   q2, q3
+-+
+-+        vsub.s8 q0, q0, q15
+-+        vqadd.s8 q2, q2, q12
+-+        vqadd.s8 q3, q3, q14
+-+        vsub.s8 q1, q1, q15
+-+        vsub.s8 q2, q2, q15
+-+        vsub.s8 q3, q3, q15
+++4:
+++1:      subs      r12, #1
+++        vld1.8    {d16}, [r1, :64], r3
+++        vld1.8    {d17}, [r1, :64], r3
+++        vuzp.8    d16, d17
+ +
+-+.endm
+++        sao_band_16b_8 "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q15
+ +
+-+function edge_w64_body
+-+        edgeidx64
+-+        vstm    r0, {q0-q3}
+-+        add     r0, r0, r2
+-+        bx       lr
+++        vzip.8    d16, d17
+++        vst1.8    {d16}, [r0, :64], r2
+++        vst1.8    {d17}, [r0, :64], r2
+++        bpl       1b
+++        pop     {r4, pc}
+ +endfunc
+ +
+-+.macro init_edge_64
+-+        push   {r4-r8,lr}
+-+        ldr    r12, [sp, #24] // height
+-+        ldr    r5,  [sp, #28] // sao_offset_val_table
+-+        ldrd   r4, r5, [r5]
+-+        mov    r6, r4
+-+        mov    r7, r5
+++
+++@ ff_hevc_sao_band_64_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_64_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q2, #0
+++        vdup.i16  q3, lr
+++        bl        band_load_y
+++        vpush     {q4-q7}
+++
+++1:      subs      r12, #1
+++        vldm      r1, {q4-q11}
+++        add       r1, r3
+++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+++        vstm      r0, {q4-q11}
+++        add       r0, r2
+++        bpl       1b
+++
+++        vpop      {q4-q7}
+++        pop       {r4, pc}
+ +.endm
+ +
+-+function ff_hevc_sao_edge_eo0_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+        sub    r1, #8
+-+1:      subs    r12, #1
+-+        vld1.64  {d7}, [r1, :64]!
+-+        vld1.64  {q4-q5}, [r1, :128]! // load c
+-+        vld1.64  {q6-q7}, [r1, :128]!
+-+        vld1.64  {d24}, [r1, :64], r3
+-+        sub      r1, #72
+-+        // load a
+-+        vext.8 q0, q3, q4, #15
+-+        vext.8 q1, q4, q5, #15
+-+        vext.8 q2, q5, q6, #15
+-+        vext.8 q3, q6, q7, #15
+-+        // load b
+-+        vext.8 q8, q4, q5, #1
+-+        vext.8 q9, q5, q6, #1
+-+        vext.8 q10, q6, q7, #1
+-+        vext.8 q11, q7, q12, #1
+-+        bl    edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+++function ff_hevc_sao_band_64_neon_10, export=1
+++        band_64_16 10
+ +endfunc
+ +
+-+function ff_hevc_sao_edge_eo1_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+        sub     r1, r3
+-+        // load a
+-+        vld1.8  {q0-q1}, [r1, :128]!
+-+        vld1.8  {q2-q3}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q8-q9}, [r1, :128]!
+-+        vld1.8  {q10-q11}, [r1, :128], r3
+-+        sub     r1, #32
+-+        bl      edge_w64_body
+-+        // copy c to a
+-+        vmov.64 q0, q4
+-+        vmov.64 q1, q5
+-+        vmov.64 q2, q6
+-+        vmov.64 q3, q7
+-+        // copy b to c
+-+        vmov.64 q4, q8
+-+        vmov.64 q5, q9
+-+        vmov.64 q6, q10
+-+        vmov.64 q7, q11
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+endfunc
+++@ ff_hevc_sao_band_32_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_32_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q2, #0
+++        vdup.i16  q3, lr
+++        bl        band_load_y
+ +
+-+function ff_hevc_sao_edge_eo2_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        sub     r1, #1
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #31
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        add     r1, #1
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #33
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+endfunc
+++1:      subs      r12, #1
+++        vldm      r1, {q8-q11}
+++        add       r1, r3
+++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q2, q3, \bit_depth
+++        vstm      r0, {q8-q11}
+++        add       r0, r2
+++        bpl       1b
+ +
+-+function ff_hevc_sao_edge_eo3_w64_neon_8, export=1
+-+        init_edge_64
+-+        vpush {d8-d15}
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        add     r1, #1
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #33
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        sub     r1, #1
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #31
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+++        pop       {r4, pc}
+++.endm
+++
+++function ff_hevc_sao_band_32_neon_10, export=1
+++        band_32_16 10
+ +endfunc
+ +
+++@ ff_hevc_sao_band_16_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_16_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        bl        band_load_y
+++
+++1:      subs      r12, #2
+++        vld1.16   { q8, q9 }, [r1, :128], r3
+++        vld1.16   {q10, q11}, [r1, :128], r3
+++        sao_band_64b_16 q8,  q9,  q10, q11, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+++        vst1.16   { q8, q9 }, [r0, :128], r2
+++        vst1.16   {q10, q11}, [r0, :128], r2
+++        bpl       1b
+ +
+-+@ void ff_hevc_sao_edge_c_eo1_w64_neon_8(
+-+@   uint8_t *_dst,               r0
+-+@   uint8_t *_src,               r1
+-+@   ptrdiff_t stride_dst,        r2
+-+@   ptrdiff_t stride_src,        r3
+-+@   int height,                  sp[0]
+-+@   int16_t *sao_offset_table_u,  sp[4]
+-+@   int16_t *sao_offset_table_v); sp[8]
+-+@   int eo                        sp[12]
+-+
+-+function ff_hevc_sao_edge_c_w64_neon_8, export=1
+-+        push   {r4-r8,lr}     // 6 reg = 24
+-+        ldr    r5,  [sp, #28] // sao_offset_val_table_u
+-+        ldr    r7,  [sp, #32] // sao_offset_val_table_v
+-+
+-+        @ Load and rearrange offsets
+-+        @ Also "convert" from 16bit to 8bit
+-+        ldrb    r4, [r5, #2]
+-+        ldrb    r8, [r5, #4]
+-+        ldrb    r6, [r7, #2]
+-+        ldrb    r12, [r7, #4]
+-+        orr     r4, r4, r8, lsl #8
+-+        orr     r6, r6, r12, lsl #8
+-+        ldrb    r8, [r5, #6]
+-+        ldrb    r12, [r7, #6]
+-+        orr     r4, r4, r8, lsl #24
+-+        orr     r6, r6, r12, lsl #24
+-+        ldrb    r5, [r5, #8]
+-+        ldrb    r7, [r7, #8]
+-+
+-+        ldr     r12, [sp, #36] // e0
+-+        adr     r8, edge_c_tbl_w64
+-+        ldr     r8, [r8, r12, lsl #2]
+-+
+-+        ldr     r12, [sp, #24] // height
+-+        vpush   {d8-d15}
+-+        mov     pc, r8
+-+
+-+edge_c_tbl_w64:
+-+        .word   ff_hevc_sao_edge_c_eo0_w64_neon_8
+-+        .word   ff_hevc_sao_edge_c_eo1_w64_neon_8
+-+        .word   ff_hevc_sao_edge_c_eo2_w64_neon_8
+-+        .word   ff_hevc_sao_edge_c_eo3_w64_neon_8
+-+
+-+ff_hevc_sao_edge_c_eo0_w64_neon_8:
+-+        sub    r1, #8
+-+1:      subs    r12, #1
+-+        vld1.64  {d7}, [r1, :64]!
+-+        vld1.64  {q4-q5}, [r1, :128]! // load c
+-+        vld1.64  {q6-q7}, [r1, :128]!
+-+        vld1.64  {d24}, [r1, :64], r3
+-+        sub      r1, #72
+-+        // load a
+-+        vext.8 q0, q3, q4, #14
+-+        vext.8 q1, q4, q5, #14
+-+        vext.8 q2, q5, q6, #14
+-+        vext.8 q3, q6, q7, #14
+-+        // load b
+-+        vext.8 q8, q4, q5, #2
+-+        vext.8 q9, q5, q6, #2
+-+        vext.8 q10, q6, q7, #2
+-+        vext.8 q11, q7, q12, #2
+-+        bl    edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+
+-+ff_hevc_sao_edge_c_eo1_w64_neon_8:
+-+        sub     r1, r3
+-+        // load a
+-+        vldm    r1, {q0-q3}
+-+        add     r1, r3
+-+        // load c
+-+        vldm    r1, {q4-q7}
+-+        add     r1, r3
+-+1:      subs    r12, #1
+-+        // load b
+-+        vldm    r1, {q8-q11}
+-+        add     r1, r3
+-+        bl      edge_w64_body
+-+        // copy c to a
+-+        vmov.64 q0, q4
+-+        vmov.64 q1, q5
+-+        vmov.64 q2, q6
+-+        vmov.64 q3, q7
+-+        // copy b to c
+-+        vmov.64 q4, q8
+-+        vmov.64 q5, q9
+-+        vmov.64 q6, q10
+-+        vmov.64 q7, q11
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+
+-+ff_hevc_sao_edge_c_eo2_w64_neon_8:
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        sub     r1, #2
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #30
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        add     r1, #2
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #34
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+-+
+-+ff_hevc_sao_edge_c_eo3_w64_neon_8:
+-+1:      sub     r1, r3
+-+        // load a
+-+        // TODO: fix unaligned load
+-+        //       don't reload a like in eo1
+-+        add     r1, #2
+-+        vld1.8  {q0-q1}, [r1]!
+-+        vld1.8  {q2-q3}, [r1], r3
+-+        sub     r1, #34
+-+        subs    r12, #1
+-+        // load c
+-+        vld1.8  {q4-q5}, [r1, :128]!
+-+        vld1.8  {q6-q7}, [r1, :128], r3
+-+        sub     r1, #32
+-+        // load b
+-+        sub     r1, #2
+-+        vld1.8  {q8-q9}, [r1]!
+-+        vld1.8  {q10-q11}, [r1]
+-+        sub     r1, #30
+-+        bl      edge_w64_body
+-+        bne   1b
+-+        vpop  {d8-d15}
+-+        pop   {r4-r8,pc}
+++        pop       {r4, pc}
+++.endm
+++
+++function ff_hevc_sao_band_16_neon_10, export=1
+++        band_16_16 10
+ +endfunc
+ +
+++@ ff_hevc_sao_band_8_neon_10 (
+++@   uint8_t *_dst,              [r0]
+++@   uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,       [r2]
+++@   ptrdiff_t stride_src,       [r3]
+++@   int16_t *sao_offset_val,    [sp, #0]
+++@   int sao_left_class,         [sp, #4]
+++@   int width,                  [sp, #8]
+++@   int height)                 [sp, #12]
+++
+++.macro band_8_16 bit_depth
+++        push      {r4, lr}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        bl        band_load_y
+++        ldr       lr, [sp, #16]
+++        cmp       lr, #8
+++        blt       4f
+++
+++1:      subs      r12, #2
+++        vld1.16   { q8}, [r1, :128], r3
+++        vld1.16   { q9}, [r1, :128], r3
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+++        vst1.16   { q8}, [r0, :128], r2
+++        vst1.16   { q9}, [r0, :128], r2
+++        bpl       1b
+++        pop       {r4, pc}
+ +
+-+.macro init_edge_32
+-+        ldr     r12, [sp, #4] // sao_offset_val_table
+-+        vld1.32 {d31}, [r12]
+-+        ldr     r12, [sp] // height
+++4:
+++1:      subs      r12, #4
+++        vld1.16   {d16}, [r1, :64], r3
+++        vld1.16   {d17}, [r1, :64], r3
+++        vld1.16   {d18}, [r1, :64], r3
+++        vld1.16   {d19}, [r1, :64], r3
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d0,d1,d2,d3}", q14, q15, \bit_depth
+++        vst1.16   {d16}, [r0, :64], r2
+++        vst1.16   {d17}, [r0, :64], r2
+++        vst1.16   {d18}, [r0, :64], r2
+++        vst1.16   {d19}, [r0, :64], r2
+++        bpl       1b
+++        pop       {r4, pc}
+ +.endm
+ +
+-+.macro diff out0, tmp0, in0, in1
+-+        vcgt.u8 \out0, \in1, \in0  // c > a -> -1 , otherwise 0
+-+        vcgt.u8 \tmp0,  \in0, \in1  // a > c -> -1 , otherwise 0
+-+        vsub.s8 \out0, \tmp0, \out0 // diff0
+-+.endm
+++function ff_hevc_sao_band_8_neon_10, export=1
+++        band_8_16 10
+++endfunc
+ +
+-+.macro table32
+-+        vmov.s8  q10, #2
+-+        vadd.s8  q0, q10
+-+        vadd.s8  q1, q10
+-+        vmov.s8  q10, #128
+-+        vtbl.8   d0, {d31}, d0
+-+        vadd.s8  q11, q2, q10
+-+        vtbl.8   d1, {d31}, d1
+-+        vadd.s8  q12, q3, q10
+-+        vtbl.8   d2, {d31}, d2
+-+        vqadd.s8 q11, q0
+-+        vtbl.8   d3, {d31}, d3
+-+        vqadd.s8 q12, q1
+-+        vsub.s8  q0, q11, q10
+-+        vsub.s8  q1, q12, q10
+-+        vst1.8   {q0-q1}, [r0, :128], r2
+-+.endm
+ +
+-+function ff_hevc_sao_edge_eo0_w32_neon_8, export=1
+-+        init_edge_32
+-+        vpush {q4-q7}
+-+        sub     r1, #4
+-+1:      subs    r12, #1
+-+        vld1.8  {q13-q14}, [r1]!
+-+        vld1.32 d30, [r1], r3
+-+        sub     r1, #32
+-+        // a
+-+        vext.8   q0, q13, q14, #3
+-+        vext.8   q1, q14, q15, #3
+-+        vshr.u64 d24, d30, #24
+-+        // c
+-+        vext.8   q2, q13, q14, #4
+-+        vext.8   q3, q14, q15, #4
+-+        vshr.u64 d16, d30, #32
+-+        // diff0
+-+        diff32 q13, q14, q4, q5, q0, q1, q2, q3
+-+        diff   d18, d25, d24, d16
+-+        // -diff1
+-+        vext.s8 q0, q13, q14, #1
+-+        vext.s8 q1, q14, q9, #1
+-+
+-+        vsub.s8 q0, q13, q0 //diff0 + diff1
+-+        vsub.s8 q1, q14, q1
+-+        table32
+-+        bne     1b
+-+        vpop {q4-q7}
+++@ ff_hevc_sao_band_c_32_neon_10(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+        bx      lr
+-+endfunc
+++.macro band_c_32_16 bit_depth
+++        push      {r4, lr}
+++        bl        band_load_c
+++        vpush     {q4-q7}
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        sub       r2, #96
+ +
+-+function ff_hevc_sao_edge_eo1_w32_neon_8, export=1
+-+        init_edge_32
+-+        vpush {q4-q7}
+-+        // load a
+-+        sub     r1, r3
+-+        vld1.8  {q0-q1}, [r1, :128], r3
+-+        // load c
+-+        vld1.8  {q2-q3}, [r1, :128], r3
+-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3 // CMP ( c, a )
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q8-q9}, [r1, :128], r3
+-+        diff32 q4, q5, q10, q11, q8, q9, q2, q3 // CMP ( c, b )
+-+        vadd.s8 q0, q4, q12 //diff0 + diff1
+-+        vadd.s8 q1, q5, q13
+-+        table32
+-+        // CMP ( c, a )
+-+        vneg.s8 q12, q4
+-+        vneg.s8 q13, q5
+-+        // c
+-+        vmov.64 q2, q8
+-+        vmov.64 q3, q9
+-+        bne     1b
+-+        vpop {q4-q7}
+-+        bx      lr
+-+endfunc
+++1:      subs      r12, #1
+ +
+-+function ff_hevc_sao_edge_eo2_w32_neon_8, export=1
+-+        init_edge_32
+-+        vpush   {d8-d15}
+-+        // load a
+-+        sub     r1, r3
+-+        sub     r1, #8
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {d24}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q0, q10, q11, #7
+-+        vext.8  q1, q11, q12, #7
+-+        // load c
+-+        vld1.8  {d9}, [r1, :64]!
+-+        vld1.8  {q2-q3}, [r1, :64], r3
+-+        sub     r1, #8
+-+        vext.8  q4, q4, q2, #15
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {q12}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q8, q10, q11, #9
+-+        vext.8  q9, q11, q12, #9
+-+        vext.8  q6, q10, q11, #8
+-+        vext.8  q7, q11, q12, #8
+-+        vext.8  q5, q10, q11, #7
+-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+-+        vadd.s8 q0, q12 //diff0 + diff1
+-+        vadd.s8 q1, q13
+-+        table32
+-+        // inputs for next loop iteration
+-+        // a
+-+        vmov.8  q0, q4
+-+        vext.8  q1, q2, q3, #15
+-+        // c
+-+        vmov.8  q2, q6
+-+        vmov.8  q3, q7
+-+        vmov.8  q4, q5
+-+        bne     1b
+-+        vpop    {d8-d15}
+-+        bx      lr
+-+endfunc
+++        vld2.16   { q4, q5 }, [r1, :128]!
+++        vld2.16   { q6, q7 }, [r1, :128]!
+++        vld2.16   { q8, q9 }, [r1, :128]!
+++        vld2.16   {q10, q11}, [r1, :128], r3
+ +
+-+function ff_hevc_sao_edge_eo3_w32_neon_8, export=1
+-+        init_edge_32
+-+        sub     r1, r3
+-+        // load a
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {d24}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q0, q10, q11, #1
+-+        vext.8  q1, q11, q12, #1
+-+        // load c
+-+        vld1.8  {q2-q3}, [r1, :64]!
+-+        vld1.8  {d30}, [r1, :64], r3
+-+        sub     r1, #40
+-+1:      subs    r12, #1
+-+        // load b
+-+        vld1.8  {q10-q11}, [r1, :64]!
+-+        vld1.8  {q12}, [r1, :64], r3
+-+        sub     r1, #32
+-+        vext.8  q8, q10, q11, #7
+-+        vext.8  q9, q11, q12, #7
+-+        vext.8  q14, q12, q10, #7
+-+
+-+        diff32 q12, q13, q0, q1, q0, q1, q2, q3
+-+        diff32 q0, q1, q10, q11,  q8, q9, q2, q3
+-+
+-+        vadd.s8 q0, q12 //diff0 + diff1
+-+        vadd.s8 q1, q13
+-+        table32
+-+
+-+        // inputs for next loop iteration
+-+        // a
+-+        vext.8  q0, q2, q3, #1
+-+        vext.8  q1, q3, q15, #1
+-+        // c
+-+        vext.8  q2, q8, q9, #1
+-+        vext.8  q3, q9, q14, #1
+-+        vext.8  d30, d28, d2, #1
+-+        bne     1b
+-+        bx      lr
+-+endfunc
+++        pld       [r4]
+++        sub       r1, #96
+ +
+-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+-index 39713ed..25eb52b 100644
+---- a/libavcodec/avcodec.h
+-+++ b/libavcodec/avcodec.h
+-@@ -410,6 +410,8 @@ enum AVCodecID {
+-     AV_CODEC_ID_SHEERVIDEO,
+-     AV_CODEC_ID_YLC,
+- 
+-+    AV_CODEC_ID_H264_MVC,
+++        sao_band_64b_16 q4,  q5,  q6,  q7, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-     /* various PCM "codecs" */
+-     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
+-     AV_CODEC_ID_PCM_S16LE = 0x10000,
+-@@ -2850,6 +2852,7 @@ typedef struct AVCodecContext {
+- #define FF_BUG_DC_CLIP          4096
+- #define FF_BUG_MS               8192 ///< Work around various bugs in Microsoft's broken decoders.
+- #define FF_BUG_TRUNCATED       16384
+-+#define FF_BUG_GMC_UNSUPPORTED 32768
+- 
+-     /**
+-      * strictly follow the standard (MPEG-4, ...).
+-@@ -3195,6 +3198,9 @@ typedef struct AVCodecContext {
+- #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
+- #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
+- #define FF_PROFILE_H264_CAVLC_444            44
+-+#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+-+#define FF_PROFILE_H264_STEREO_HIGH          128
+-+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
+- 
+- #define FF_PROFILE_VC1_SIMPLE   0
+- #define FF_PROFILE_VC1_MAIN     1
+-@@ -3505,6 +3511,12 @@ typedef struct AVCodecContext {
+- #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
+- #endif
+- 
+-+    /**
+-+     * Opaque pointer for use by replacement get_buffer2 code
+-+     *
+-+     * @author jc (08/02/2016)
+-+     */
+-+    void * get_buffer_context;
+- } AVCodecContext;
+- 
+- AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
+-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
+-index 1bf1c62..ccfa991 100644
+---- a/libavcodec/cabac.h
+-+++ b/libavcodec/cabac.h
+-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
+- typedef struct CABACContext{
+-     int low;
+-     int range;
+--    int outstanding_count;
+-+    union
+-+    {
+-+        int outstanding_count;
+-+        struct {
+-+            uint16_t bits;
+-+            uint16_t range;
+-+        } by22;
+-+    };
+-     const uint8_t *bytestream_start;
+-     const uint8_t *bytestream;
+-     const uint8_t *bytestream_end;
+-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
+-index 9d94b72..535ebf0 100644
+---- a/libavcodec/codec_desc.c
+-+++ b/libavcodec/codec_desc.c
+-@@ -1563,6 +1563,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
+-         .long_name = NULL_IF_CONFIG_SMALL("YUY2 Lossless Codec"),
+-         .props     = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
+-     },
+-+    {
+-+        .id        = AV_CODEC_ID_H264_MVC,
+-+        .type      = AVMEDIA_TYPE_VIDEO,
+-+        .name      = "h264_mvc",
+-+        .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
+-+        .props     = AV_CODEC_PROP_LOSSY,
+-+    },
+- 
+-     /* various PCM "codecs" */
+-     {
+-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
+-index efe3555..16358aa 100644
+---- a/libavcodec/h264.h
+-+++ b/libavcodec/h264.h
+-@@ -126,7 +126,9 @@ enum {
+-     NAL_END_STREAM      = 11,
+-     NAL_FILLER_DATA     = 12,
+-     NAL_SPS_EXT         = 13,
+-+    NAL_SPS_SUBSET      = 15,
+-     NAL_AUXILIARY_SLICE = 19,
+-+    NAL_SLICE_EXT       = 20,
+-     NAL_FF_IGNORE       = 0xff0f001,
+- };
+- 
+-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
+-index ce4bab2..b9b0c78 100644
+---- a/libavcodec/h264_parser.c
+-+++ b/libavcodec/h264_parser.c
+-@@ -58,6 +58,8 @@ typedef struct H264ParseContext {
+-     uint8_t parse_history[6];
+-     int parse_history_count;
+-     int parse_last_mb;
+-+    int is_mvc;
+-+    int slice_ext;
+- } H264ParseContext;
+- 
+- 
+-@@ -105,24 +107,27 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
+-         } else if (state <= 5) {
+-             int nalu_type = buf[i] & 0x1F;
+-             if (nalu_type == NAL_SEI || nalu_type == NAL_SPS ||
+--                nalu_type == NAL_PPS || nalu_type == NAL_AUD) {
+-+                nalu_type == NAL_PPS || nalu_type == NAL_AUD ||
+-+                nalu_type == NAL_SPS_SUBSET) {
+-                 if (pc->frame_start_found) {
+-                     i++;
+-                     goto found;
+-                 }
+-             } else if (nalu_type == NAL_SLICE || nalu_type == NAL_DPA ||
+--                       nalu_type == NAL_IDR_SLICE) {
+-+                       nalu_type == NAL_IDR_SLICE || (p->is_mvc && nalu_type == NAL_SLICE_EXT)) {
+-                 state += 8;
+++        it ne
+++        addne     r4, r3
+ +
+-+                p->slice_ext = (nalu_type == NAL_SLICE_EXT);
+-                 continue;
+-             }
+-             state = 7;
+-         } else {
+-             p->parse_history[p->parse_history_count++] = buf[i];
+--            if (p->parse_history_count > 5) {
+-+            if (p->parse_history_count > 8) {
+-                 unsigned int mb, last_mb = p->parse_last_mb;
+-                 GetBitContext gb;
+- 
+--                init_get_bits(&gb, p->parse_history, 8*p->parse_history_count);
+-+                init_get_bits8(&gb, p->parse_history + 3*p->slice_ext, p->parse_history_count - 3*p->slice_ext);
+-                 p->parse_history_count = 0;
+-                 mb= get_ue_golomb_long(&gb);
+-                 p->parse_last_mb = mb;
+-@@ -145,7 +150,7 @@ found:
+-     pc->frame_start_found = 0;
+-     if (p->is_avc)
+-         return next_avc;
+--    return i - (state & 5) - 5 * (state > 7);
+-+    return i - (state & 5) - 8 * (state > 7);
+- }
+- 
+- static int scan_mmco_reset(AVCodecParserContext *s, GetBitContext *gb,
+-@@ -585,7 +590,8 @@ static int h264_parse(AVCodecParserContext *s,
+-         }
+-     }
+- 
+--    parse_nal_units(s, avctx, buf, buf_size);
+-+    if (!p->is_mvc)
+-+        parse_nal_units(s, avctx, buf, buf_size);
+- 
+-     if (avctx->framerate.num)
+-         avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
+-@@ -622,7 +628,7 @@ static int h264_split(AVCodecContext *avctx,
+-         if ((state & 0xFFFFFF00) != 0x100)
+-             break;
+-         nalu_type = state & 0x1F;
+--        if (nalu_type == NAL_SPS) {
+-+        if (nalu_type == NAL_SPS || nalu_type == NAL_SPS_SUBSET) {
+-             has_sps = 1;
+-         } else if (nalu_type == NAL_PPS)
+-             has_pps = 1;
+-@@ -672,3 +678,23 @@ AVCodecParser ff_h264_parser = {
+-     .parser_close   = h264_close,
+-     .split          = h264_split,
+- };
+++        vst2.16   { q4, q5 }, [r0, :128]!
+++        vst2.16   { q6, q7 }, [r0, :128]!
+++        vst2.16   { q8, q9 }, [r0, :128]!
+++        vst2.16   {q10, q11}, [r0, :128], r2
+ +
+-+static av_cold int init_mvc(AVCodecParserContext *s)
+-+{
+-+    H264ParseContext *p = s->priv_data;
+-+    int ret = init(s);
+-+    if (ret < 0)
+-+        return ret;
+++        bpl       1b
+ +
+-+    p->is_mvc = 1;
+-+    return 0;
+-+}
+++        vpop      {q4-q7}
+++        pop       {r4, pc}
+++.endm
+ +
+-+AVCodecParser ff_h264_mvc_parser = {
+-+    .codec_ids      = { AV_CODEC_ID_H264_MVC },
+-+    .priv_data_size = sizeof(H264ParseContext),
+-+    .parser_init    = init_mvc,
+-+    .parser_parse   = h264_parse,
+-+    .parser_close   = h264_close,
+-+    .split          = h264_split,
+-+};
+-diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
+-index b478065..955e426 100644
+---- a/libavcodec/hevc.c
+-+++ b/libavcodec/hevc.c
+-@@ -41,8 +41,196 @@
+- #include "hevc.h"
+- #include "profiles.h"
+- 
+-+#ifdef RPI
+-+  #include "rpi_qpu.h"
+-+  #include "rpi_shader.h"
+-+  #include "rpi_shader_cmd.h"
+-+  #include "rpi_zc.h"
+++function ff_hevc_sao_band_c_32_neon_10, export=1
+++        band_c_32_16 10
+++endfunc
+ +
+-+  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+-+  #define RPI_CACHE_UNIF_MVS  1
+ +
+-+  // Define RPI_SIMULATE_QPUS for debugging to run QPU code on the ARMs (*rotted*)
+-+  //#define RPI_SIMULATE_QPUS
+-+  #ifdef RPI_WORKER
+-+    #include "pthread.h"
+-+  #endif
+++@ ff_hevc_sao_band_c_16_neon_10(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+  static void worker_core(HEVCContext * const s);
+++.macro band_c_16_16 bit_depth
+++        push      {r4, lr}
+++        bl        band_load_c
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        sub       r2, #32
+++        sub       r3, #32
+ +
+-+  // We can pred any block height but annoyingly if we we do then the TMU cache
+-+  // explodes and it goes even slower :-(
+-+  #if 0
+-+  #define Y_P_MAX_H     16
+-+  #define Y_B_MAX_H     16
+-+  #else
+-+  #define Y_P_MAX_H     64
+-+  #define Y_B_MAX_H     64
+-+  #endif
+-+#endif
+++1:      subs      r12, #1
+ +
+-+// #define DISABLE_MC
+++        vld2.16   { q8, q9 }, [r1, :128]!
+++        vld2.16   {q10, q11}, [r1, :128], r3
+ +
+-+#define DISABLE_CHROMA 0
+-+#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+++        sao_band_64b_16 q8,  q9, q10, q11, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+++        vst2.16   { q8, q9 }, [r0, :128]!
+++        vst2.16   {q10, q11}, [r0, :128], r2
+ +
+-+#ifndef av_mod_uintp2
+-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+-+{
+-+    return a & ((1 << p) - 1);
+-+}
+-+#   define av_mod_uintp2   av_mod_uintp2_c
+-+#endif
+++        bpl       1b
+++        pop       {r4, pc}
+++.endm
+ +
+-+#define Y_B_ONLY 0
+++function ff_hevc_sao_band_c_16_neon_10, export=1
+++        band_c_16_16 10
+++endfunc
+ +
+- const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+- 
+ +
+-+#if RPI_INTER
+++@ ff_hevc_sao_band_c_8_neon_10(
+++@   uint8_t * dst          [r0]
+++@   uint8_t * src          [r1]
+++@   uint32_t dst_stride    [r2]
+++@   uint32_t src_stride    [r3]
+++@   const int16_t * table1 sp[0]
+++@   uint32_t offset1       sp[4]
+++@   const int16_t * table2 sp[8]
+++@   uint32_t offset2       sp[12]
+++@   int width              sp[16]
+++@   int height             sp[20]
+ +
+-+#define MC_DUMMY_X (-32)
+-+#define MC_DUMMY_Y (-32)
+++.macro band_c_8_16 bit_depth
+++        push      {r4, lr}
+++        bl        band_load_c
+++        movw      lr, #(1 << \bit_depth) - 1
+++        vmov.i64  q14, #0
+++        vdup.i16  q15, lr
+++        ldr       lr, [sp, #24]         @ width
+++        cmp       lr, #8
+++        blt       4f
+ +
+-+// Each luma QPU processes 2*RPI_NUM_CHUNKS 64x64 blocks
+-+// Each chroma QPU processes 3*RPI_NUM_CHUNKS 64x64 blocks, but requires two commands for B blocks
+-+// For each block of 64*64 the smallest block size is 8x4
+-+// We also need an extra command for the setup information
+++1:      subs      r12, #1
+++        vld2.16   { q8, q9 }, [r1, :128], r3
+ +
+-+#define UV_COMMANDS_PER_QPU (1 + RPI_NUM_CHUNKS*(64*64)*2/(8*4))
+-+// The QPU code for UV blocks only works up to a block width of 8
+-+#define RPI_CHROMA_BLOCK_WIDTH 8
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+++        vst2.16   { q8, q9 }, [r0, :128], r2
+ +
+-+// TODO Chroma only needs 4 taps
+++        bpl       1b
+++        pop       {r4, pc}
+ +
+-+// Actual filter goes -ve, +ve, +ve, -ve using these values
+-+static const uint32_t rpi_filter_coefs[8] = {
+-+        ENCODE_COEFFS(  0,  64,   0,  0),
+-+        ENCODE_COEFFS(  2,  58,  10,  2),
+-+        ENCODE_COEFFS(  4,  54,  16,  2),
+-+        ENCODE_COEFFS(  6,  46,  28,  4),
+-+        ENCODE_COEFFS(  4,  36,  36,  4),
+-+        ENCODE_COEFFS(  4,  28,  46,  6),
+-+        ENCODE_COEFFS(  2,  16,  54,  4),
+-+        ENCODE_COEFFS(  2,  10,  58,  2)
+-+};
+++4:
+++1:      subs      r12, #2
+++        vld2.16   {d16, d17}, [r1, :128], r3
+++        vld2.16   {d18, d19}, [r1, :128], r3
+ +
+-+#define Y_COMMANDS_PER_QPU ((1+RPI_NUM_CHUNKS*(64*64)/(8*4)))
+++        sao_band_32b_16 q8,  q9, "{d0,d1,d2,d3}", "{d4,d5,d6,d7}", q14, q15, \bit_depth
+ +
+-+#endif
+++        vst2.16   {d16, d17}, [r0, :128], r2
+++        vst2.16   {d18, d19}, [r0, :128], r2
+ +
+++        bpl       1b
+++        pop       {r4, pc}
+++.endm
+ +
+-+#ifdef RPI_WORKER
+++function ff_hevc_sao_band_c_8_neon_10, export=1
+++        band_c_8_16 10
+++endfunc
+ +
+-+typedef struct worker_global_env_s
+-+{
+-+    volatile int arm_load;
+-+    pthread_mutex_t lock;
+ +
+-+    unsigned int arm_y;
+-+    unsigned int arm_c;
+-+    unsigned int gpu_y;
+-+    unsigned int gpu_c;
+-+} worker_global_env_t;
+++@ =============================================================================
+++@ SAO EDGE
+ +
+-+static worker_global_env_t worker_global_env =
+-+{
+-+    .lock = PTHREAD_MUTEX_INITIALIZER
+-+};
+++@ r0    destination address
+++@ r2    stride to post-increment r0 with
+++@ [r5]  translate values
+++@
+++@ a <- c <- b
+++@ a in q0 - q3
+++@ c in q4 - q7
+++@ b in q8 - q11
+++@
+++@ q12-15 used as temp
+++@
+++@ Can be used for both Y & C as we unzip/zip the deltas and
+++@ transform "u/v" separately via d26/d27.  For Y d26=d27
+ +
+++function edge_64b_body_8
+ +
+-+//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+-+//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
+++        vcgt.u8 q13,  q5,  q1
+++        vcgt.u8 q14,  q6,  q2
+++        vcgt.u8 q15,  q7,  q3
+ +
+-+#define LOG_ENTER
+-+#define LOG_EXIT
+++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
+++        vcgt.u8  q1,  q5
+++        vcgt.u8  q2,  q6
+++        vcgt.u8  q3,  q7
+ +
+-+// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+-+static void worker_submit_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_tail++;
+-+  s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_tail); // Let people know that the tail has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vsub.s8  q0,  q12       @ a = sign(c-a)
+++        vsub.s8  q1,  q13
+++        vsub.s8  q2,  q14
+++        vsub.s8  q3,  q15
+ +
+-+// Call this to say we have completed pass1
+-+static void worker_complete_job(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  s->worker_head++;
+-+  s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+-+  pthread_cond_broadcast(&s->worker_cond_head); // Let people know that the head has moved
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
+++        vcgt.u8  q13, q5,  q9
+++        vcgt.u8  q14, q6,  q10
+++        vcgt.u8  q15, q7,  q11
+ +
+-+// Call this to wait for all jobs to have completed at the end of a frame
+-+static void worker_wait(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+  pthread_mutex_lock(&s->worker_mutex);
+-+  while( s->worker_head !=s->worker_tail)
+-+  {
+-+    pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+  }
+-+  pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vsub.s8  q0,  q12
+++        vsub.s8  q1,  q13
+++        vsub.s8  q2,  q14
+++        vsub.s8  q3,  q15
+ +
+-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+-+// available to receive the next job.
+-+static void worker_pass0_ready(HEVCContext *s)
+-+{
+-+  LOG_ENTER
+-+    pthread_mutex_lock(&s->worker_mutex);
+-+    // tail is number of submitted jobs
+-+    // head is number of completed jobs
+-+    // tail-head is number of outstanding jobs in the queue
+-+    // we need to ensure there is at least 1 space left for us to use
+-+    while( s->worker_tail - s->worker_head >= RPI_MAX_JOBS)
+-+    {
+-+      // Wait until another job is completed
+-+      pthread_cond_wait(&s->worker_cond_head, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+-+  LOG_EXIT
+-+}
+++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
+++        vcgt.u8  q13, q9,  q5
+++        vcgt.u8  q14, q10, q6
+++        vcgt.u8  q15, q11, q7
+ +
+-+static void *worker_start(void *arg)
+-+{
+-+  HEVCContext *s = (HEVCContext *)arg;
+-+  while(1) {
+-+    pthread_mutex_lock(&s->worker_mutex);
+++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
+++        vadd.s8  q1,  q13
+++        vmov.u8  q12, #2
+++        vadd.s8  q2,  q14
+++        vadd.s8  q3,  q15
+ +
+-+    while( !s->kill_worker && s->worker_tail - s->worker_head <= 0)
+-+    {
+-+      pthread_cond_wait(&s->worker_cond_tail, &s->worker_mutex);
+-+    }
+-+    pthread_mutex_unlock(&s->worker_mutex);
+++        vadd.s8  q0,  q12
+++        vadd.s8  q1,  q12
+ +
+-+    if (s->kill_worker) {
+-+      break;
+-+    }
+-+    LOG_ENTER
+-+    worker_core(s);
+++        vld1.8   {d26, d27}, [r5]
+ +
+-+    worker_complete_job(s);
+-+    LOG_EXIT
+-+  }
+-+  return NULL;
+-+}
+++        vadd.s8  q2,  q12
+++        vuzp.8   q0,  q1
+++        vmov.u8  q15, #128
+++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
+ +
+-+#endif
+++        vtbl.8   d0,  {d26}, d0
+++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
+ +
+- /**
+-  * NOTE: Each function hls_foo correspond to the function foo in the
+-  * specification (HLS stands for High Level Syntax).
+-@@ -55,6 +243,32 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
+- /* free everything allocated  by pic_arrays_init() */
+- static void pic_arrays_free(HEVCContext *s)
+- {
+-+#ifdef RPI
+-+    int job;
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+      if (s->coeffs_buf_arm[job][0]) {
+-+        gpu_free(&s->coeffs_buf_default[job]);
+-+        s->coeffs_buf_arm[job][0] = 0;
+-+      }
+-+      if (s->coeffs_buf_arm[job][2]) {
+-+        gpu_free(&s->coeffs_buf_accelerated[job]);
+-+        s->coeffs_buf_arm[job][2] = 0;
+-+      }
+-+    }
+-+#endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    {
+-+        int i;
+-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++        vtbl.8   d1,  {d26}, d1
+++        vadd.s8  q14, q5, q15
+ +
+-+            if (dvq->vpu_cmds_arm) {
+-+                gpu_free(&dvq->deblock_vpu_gmem);
+-+              dvq->vpu_cmds_arm = 0;
+-+            }
+-+        }
+-+    }
+-+#endif
+-     av_freep(&s->sao);
+-     av_freep(&s->deblock);
+- 
+-@@ -91,6 +305,89 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
+-     int ctb_count        = sps->ctb_width * sps->ctb_height;
+-     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+- 
+-+#ifdef RPI
+-+    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+-+    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+-+    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+-+    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+-+    int job;
+++        vtbl.8   d2,  {d27}, d2
+++        vuzp.8   q2,  q3
+ +
+-+    av_assert0(sps);
+-+//    s->max_ctu_count = sps->ctb_width;
+-+//    printf("CTB with=%d\n", sps->ctb_width);
+-+//    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+-+    s->max_ctu_count = FFMIN(coefs_per_luma / coefs_in_ctb, sps->ctb_width);
+-+    s->ctu_per_y_chan = s->max_ctu_count / QPU_N_Y;
+-+    s->ctu_per_uv_chan = s->max_ctu_count / QPU_N_UV;
+-+
+-+    for(job=0;job<RPI_MAX_JOBS;job++) {
+-+        for(job=0;job<RPI_MAX_JOBS;job++) {
+-+            gpu_malloc_cached(sizeof(int16_t) * coefs_per_row, &s->coeffs_buf_default[job]);
+-+            s->coeffs_buf_arm[job][0] = (int16_t*) s->coeffs_buf_default[job].arm;
+-+            if (!s->coeffs_buf_arm[job][0])
+-+                goto fail;
+-+
+-+            gpu_malloc_cached(sizeof(int16_t) * (coefs_per_row + 32*32), &s->coeffs_buf_accelerated[job]);  // We prefetch past the end so provide an extra blocks worth of data
+-+            s->coeffs_buf_arm[job][2] = (int16_t*) s->coeffs_buf_accelerated[job].arm;
+-+            s->coeffs_buf_vc[job][2] = s->coeffs_buf_accelerated[job].vc;
+-+            if (!s->coeffs_buf_arm[job][2])
+-+                goto fail;
+-+            s->coeffs_buf_arm[job][3] = coefs_per_row + s->coeffs_buf_arm[job][2];  // This points to just beyond the end of the buffer.  Coefficients fill in backwards.
+-+            s->coeffs_buf_vc[job][3] = sizeof(int16_t) * coefs_per_row + s->coeffs_buf_vc[job][2];
+-+        }
+-+    }
+-+#endif
+-+#ifdef RPI_DEBLOCK_VPU
+-+    {
+-+        int i;
+-+        s->enable_rpi_deblock = !sps->sao_enabled;
+-+        s->setup_width = (sps->width+15) / 16;
+-+        s->setup_height = (sps->height+15) / 16;
+-+        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+-+        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+++        vtbl.8   d3,  {d27}, d3
+ +
+-+        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+-+        {
+-+            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+-+            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+-+            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+-+            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+-+            const unsigned int total_size =- cmd_size + y_size + uv_size;
+-+            int p_vc;
+-+            uint8_t * p_arm;
+-+ #if RPI_VPU_DEBLOCK_CACHED
+-+            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+-+ #else
+-+            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+-+ #endif
+-+            p_vc = dvq->deblock_vpu_gmem.vc;
+-+            p_arm = dvq->deblock_vpu_gmem.arm;
+++        vtbl.8   d4,  {d26}, d4
+++        vzip.8   q0,  q1
+ +
+-+            // Zap all
+-+            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+++        vtbl.8   d5,  {d26}, d5
+++        vqadd.s8 q0,  q12
+++        vqadd.s8 q1,  q14
+++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
+ +
+-+            // Subdivide
+-+            dvq->vpu_cmds_arm = (void*)p_arm;
+-+            dvq->vpu_cmds_vc = p_vc;
+++        vtbl.8   d6,  {d27}, d6
+++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
+ +
+-+            p_arm += cmd_size;
+-+            p_vc += cmd_size;
+++        vtbl.8   d7,  {d27}, d7
+++        vzip.8   q2,  q3
+ +
+-+            dvq->y_setup_arm = (void*)p_arm;
+-+            dvq->y_setup_vc = (void*)p_vc;
+++        vsub.s8  q0,  q15
+++        vqadd.s8 q2,  q12
+++        vqadd.s8 q3,  q14
+++        vsub.s8  q1,  q15
+++        vsub.s8  q2,  q15
+++        vsub.s8  q3,  q15
+ +
+-+            p_arm += y_size;
+-+            p_vc += y_size;
+++        bx      lr
+++endfunc
+ +
+-+            dvq->uv_setup_arm = (void*)p_arm;
+-+            dvq->uv_setup_vc = (void*)p_vc;
+-+        }
+++@ r0    destination address
+++@ r2    stride to post-increment r0 with
+++@ r4    upper clip value
+++@ [r5]  translate values
+++@
+++@ a <- c <- b
+++@ a in q0 - q3
+++@ c in q4 - q7
+++@ b in q8 - q11
+++@
+++@ q12-15 used as temp
+++@
+++@ Can be used for both Y & C as we unzip/zip the deltas and
+++@ transform "u/v" separately via d26/d27.  For Y d26=d27
+ +
+-+        s->dvq_n = 0;
+-+        s->dvq = s->dvq_ents + s->dvq_n;
+-+    }
+-+#endif
+++function edge_64b_body_16
+ +
+-     s->bs_width  = (width  >> 2) + 1;
+-     s->bs_height = (height >> 2) + 1;
+- 
+-@@ -137,6 +434,29 @@ fail:
+-     return AVERROR(ENOMEM);
+- }
+- 
+-+static void default_pred_weight_table(HEVCContext * const s)
+-+{
+-+  unsigned int i;
+-+  s->sh.luma_log2_weight_denom = 0;
+-+  s->sh.chroma_log2_weight_denom = 0;
+-+  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+-+      s->sh.luma_weight_l0[i] = 1;
+-+      s->sh.luma_offset_l0[i] = 0;
+-+      s->sh.chroma_weight_l0[i][0] = 1;
+-+      s->sh.chroma_offset_l0[i][0] = 0;
+-+      s->sh.chroma_weight_l0[i][1] = 1;
+-+      s->sh.chroma_offset_l0[i][1] = 0;
+-+  }
+-+  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+-+      s->sh.luma_weight_l1[i] = 1;
+-+      s->sh.luma_offset_l1[i] = 0;
+-+      s->sh.chroma_weight_l1[i][0] = 1;
+-+      s->sh.chroma_offset_l1[i][0] = 0;
+-+      s->sh.chroma_weight_l1[i][1] = 1;
+-+      s->sh.chroma_offset_l1[i][1] = 0;
+-+  }
+-+}
+++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
+++        vcgt.u16 q13, q5, q1
+++        vcgt.u16 q14, q6, q2
+++        vcgt.u16 q15, q7, q3
+ +
+- static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
+- {
+-     int i = 0;
+-@@ -331,7 +651,7 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
+- static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
+- {
+-     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
+--    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+-+    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+-     int ret, i;
+- 
+-     pic_arrays_free(s);
+-@@ -350,6 +670,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+-     switch (sps->pix_fmt) {
+-     case AV_PIX_FMT_YUV420P:
+-     case AV_PIX_FMT_YUVJ420P:
+-+#if RPI_HEVC_SAND
+-+        // Currently geometry calc is stuffed for big sizes
+-+        if (sps->width < 2048 && sps->height <= 1088) {
+-+            *fmt++ = AV_PIX_FMT_SAND128;
+-+        }
+-+#endif
+- #if CONFIG_HEVC_DXVA2_HWACCEL
+-         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+- #endif
+-@@ -380,6 +706,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+-         ret = ff_thread_get_format(s->avctx, pix_fmts);
+-         if (ret < 0)
+-             goto fail;
+++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
+++        vcgt.u16 q1, q1, q5
+++        vcgt.u16 q2, q2, q6
+++        vcgt.u16 q3, q3, q7
+ +
+-         s->avctx->pix_fmt = ret;
+-     }
+-     else {
+-@@ -402,11 +729,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
+-         for(c_idx = 0; c_idx < c_count; c_idx++) {
+-             int w = sps->width >> sps->hshift[c_idx];
+-             int h = sps->height >> sps->vshift[c_idx];
+-+            // ******** Very very nasty allocation kludge for plaited Chroma
+-             s->sao_pixel_buffer_h[c_idx] =
+--                av_malloc((w * 2 * sps->ctb_height) <<
+-+                av_malloc((w * 2 * sps->ctb_height * (1 + (c_idx == 1))) <<
+-                           sps->pixel_shift);
+-             s->sao_pixel_buffer_v[c_idx] =
+--                av_malloc((h * 2 * sps->ctb_width) <<
+-+                av_malloc((h * 2 * sps->ctb_width  * (1 + (c_idx == 1))) <<
+-                           sps->pixel_shift);
+-         }
+-     }
+-@@ -674,6 +1002,11 @@ static int hls_slice_header(HEVCContext *s)
+-                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
+-                 pred_weight_table(s, gb);
+-             }
+-+            else
+-+            {
+-+              // Give us unit weights
+-+              default_pred_weight_table(s);
+-+            }
+- 
+-             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
+-             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
+-@@ -931,6 +1264,34 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+-+{
+-+    // U & V done on U call in the case of sliced frames
+-+    if (rpi_sliced_frame(s->frame) && c_idx > 1)
+-+        return;
+++        vsub.s16 q0, q0, q12 // a = sign(c-a)
+++        vsub.s16 q1, q1, q13
+++        vsub.s16 q2, q2, q14
+++        vsub.s16 q3, q3, q15
+ +
+-+    if (s->enable_rpi) {
+-+        HEVCLocalContext *lc = s->HEVClc;
+-+        HEVCPredCmd *cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+        cmd->type = RPI_PRED_INTRA;
+-+        cmd->size = log2_trafo_size;
+-+        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+-+        cmd->c_idx = c_idx;
+-+        cmd->i_pred.x = x0;
+-+        cmd->i_pred.y = y0;
+-+        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+-+    }
+-+    else if (rpi_sliced_frame(s->frame) && c_idx != 0) {
+-+        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+-+    }
+-+    else {
+-+        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+-+    }
+++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
+++        vcgt.u16 q13, q5, q9
+++        vcgt.u16 q14, q6, q10
+++        vcgt.u16 q15, q7, q11
+ +
+-+}
+-+#endif
+++        vsub.s16 q0, q0, q12
+++        vsub.s16 q1, q1, q13
+++        vsub.s16 q2, q2, q14
+++        vsub.s16 q3, q3, q15
+ +
+- static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                               int xBase, int yBase, int cb_xBase, int cb_yBase,
+-                               int log2_cb_size, int log2_trafo_size,
+-@@ -943,8 +1304,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-     if (lc->cu.pred_mode == MODE_INTRA) {
+-         int trafo_size = 1 << log2_trafo_size;
+-         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
+--
+-+#ifdef RPI
+-+        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+-+#else
+-         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+-+#endif
+-     }
+- 
+-     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
+-@@ -1030,7 +1394,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+-+#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1059,7 +1427,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+-+#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
+-@@ -1088,7 +1460,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                     trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+-+#endif
+-                 }
+-                 if (cbf_cb[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1098,7 +1474,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-                 if (lc->cu.pred_mode == MODE_INTRA) {
+-                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+-+#else
+-                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+-+#endif
+-                 }
+-                 if (cbf_cr[i])
+-                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
+-@@ -1110,26 +1490,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
+-             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
+-             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+-+            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
+-             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+-+                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
+-                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+-+#endif
+-             }
+-         } else if (blk_idx == 3) {
+-             int trafo_size_h = 1 << (log2_trafo_size + 1);
+-             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
+-             ff_hevc_set_neighbour_available(s, xBase, yBase,
+-                                             trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+-+            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+-+#else
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
+-             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+-+#endif
+-             if (s->ps.sps->chroma_format_idc == 2) {
+-                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
+-                                                 trafo_size_h, trafo_size_v);
+-+#ifdef RPI
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-+                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#else
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
+-                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+-+#endif
+-             }
+-         }
+-     }
+-@@ -1275,47 +1675,120 @@ do {
+-     return 0;
+- }
+- 
+--static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
+++        vcgt.u16 q13, q9, q5
+++        vcgt.u16 q14, q10, q6
+++        vcgt.u16 q15, q11, q7
+ +
+-+static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
+- {
+--    HEVCLocalContext *lc = s->HEVClc;
+-     GetBitContext gb;
+--    int cb_size   = 1 << log2_cb_size;
+--    int stride0   = s->frame->linesize[0];
+--    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+--    int   stride1 = s->frame->linesize[1];
+--    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+--    int   stride2 = s->frame->linesize[2];
+--    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+--
+--    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
+--                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
+--                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
+--                          s->ps.sps->pcm.bit_depth_chroma;
+--    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
+-     int ret;
+- 
+--    if (!s->sh.disable_deblocking_filter_flag)
+--        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+--
+-     ret = init_get_bits(&gb, pcm, length);
+-     if (ret < 0)
+-         return ret;
+- 
+--    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
+--    if (s->ps.sps->chroma_format_idc) {
+--        s->hevcdsp.put_pcm(dst1, stride1,
+-+#ifdef RPI
+-+    if (rpi_sliced_frame(s->frame)) {
+-+        s->hevcdsp.put_pcm(rpi_sliced_frame_pos_y(s->frame, x0, y0),
+-+                           s->frame->linesize[0],
+-+                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
+++        vadd.s16 q1, q1, q13
+++        vmov.u8  q12, #2
+++        vadd.s16 q2, q2, q14
+++        vadd.s16 q3, q3, q15
+ +
+-+        s->hevcdsp.put_pcm_c(rpi_sliced_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+-+                           s->frame->linesize[1],
+-                            cb_size >> s->ps.sps->hshift[1],
+-                            cb_size >> s->ps.sps->vshift[1],
+-                            &gb, s->ps.sps->pcm.bit_depth_chroma);
+--        s->hevcdsp.put_pcm(dst2, stride2,
+--                           cb_size >> s->ps.sps->hshift[2],
+--                           cb_size >> s->ps.sps->vshift[2],
+--                           &gb, s->ps.sps->pcm.bit_depth_chroma);
+-     }
+-+    else
+-+#endif
+-+    {
+-+        const int stride0   = s->frame->linesize[0];
+-+        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+-+        const int   stride1 = s->frame->linesize[1];
+-+        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-+        const int   stride2 = s->frame->linesize[2];
+-+        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+++        vmovn.s16 d0, q0
+++        vmovn.s16 d1, q1
+++        vmovn.s16 d2, q2
+++        vmovn.s16 d3, q3
+ +
+-+        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+-+        if (s->ps.sps->chroma_format_idc) {
+-+            s->hevcdsp.put_pcm(dst1, stride1,
+-+                               cb_size >> s->ps.sps->hshift[1],
+-+                               cb_size >> s->ps.sps->vshift[1],
+-+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+-+            s->hevcdsp.put_pcm(dst2, stride2,
+-+                               cb_size >> s->ps.sps->hshift[2],
+-+                               cb_size >> s->ps.sps->vshift[2],
+-+                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+-+        }
+- 
+-+    }
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+-+{
+-+    int16_t * const coeffs = (buf_no != 3) ?
+-+        s->coeffs_buf_arm[s->pass0_job][buf_no] + s->num_coeffs[s->pass0_job][buf_no] :
+-+        s->coeffs_buf_arm[s->pass0_job][buf_no] - s->num_coeffs[s->pass0_job][buf_no] - n;
+-+    s->num_coeffs[s->pass0_job][buf_no] += n;
+-+    return coeffs;
+-+}
+-+#endif
+++        vuzp.8   q0, q1
+ +
+-+// x * 2^(y*2)
+-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+-+{
+-+    return x << (y * 2);
+-+}
+++        vld1.8   {d26, d27}, [r5]
+ +
+-+static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+-+{
+-+    // Length in bits
+-+    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+-+        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+++        vadd.s8  q0, q0, q12
+++        vadd.s8  q1, q1, q12
+ +
+-+    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+++        vtbl.8   d0, {d26}, d0
+++        vtbl.8   d1, {d26}, d1
+++        vtbl.8   d2, {d27}, d2
+++        vtbl.8   d3, {d27}, d3
+ +
+-+    if (!s->sh.disable_deblocking_filter_flag)
+-+        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+++        vmov.i64 q12, #0
+ +
+-+#ifdef RPI
+-+    if (s->enable_rpi) {
+-+        // Copy coeffs
+-+        const int blen = (length + 7) >> 3;
+-+        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+-+        // Allocation is in int16_t s
+-+        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+-+        // sample this rounding doesn't affect the total size we need to allocate for
+-+        // the coeff buffer
+-+        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+-+        memcpy(coeffs, pcm, blen);
+++        vzip.8   q0, q1
+ +
+-+        // Our coeff stash assumes that any partially allocated 64byte lump
+-+        // is zeroed so make that true.
+-+        {
+-+            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+-+            if ((-(intptr_t)eopcm & 63) != 0)
+-+                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+-+        }
+++        vdup.i16 q13, r4
+ +
+-+        // Add command
+-+        {
+-+            HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+            cmd->type = RPI_PRED_I_PCM;
+-+            cmd->size = log2_cb_size;
+-+            cmd->i_pcm.src = coeffs;
+-+            cmd->i_pcm.x = x0;
+-+            cmd->i_pcm.y = y0;
+-+            cmd->i_pcm.src_len = length;
+-+        }
+-+        return 0;
+-+    }
+-+#endif
+++        @ Avoid overwrite whilst widening
+++        vaddw.s8 q2, q6, d2
+++        vaddw.s8 q3, q7, d3
+++        vaddw.s8 q1, q5, d1
+++        vaddw.s8 q0, q4, d0
+ +
+-+    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+-+}
+++        @ now clip
+++        clip16_4 q2, q3, q1, q0, q12, q13
+ +
+- /**
+-  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
+-  *
+-@@ -1332,6 +1805,91 @@ static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-  * @param luma_offset additive offset applied to the luma prediction value
+-  */
+- 
+-+#if RPI_INTER
+-+static void rpi_luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                        AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-+                        int block_w, int block_h, int luma_weight, int luma_offset)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_LUMA_UNI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref->data[0];
+-+    cmd->srcstride = ref->linesize[0];
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = luma_weight;
+-+    cmd->offset = luma_offset;
+-+}
+-+
+-+static void rpi_luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+                       AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-+                       int block_w, int block_h, AVFrame *ref1, const Mv *mv1,
+-+                       const struct MvField * const current_mv)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_y[s->pass0_job] + s->num_mv_cmds_y[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_LUMA_BI;
+-+    cmd->dst = dst;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref0->data[0];
+-+    cmd->srcstride = ref0->linesize[0];
+-+    cmd->mv = *mv0;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->src1 = ref1->data[0];
+-+    cmd->srcstride1 = ref1->linesize[0];
+-+    cmd->mv1 = *mv1;
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+
+-+static inline void rpi_chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-+                          ptrdiff_t dststride, uint8_t *src0, ptrdiff_t srcstride,
+-+                          int x_off, int y_off, int block_w, int block_h, const Mv * const mv, int chroma_weight, int chroma_offset)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_CHROMA_UNI;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = src0;
+-+    cmd->srcstride = srcstride;
+-+    cmd->mv = *mv;
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->weight = chroma_weight;
+-+    cmd->offset = chroma_offset;
+-+}
+-+
+-+static inline void rpi_chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVFrame *ref0, AVFrame *ref1,
+-+                         int x_off, int y_off, int block_w, int block_h, const struct MvField * const current_mv, int cidx)
+-+{
+-+    HEVCMvCmd *cmd = s->unif_mv_cmds_c[s->pass0_job] + s->num_mv_cmds_c[s->pass0_job]++;
+-+    cmd->cmd = RPI_CMD_CHROMA_BI+cidx;
+-+    cmd->dst = dst0;
+-+    cmd->dststride = dststride;
+-+    cmd->src = ref0->data[cidx+1];
+-+    cmd->srcstride = ref0->linesize[cidx+1];
+-+    cmd->mv = current_mv->mv[0];
+-+    cmd->mv1 = current_mv->mv[1];
+-+    cmd->x_off = x_off;
+-+    cmd->y_off = y_off;
+-+    cmd->block_w = block_w;
+-+    cmd->block_h = block_h;
+-+    cmd->src1 = ref1->data[cidx+1];
+-+    cmd->srcstride1 = ref1->linesize[cidx+1];
+-+    cmd->ref_idx[0] = current_mv->ref_idx[0];
+-+    cmd->ref_idx[1] = current_mv->ref_idx[1];
+-+}
+-+
+-+#endif
+-+
+- static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                         AVFrame *ref, const Mv *mv, int x_off, int y_off,
+-                         int block_w, int block_h, int luma_weight, int luma_offset)
+-@@ -1347,6 +1905,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
+-     int idx              = ff_hevc_pel_weight[block_w];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+++        bx       lr
+++endfunc
+ +
+-     x_off += mv->x >> 2;
+-     y_off += mv->y >> 2;
+-     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1393,7 +1955,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-  * @param mv1 motion vector1 (relative to block position) to get pixel data from
+-  * @param current_mv current motion vector structure
+-  */
+-- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-+static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
+-                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
+- {
+-@@ -1417,6 +1979,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+-     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
+-     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+ +
+-     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
+-         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
+-         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
+-@@ -1502,6 +2068,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
+-     intptr_t _mx         = mx << (1 - hshift);
+-     intptr_t _my         = my << (1 - vshift);
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+++@ a <- c <- b
+++@ a in q0
+++@ c in q1
+++@ b in q2
+++@ Temp q3, q9, q10
+++@
+++@ d16, d17 (q8) xlat U, V
+++@ q14.u8 #2
+++@ q15.u8 #128
+ +
+-     x_off += mv->x >> (2 + hshift);
+-     y_off += mv->y >> (2 + vshift);
+-     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
+-@@ -1566,6 +2136,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
+-     int hshift = s->ps.sps->hshift[1];
+-     int vshift = s->ps.sps->vshift[1];
+- 
+-+#ifdef DISABLE_MC
+-+    return;
+-+#endif
+++function edge_16b_body_8
+++        vcgt.u8  q3,  q1,  q0   @ c > a -> -1 , otherwise 0
+++        vcgt.u8  q0,  q1        @ a > c -> -1 , otherwise 0
+++        vcgt.u8  q9,  q1,  q2   @ c > b -> -1 , otherwise 0
+++        vcgt.u8  q10, q2,  q1   @ c < b -> -1 , otherwise 0
+ +
+-     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
+-     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
+-     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
+-@@ -1693,14 +2267,423 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
+-     }
+- }
+- 
+--static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+--                                int nPbW, int nPbH,
+--                                int log2_cb_size, int partIdx, int idx)
+++        vsub.s8  q0,  q3
+++        vsub.s8  q10, q9
+++        vadd.s8  q0,  q10       @ a = sign(c-a)
+ +
+-+#if RPI_INTER
+++        vadd.s8  q0,  q14
+++        vuzp.8   d0,  d1
+++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
+ +
+-+static HEVCRpiLumaPred *
+-+rpi_nxt_pred_y(HEVCContext *const s, const unsigned int load_val)
+-+{
+-+    HEVCRpiLumaPred * yp = s->curr_pred_y;
+-+    HEVCRpiLumaPred * ypt = yp + 1;
+-+    for (unsigned int i = 1; i != QPU_N_GRP_Y; ++i, ++ypt) {
+-+        if (ypt->load < yp->load)
+-+            yp = ypt;
+-+    }
+++        vtbl.8   d0,  {d16}, d0
+++        vtbl.8   d1,  {d17}, d1
+ +
+-+//        yp->load += load_val;
+-+    ++yp->load;
+-+    return yp;
+-+}
+++        vzip.8   d0,  d1
+++        vqadd.s8 q0,  q3
+++        vsub.s8  q0,  q15
+ +
+-+static void
+-+rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+-+           const int nPbW, const int nPbH,
+-+           const Mv *const mv,
+-+           const int weight_mul,
+-+           const int weight_offset,
+-+           AVFrame *const src_frame)
+-+{
+-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+++        bx      lr
+++endfunc
+ +
+-+//    rpi_luma_mc_uni(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+-+//                    mv, x0, y0, nPbW, nPbH,
+-+//                    weight_mul, weight_offset);
+++@ a <- c <- b
+++@ a in q0
+++@ c in q1
+++@ b in q2
+++@ Temp q3
+++@
+++@ q12, #0
+++@ d16, d17 xlat U, V
+++@ q14.u8 #2
+++@ q15.u16 max
+++function edge_16b_body_16
+++        vcgt.u16 q3, q1, q0     @ c > a -> -1 , otherwise 0
+++        vcgt.u16 q0, q1         @ a > c -> -1 , otherwise 0
+++        vsub.s16 q0, q3         @ a = sign(c-a)
+++        vcgt.u16 q3, q1, q2     @ c > b -> -1 , otherwise 0
+++        vsub.s16 q0, q3
+++        vcgt.u16 q3, q2, q1     @ c < b -> -1 , otherwise 0
+++        vadd.s16 q0, q3         @ a = sign(c-a) + sign(c-b)
+++
+++        vmovn.s16 d0, q0
+++        @ d1 will have random contents that we transform but
+++        @ that doesn't matter as we then discard them
+++        vuzp.8   d0, d1
+++
+++        vadd.s8  q0, q0, q14
+++
+++        vtbl.8   d0, {d16}, d0
+++        vtbl.8   d1, {d17}, d1
+++
+++        vzip.8   d0, d1
+++
+++        vaddw.s8 q0, q1, d0
+++
+++        @ now clip
+++        vmax.s16 q0, q12
+++        vmin.s16 q0, q15
+++        bx       lr
+++endfunc
+ +
+-+    {
+-+        const unsigned int mx          = mv->x & 3;
+-+        const unsigned int my          = mv->y & 3;
+-+        const unsigned int my_mx       = (my << 8) | mx;
+-+        const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+-+        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+-+        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+-+        const uint32_t src_vc_address_y = get_vc_address_y(src_frame);
+-+        uint32_t dst_addr = get_vc_address_y(s->frame) + y_off;
+-+        const uint32_t wo = PACK2(weight_offset * 2 + 1, weight_mul);
+ +
+-+        // Potentially we could change the assembly code to support taller sizes in one go
+-+        for (int start_y = 0; start_y < nPbH; start_y += Y_P_MAX_H, dst_addr += s->frame->linesize[0] * 16)
+-+        {
+-+            const uint32_t src_yx_y = y1_m3 + start_y;
+-+            int start_x = 0;
+-+            const int bh = FFMIN(nPbH - start_y, Y_P_MAX_H);
+++@ ff_hevc_sao_edge_[c_]xx_neon(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
+++@   int eo,                           [sp, #sp_base + 0]
+++@   int width,                        [sp, #sp_base + 4]
+++@   int height)                       [sp, #sp_base + 8]
+++
+++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0
+++        push     {r4-r6, lr}    @ 16 bytes
+++.set sp_base, 16
+++
+++@ Build translate registers
+++@ As translate values can only be 0-4 we don't care about junk in the rest
+++@ of the register
+++        mov      r12, #2
+++.if \is_chroma
+++        ldr      r4, [sp, #16]
+++.set sp_base, sp_base + 4
+++.endif
+++        vld1.8   {d16[2]}, [r3], r12
+++        vld1.8   {d16[0]}, [r3], r12
+++        vld1.8   {d16[1]}, [r3], r12
+++        vld1.8   {d16[3]}, [r3], r12
+++        vld1.8   {d16[4]}, [r3]
+++.if \is_chroma
+++        vld1.8   {d17[2]}, [r4], r12
+++        vld1.8   {d17[0]}, [r4], r12
+++        vld1.8   {d17[1]}, [r4], r12
+++        vld1.8   {d17[3]}, [r4], r12
+++        vld1.8   {d17[4]}, [r4]
+++.else
+++        vmov     d17, d16
+++.endif
+++
+++@ Setup constant registers
+++.if \bit_depth > 8
+++        movw     r4, (1 << \bit_depth) - 1
+++.endif
+++.if \setup_16b
+++.if \bit_depth > 8
+++        vmov.i64 q12, #0
+++        vdup.16  q15, r4
+++.else
+++        vmov.u8  q15, #128
+++.endif
+++        vmov.u8  q14, #2
+++.endif
+++        movw     r3, EDGE_SRC_STRIDE
+++
+++@ If setup_64b we need the xlat table on the stack and q4-q7 saved
+++.if \setup_64b
+++        sub      r5, sp, #16
+++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
+++.set sp_base, sp_base + 80
+++.endif
+++
+++@ Get jump address
+++@ We have a special case for width 4 as the calling code doesn't detect it
+++@ If we may have w4 then we add a 2nd jump table after the 1st
+++.if \check_w4
+++        ldr      r12, [sp, #sp_base + 4]        @ width
+++        cmp      r12, #8
+++.endif
+++        ldr      r12, [sp, #sp_base + 0]        @ e0
+++        adr      r6, \jump_tab
+++.if \check_w4
+++        it lt
+++        addlt    r6, #16
+++.endif
+++        ldr      r6, [r6, r12, lsl #2]
+++
+++        ldr      r12, [sp, #sp_base + 8]        @ height
+++
+++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
+++.if \do2
+++        push     {r0, r1, r6, r12}
+++        blx      r6
+++        pop      {r0, r1, r6, r12}
+++
+++        add      r0, #64
+++        add      r1, #64
+++.endif
+++
+++        blx      r6
+++
+++@ Tidy up & return
+++.if \setup_64b
+++        vpop     {q4-q8}        @ spurious but harmless load of q8
+++.endif
+++        pop      {r4-r6, pc}
+++.endm
+ +
+-+#if 1
+-+            // As Y-pred operates on two independant 8-wide src blocks we can merge
+-+            // this pred with the previous one if it the previous one is 8 pel wide,
+-+            // the same height as the current block, immediately to the left of our
+-+            // current dest block and mono-pred.
+ +
+-+            qpu_mc_pred_y_t *const last_y8_p = s->last_y8_p;
+-+            if (last_y8_p != NULL && last_y8_p->p.h == bh && last_y8_p->p.dst_addr + 8 == dst_addr)
+-+            {
+-+                const int bw = FFMIN(nPbW, 8);
+-+                qpu_mc_pred_y_t *const last_y8_lx = s->last_y8_lx;
+-+
+-+                last_y8_lx->next_src2_x = x1_m3;
+-+                last_y8_lx->next_src2_y = src_yx_y;
+-+                last_y8_lx->next_src2_base = src_vc_address_y;
+-+                last_y8_p->p.w += bw;
+-+                last_y8_p->p.mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->p.mymx21);
+-+                last_y8_p->p.wo2 = wo;
+-+
+-+                s->last_y8_p = NULL;
+-+                s->last_y8_lx = NULL;
+-+                start_x = bw;
+-+#if RPI_TSTATS
+-+                ++s->tstats.y_pred1_y8_merge;
+-+#endif
+-+            }
+-+#endif
+++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
+++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
+++.endm
+ +
+-+            for (; start_x < nPbW; start_x += 16)
+-+            {
+-+                const int bw = FFMIN(nPbW - start_x, 16);
+-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+-+#if RPI_TSTATS
+-+                {
+-+                    HEVCRpiStats *const ts = &s->tstats;
+-+                    if (mx == 0 && my == 0)
+-+                        ++ts->y_pred1_x0y0;
+-+                    else if (mx == 0)
+-+                        ++ts->y_pred1_x0;
+-+                    else if (my == 0)
+-+                        ++ts->y_pred1_y0;
+-+                    else
+-+                        ++ts->y_pred1_xy;
+++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab
+++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1
+++.endm
+ +
+-+                    if (nPbW > 8)
+-+                        ++ts->y_pred1_wgt8;
+-+                    else
+-+                        ++ts->y_pred1_wle8;
+ +
+-+                    if (nPbH > 16)
+-+                        ++ts->y_pred1_hgt16;
+-+                    else
+-+                        ++ts->y_pred1_hle16;
+-+                }
+-+#endif
+-+                cmd_y[-1].next_fn = s->qpu_filter;
+-+                cmd_lx->next_src1_x = x1_m3 + start_x;
+-+                cmd_lx->next_src1_y = src_yx_y;
+-+                cmd_lx->next_src1_base = src_vc_address_y;
+-+                if (bw <= 8)
+-+                {
+-+                    cmd_lx->next_src2_x = MC_DUMMY_X;
+-+                    cmd_lx->next_src2_y = MC_DUMMY_Y;
+-+                    cmd_lx->next_src2_base = s->qpu_dummy_frame;
+-+                }
+-+                else
+-+                {
+-+                    cmd_lx->next_src2_x = x1_m3 + start_x + 8;
+-+                    cmd_lx->next_src2_y = src_yx_y;
+-+                    cmd_lx->next_src2_base = src_vc_address_y;
+-+                }
+-+                cmd_y->p.w = bw;
+-+                cmd_y->p.h = bh;
+-+                cmd_y->p.mymx21 = my2_mx2_my_mx;
+-+                cmd_y->p.wo1 = wo;
+-+                cmd_y->p.wo2 = wo;
+-+                cmd_y->p.dst_addr =  dst_addr + start_x;
+-+                yp->last_lx = cmd_y;
+-+                yp->qpu_mc_curr = cmd_y + 1;
+-+
+-+                if (bw == 8) {
+-+                    s->last_y8_lx = cmd_lx;
+-+                    s->last_y8_p = cmd_y;
+-+                }
+-+            }
+-+        }
+-+    }
+-+}
+++.macro  edge_64b_e0, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #8
+++1:      vldm     r1, {d7-d16}
+++        subs     r12, #1
+++        add      r1, r3
+++        // load a
+++        vext.8   q0,  q3,  q4, #(16 - \pb)
+++        vext.8   q1,  q4,  q5, #(16 - \pb)
+++        vext.8   q2,  q5,  q6, #(16 - \pb)
+++        vext.8   q3,  q6,  q7, #(16 - \pb)
+++        // load b
+++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
+++        vext.8   q8,  q4,  q5, #\pb
+++        vext.8   q9,  q5,  q6, #\pb
+++        vext.8   q10, q6,  q7, #\pb
+++        bl       \body_fn
+++        vstm     r0, {q0-q3}
+++        add      r0, r0, r2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+static void
+-+rpi_pred_y_b(HEVCContext * const s,
+-+           const int x0, const int y0,
+-+           const int nPbW, const int nPbH,
+-+           const struct MvField *const mv_field,
+-+           AVFrame *const src_frame,
+-+           AVFrame *const src_frame2)
+-+{
+-+    const unsigned int y_off = rpi_sliced_frame_off_y(s->frame, x0, y0);
+-+    const Mv * const mv  = mv_field->mv + 0;
+-+    const Mv * const mv2 = mv_field->mv + 1;
+++.macro  edge_32bx2_e0, body_fn, pb
+++        mov      r6, lr
+ +
+-+//    rpi_luma_mc_bi(s, s->frame->data[0] + y_off, s->frame->linesize[0], src_frame,
+-+//           mv, x0, y0, nPbW, nPbH,
+-+//           src_frame2, mv2, mv_field);
+-+    {
+-+        const unsigned int mx          = mv->x & 3;
+-+        const unsigned int my          = mv->y & 3;
+-+        const unsigned int my_mx = (my<<8) | mx;
+-+        const unsigned int mx2          = mv2->x & 3;
+-+        const unsigned int my2          = mv2->y & 3;
+-+        const unsigned int my2_mx2 = (my2<<8) | mx2;
+-+        const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+-+        const int x1 = x0 + (mv->x >> 2) - 3;
+-+        const int y1 = y0 + (mv->y >> 2) - 3;
+-+        const int x2 = x0 + (mv2->x >> 2) - 3;
+-+        const int y2 = y0 + (mv2->y >> 2) - 3;
+-+        const unsigned int ref_idx0 = mv_field->ref_idx[0];
+-+        const unsigned int ref_idx1 = mv_field->ref_idx[1];
+-+        const uint32_t wt_offset = s->sh.luma_offset_l0[ref_idx0] +
+-+                     s->sh.luma_offset_l1[ref_idx1] + 1;
+-+        const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+-+        const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+-+
+-+        uint32_t dst = get_vc_address_y(s->frame) + y_off;
+-+        const uint32_t src1_base = get_vc_address_y(src_frame);
+-+        const uint32_t src2_base = get_vc_address_y(src_frame2);
+-+
+-+        for (int start_y=0; start_y < nPbH; start_y += Y_B_MAX_H)
+-+        {
+-+            const unsigned int bh = FFMIN(nPbH - start_y, Y_B_MAX_H);
+++1:      subs     r12, #2
+ +
+-+            for (int start_x=0; start_x < nPbW; start_x += 8)
+-+            { // B blocks work 8 at a time
+-+                HEVCRpiLumaPred * const yp = rpi_nxt_pred_y(s, bh + 7);
+-+                qpu_mc_pred_y_t *const cmd_lx = yp->last_lx;
+-+                qpu_mc_pred_y_t *const cmd_y = yp->qpu_mc_curr;
+-+#if RPI_TSTATS
+-+              {
+-+                  HEVCRpiStats *const ts = &s->tstats;
+-+                  const unsigned int mmx = mx | mx2;
+-+                  const unsigned int mmy = my | my2;
+-+                  if (mmx == 0 && mmy == 0)
+-+                      ++ts->y_pred2_x0y0;
+-+                  else if (mmx == 0)
+-+                      ++ts->y_pred2_x0;
+-+                  else if (mmy == 0)
+-+                      ++ts->y_pred2_y0;
+-+                  else
+-+                      ++ts->y_pred2_xy;
+-+
+-+                  if (nPbH > 16)
+-+                      ++ts->y_pred2_hgt16;
+-+                  else
+-+                      ++ts->y_pred2_hle16;
+-+              }
+-+#endif
+-+              cmd_y[-1].next_fn = s->qpu_filter_b;
+-+              cmd_lx->next_src1_x = x1 + start_x;
+-+              cmd_lx->next_src1_y = y1 + start_y;
+-+              cmd_lx->next_src1_base = src1_base;
+-+              cmd_lx->next_src2_x = x2 + start_x;
+-+              cmd_lx->next_src2_y = y2 + start_y;
+-+              cmd_lx->next_src2_base = src2_base;
+-+              cmd_y->p.w = FFMIN(nPbW - start_x, 8);
+-+              cmd_y->p.h = bh;
+-+              cmd_y->p.mymx21 = my2_mx2_my_mx;
+-+              cmd_y->p.wo1 = wo1;
+-+              cmd_y->p.wo2 = wo2;
+-+              cmd_y->p.dst_addr =  dst + start_x;
+-+              yp->last_lx = cmd_y;
+-+              yp->qpu_mc_curr = cmd_y + 1;
+-+          }
+-+          dst += s->frame->linesize[0] * 16;
+-+        }
+-+    }
+-+}
+++        vld1.8   {q4-q5}, [r1]
+++        sub      r1, #\pb
+++        vld1.8   {q0-q1}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {q8-q9}, [r1], r3
+++        sub      r1, #\pb
+++        vld1.8   {q6-q7}, [r1]
+++        sub      r1, #\pb
+++        vld1.8   {q2-q3}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {q10-q11}, [r1], r3
+++        sub      r1, #\pb
+++
+++        bl       \body_fn
+++
+++        vst1.8   {q0,q1}, [r0], r2
+++        vst1.8   {q2,q3}, [r0], r2
+++
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+++.macro  edge_16b_e0, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++        sub      r3, #\pb * 2
+ +
+-+static HEVCRpiChromaPred *
+-+rpi_nxt_pred_c(HEVCContext *const s, const unsigned int load_val)
+-+{
+-+    HEVCRpiChromaPred * cp = s->curr_pred_c;
+-+    HEVCRpiChromaPred * cpt = cp + 1;
+-+    for (unsigned int i = 1; i != QPU_N_GRP_UV; ++i, ++cpt) {
+-+        if (cpt->load < cp->load)
+-+            cp = cpt;
+-+    }
+-+    // Actual use of load_val is noticably better but we haven't sorted Q length problems yet
+-+    ++cp->load;
+-+//    cp->load += load_val;
+-+    return cp;
+-+}
+++1:      subs     r12, #1
+ +
+-+static void
+-+rpi_pred_c(HEVCContext * const s, const int x0_c, const int y0_c,
+-+  const int nPbW_c, const int nPbH_c,
+-+  const Mv * const mv,
+-+  const int16_t * const c_weights,
+-+  const int16_t * const c_offsets,
+-+  AVFrame * const src_frame)
+-+{
+++        vld1.64  {q0}, [r1]             @ load a
+++        add      r1, #\pb
+++        vld1.64  {q1}, [r1, :128]       @ load c
+++        add      r1, #\pb
+++        vld1.64  {q2}, [r1], r3         @ load b
+ +
+-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+-+#if 0
+-+    av_assert0(s->frame->linesize[1] == s->frame->linesize[2]);
+++        bl       \body_fn
+++        vst1.8   {q0}, [r0], r2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+    rpi_chroma_mc_uni(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame->data[1], src_frame->linesize[1],
+-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+-+                c_weights[0], c_offsets[0]);
+++.macro  edge_8bx2_e0, body_fn, pb
+++        mov      r6, lr
+ +
+-+    rpi_chroma_mc_uni(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame->data[2], src_frame->linesize[2],
+-+                x0_c, y0_c, nPbW_c, nPbH_c, mv,
+-+                c_weights[1], c_offsets[1]);
+-+#endif
+-+    {
+-+        const int hshift           = s->ps.sps->hshift[1];
+-+        const int vshift           = s->ps.sps->vshift[1];
+-+
+-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+-+        const uint32_t src_base_u = get_vc_address_u(src_frame);
+-+        const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+-+        const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+-+        const uint32_t wo_u = PACK2(c_offsets[0] * 2 + 1, c_weights[0]);
+-+        const uint32_t wo_v = PACK2(c_offsets[1] * 2 + 1, c_weights[1]);
+-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+-+
+-+        for(int start_y=0;start_y < nPbH_c;start_y+=16)
+-+        {
+-+            const int bh = FFMIN(nPbH_c-start_y, 16);
+++1:      subs     r12, #2
+ +
+-+            for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+-+            {
+-+                HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh + 3);
+-+                qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+-+                qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+-+                const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+-+
+-+                u[-1].next_fn  = s->qpu_filter_uv;
+-+                last_l0->next_src_x = x1_c + start_x;
+-+                last_l0->next_src_y = y1_c + start_y;
+-+                last_l0->next_src_base_c = src_base_u;
+-+                u[0].p.h = bh;
+-+                u[0].p.w = bw;
+-+                u[0].p.coeffs_x = x_coeffs;
+-+                u[0].p.coeffs_y = y_coeffs;
+-+                u[0].p.wo_u = wo_u;
+-+                u[0].p.wo_v = wo_v;
+-+                u[0].p.dst_addr_c = dst_base_u + start_x * 2;
+-+                cp->last_l0 = u;
+-+                cp->qpu_mc_curr = u + 1;
+-+            }
+++        vld1.8   {d2}, [r1, :64]
+++        sub      r1, #\pb
+++        vld1.8   {d0}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {d4}, [r1], r3
+++        sub      r1, #\pb
+++        vld1.8   {d3}, [r1, :64]
+++        sub      r1, #\pb
+++        vld1.8   {d1}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.8   {d5}, [r1], r3
+++        sub      r1, #\pb
+++
+++        bl       \body_fn
+++
+++        vst1.8   {d0}, [r0, :64], r2
+++        vst1.8   {d1}, [r0, :64], r2
+++
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+            dst_base_u += s->frame->linesize[1] * 16;
+-+        }
+-+    }
+-+  return;
+-+}
+++.macro  edge_4bx4_e0, body_fn, pb
+++        mov      r6, lr
+ +
+-+static void
+-+rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+-+  const int nPbW_c, const int nPbH_c,
+-+  const struct MvField * const mv_field,
+-+  const int16_t * const c_weights,
+-+  const int16_t * const c_offsets,
+-+  const int16_t * const c_weights2,
+-+  const int16_t * const c_offsets2,
+-+  AVFrame * const src_frame,
+-+  AVFrame * const src_frame2)
+-+{
+-+    const unsigned int c_off = rpi_sliced_frame_off_c(s->frame, x0_c, y0_c);
+-+#if 0
+-+    rpi_chroma_mc_bi(s, s->frame->data[1] + c_off, s->frame->linesize[1], src_frame, src_frame2,
+-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 0);
+++1:      subs     r12, #4
+ +
+-+    rpi_chroma_mc_bi(s, s->frame->data[2] + c_off, s->frame->linesize[2], src_frame, src_frame2,
+-+                 x0_c, y0_c, nPbW_c, nPbH_c, mv_field, 1);
+-+#endif
+-+    {
+-+        const int hshift = s->ps.sps->hshift[1];
+-+        const int vshift = s->ps.sps->vshift[1];
+-+        const Mv * const mv = mv_field->mv + 0;
+-+        const Mv * const mv2 = mv_field->mv + 1;
+-+
+-+        const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+-+        const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+-+        const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+-+        const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+-+        const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+-+        const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+-+
+-+        const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+-+        const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+-+        const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+-+        const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+-+
+-+        const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+-+        const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+-+
+-+        uint32_t dst_base_u = get_vc_address_u(s->frame) + c_off;
+-+
+-+        for (int start_y = 0; start_y < nPbH_c; start_y += 16) {
+-+          const unsigned int bh = FFMIN(nPbH_c-start_y, 16);
+-+
+-+          // We are allowed 3/4 powers of two as well as powers of 2
+-+          av_assert2(bh == 16 || bh == 12 || bh == 8 || bh == 6 || bh == 4 || bh == 2);
+-+
+-+          for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) {
+-+              const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+-+
+-+              HEVCRpiChromaPred * const cp = rpi_nxt_pred_c(s, bh * 2 + 3);
+-+              qpu_mc_pred_c_t * const u = cp->qpu_mc_curr;
+-+              qpu_mc_pred_c_t * const last_l0 = cp->last_l0;
+-+              qpu_mc_pred_c_t * const last_l1 = cp->last_l1;
+-+
+-+              u[-1].next_fn = s->qpu_filter_uv_b0;
+-+              last_l0->next_src_x = x1_c + start_x;
+-+              last_l0->next_src_y = y1_c + start_y;
+-+              last_l0->next_src_base_c = get_vc_address_u(src_frame);
+-+
+-+              u[0].next_fn = 0;  // Ignored - 2 block cmd
+-+              u[0].next_src_x = x2_c + start_x;
+-+              u[0].next_src_y = y2_c + start_y;
+-+              u[0].next_src_base_c = get_vc_address_u(src_frame2);
+-+
+-+              u[0].b0.h = (bh<16 ? bh : 16);
+-+              u[0].b0.w = (bw<RPI_CHROMA_BLOCK_WIDTH ? bw : RPI_CHROMA_BLOCK_WIDTH);
+-+              u[0].b0.coeffs_x = coefs0_x;
+-+              u[0].b0.coeffs_y = coefs0_y;
+-+              u[0].b0.weight_u = c_weights[0]; // Weight L0 U
+-+              u[0].b0.weight_v = c_weights[1]; // Weight L0 V
+-+              u[0].b0.dummy0 = 0;  // Intermediate results are not written back in first pass of B filtering
+-+
+-+              last_l1->next_src_x = x2_c + start_x;
+-+              last_l1->next_src_y = y2_c + start_y;
+-+              last_l1->next_src_base_c = get_vc_address_u(src_frame2);
+-+
+-+              u[1].b1.dummy0 = 0;  // w,h inherited from b0
+-+              u[1].b1.coeffs_x = coefs1_x;
+-+              u[1].b1.coeffs_y = coefs1_y;
+-+              u[1].b1.wo_u = PACK2(c_offsets[0] + c_offsets2[0] + 1, c_weights2[0]);
+-+              u[1].b1.wo_v = PACK2(c_offsets[1] + c_offsets2[1] + 1, c_weights2[1]);
+-+              u[1].b1.dst_addr_c = dst_base_u + start_x * 2;
+-+
+-+              cp->last_l0 = u;
+-+              cp->last_l1 = u + 1;
+-+              cp->qpu_mc_curr = u + 2;
+-+          }
+-+
+-+          dst_base_u += s->frame->linesize[1] * 16;
+-+        }
+-+    }
+-+}
+-+#endif
+++        vld1.32  {d2[0]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d0[0]}, [r1]
+++        add      r1, #(\pb * 2)
+++        vld1.32  {d4[0]}, [r1], r3      @ R
+++        vld1.32  {d4[1]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d2[1]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d0[1]}, [r1], r3      @ L
+++        vld1.32  {d1[0]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d3[0]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d5[0]}, [r1], r3      @ R
+++        vld1.32  {d5[1]}, [r1]
+++        sub      r1, #(\pb * 2)
+++        vld1.32  {d1[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d3[1]}, [r1], r3      @ M
+++
+++        bl       \body_fn
+++
+++        vst1.32  {d0[0]}, [r0], r2
+++        vst1.32  {d0[1]}, [r0], r2
+++        vst1.32  {d1[0]}, [r0], r2
+++        vst1.32  {d1[1]}, [r0], r2
+++
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+ +
+++.macro  edge_64b_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {q0-q1}, [r1, :128]!
+++        vld1.8   {q2-q3}, [r1, :128], r3
+++        sub      r1, #32
+++        // load c
+++        vld1.8   {q4-q5}, [r1, :128]!
+++        vld1.8   {q6-q7}, [r1, :128], r3
+++        sub      r1, #32
+++1:      subs     r12, #1
+++        // load b
+++        vld1.8   {q8-q9}, [r1, :128]!
+++        vld1.8   {q10-q11}, [r1, :128], r3
+++        sub      r1, #32
+++        bl       \body_fn
+++        vstm     r0, {q0-q3}
+++        add      r0, r0, r2
+++        // copy c to a
+++        vmov.64  q0, q4
+++        vmov.64  q1, q5
+++        vmov.64  q2, q6
+++        vmov.64  q3, q7
+++        // copy b to c
+++        vmov.64  q4, q8
+++        vmov.64  q5, q9
+++        vmov.64  q6, q10
+++        vmov.64  q7, q11
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+-+                                const int nPbW, const int nPbH,
+-+                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+- {
+- #define POS(c_idx, x, y)                                                              \
+-     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
+-                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
+--    HEVCLocalContext *lc = s->HEVClc;
+-+    HEVCLocalContext * const lc = s->HEVClc;
+-     int merge_idx = 0;
+-     struct MvField current_mv = {{{ 0 }}};
+- 
+-@@ -1718,8 +2701,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-     int y_cb             = y0 >> log2_min_cb_size;
+-     int x_pu, y_pu;
+-     int i, j;
+--
+--    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+-+    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+- 
+-     if (!skip_flag)
+-         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
+-@@ -1763,12 +2745,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#if RPI_INTER
+-+        if (s->enable_rpi) {
+-+            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+-+              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+-+              ref0->frame);
+-+        } else
+-+#endif
+-+        {
+-+            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+-                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
+-                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#if RPI_INTER
+-+            if (s->enable_rpi) {
+-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+-+                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+-+                  ref0->frame);
+-+                return;
+-+            }
+-+#endif
+-             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
+-                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
+-@@ -1782,12 +2781,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-+#if RPI_INTER
+-+        if (s->enable_rpi) {
+-+            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+-+              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+-+              ref1->frame);
+-+        } else
+-+#endif
+-+        {
+-+            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+-                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
+-                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
+-                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#if RPI_INTER
+-+            if (s->enable_rpi) {
+-+                rpi_pred_c(s, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+-+                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+-+                  ref1->frame);
+-+                return;
+-+            }
+-+#endif
+-             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
+-                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
+-                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
+-@@ -1802,11 +2818,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
+-         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
+-         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
+- 
+--        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-+#if RPI_INTER
+-+        if (s->enable_rpi) {
+-+            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+-+        } else
+-+#endif
+-+        {
+-+            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+-                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
+-                    ref1->frame, &current_mv.mv[1], &current_mv);
+-+        }
+- 
+-         if (s->ps.sps->chroma_format_idc) {
+-+#if RPI_INTER
+-+          if (s->enable_rpi) {
+-+              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+-+                           &current_mv,
+-+                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+-+                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+-+                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+-+                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+-+                           ref0->frame,
+-+                           ref1->frame);
+-+                return;
+-+            }
+-+#endif
+-             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
+-                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
+- 
+-@@ -2081,7 +3117,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
+-                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
+-                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+-+                {
+-                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+-+                }
+- 
+-                 if (ret < 0)
+-                     return ret;
+-@@ -2304,6 +3342,529 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
+-     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+- }
+- 
+-+#ifdef RPI
+-+static void rpi_execute_dblk_cmds(HEVCContext *s)
+-+{
+-+    int n;
+-+    int job = s->pass1_job;
+-+    int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+-+    int (*p)[2] = s->dblk_cmds[job];
+-+    for(n = s->num_dblk_cmds[job]; n>0 ;n--,p++) {
+-+        ff_hevc_hls_filters(s, (*p)[0], (*p)[1], ctb_size);
+-+    }
+-+    s->num_dblk_cmds[job] = 0;
+-+}
+++.macro  edge_32bx2_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {q0-q1}, [r1, :128], r3
+++        vld1.8   {q4-q5}, [r1, :128], r3
+ +
+-+#if 0
+-+static void rpi_execute_transform(HEVCContext *s)
+-+{
+-+    int i=2;
+-+    int job = s->pass1_job;
+-+    /*int j;
+-+    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+-+        s->hevcdsp.idct[4-2](coeffs, 16);
+-+    }
+-+    i=3;
+-+    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+-+    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+-+        s->hevcdsp.idct[5-2](coeffs, 32);
+-+    }*/
+++1:      subs     r12, #2
+++        @ Given the data duplication here we could obviously do better than
+++        @ using the generic body_fn but it almost certainly isn't worth it
+++        vmov     q2, q4
+++        vmov     q3, q5
+++        vld1.8   {q8-q9}, [r1, :128], r3
+++        vld1.8   {q10-q11}, [r1, :128], r3
+++        vmov     q6, q8
+++        vmov     q7, q9
+ +
+-+    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+-+                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-+                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+-+    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+-+    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+-+    //vpu_wait(s->vpu_id);
+++        bl       \body_fn
+ +
+-+    for(i=0;i<4;i++)
+-+        s->num_coeffs[job][i] = 0;
+-+}
+-+#endif
+++        vst1.8   {q0,q1}, [r0], r2
+++        vst1.8   {q2,q3}, [r0], r2
+ +
+++        // copy c to a
+++        vmov.64  q0, q8
+++        vmov.64  q1, q9
+ +
+-+// I-pred, transform_and_add for all blocks types done here
+-+// All ARM
+-+static void rpi_execute_pred_cmds(HEVCContext * const s)
+-+{
+-+  int i;
+-+  int job = s->pass1_job;
+-+  const HEVCPredCmd *cmd = s->univ_pred_cmds[job];
+-+#ifdef RPI_WORKER
+-+  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+-+#else
+-+  HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+++        // copy b to c
+++        vmov.64  q4, q10
+++        vmov.64  q5, q11
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+  for(i = s->num_pred_cmds[job]; i > 0; i--, cmd++) {
+-+//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++.macro  edge_16b_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {q0}, [r1, :128], r3
+++        // load c
+++        vld1.8   {q1}, [r1, :128], r3
+++1:      subs     r12, #1
+++        // load b
+++        vld1.8   {q2}, [r1, :128], r3
+++        bl       \body_fn
+++        vst1.8   {q0}, [r0], r2
+++        // copy c to a
+++        vmov.64  q0, q1
+++        // copy b to c
+++        vmov.64  q1, q2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+      switch (cmd->type)
+-+      {
+-+          case RPI_PRED_INTRA:
+-+              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+-+              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+-+              lc->na.cand_left         = (cmd->na >> 3) & 1;
+-+              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+-+              lc->na.cand_up           = (cmd->na >> 1) & 1;
+-+              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+-+              if (!rpi_sliced_frame(s->frame) || cmd->c_idx == 0)
+-+                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+-+              else
+-+                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+-+              break;
+++.macro  edge_8bx2_e1, body_fn
+++        mov      r6, lr
+++        sub      r1, r3
+++        // load a
+++        vld1.8   {d0}, [r1, :64], r3
+++        vld1.8   {d2}, [r1, :64], r3
+ +
+-+          case RPI_PRED_ADD_RESIDUAL:
+-+              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+-+#ifdef RPI_PRECLEAR
+-+              memset(cmd->buf, 0, sizeof(int16_t) << (cmd->size * 2)); // Clear coefficients here while they are in the cache
+-+#endif
+-+              break;
+-+          case RPI_PRED_ADD_RESIDUAL_U:
+-+              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+-+              break;
+-+          case RPI_PRED_ADD_RESIDUAL_V:
+-+              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+-+              break;
+++1:      subs     r12, #2
+++        @ Given the data duplication here we could obviously do better than
+++        @ using the generic body_fn but it almost certainly isn't worth it
+++        vmov.64  d1, d2
+++        vld1.8   {d4}, [r1, :64], r3
+++        vld1.8   {d5}, [r1, :64], r3
+++        vmov.64  d3, d4
+ +
+-+          case RPI_PRED_I_PCM:
+-+              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+-+              break;
+++        bl       \body_fn
+ +
+-+          default:
+-+              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+-+              abort();
+-+      }
+-+  }
+-+  s->num_pred_cmds[job] = 0;
+-+}
+++        vst1.8   {d0}, [r0], r2
+++        vst1.8   {d1}, [r0], r2
+ +
+-+// Do any inter-pred that we want to do in software
+-+// With both RPI_INTER_QPU && RPI_LUMA_QPU defined we should do nothing here
+-+// All ARM
+-+static void do_yc_inter_cmds(HEVCContext * const s, const HEVCMvCmd *cmd, unsigned int n, const int b_only)
+-+{
+-+    unsigned int cidx;
+-+    AVFrame myref;
+-+    AVFrame myref1;
+-+    struct MvField mymv;
+++        // copy c to a
+++        vmov.64  d0, d4
+++        // copy b to c
+++        vmov.64  d2, d5
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+    for(; n>0 ; n--, cmd++) {
+-+        av_assert0(0);
+++.macro  edge_4bx4_e1, body_fn
+++        mov      r6, lr
+++debug_me:
+++        sub      r1, r3
+++        // load a
+++        vld1.32  {d0[0]}, [r1], r3
+++        vld1.32  {d0[1]}, [r1], r3
+ +
+-+        switch(cmd->cmd) {
+-+        case RPI_CMD_LUMA_UNI:
+-+            if (b_only)
+-+                break;
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            luma_mc_uni(s, cmd->dst, cmd->dststride, &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_LUMA_BI:
+-+            myref.data[0] = cmd->src;
+-+            myref.linesize[0] = cmd->srcstride;
+-+            myref1.data[0] = cmd->src1;
+-+            myref1.linesize[0] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            luma_mc_bi(s, cmd->dst, cmd->dststride,
+-+                       &myref, &cmd->mv, cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h,
+-+                       &myref1, &cmd->mv1, &mymv);
+-+            break;
+-+        case RPI_CMD_CHROMA_UNI:
+-+            if (b_only)
+-+                break;
+-+            mymv.mv[0] = cmd->mv;
+-+            chroma_mc_uni(s, cmd->dst,
+-+                          cmd->dststride, cmd->src, cmd->srcstride, 0,
+-+                          cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cmd->weight, cmd->offset);
+-+            break;
+-+        case RPI_CMD_CHROMA_BI:
+-+        case RPI_CMD_CHROMA_BI+1:
+-+            cidx = cmd->cmd - RPI_CMD_CHROMA_BI;
+-+            myref.data[cidx+1] = cmd->src;
+-+            myref.linesize[cidx+1] = cmd->srcstride;
+-+            myref1.data[cidx+1] = cmd->src1;
+-+            myref1.linesize[cidx+1] = cmd->srcstride1;
+-+            mymv.ref_idx[0] = cmd->ref_idx[0];
+-+            mymv.ref_idx[1] = cmd->ref_idx[1];
+-+            mymv.mv[0] = cmd->mv;
+-+            mymv.mv[1] = cmd->mv1;
+-+            chroma_mc_bi(s, cmd->dst, cmd->dststride, &myref, &myref1,
+-+                         cmd->x_off, cmd->y_off, cmd->block_w, cmd->block_h, &mymv, cidx);
+-+            break;
+-+        }
+-+    }
+-+}
+++1:      subs     r12, #4
+++        @ Given the data duplication here we could probably do better than
+++        @ using the generic body_fn but it almost certainly isn't worth it
+++        vld1.32  {d4[0]}, [r1], r3
+++        vld1.32  {d4[1]}, [r1], r3
+++        vld1.32  {d5[0]}, [r1], r3
+++        vld1.32  {d5[1]}, [r1], r3
+++
+++        vmov.32  d1, d4
+++        vext.32  d2, d0, d4, #1
+++        vext.32  d3, d4, d5, #1
+++
+++        bl       \body_fn
+++
+++        vst1.32  {d0[0]}, [r0], r2
+++        vst1.32  {d0[1]}, [r0], r2
+++        vst1.32  {d1[0]}, [r0], r2
+++        vst1.32  {d1[1]}, [r0], r2
+++
+++        vmov.32  d0, d5
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+static void rpi_execute_inter_cmds(HEVCContext *s, const int qpu_luma, const int qpu_chroma, const int luma_b_only, const int chroma_b_only)
+-+{
+-+    const int job = s->pass1_job;
+++.macro  edge_64b_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #32
+++        sub      r3, #(32 - \pb)
+ +
+-+    if (!qpu_luma || luma_b_only)
+-+        do_yc_inter_cmds(s, s->unif_mv_cmds_y[job], s->num_mv_cmds_y[job], qpu_luma);
+-+    s->num_mv_cmds_y[job] = 0;
+-+    if (!qpu_chroma || chroma_b_only)
+-+        do_yc_inter_cmds(s, s->unif_mv_cmds_c[job], s->num_mv_cmds_c[job], qpu_chroma);
+-+    s->num_mv_cmds_c[job] = 0;
+-+}
+++1:      sub      r1, r3
+++        // load a
+++        // TODO: fix unaligned load
+++        //       don't reload a like in eo1
+++        vld1.8   {q0-q1}, [r1]!
+++        vld1.8   {q2-q3}, [r1], r3
+++        subs     r12, #1
+++        // load  c
+++        vld1.8   {q4-q5}, [r1, :128]!
+++        vld1.8   {q6-q7}, [r1, :128], r3
+++        // load  b
+++        vld1.8   {q8-q9}, [r1]!
+++        vld1.8   {q10-q11}, [r1]
+++        sub      r1, #(64 + \pb)
+++        bl       \body_fn
+++        vstm     r0, {q0-q3}
+++        add      r0, r0, r2
+++        bgt      1b
+++
+++        add      r3, #(32 - \pb)
+++        bx       r6
+++.endm
+ +
+-+#endif
+++.macro  edge_32bx2_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++
+++1:      sub      r1, r3
+++        vld1.8   {q0-q1}, [r1], r3
+++        vld1.8   {q2-q3}, [r1]
+++        subs     r12, #2
+++        // load  c
+++        add      r1, #\pb
+++        vld1.8   {q4-q5}, [r1, :128], r3
+++        vld1.8   {q6-q7}, [r1, :128]
+++        // load  b
+++        add      r1, #\pb
+++        vld1.8   {q8-q9}, [r1], r3
+++        vld1.8   {q10-q11}, [r1]
+++        sub      r1, #(\pb * 2)
+++
+++        bl       \body_fn
+++
+++        vst1.8   {q0-q1}, [r0], r2
+++        vst1.8   {q2-q3}, [r0], r2
+++        bgt      1b
+++
+++        bx       r6
+++.endm
+ +
+-+#ifdef RPI
+-+// Set initial uniform job values & zero ctu_count
+-+static void rpi_begin(HEVCContext *s)
+-+{
+-+#if RPI_INTER
+-+    int job = s->pass0_job;
+-+    int i;
+++.macro  edge_16b_e2, body_fn, pb
+++        mov      r6, lr
+++        add     r3, #\pb
+ +
+-+    const uint16_t pic_width_y        = s->ps.sps->width;
+-+    const uint16_t pic_height_y       = s->ps.sps->height;
+++1:      sub      r1, r3
+++        // load a
+++        vld1.8   {q0}, [r1], r3
+++        subs     r12, #1
+++        // load  c
+++        vld1.8   {q1}, [r1, :128], r3
+++        // load  b
+++        vld1.8   {q2}, [r1]
+++        sub      r1, #\pb
+++        bl       \body_fn
+++        vst1.8   {q0}, [r0], r2
+++        bgt      1b
+++        bx       r6
+++.endm
+ +
+-+    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+-+    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+++.macro  edge_8bx2_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++
+++1:      sub      r1, r3
+++        vld1.8   {d0}, [r1], r3
+++        vld1.8   {d1}, [r1]
+++        subs     r12, #2
+++        // load  c
+++        add      r1, #\pb
+++        vld1.8   {d2}, [r1, :64], r3
+++        vld1.8   {d3}, [r1, :64]
+++        // load  b
+++        add      r1, #\pb
+++        vld1.8   {d4}, [r1], r3
+++        vld1.8   {d5}, [r1]
+++        sub      r1, #(\pb * 2)
+++
+++        bl       \body_fn
+++
+++        vst1.8   {d0}, [r0], r2
+++        vst1.8   {d1}, [r0], r2
+++        bgt      1b
+++
+++        bx       r6
+++.endm
+ +
+-+    for(i=0; i < QPU_N_UV;i++) {
+-+        HEVCRpiChromaPred * const cp = s->jobs[job].chroma_mvs + i;
+-+        qpu_mc_pred_c_t * u = cp->qpu_mc_base;
+++.macro  edge_4bx4_e2, body_fn, pb
+++        mov      r6, lr
+++        sub      r1, #\pb
+++
+++1:      sub      r1, r3
+++        @ line 0 {d0[0], -,     -    }  r1 lo
+++        vld1.32  {d0[0]}, [r1], r3
+++        subs     r12, #4
+++        @ Line 1 {d0[1], d2[0], -    }  r1 lo
+++        vld1.32  {d0[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d2[0]}, [r1], r3
+++        @ Line 2 {d1[0], d2[1], d4[0]}  r1 mid
+++        vld1.32  {d2[1]}, [r1]
+++        sub      r1, #\pb
+++        vld1.32  {d1[0]}, [r1]
+++        add      r1, #\pb * 2
+++        vld1.32  {d4[0]}, [r1], r3
+++        @ Line 2 {d1[1], d3[0], d4[1]}  r1 hi
+++        vld1.32  {d4[1]}, [r1]
+++        sub      r1, #\pb * 2
+++        vld1.32  {d1[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d3[0]}, [r1], r3
+++        @ Line 3 {-,     d3[1], d5[0]}  r1 mid
+++        vld1.32  {d3[1]}, [r1]
+++        add      r1, #\pb
+++        vld1.32  {d5[0]}, [r1], r3
+++        @ Line 4 {-,      -,    d5[1]}  r1 hi
+++        vld1.32  {d5[1]}, [r1]
+++        sub      r1, #(\pb * 2)
+++
+++        bl       \body_fn
+++
+++        vst1.32  {d0[0]}, [r0], r2
+++        vst1.32  {d0[1]}, [r0], r2
+++        vst1.32  {d1[0]}, [r0], r2
+++        vst1.32  {d1[1]}, [r0], r2
+++        bgt      1b
+++
+++        bx       r6
+++.endm
+ +
+-+        // Chroma setup is a double block with L0 fetch
+-+        // and other stuff in the 1st block and L1 fetch
+-+        // in the 2nd along with a lot of dummy vars
+-+        // This could be packed a lot tighter but it would make
+-+        // L0, L1 management a lot harder
+++.macro  edge_64b_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_64b_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+        u->next_fn = 0;
+-+        u->next_src_x = 0;
+-+        u->next_src_y = 0;
+-+        u->next_src_base_c = 0;
+-+        u->s0.pic_cw = pic_width_c;
+-+        u->s0.pic_ch = pic_height_c;
+-+        u->s0.stride2 = rpi_sliced_frame_stride2(s->frame);
+-+        u->s0.stride1 = s->frame->linesize[1];
+-+        u->s0.wdenom = s->sh.chroma_log2_weight_denom + 6;
+-+        u->s0.dummy0 = 0;
+-+        cp->last_l0 = u;
+-+        ++u;
+++.macro  edge_32bx2_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_32bx2_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+        u->next_fn = 0;
+-+        u->next_src_x = 0;
+-+        u->next_src_y = 0;
+-+        u->next_src_base_c = 0;
+-+        u->s1.dummy0 = 0;
+-+        u->s1.dummy1 = 0;
+-+        u->s1.dummy2 = 0;
+-+        u->s1.dummy3 = 0;
+-+        u->s1.dummy4 = 0;
+-+        u->s1.dummy5 = 0;
+-+        cp->last_l1 = u;
+-+        ++u;
+-+
+-+        cp->load = 0;
+-+        cp->qpu_mc_curr = u;
+-+    }
+-+    s->curr_pred_c = NULL;
+-+
+-+    for(i=0;i < QPU_N_Y;i++) {
+-+        HEVCRpiLumaPred * const yp = s->jobs[job].luma_mvs + i;
+-+        qpu_mc_pred_y_t * y = yp->qpu_mc_base;
+-+
+-+        y->next_src1_x = 0;
+-+        y->next_src1_y = 0;
+-+        y->next_src1_base = 0;
+-+        y->next_src2_x = 0;
+-+        y->next_src2_y = 0;
+-+        y->next_src2_base = 0;
+-+        y->s.pic_h = pic_height_y;
+-+        y->s.pic_w = pic_width_y;
+-+        y->s.stride2 = rpi_sliced_frame_stride2(s->frame);
+-+        y->s.stride1 = s->frame->linesize[0];
+-+        y->s.wdenom = s->sh.luma_log2_weight_denom + 6;
+-+        y->s.dummy0 = 0;
+-+        y->next_fn = 0;
+-+        yp->last_lx = y;
+-+        ++y;
+++.macro  edge_16b_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_16b_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+        yp->load = 0;
+-+        yp->qpu_mc_curr = y;
+-+    }
+-+    s->curr_pred_y = NULL;
+-+    s->last_y8_p = NULL;
+-+    s->last_y8_lx = NULL;
+-+#endif
+-+    s->ctu_count = 0;
+-+}
+-+#endif
+++.macro  edge_8bx2_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_8bx2_e2 \body_fn, (-\pb)
+++.endm
+ +
+++.macro  edge_4bx4_e3, body_fn, pb
+++        @ e3 is the same as e2 but with the X offset reversed
+++        edge_4bx4_e2 \body_fn, (-\pb)
+++.endm
+ +
+-+#if RPI_INTER
+-+static unsigned int mc_terminate_y(HEVCContext * const s, const int job)
+-+{
+-+    unsigned int i;
+-+    const uint32_t exit_fn = qpu_fn(mc_exit);
+-+    const uint32_t exit_fn2 = qpu_fn(mc_interrupt_exit12);
+-+    unsigned int tc = 0;
+-+    HEVCRpiJob * const jb = s->jobs + job;
+++.macro edge_64b_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+ +
+-+    // Add final commands to Q
+-+    for(i = 0; i != QPU_N_Y; ++i) {
+-+        HEVCRpiLumaPred * const yp = jb->luma_mvs + i;
+-+        qpu_mc_pred_y_t *const px = yp->qpu_mc_curr - 1; // *** yp->last_lx;
+++0:      edge_64b_e0     \body_fn, \pb
+++10:     edge_64b_e1     \body_fn
+++20:     edge_64b_e2     \body_fn, \pb
+++30:     edge_64b_e3     \body_fn, \pb
+++.endm
+ +
+-+        // We will always have had L0 if we have L1 so only test L0
+-+        if (px != yp->qpu_mc_base)
+-+            tc = 1;
+++.macro edge_32bx2_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+ +
+-+        yp->qpu_mc_curr[-1].next_fn = (i != QPU_N_Y - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++0:      edge_32bx2_e0   \body_fn, \pb
+++10:     edge_32bx2_e1   \body_fn
+++20:     edge_32bx2_e2   \body_fn, \pb
+++30:     edge_32bx2_e3   \body_fn, \pb
+++.endm
+ +
+-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+-+        px->next_src1_x = MC_DUMMY_X;
+-+        px->next_src1_y = MC_DUMMY_Y;
+-+        px->next_src1_base = s->qpu_dummy_frame;
+-+        px->next_src2_x = MC_DUMMY_X;
+-+        px->next_src2_y = MC_DUMMY_Y;
+-+        px->next_src2_base = s->qpu_dummy_frame;
+-+
+-+        yp->last_lx = NULL;
+-+    }
+++.macro edge_16b_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+ +
+-+    return tc;
+-+}
+++0:      edge_16b_e0     \body_fn, \pb
+++10:     edge_16b_e1     \body_fn
+++20:     edge_16b_e2     \body_fn, \pb
+++30:     edge_16b_e3     \body_fn, \pb
+++.endm
+ +
+-+#define MC_EXIT_FN_C2(n) mc_interrupt_exit ## n ## c
+-+#define MC_EXIT_FN_C(n) MC_EXIT_FN_C2(n)
+++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+++        .word   5f
+++        .word   15f
+++        .word   25f
+++        .word   35f
+++
+++0:      edge_32bx2_e0   \body_fn_64b, \pb
+++10:     edge_32bx2_e1   \body_fn_64b
+++20:     edge_32bx2_e2   \body_fn_64b, \pb
+++30:     edge_32bx2_e3   \body_fn_64b, \pb
+++5:      edge_16b_e0     \body_fn_16b, \pb
+++15:     edge_16b_e1     \body_fn_16b
+++25:     edge_16b_e2     \body_fn_16b, \pb
+++35:     edge_16b_e3     \body_fn_16b, \pb
+++.endm
+ +
+-+static unsigned int mc_terminate_uv(HEVCContext * const s, const int job)
+-+{
+-+    unsigned int i;
+-+    const uint32_t exit_fn = qpu_fn(mc_exit_c);
+-+    const uint32_t exit_fn2 = qpu_fn(MC_EXIT_FN_C(QPU_N_UV));
+-+    unsigned int tc = 0;
+-+    HEVCRpiJob * const jb = s->jobs + job;
+++.macro edge_16b_8bx2_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+++        .word   5f
+++        .word   15f
+++        .word   25f
+++        .word   35f
+++
+++0:      edge_16b_e0     \body_fn, \pb
+++10:     edge_16b_e1     \body_fn
+++20:     edge_16b_e2     \body_fn, \pb
+++30:     edge_16b_e3     \body_fn, \pb
+++5:      edge_8bx2_e0    \body_fn, \pb
+++15:     edge_8bx2_e1    \body_fn
+++25:     edge_8bx2_e2    \body_fn, \pb
+++35:     edge_8bx2_e3    \body_fn, \pb
+++.endm
+ +
+-+    // Add final commands to Q
+-+    for(i = 0; i != QPU_N_UV; ++i) {
+-+        HEVCRpiChromaPred * const cp = jb->chroma_mvs + i;
+-+        qpu_mc_pred_c_t *const p0 = cp->last_l0;
+-+        qpu_mc_pred_c_t *const p1 = cp->last_l1;
+++.macro edge_8bx2_4bx4_bodies, body_fn, pb
+++        .word   0f
+++        .word   10f
+++        .word   20f
+++        .word   30f
+++        .word   5f
+++        .word   15f
+++        .word   25f
+++        .word   35f
+++
+++0:      edge_8bx2_e0    \body_fn, \pb
+++10:     edge_8bx2_e1    \body_fn
+++20:     edge_8bx2_e2    \body_fn, \pb
+++30:     edge_8bx2_e3    \body_fn, \pb
+++5:      edge_4bx4_e0    \body_fn, \pb
+++15:     edge_4bx4_e1    \body_fn
+++25:     edge_4bx4_e2    \body_fn, \pb
+++35:     edge_4bx4_e3    \body_fn, \pb
+++.endm
+ +
+-+        // We will always have had L0 if we have L1 so only test L0
+-+        if (p0 != cp->qpu_mc_base)
+-+            tc = 1;
+++@ void ff_hevc_sao_edge_8_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_8_neon_8, export=1
+++        edge_16b_init   8, 0, 1, 99f
+++99:
+++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
+++endfunc
+ +
+-+        cp->qpu_mc_curr[-1].next_fn = (i != QPU_N_UV - 1) ? exit_fn : exit_fn2;  // Actual fn ptr
+++@ void ff_hevc_sao_edge_16_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_16_neon_8, export=1
+++        edge_16b_init   8, 0, 0, 99f
+++99:
+++        edge_16b_bodies edge_16b_body_8, 1
+++endfunc
+ +
+-+        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+-+        p0->next_src_x = MC_DUMMY_X;
+-+        p0->next_src_y = MC_DUMMY_Y;
+-+        p0->next_src_base_c = s->qpu_dummy_frame;
+-+        p1->next_src_x = MC_DUMMY_X;
+-+        p1->next_src_y = MC_DUMMY_Y;
+-+        p1->next_src_base_c = s->qpu_dummy_frame;;
+-+
+-+        cp->last_l0 = NULL;
+-+        cp->last_l1 = NULL;
+-+    }
+++@ void ff_hevc_sao_edge_32_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_32_neon_8, export=1
+++        edge_64b_init   8, 0, 0, 99f
+++99:
+++        edge_32bx2_bodies edge_64b_body_8, 1
+++endfunc
+ +
+-+    return tc;
+-+}
+-+#endif
+++@ void ff_hevc_sao_edge_64_neon_8(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_64_neon_8, export=1
+++        edge_64b_init   8, 0, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_8, 1
+++endfunc
+ +
+-+#ifdef RPI
+++@ ff_hevc_sao_edge_c_8_neon_8(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_8_neon_8, export=1
+++        edge_16b_init   8, 1, 1, 99f
+++99:
+++        edge_16b_8bx2_bodies edge_16b_body_8, 2
+++endfunc
+ +
+++@ ff_hevc_sao_edge_c_16_neon_8(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_16_neon_8, export=1
+++        edge_64b_init   8, 1, 0, 99f
+++99:
+++        edge_32bx2_bodies edge_64b_body_8, 2
+++endfunc
+ +
+-+static void flush_frame(HEVCContext *s,AVFrame *frame)
+-+{
+-+  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+-+  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+  rpi_cache_flush_finish(rfe);
+-+}
+++@ ff_hevc_sao_edge_c_32_neon_8(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_32_neon_8, export=1
+++        edge_64b_init   8, 1, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_8, 2
+++endfunc
+ +
+++@ void ff_hevc_sao_edge_8_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_8_neon_10, export=1
+++        edge_16b_init   10, 0, 1, 99f
+++99:
+++        edge_16b_8bx2_bodies edge_16b_body_16, 2
+++endfunc
+ +
+-+// Core execution tasks
+-+static void worker_core(HEVCContext * const s)
+-+{
+-+    worker_global_env_t * const wg = &worker_global_env;
+-+    int arm_cost = 0;
+-+//    vpu_qpu_wait_h sync_c;
+-+    vpu_qpu_wait_h sync_y;
+-+    int qpu_luma = 0;
+-+    int qpu_chroma = 0;
+-+    int gpu_load;
+-+    int arm_load;
+-+    static const int arm_const_cost = 2;
+++@ void ff_hevc_sao_edge_16_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_16_neon_10, export=1
+++        edge_64b_init   10, 0, 0, 99f
+++99:
+++        edge_32bx2_bodies edge_64b_body_16, 2
+++endfunc
+ +
+-+//    static int z = 0;
+++@ void ff_hevc_sao_edge_64_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++@ We simply split the 32 case into 2 vertical stripes
+++@ and call the fns for w32
+++@
+++@ Calling code will always have src != dst so we don't have to worry
+++@ about edge effects
+ +
+-+    const int job = s->pass1_job;
+-+    unsigned int flush_start = 0;
+-+    unsigned int flush_count = 0;
+++function ff_hevc_sao_edge_64_neon_10, export=1
+++        edge_64b_init   10, 0, 1, 99f
+++endfunc
+ +
+-+    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+-+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++@ void ff_hevc_sao_edge_32_neon_10(
+++@   uint8_t *_dst,            [r0]
+++@   uint8_t *_src,            [r1]
+++@   int  stride_dst,          [r2]
+++@   int16_t *_sao_offset_val, [r3]
+++@   int eo,                   [sp, #0]
+++@   int width,                [sp, #4]
+++@   int height)               [sp, #8]
+++
+++function ff_hevc_sao_edge_32_neon_10, export=1
+++        edge_64b_init   10, 0, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_16, 2
+++endfunc
+ +
+-+    if (s->num_coeffs[job][3] + s->num_coeffs[job][2] != 0) {
+-+        vpu_qpu_job_add_vpu(vqj,
+-+            vpu_get_fn(),
+-+            vpu_get_constants(),
+-+            s->coeffs_buf_vc[job][2],
+-+            s->num_coeffs[job][2] >> 8,
+-+            s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+-+            s->num_coeffs[job][3] >> 10,
+-+            0);
+-+
+-+        rpi_cache_flush_add_gm_ptr(rfe, s->coeffs_buf_accelerated + job, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+    }
+++@ ff_hevc_sao_edge_c_8_neon_10(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_8_neon_10, export=1
+++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
+++99:
+++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
+++endfunc
+ +
+++@ ff_hevc_sao_edge_c_32_neon_10(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_32_neon_10, export=1
+++        edge_64b_init   10, 1, 1, 99f
+++endfunc
+ +
+-+#if RPI_INTER
+-+    pthread_mutex_lock(&wg->lock);
+-+
+-+//    ++z;
+-+    gpu_load = vpu_qpu_current_load();
+-+    arm_load = avpriv_atomic_int_get(&wg->arm_load);
+-+#if 0 // Y_B_ONLY
+-+    qpu_luma =  gpu_load + 2 < arm_load;
+-+    qpu_chroma = gpu_load < arm_load + 8;
+-+#elif 0
+-+    qpu_luma =  gpu_load < arm_load + 2;
+-+    qpu_chroma = gpu_load < arm_load + 8;
+-+#else
+-+    qpu_chroma = 1;
+-+    qpu_luma = 1;
+-+#endif
+ +
+-+    arm_cost = !qpu_chroma * 2 + !qpu_luma * 3;
+-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, arm_cost + arm_const_cost);
+++@ ff_hevc_sao_edge_c_16_neon_10(
+++@   uint8_t *_dst,                    [r0]
+++@   const uint8_t *_src,              [r1]
+++@   ptrdiff_t stride_dst,             [r2]
+++@   const int16_t *_sao_offset_val_u, [r3]
+++@   const int16_t *_sao_offset_val_v, [sp, #0]
+++@   int eo,                           [sp, #4]
+++@   int width,                        [sp, #8]
+++@   int height)                       [sp, #12]
+++
+++function ff_hevc_sao_edge_c_16_neon_10, export=1
+++        edge_64b_init   10, 1, 0, 99f
+++99:
+++        edge_64b_bodies edge_64b_body_16, 4
+++endfunc
+ +
+-+    wg->gpu_c += qpu_chroma;
+-+    wg->gpu_y += qpu_luma;
+-+    wg->arm_c += !qpu_chroma;
+-+    wg->arm_y += !qpu_luma;
++diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
++index 1be52e7a12..bae5df4bc6 100644
++--- a/libavcodec/avcodec.h
+++++ b/libavcodec/avcodec.h
++@@ -410,6 +410,8 @@ enum AVCodecID {
++     AV_CODEC_ID_SHEERVIDEO,
++     AV_CODEC_ID_YLC,
++ 
+++    AV_CODEC_ID_H264_MVC,
+ +
++     /* various PCM "codecs" */
++     AV_CODEC_ID_FIRST_AUDIO = 0x10000,     ///< A dummy id pointing at the start of audio codecs
++     AV_CODEC_ID_PCM_S16LE = 0x10000,
++@@ -3205,6 +3207,9 @@ typedef struct AVCodecContext {
++ #define FF_PROFILE_H264_HIGH_444_PREDICTIVE  244
++ #define FF_PROFILE_H264_HIGH_444_INTRA       (244|FF_PROFILE_H264_INTRA)
++ #define FF_PROFILE_H264_CAVLC_444            44
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH       118
+++#define FF_PROFILE_H264_STEREO_HIGH          128
+++#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
++ 
++ #define FF_PROFILE_VC1_SIMPLE   0
++ #define FF_PROFILE_VC1_MAIN     1
++@@ -3515,6 +3520,13 @@ typedef struct AVCodecContext {
++ #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1
++ #endif
++ 
+++    /**
+++     * Opaque pointer for use by replacement get_buffer2 code
+++     *
+++     * @author jc (08/02/2016)
+++     */
+++    void * get_buffer_context;
+ +
+-+//    if ((z & 511) == 0) {
+-+//        printf("Arm load=%d, GPU=%d, chroma=%d/%d, luma=%d/%d    \n", arm_load, gpu_load, wg->gpu_c, wg->arm_c, wg->gpu_y, wg->arm_y);
+-+//    }
++ } AVCodecContext;
++ 
++ AVRational av_codec_get_pkt_timebase         (const AVCodecContext *avctx);
++diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
++index 1bf1c620d6..ccfa991f60 100644
++--- a/libavcodec/cabac.h
+++++ b/libavcodec/cabac.h
++@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
++ typedef struct CABACContext{
++     int low;
++     int range;
++-    int outstanding_count;
+++    union
+++    {
+++        int outstanding_count;
+++        struct {
+++            uint16_t bits;
+++            uint16_t range;
+++        } by22;
+++    };
++     const uint8_t *bytestream_start;
++     const uint8_t *bytestream;
++     const uint8_t *bytestream_end;
++diff --git a/libavcodec/hevc.c b/libavcodec/hevc.c
++index c1fa67f67b..6f99021339 100644
++--- a/libavcodec/hevc.c
+++++ b/libavcodec/hevc.c
++@@ -41,8 +41,346 @@
++ #include "hevc.h"
++ #include "profiles.h"
++ 
+++#ifdef RPI
+++  #include "rpi_qpu.h"
+++  #include "rpi_shader.h"
+++  #include "rpi_shader_cmd.h"
+++  #include "rpi_shader_template.h"
+++  #include "rpi_zc.h"
+++  #include "libavutil/rpi_sand_fns.h"
+ +
+++  // Define RPI_CACHE_UNIF_MVS to write motion vector uniform stream to cached memory
+++  #define RPI_CACHE_UNIF_MVS  1
+ +
+-+    {
+-+        int (*d)[2] = s->dblk_cmds[job];
+-+        unsigned int high=(*d)[1];
+-+        int n;
+-+
+-+        flush_start = high;
+-+        for(n = s->num_dblk_cmds[job]; n>0 ;n--,d++) {
+-+            unsigned int y = (*d)[1];
+-+            flush_start = FFMIN(flush_start, y);
+-+            high=FFMAX(high,y);
+-+        }
+-+        // Avoid flushing past end of frame
+-+        flush_count = FFMIN(high + (1 << s->ps.sps->log2_ctb_size), s->frame->height) - flush_start;
+-+    }
+++  #include "pthread.h"
+++  #include "libavutil/atomic.h"
+ +
+-+#if !DISABLE_CHROMA
+-+    if (qpu_chroma && mc_terminate_uv(s, job) != 0)
+-+    {
+-+        HEVCRpiJob * const jb = s->jobs + job;
+-+        const uint32_t code = qpu_fn(mc_setup_c);
+-+        uint32_t * p;
+-+        unsigned int i;
+-+        uint32_t mail_uv[QPU_N_UV * QPU_MAIL_EL_VALS];
+++  static void worker_core(HEVCContext * const s);
+++#endif
+ +
+-+        for (p = mail_uv, i = 0; i != QPU_N_UV; ++i) {
+-+            *p++ = jb->chroma_mvs_gptr.vc + ((uint8_t *)jb->chroma_mvs[i].qpu_mc_base - jb->chroma_mvs_gptr.arm);
+-+            *p++ = code;
+-+        }
+++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
+ +
+-+        vpu_qpu_job_add_qpu(vqj, QPU_N_UV, 2, mail_uv);
+++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
+ +
+-+#if RPI_CACHE_UNIF_MVS
+-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->chroma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+#endif
+-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+          flush_start, flush_count, s->ps.sps->vshift[1], 0, 1);
+-+    }
+++#ifndef av_mod_uintp2
+++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
+++{
+++    return a & ((1 << p) - 1);
+++}
+++#   define av_mod_uintp2   av_mod_uintp2_c
+ +#endif
+ +
+-+// We can take a sync here and try to locally overlap QPU processing with ARM
+-+// but testing showed a slightly negative benefit with noticable extra complexity
+-+//    vpu_qpu_job_add_sync_this(vqj, &sync_c);
++ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++ 
+ +
+-+    if (qpu_luma && mc_terminate_y(s, job) != 0)
+-+    {
+-+        HEVCRpiJob * const jb = s->jobs + job;
+-+        const uint32_t code = qpu_fn(mc_setup);
+-+        uint32_t * p;
+-+        unsigned int i;
+-+        uint32_t mail_y[QPU_N_Y * QPU_MAIL_EL_VALS];
+++#if RPI_INTER
+ +
+-+        for (p = mail_y, i = 0; i != QPU_N_Y; ++i) {
+-+            *p++ = jb->luma_mvs_gptr.vc + ((uint8_t *)jb->luma_mvs[i].qpu_mc_base - jb->luma_mvs_gptr.arm);
+-+            *p++ = code;
+-+        }
+++#define MC_DUMMY_X (-32)
+++#define MC_DUMMY_Y (-32)
+ +
+-+        vpu_qpu_job_add_qpu(vqj, QPU_N_Y, 4, mail_y);
+++// UV still has min 4x4 pred
+++// Allow for even spread +1 for setup, +1 for rounding
+++// As we have load sharing this can (in theory) be exceeded so we have to
+++// check after each CTU, but it is a good base size
+ +
+-+#if RPI_CACHE_UNIF_MVS
+-+        rpi_cache_flush_add_gm_ptr(rfe, &jb->luma_mvs_gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+-+#endif
+-+        rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+          flush_start, flush_count, s->ps.sps->vshift[1], 1, 0);
+-+    }
+++// Worst case (all 4x4) commands per CTU
+++#define QPU_Y_CMD_PER_CTU_MAX (8 * 8)
+++#define QPU_C_CMD_PER_CTU_MAX (4 * 4)
+ +
+-+    pthread_mutex_unlock(&wg->lock);
+++#define QPU_C_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4)) / 4 + 2 * QPU_N_MAX)
+++#define QPU_Y_COMMANDS (((RPI_MAX_WIDTH * 64) / (4 * 4))     + 2 * QPU_N_MAX)
+ +
+-+#endif
+++// The QPU code for UV blocks only works up to a block width of 8
+++#define RPI_CHROMA_BLOCK_WIDTH 8
+ +
+-+    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
+ +
+-+    // Having accumulated some commands - do them
+-+    rpi_cache_flush_finish(rfe);
+-+    vpu_qpu_job_finish(vqj);
+ +
+-+    memset(s->num_coeffs[job], 0, sizeof(s->num_coeffs[job]));  //???? Surely we haven't done the smaller
+++// Actual filter goes -ve, +ve, +ve, -ve using these values
+++static const uint32_t rpi_filter_coefs[8] = {
+++        ENCODE_COEFFS(  0,  64,   0,  0),
+++        ENCODE_COEFFS(  2,  58,  10,  2),
+++        ENCODE_COEFFS(  4,  54,  16,  2),
+++        ENCODE_COEFFS(  6,  46,  28,  4),
+++        ENCODE_COEFFS(  4,  36,  36,  4),
+++        ENCODE_COEFFS(  4,  28,  46,  6),
+++        ENCODE_COEFFS(  2,  16,  54,  4),
+++        ENCODE_COEFFS(  2,  10,  58,  2)
+++};
+ +
+-+#if Y_B_ONLY
+-+    if (qpu_luma)
+-+        vpu_qpu_wait(&sync_y);
+-+#endif
+-+    // Perform inter prediction
+-+    rpi_execute_inter_cmds(s, qpu_luma, qpu_chroma, Y_B_ONLY, 0);
+++// Function arrays by QPU
+ +
+-+    // Wait for transform completion
+++static const int * const inter_pred_setup_c_qpu[12] = {
+++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
+++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
+++};
+ +
+-+    // Perform intra prediction and residual reconstruction
+-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_cost);
+-+#if Y_B_ONLY
+-+    if (!qpu_luma)
+-+        vpu_qpu_wait(&sync_y);
+-+#else
+-+    vpu_qpu_wait(&sync_y);
+-+#endif
+-+    rpi_execute_pred_cmds(s);
+++static const int * const inter_pred_setup_c10_qpu[12] = {
+++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
+++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
+++};
+ +
+-+    // Perform deblocking for CTBs in this row
+-+    rpi_execute_dblk_cmds(s);
+++static const int * const inter_pred_setup_y_qpu[12] = {
+++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
+++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
+++};
+ +
+-+    avpriv_atomic_int_add_and_fetch(&wg->arm_load, -arm_const_cost);
+-+}
+++static const int * const inter_pred_setup_y10_qpu[12] = {
+++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
+++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
+++};
+ +
+-+static void rpi_do_all_passes(HEVCContext *s)
+-+{
+-+    // Do the various passes - common with the worker code
+-+    worker_core(s);
+-+    // Prepare next batch
+-+    rpi_begin(s);
+-+}
+++static const int * const inter_pred_sync_qpu[12] = {
+++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
+++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
+++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
+++};
+ +
+++static const int * const inter_pred_sync10_qpu[12] = {
+++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
+++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
+++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
+++};
+ +
+++static const int * const inter_pred_exit_c_qpu[12] = {
+++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
+++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
+++};
+ +
+-+#endif
+++static const int * const inter_pred_exit_c10_qpu[12] = {
+++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
+++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
+++};
+ +
+- static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- {
+-     HEVCContext *s  = avctxt->priv_data;
+-@@ -2313,6 +3874,18 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-     int y_ctb       = 0;
+-     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = s->ps.sps->bit_depth == 8 &&
+-+        s->frame->format == AV_PIX_FMT_SAND128 &&
+-+        !s->ps.pps->cross_component_prediction_enabled_flag;
+++static const int * const inter_pred_exit_y_qpu[12] = {
+++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
+++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
+++};
+ +
+-+    if (!s->enable_rpi) {
+-+      if (s->ps.pps->cross_component_prediction_enabled_flag)
+-+        printf("Cross component\n");
+-+    }
+-+#endif
+-+    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+++static const int * const inter_pred_exit_y10_qpu[12] = {
+++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
+++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
+++};
+ +
+-     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
+-         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
+-         return AVERROR_INVALIDDATA;
+-@@ -2326,6 +3899,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         }
+-     }
+- 
+-+#ifdef RPI_WORKER
+-+    s->pass0_job = 0;
+-+    s->pass1_job = 0;
+-+#endif
+-+#ifdef RPI
+-+    rpi_begin(s);
+-+#endif
+++typedef struct ipe_chan_info_s
+++{
+++    const unsigned int n;
+++    const int * const * setup_fns;
+++    const int * const * sync_fns;
+++    const int * const * exit_fns;
+++} ipe_chan_info_t;
+ +
+-     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
+-         int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+- 
+-@@ -2333,6 +3914,7 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
+-         hls_decode_neighbour(s, x_ctb, y_ctb, ctb_addr_ts);
+- 
+++typedef struct ipe_init_info_s
+++{
+++    ipe_chan_info_t luma;
+++    ipe_chan_info_t chroma;
+++} ipe_init_info_t;
+++
+++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
+++   {  // 8
+++      .luma =   {QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
+++      .chroma = {QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
+++   },
+++   {  // 9
+++      .luma =   {0},
+++      .chroma = {0}
+++   },
+++   {  // 10
+++      .luma =   {QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
+++      .chroma = {QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
+++   }
+ +
+-         ff_hevc_cabac_init(s, ctb_addr_ts);
+- 
+-         hls_sao_param(s, x_ctb >> s->ps.sps->log2_ctb_size, y_ctb >> s->ps.sps->log2_ctb_size);
+-@@ -2341,7 +3923,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+-         s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
+-         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+- 
+-+#if RPI_INTER
+-+        s->curr_pred_c = s->jobs[s->pass0_job].chroma_mvs + (s->ctu_count * QPU_N_GRP_UV) % QPU_N_UV;
+-+        s->curr_pred_y = s->jobs[s->pass0_job].luma_mvs + (s->ctu_count * QPU_N_GRP_Y) % QPU_N_Y;
+-+#endif
+++};
+ +
+-         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
+++{
+++    const unsigned int n = ici->n;
+++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
+++
+++    ipe->n = n;
+++    ipe->max_fill = q1_size - ipe->min_gap;
+++    for(unsigned int i = 0; i < n; i++) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i;
+++        q->qpu_mc_curr = q->qpu_mc_base =
+++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
+++        q->code_setup = qpu_fn(ici->setup_fns[i]);
+++        q->code_sync = qpu_fn(ici->sync_fns[i]);
+++        q->code_exit = qpu_fn(ici->exit_fns[i]);
+++    }
+++}
+ +
+-+#ifdef RPI
+-+        if (s->enable_rpi) {
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]>=0);
+-+          //av_assert0(s->num_dblk_cmds[s->pass0_job]<RPI_MAX_DEBLOCK_CMDS);
+-+          //av_assert0(s->pass0_job<RPI_MAX_JOBS);
+-+          //av_assert0(s->pass0_job>=0);
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]][0] = x_ctb;
+-+          s->dblk_cmds[s->pass0_job][s->num_dblk_cmds[s->pass0_job]++][1] = y_ctb;
+-+          s->ctu_count++;
+-+
+-+          if ( s->ctu_count >= s->max_ctu_count ) {
+-+#ifdef RPI_WORKER
+-+            if (s->used_for_ref)
+-+            {
+-+//              printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+++static void rpi_hevc_qpu_set_fns(HEVCContext * const s, const unsigned int bit_depth)
+++{
+++    const ipe_init_info_t * const iii = ipe_init_infos + bit_depth - 8;
+ +
+-+//                worker_wait(s);
+-+              // Split work load onto separate threads so we make as rapid progress as possible with this frame
+-+              // Pass on this job to worker thread
+-+              worker_submit_job(s);
+++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
+ +
+-+              // Make sure we have space to prepare the next job
+-+              worker_pass0_ready(s);
+++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
+++
+++    for (unsigned int i = 0; i != RPI_MAX_JOBS; ++i) {
+++        HEVCRpiJob *const jb = s->jobs + i;
+++        set_ipe_from_ici(&jb->chroma_ip, &iii->chroma);
+++        set_ipe_from_ici(&jb->luma_ip,   &iii->luma);
+++    }
+++}
+ +
+-+              // Prepare the next batch of commands
+-+              rpi_begin(s);
+-+            } else {
+-+              // Non-ref frame so do it all on this thread
+-+              rpi_do_all_passes(s);
+-+            }
+-+#else
+-+            rpi_do_all_passes(s);
+-+#endif
+-+          }
+ +
+-+        }
+ +#endif
+ +
+ +
+-         if (more_data < 0) {
+-             s->tab_slice_address[ctb_addr_rs] = -1;
+-             return more_data;
+-@@ -2350,9 +3977,42 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+- 
+-         ctb_addr_ts++;
+-         ff_hevc_save_states(s, ctb_addr_ts);
+-+#ifdef RPI
+-+        if (s->enable_rpi)
+-+            continue;
+-+#endif
+-         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+-     }
+- 
+ +#ifdef RPI
+ +
+-+#ifdef RPI_WORKER
+-+    // Wait for the worker to finish all its jobs
+-+    if (s->enable_rpi) {
+-+        worker_wait(s);
+-+    }
+-+#endif
+++//#define LOG_ENTER printf("Enter %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+++//#define LOG_EXIT printf("Exit %s: p0=%d p1=%d (%d jobs) %p\n", __func__,s->pass0_job,s->pass1_job,s->worker_tail-s->worker_head,s);
+ +
+-+    // Finish off any half-completed rows
+-+    if (s->enable_rpi && s->ctu_count) {
+-+        rpi_do_all_passes(s);
+-+    }
+++#define LOG_ENTER
+++#define LOG_EXIT
+ +
+-+#if RPI_TSTATS
+-+    {
+-+        HEVCRpiStats *const ts = &s->tstats;
+++#define USE_SEM 1
+ +
+-+        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+-+               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+-+               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+-+               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+-+               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+-+        memset(ts, 0, sizeof(*ts));
+-+    }
+-+#endif
+++// Call this when we have completed pass0 and wish to trigger pass1 for the current job
+++static void worker_submit_job(HEVCContext * const s)
+++{
+++    LOG_ENTER
+++    sem_post(&s->jb0->sem_in);
+++    s->jb0->pending = 1;
+++    s->pass0_job = (s->pass0_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++    s->jb0 = s->jobs + s->pass0_job;
+++    LOG_EXIT
+++}
+ +
+-+#endif
+++// Call this to say we have completed pass1
+++static void worker_complete_job(HEVCContext * const s)
+++{
+++    LOG_ENTER
+++    sem_t * const sem = &s->jb1->sem_out;
+++    // Must set job no before signalling as otherwise rpi_do_all_passes
+++    // may call worker_core from the main thread with a bad job number
+++    s->pass1_job = (s->pass1_job + 1) % RPI_MAX_JOBS; // Move onto the next slot
+++    s->jb1 = s->jobs + s->pass1_job;
+++    sem_post(sem);
+++    LOG_EXIT
+++}
+ +
+-     if (x_ctb + ctb_size >= s->ps.sps->width &&
+-         y_ctb + ctb_size >= s->ps.sps->height)
+-         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
+-@@ -2387,6 +4047,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
+-     s = s1->sList[self_id];
+-     lc = s->HEVClc;
+- 
+-+#ifdef RPI
+-+    s->enable_rpi = 0;
+-+    //printf("Wavefront\n");
+-+#endif
+ +
+-     if(ctb_row) {
+-         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
+- 
+-@@ -2767,6 +4432,32 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
+-         if (ret < 0)
+-             return ret;
+- 
+-+        // The definition of _N unit types is "non-reference for other frames
+-+        // with the same temporal_id" so they may/will be ref frames for pics
+-+        // with a higher temporal_id.
+-+        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+-+            !(s->nal_unit_type == NAL_TRAIL_N ||
+-+                        s->nal_unit_type == NAL_TSA_N   ||
+-+                        s->nal_unit_type == NAL_STSA_N  ||
+-+                        s->nal_unit_type == NAL_RADL_N  ||
+-+                        s->nal_unit_type == NAL_RASL_N);
+++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
+++// available to receive the next job.
+++static void worker_pass0_ready(HEVCContext *s)
+++{
+++    LOG_ENTER
+++    HEVCRpiJob * const jb = s->jb0;
+++    if (jb->pending) {
+++        while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+++            /* Loop */;
+++        jb->pending = 0;
+++    }
+++    LOG_EXIT
+++}
+ +
+-+#if DEBUG_DECODE_N
+-+        {
+-+            static int z = 0;
+-+            if (IS_IDR(s)) {
+-+                z = 1;
+-+            }
+-+            if (z != 0 && z++ > DEBUG_DECODE_N) {
+-+                s->is_decoded = 0;
+-+                break;
+-+            }
+-+        }
+-+#endif
+-+        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+-+            s->is_decoded = 0;
+-+            break;
+++// Call this to wait for all jobs to have completed at the end of a frame
+++static void worker_wait(HEVCContext * const s)
+++{
+++    LOG_ENTER
+++    unsigned int i;
+++    for (i = 0; i != RPI_MAX_JOBS; ++i) {
+++        HEVCRpiJob * const jb = s->jobs + i;
+++        if (jb->pending) {
+++            while (sem_wait(&jb->sem_out) == -1 && errno == EINTR)
+++                /* Loop */;
+++            jb->pending = 0;
+ +        }
+-         if (s->max_ra == INT_MAX) {
+-             if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+-                 s->max_ra = s->poc;
+-@@ -2890,10 +4581,19 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
+-         }
+-     }
+- 
+--fail:
+--    if (s->ref && s->threads_type == FF_THREAD_FRAME)
+-+fail:  // Also success path
+-+    if (s->ref && s->threads_type == FF_THREAD_FRAME) {
+-+#if RPI_INTER
+-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+-+#endif
+-         ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+--
+-+    }
+-+#if RPI_INTER
+-+    else if (s->ref && s->enable_rpi) {
+-+      // When running single threaded we need to flush the whole frame
+-+      flush_frame(s,s->frame);
+ +    }
+-+#endif
+-     return ret;
+- }
+- 
+-@@ -3064,6 +4764,41 @@ fail:
+-     return AVERROR(ENOMEM);
+- }
+- 
+-+#ifdef RPI_WORKER
+-+static av_cold void hevc_init_worker(HEVCContext *s)
+++    LOG_EXIT
+++}
+++
+++static void *worker_start(void *arg)
+ +{
+-+    int err;
+-+    pthread_cond_init(&s->worker_cond_head, NULL);
+-+    pthread_cond_init(&s->worker_cond_tail, NULL);
+-+    pthread_mutex_init(&s->worker_mutex, NULL);
+++    HEVCContext * const s = (HEVCContext *)arg;
+ +
+-+    s->worker_tail=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+-+    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+-+    if (err) {
+-+        printf("Failed to create worker thread\n");
+-+        exit(-1);
+++    for (;;)
+++    {
+++        HEVCRpiJob * const jb = s->jb1;
+++        while (sem_wait(&jb->sem_in) == -1 && errno == EINTR)
+++            /* Loop */;
+++        if (jb->terminate)
+++            break;
+++
+++        LOG_ENTER
+++        worker_core(s);
+++        worker_complete_job(s);
+++        LOG_EXIT
+ +    }
+++    return NULL;
+ +}
+ +
+-+static av_cold void hevc_exit_worker(HEVCContext *s)
+++static void worker_pic_free_all(HEVCContext * const s)
+ +{
+-+    void *res;
+-+    s->kill_worker=1;
+-+    pthread_cond_broadcast(&s->worker_cond_tail);
+-+    pthread_join(s->worker_thread, &res);
+++    unsigned int i;
+ +
+-+    pthread_cond_destroy(&s->worker_cond_head);
+-+    pthread_cond_destroy(&s->worker_cond_tail);
+-+    pthread_mutex_destroy(&s->worker_mutex);
+++    // Free coeff stuff - allocation not the same for all buffers
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++    {
+++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+ +
+-+    s->worker_tail=0;
+-+    s->worker_head=0;
+-+    s->kill_worker=0;
+++        if (cf->s[0].buf != NULL)
+++            av_freep(&cf->mptr);
+++        if (cf->s[2].buf != NULL)
+++            gpu_free(&cf->gptr);
+++        memset(cf, 0, sizeof(*cf));
+++    }
+ +}
+-+#endif
+ +
+- static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- {
+-     HEVCContext       *s = avctx->priv_data;
+-@@ -3075,6 +4810,29 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+- 
+-     av_freep(&s->cabac_state);
+- 
+-+#ifdef RPI
+++static int worker_pic_alloc_all(HEVCContext * const s, const unsigned int coeff_count)
+++{
+++    unsigned int i;
+ +
+-+#ifdef RPI_WORKER
+-+    hevc_exit_worker(s);
+-+#endif
+++    // Free coeff stuff - allocation not the same for all buffers
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++    {
+++        HEVCRpiCoeffsEnv * const cf = &s->jobs[i].coeffs;
+ +
+-+    for(i=0;i<RPI_MAX_JOBS;i++) {
+++//        av_assert0(cf->s[0].n == 0 && cf->s[0].buf == NULL);
+++//        av_assert0(cf->s[1].n == 0 && cf->s[1].buf == NULL);
+++//        av_assert0(cf->s[2].n == 0 && cf->s[2].buf == NULL);
+++//        av_assert0(cf->s[3].n == 0 && cf->s[3].buf == NULL);
+ +
+-+        av_freep(&s->unif_mv_cmds_y[i]);
+-+        av_freep(&s->unif_mv_cmds_c[i]);
+-+        av_freep(&s->univ_pred_cmds[i]);
+++        if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
+++            goto fail;
+++        cf->s[2].buf = (int16_t *)cf->gptr.arm;
+++        cf->s[3].buf = cf->s[2].buf + coeff_count;
+ +
+-+#if RPI_INTER
+-+        gpu_free(&s->jobs[i].chroma_mvs_gptr);
+-+        gpu_free(&s->jobs[i].luma_mvs_gptr);
+-+#endif
+++        // Must be 64 byte aligned for our zero apping code so over-allocate &
+++        // round
+++        if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
+++            goto fail;
+++        cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
+ +    }
+++    return 0;
+ +
+-+    vpu_qpu_term();
+++fail:
+++    printf("%s: **** Failed\n", __func__);
+++    worker_pic_free_all(s);
+++    return -1;
+++}
+ +
+-+    av_rpi_zc_uninit(avctx);
+-+#endif
+-+
+-     for (i = 0; i < 3; i++) {
+-         av_freep(&s->sao_pixel_buffer_h[i]);
+-         av_freep(&s->sao_pixel_buffer_v[i]);
+-@@ -3116,10 +4874,25 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+-     return 0;
+- }
+- 
+-+#ifdef RPI
+-+#ifdef RPI_PRECLEAR
+-+static av_cold void memclear16(int16_t *p, int n)
+++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
+ +{
+-+  vpu_execute_code( vpu_get_fn(), p, n, 0, 0, 0, 1);
+-+  //int i;
+-+  //for(i=0;i<n;i++)
+-+  //  p[i] = 0;
+++    unsigned int i;
+++    for (i = 0; i != 4; ++i) {
+++        cf->s[i].n = 0;
+++    }
+ +}
+ +#endif
+-+#endif
+ +
+- static av_cold int hevc_init_context(AVCodecContext *avctx)
+++
++ /**
++  * NOTE: Each function hls_foo correspond to the function foo in the
++  * specification (HLS stands for High Level Syntax).
++@@ -55,6 +393,23 @@ const uint8_t ff_hevc_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12
++ /* free everything allocated  by pic_arrays_init() */
++ static void pic_arrays_free(HEVCContext *s)
+  {
+-     HEVCContext *s = avctx->priv_data;
+-     int i;
+ +#ifdef RPI
+-+    unsigned int job;
+++    worker_pic_free_all(s);
+ +#endif
+++
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i) {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++
+++            if (dvq->vpu_cmds_arm) {
+++                gpu_free(&dvq->deblock_vpu_gmem);
+++              dvq->vpu_cmds_arm = 0;
+++            }
+++        }
+++    }
+++#endif
++     av_freep(&s->sao);
++     av_freep(&s->deblock);
+  
+-     s->avctx = avctx;
+- 
+-@@ -3129,6 +4902,77 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
+-     s->HEVClcList[0] = s->HEVClc;
+-     s->sList[0] = s;
++@@ -91,6 +446,74 @@ static int pic_arrays_init(HEVCContext *s, const HEVCSPS *sps)
++     int ctb_count        = sps->ctb_width * sps->ctb_height;
++     int min_pu_size      = sps->min_pu_width * sps->min_pu_height;
+  
+ +#ifdef RPI
+-+    // Whilst FFmpegs init fn is only called once the close fn is called as
+-+    // many times as we have threads (init_thread_copy is called for the
+-+    // threads).  So to match init & term put the init here where it will be
+-+    // called by both init & copy
+-+    av_rpi_zc_init(avctx);
+++    const int coefs_in_ctb = (1 << sps->log2_ctb_size) * (1 << sps->log2_ctb_size);
+++    const int coefs_per_luma = 64*64*RPI_CHUNK_SIZE*RPI_NUM_CHUNKS;
+++    const int coefs_per_chroma = (coefs_per_luma * 2) >> sps->vshift[1] >> sps->hshift[1];
+++    const int coefs_per_row = coefs_per_luma + coefs_per_chroma;
+ +
+-+    if (vpu_qpu_init() != 0)
+++    av_assert0(sps);
+++    s->max_ctu_count = coefs_per_luma / coefs_in_ctb;
+++#if RPI_ROUND_TO_LINES
+++    // Round down to an integral quantity of lines
+++    if (s->max_ctu_count > sps->ctb_width)
+++        s->max_ctu_count -= s->max_ctu_count % sps->ctb_width;
+++#endif
+++
+++    if (worker_pic_alloc_all(s, coefs_per_row) != 0)
+ +        goto fail;
+++#endif
+++#ifdef RPI_DEBLOCK_VPU
+++    {
+++        int i;
+++        s->enable_rpi_deblock = !sps->sao_enabled;
+++        s->setup_width = (sps->width+15) / 16;
+++        s->setup_height = (sps->height+15) / 16;
+++        s->uv_setup_width = ( (sps->width >> sps->hshift[1]) + 15) / 16;
+++        s->uv_setup_height = ( (sps->height >> sps->vshift[1]) + 15) / 16;
+ +
+-+    for(job = 0; job < RPI_MAX_JOBS; job++) {
+-+        s->unif_mv_cmds_y[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_Y);
+-+        if (!s->unif_mv_cmds_y[job])
+-+            goto fail;
+-+        s->unif_mv_cmds_c[job] = av_mallocz(sizeof(HEVCMvCmd)*RPI_MAX_MV_CMDS_C);
+-+        if (!s->unif_mv_cmds_c[job])
+-+            goto fail;
+-+        s->univ_pred_cmds[job] = av_mallocz(sizeof(HEVCPredCmd)*RPI_MAX_PRED_CMDS);
+-+        if (!s->univ_pred_cmds[job])
+-+            goto fail;
+-+    }
+++        for (i = 0; i != RPI_DEBLOCK_VPU_Q_COUNT; ++i)
+++        {
+++            struct dblk_vpu_q_s * const dvq = s->dvq_ents + i;
+++            const unsigned int cmd_size = (sizeof(*dvq->vpu_cmds_arm) * 3 + 15) & ~15;
+++            const unsigned int y_size = (sizeof(*dvq->y_setup_arm) * s->setup_width * s->setup_height + 15) & ~15;
+++            const unsigned int uv_size = (sizeof(*dvq->uv_setup_arm) * s->uv_setup_width * s->uv_setup_height + 15) & ~15;
+++            const unsigned int total_size =- cmd_size + y_size + uv_size;
+++            int p_vc;
+++            uint8_t * p_arm;
+++ #if RPI_VPU_DEBLOCK_CACHED
+++            gpu_malloc_cached(total_size, &dvq->deblock_vpu_gmem);
+++ #else
+++            gpu_malloc_uncached(total_size, &dvq->deblock_vpu_gmem);
+++ #endif
+++            p_vc = dvq->deblock_vpu_gmem.vc;
+++            p_arm = dvq->deblock_vpu_gmem.arm;
+ +
+-+#if RPI_INTER
+-+    // We divide the image into blocks 256 wide and 64 high
+-+    // We support up to 2048 widths
+-+    // We compute the number of chroma motion vector commands for 4:4:4 format and 4x4 chroma blocks - assuming all blocks are B predicted
+-+    // Also add space for the startup command for each stream.
+++            // Zap all
+++            memset(p_arm, 0, dvq->deblock_vpu_gmem.numbytes);
+ +
+-+    for (job = 0; job < RPI_MAX_JOBS; job++) {
+-+        HEVCRpiJob * const jb = s->jobs + job;
+-+#if RPI_CACHE_UNIF_MVS
+-+        gpu_malloc_cached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+-+        gpu_malloc_cached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+-+#else
+-+        gpu_malloc_uncached(QPU_N_UV * UV_COMMANDS_PER_QPU * sizeof(qpu_mc_pred_c_t), &jb->chroma_mvs_gptr);
+-+        gpu_malloc_uncached(QPU_N_Y  * Y_COMMANDS_PER_QPU  * sizeof(qpu_mc_pred_y_t), &jb->luma_mvs_gptr);
+-+#endif
+++            // Subdivide
+++            dvq->vpu_cmds_arm = (void*)p_arm;
+++            dvq->vpu_cmds_vc = p_vc;
+ +
+-+        {
+-+            qpu_mc_pred_c_t * p = (qpu_mc_pred_c_t *)jb->chroma_mvs_gptr.arm;
+-+            for(i = 0; i < QPU_N_UV; i++) {
+-+                jb->chroma_mvs[i].qpu_mc_base = p;
+-+                jb->chroma_mvs[i].qpu_mc_curr = p;
+-+                p += UV_COMMANDS_PER_QPU;
+-+            }
+-+        }
+-+        {
+-+            qpu_mc_pred_y_t * p = (qpu_mc_pred_y_t *)jb->luma_mvs_gptr.arm;
+-+            for(i = 0; i < QPU_N_Y; i++) {
+-+                jb->luma_mvs[i].qpu_mc_base = p;
+-+                jb->luma_mvs[i].qpu_mc_curr = p;
+-+                p += Y_COMMANDS_PER_QPU;
+-+            }
+++            p_arm += cmd_size;
+++            p_vc += cmd_size;
+++
+++            dvq->y_setup_arm = (void*)p_arm;
+++            dvq->y_setup_vc = (void*)p_vc;
+++
+++            p_arm += y_size;
+++            p_vc += y_size;
+++
+++            dvq->uv_setup_arm = (void*)p_arm;
+++            dvq->uv_setup_vc = (void*)p_vc;
+ +        }
+++
+++        s->dvq_n = 0;
+++        s->dvq = s->dvq_ents + s->dvq_n;
+ +    }
+-+    s->qpu_filter_uv = qpu_fn(mc_filter_uv);
+-+    s->qpu_filter_uv_b0 = qpu_fn(mc_filter_uv_b0);
+-+    s->qpu_dummy_frame = qpu_fn(mc_setup_c);  // Use our code as a dummy frame
+-+    s->qpu_filter = qpu_fn(mc_filter);
+-+    s->qpu_filter_b = qpu_fn(mc_filter_b);
+ +#endif
+-+    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+-+    s->enable_rpi = 0;
++     s->bs_width  = (width  >> 2) + 1;
++     s->bs_height = (height >> 2) + 1;
++ 
++@@ -137,6 +560,29 @@ fail:
++     return AVERROR(ENOMEM);
++ }
++ 
+++static void default_pred_weight_table(HEVCContext * const s)
+++{
+++  unsigned int i;
+++  s->sh.luma_log2_weight_denom = 0;
+++  s->sh.chroma_log2_weight_denom = 0;
+++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
+++      s->sh.luma_weight_l0[i] = 1;
+++      s->sh.luma_offset_l0[i] = 0;
+++      s->sh.chroma_weight_l0[i][0] = 1;
+++      s->sh.chroma_offset_l0[i][0] = 0;
+++      s->sh.chroma_weight_l0[i][1] = 1;
+++      s->sh.chroma_offset_l0[i][1] = 0;
+++  }
+++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
+++      s->sh.luma_weight_l1[i] = 1;
+++      s->sh.luma_offset_l1[i] = 0;
+++      s->sh.chroma_weight_l1[i][0] = 1;
+++      s->sh.chroma_offset_l1[i][0] = 0;
+++      s->sh.chroma_weight_l1[i][1] = 1;
+++      s->sh.chroma_offset_l1[i][1] = 0;
+++  }
+++}
+ +
+-+#ifdef RPI_WORKER
+-+    hevc_init_worker(s);
++ static void pred_weight_table(HEVCContext *s, GetBitContext *gb)
++ {
++     int i = 0;
++@@ -337,8 +783,8 @@ static void export_stream_params(AVCodecContext *avctx, const HEVCParamSets *ps,
++ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fmt)
++ {
++     #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + CONFIG_HEVC_D3D11VA_HWACCEL + CONFIG_HEVC_VAAPI_HWACCEL + CONFIG_HEVC_VDPAU_HWACCEL)
++-    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
++-    int ret, i;
+++    enum AVPixelFormat pix_fmts[HWACCEL_MAX + 4], *fmt = pix_fmts;
+++    int ret;
++ 
++     pic_arrays_free(s);
++     s->ps.sps = NULL;
++@@ -356,6 +802,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++     switch (sps->pix_fmt) {
++     case AV_PIX_FMT_YUV420P:
++     case AV_PIX_FMT_YUVJ420P:
+++#if RPI_HEVC_SAND
+++        // Currently geometry calc is stuffed for big sizes
+++        if (sps->width < 2048 && sps->height <= 1088) {
+++            *fmt++ = AV_PIX_FMT_SAND128;
+++        }
+ +#endif
+-+
++ #if CONFIG_HEVC_DXVA2_HWACCEL
++         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
++ #endif
++@@ -370,6 +822,12 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++ #endif
++         break;
++     case AV_PIX_FMT_YUV420P10:
+++#if RPI_HEVC_SAND
+++        // Currently geometry calc is stuffed for big sizes
+++        if (sps->width < 2048 && sps->height <= 1088) {
+++            *fmt++ = AV_PIX_FMT_SAND64_10;
+++        }
+ +#endif
++ #if CONFIG_HEVC_DXVA2_HWACCEL
++         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
++ #endif
++@@ -386,6 +844,7 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++         ret = ff_thread_get_format(s->avctx, pix_fmts);
++         if (ret < 0)
++             goto fail;
+ +
+-     s->cabac_state = av_malloc(HEVC_CONTEXTS);
+-     if (!s->cabac_state)
+-         goto fail;
+-@@ -3343,9 +5187,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
++         s->avctx->pix_fmt = ret;
+      }
++     else {
++@@ -395,26 +854,36 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps, enum AVPixelFormat pix_fm
++     ff_hevc_pred_init(&s->hpc,     sps->bit_depth);
++     ff_hevc_dsp_init (&s->hevcdsp, sps->bit_depth);
++     ff_videodsp_init (&s->vdsp,    sps->bit_depth);
+++#ifdef RPI
+++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
+++#endif
+  
+-     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
+--            s->threads_type = FF_THREAD_FRAME;
+--        else
+--            s->threads_type = FF_THREAD_SLICE;
+-+        s->threads_type = FF_THREAD_FRAME;
+-+    else
+-+        s->threads_type = FF_THREAD_SLICE;
++-    for (i = 0; i < 3; i++) {
++-        av_freep(&s->sao_pixel_buffer_h[i]);
++-        av_freep(&s->sao_pixel_buffer_v[i]);
++-    }
+++    av_freep(&s->sao_pixel_buffer_h[0]);
+++    av_freep(&s->sao_pixel_buffer_v[0]);
+  
+-     return 0;
+- }
+-@@ -3404,6 +5248,8 @@ AVCodec ff_hevc_decoder = {
+-     .update_thread_context = hevc_update_thread_context,
+-     .init_thread_copy      = hevc_init_thread_copy,
+-     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+-+//                             0,
+-+//                             AV_CODEC_CAP_FRAME_THREADS,
+-                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
+-     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
+- };
+-diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
+-index be91010..dd7d152 100644
+---- a/libavcodec/hevc.h
+-+++ b/libavcodec/hevc.h
+-@@ -23,6 +23,9 @@
+- #ifndef AVCODEC_HEVC_H
+- #define AVCODEC_HEVC_H
++     if (sps->sao_enabled && !s->avctx->hwaccel) {
++-        int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
++-        int c_idx;
+++        const unsigned int c_count = (sps->chroma_format_idc != 0) ? 3 : 1;
+++        unsigned int c_idx;
+++        size_t vsize[3] = {0};
+++        size_t hsize[3] = {0};
+  
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#include "config.h"
++         for(c_idx = 0; c_idx < c_count; c_idx++) {
++             int w = sps->width >> sps->hshift[c_idx];
++             int h = sps->height >> sps->vshift[c_idx];
++-            s->sao_pixel_buffer_h[c_idx] =
++-                av_malloc((w * 2 * sps->ctb_height) <<
++-                          sps->pixel_shift);
++-            s->sao_pixel_buffer_v[c_idx] =
++-                av_malloc((h * 2 * sps->ctb_width) <<
++-                          sps->pixel_shift);
+++            // ctb height & width are a min of 8 so this must a multiple of 16
+++            // so no point rounding up!
+++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
+++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++         }
+ +
+- #include "libavutil/buffer.h"
+- #include "libavutil/md5.h"
+++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
+++        // when we have plaited chroma
+++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
+++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
+++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
+++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
+++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
+++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++     }
+  
+-@@ -37,6 +40,45 @@
+- #include "thread.h"
+- #include "videodsp.h"
++     s->ps.sps = sps;
++@@ -680,6 +1149,11 @@ static int hls_slice_header(HEVCContext *s)
++                 (s->ps.pps->weighted_bipred_flag && sh->slice_type == B_SLICE)) {
++                 pred_weight_table(s, gb);
++             }
+++            else
+++            {
+++              // Give us unit weights
+++              default_pred_weight_table(s);
+++            }
+  
+-+// define RPI to split the CABAC/prediction/transform into separate stages
+-+#ifndef RPI
+-+
+-+  #define RPI_INTER          0
+-+  #define RPI_TSTATS         0
+-+  #define RPI_HEVC_SAND      0
+-+
+-+#else
+-+
+-+  #include "rpi_qpu.h"
+-+  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
++             sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++             if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++@@ -937,6 +1411,39 @@ static int hls_cross_component_pred(HEVCContext *s, int idx) {
++     return 0;
++ }
++ 
+++#ifdef RPI
+++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCContext * const s)
+++{
+++    return s->jb0->intra.cmds + s->jb0->intra.n++;
+++}
+ +
+-+  // Define RPI_WORKER to launch a worker thread for pixel processing tasks
+-+  #define RPI_WORKER
+-+  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+-+  // This has no effect unless RPI_WORKER is defined
+-+  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+-+  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+-+  // free for the foreground to fill in.
+-+  #define RPI_MAX_JOBS 2
+++static void rpi_intra_pred(HEVCContext *s, int log2_trafo_size, int x0, int y0, int c_idx)
+++{
+++    // U & V done on U call in the case of sliced frames
+++    if (av_rpi_is_sand_frame(s->frame) && c_idx > 1)
+++        return;
+ +
+-+  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+-+  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+-+  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+-+  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+-+//  #define RPI_DEBLOCK_VPU
+-+
+-+  #define RPI_VPU_DEBLOCK_CACHED 1
+-+
+-+  #if HAVE_NEON
+-+  #define RPI_HEVC_SAND      1
+-+  #else
+-+  // Sand bust on Pi1 currently - reasons unknown
+-+  #define RPI_HEVC_SAND      0
+-+  #endif
+++    if (s->enable_rpi) {
+++        HEVCLocalContext *lc = s->HEVClc;
+++        HEVCPredCmd *cmd = rpi_new_intra_cmd(s);
+++        cmd->type = RPI_PRED_INTRA;
+++        cmd->size = log2_trafo_size;
+++        cmd->na = (lc->na.cand_bottom_left<<4) + (lc->na.cand_left<<3) + (lc->na.cand_up_left<<2) + (lc->na.cand_up<<1) + lc->na.cand_up_right;
+++        cmd->c_idx = c_idx;
+++        cmd->i_pred.x = x0;
+++        cmd->i_pred.y = y0;
+++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
+++    }
+++    else if (av_rpi_is_sand_frame(s->frame) && c_idx != 0) {
+++        s->hpc.intra_pred_c[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+++    else {
+++        s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, c_idx);
+++    }
+ +
+-+  #define RPI_TSTATS 0
+++}
+ +#endif
+ +
+- #define MAX_DPB_SIZE 16 // A.4.1
+- #define MAX_REFS 16
++ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                               int xBase, int yBase, int cb_xBase, int cb_yBase,
++                               int log2_cb_size, int log2_trafo_size,
++@@ -949,8 +1456,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++     if (lc->cu.pred_mode == MODE_INTRA) {
++         int trafo_size = 1 << log2_trafo_size;
++         ff_hevc_set_neighbour_available(s, x0, y0, trafo_size, trafo_size);
++-
+++#ifdef RPI
+++        rpi_intra_pred(s, log2_trafo_size, x0, y0, 0);
+++#else
++         s->hpc.intra_pred[log2_trafo_size - 2](s, x0, y0, 0);
+++#endif
++     }
+  
+-@@ -660,17 +702,6 @@ typedef struct CodingUnit {
+-     uint8_t cu_transquant_bypass_flag;
+- } CodingUnit;
++     if (cbf_luma || cbf_cb[0] || cbf_cr[0] ||
++@@ -1036,7 +1546,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 1);
+++#endif
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1065,7 +1579,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             for (i = 0; i < (s->ps.sps->chroma_format_idc == 2 ? 2 : 1); i++) {
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, x0, y0 + (i << log2_trafo_size_c), trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (i << log2_trafo_size_c), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (i << log2_trafo_size_c), 2);
+++#endif
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, x0, y0 + (i << log2_trafo_size_c),
++@@ -1094,7 +1612,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                     trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 1);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 1);
+++#endif
++                 }
++                 if (cbf_cb[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1104,7 +1626,11 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++                 if (lc->cu.pred_mode == MODE_INTRA) {
++                     ff_hevc_set_neighbour_available(s, xBase, yBase + (i << log2_trafo_size),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                    rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (i << log2_trafo_size), 2);
+++#else
++                     s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (i << log2_trafo_size), 2);
+++#endif
++                 }
++                 if (cbf_cr[i])
++                     ff_hevc_hls_residual_coding(s, xBase, yBase + (i << log2_trafo_size),
++@@ -1116,26 +1642,46 @@ static int hls_transform_unit(HEVCContext *s, int x0, int y0,
++             int trafo_size_h = 1 << (log2_trafo_size_c + s->ps.sps->hshift[1]);
++             int trafo_size_v = 1 << (log2_trafo_size_c + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, x0, y0, trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 1);
+++            rpi_intra_pred(s, log2_trafo_size_c, x0, y0, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 1);
++             s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, x0, y0 + (1 << log2_trafo_size_c),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 1);
+++                rpi_intra_pred(s, log2_trafo_size_c, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 1);
++                 s->hpc.intra_pred[log2_trafo_size_c - 2](s, x0, y0 + (1 << log2_trafo_size_c), 2);
+++#endif
++             }
++         } else if (blk_idx == 3) {
++             int trafo_size_h = 1 << (log2_trafo_size + 1);
++             int trafo_size_v = 1 << (log2_trafo_size + s->ps.sps->vshift[1]);
++             ff_hevc_set_neighbour_available(s, xBase, yBase,
++                                             trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 1);
+++            rpi_intra_pred(s, log2_trafo_size, xBase, yBase, 2);
+++#else
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 1);
++             s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase, 2);
+++#endif
++             if (s->ps.sps->chroma_format_idc == 2) {
++                 ff_hevc_set_neighbour_available(s, xBase, yBase + (1 << (log2_trafo_size)),
++                                                 trafo_size_h, trafo_size_v);
+++#ifdef RPI
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 1);
+++                rpi_intra_pred(s, log2_trafo_size, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#else
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 1);
++                 s->hpc.intra_pred[log2_trafo_size - 2](s, xBase, yBase + (1 << (log2_trafo_size)), 2);
+++#endif
++             }
++         }
++     }
++@@ -1281,47 +1827,119 @@ do {
++     return 0;
++ }
+  
+--typedef struct Mv {
+--    int16_t x;  ///< horizontal component of motion vector
+--    int16_t y;  ///< vertical component of motion vector
+--} Mv;
++-static int hls_pcm_sample(HEVCContext *s, int x0, int y0, int log2_cb_size)
+++
+++static int pcm_extract(HEVCContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++ {
++-    HEVCLocalContext *lc = s->HEVClc;
++     GetBitContext gb;
++-    int cb_size   = 1 << log2_cb_size;
++-    int stride0   = s->frame->linesize[0];
++-    uint8_t *dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
++-    int   stride1 = s->frame->linesize[1];
++-    uint8_t *dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++-    int   stride2 = s->frame->linesize[2];
++-    uint8_t *dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+ -
+--typedef struct MvField {
+--    DECLARE_ALIGNED(4, Mv, mv)[2];
+--    int8_t ref_idx[2];
+--    int8_t pred_flag;
+--} MvField;
++-    int length         = cb_size * cb_size * s->ps.sps->pcm.bit_depth +
++-                         (((cb_size >> s->ps.sps->hshift[1]) * (cb_size >> s->ps.sps->vshift[1])) +
++-                          ((cb_size >> s->ps.sps->hshift[2]) * (cb_size >> s->ps.sps->vshift[2]))) *
++-                          s->ps.sps->pcm.bit_depth_chroma;
++-    const uint8_t *pcm = skip_bytes(&lc->cc, (length + 7) >> 3);
++     int ret;
++ 
++-    if (!s->sh.disable_deblocking_filter_flag)
++-        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+ -
+- typedef struct NeighbourAvailable {
+-     int cand_bottom_left;
+-     int cand_left;
+-@@ -747,7 +778,17 @@ typedef struct HEVCFrame {
+-     uint8_t flags;
+- } HEVCFrame;
++     ret = init_get_bits(&gb, pcm, length);
++     if (ret < 0)
++         return ret;
+  
+-+#ifdef RPI_WORKER
+-+typedef struct HEVCLocalContextIntra {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;
+-+} HEVCLocalContextIntra;
+-+#endif
++-    s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size,     &gb, s->ps.sps->pcm.bit_depth);
++-    if (s->ps.sps->chroma_format_idc) {
++-        s->hevcdsp.put_pcm(dst1, stride1,
+++#if RPI_HEVC_SAND
+++    if (av_rpi_is_sand_frame(s->frame)) {
+++        s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
+++                           s->frame->linesize[0],
+++                           cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+ +
+- typedef struct HEVCLocalContext {
+-+    TransformUnit tu;
+-+    NeighbourAvailable na;  // WARNING tu and na must be the first two fields to match HEVCLocalContextIntra
+++        s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> s->ps.sps->hshift[1], y0 >> s->ps.sps->vshift[1]),
+++                           s->frame->linesize[1],
++                            cb_size >> s->ps.sps->hshift[1],
++                            cb_size >> s->ps.sps->vshift[1],
++                            &gb, s->ps.sps->pcm.bit_depth_chroma);
++-        s->hevcdsp.put_pcm(dst2, stride2,
++-                           cb_size >> s->ps.sps->hshift[2],
++-                           cb_size >> s->ps.sps->vshift[2],
++-                           &gb, s->ps.sps->pcm.bit_depth_chroma);
++     }
+++    else
+++#endif
+++    {
+++        const int stride0   = s->frame->linesize[0];
+++        uint8_t * const dst0 = &s->frame->data[0][y0 * stride0 + (x0 << s->ps.sps->pixel_shift)];
+++        const int   stride1 = s->frame->linesize[1];
+++        uint8_t * const dst1 = &s->frame->data[1][(y0 >> s->ps.sps->vshift[1]) * stride1 + ((x0 >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++        const int   stride2 = s->frame->linesize[2];
+++        uint8_t * const dst2 = &s->frame->data[2][(y0 >> s->ps.sps->vshift[2]) * stride2 + ((x0 >> s->ps.sps->hshift[2]) << s->ps.sps->pixel_shift)];
+ +
+-     uint8_t cabac_state[HEVC_CONTEXTS];
+- 
+-     uint8_t stat_coeff[4];
+-@@ -762,7 +803,6 @@ typedef struct HEVCLocalContext {
+- 
+-     int qPy_pred;
+- 
+--    TransformUnit tu;
+- 
+-     uint8_t ctb_left_flag;
+-     uint8_t ctb_up_flag;
+-@@ -779,7 +819,6 @@ typedef struct HEVCLocalContext {
+-     int ct_depth;
+-     CodingUnit cu;
+-     PredictionUnit pu;
+--    NeighbourAvailable na;
+++        s->hevcdsp.put_pcm(dst0, stride0, cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
+++        if (s->ps.sps->chroma_format_idc) {
+++            s->hevcdsp.put_pcm(dst1, stride1,
+++                               cb_size >> s->ps.sps->hshift[1],
+++                               cb_size >> s->ps.sps->vshift[1],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++            s->hevcdsp.put_pcm(dst2, stride2,
+++                               cb_size >> s->ps.sps->hshift[2],
+++                               cb_size >> s->ps.sps->vshift[2],
+++                               &gb, s->ps.sps->pcm.bit_depth_chroma);
+++        }
+  
+- #define BOUNDARY_LEFT_SLICE     (1 << 0)
+- #define BOUNDARY_LEFT_TILE      (1 << 1)
+-@@ -790,6 +829,147 @@ typedef struct HEVCLocalContext {
+-     int boundary_flags;
+- } HEVCLocalContext;
+++    }
++     return 0;
++ }
+  
+-+
+ +#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n)
+++{
+++    HEVCRpiCoeffEnv *const cfe = s->jb0->coeffs.s + buf_no;
+++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
+++    cfe->n += n;
+++    return coeffs;
+++}
+++#endif
+ +
+-+// The processing is done in chunks
+-+// Each chunk corresponds to 24 64x64 luma blocks (24 so it is divisible by 8 for chroma and 12 for luma)
+-+// This is a distance of 1536 pixels across the screen
+-+// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+-+// but allocate more memory and increase the latency before data in the next frame can be processed
+-+#define RPI_NUM_CHUNKS 4
+-+#define RPI_CHUNK_SIZE 12
+-+
+-+// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+-+#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+++// x * 2^(y*2)
+++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
+++{
+++    return x << (y * 2);
+++}
+ +
+-+// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+-+#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+-+#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+-+// Each block can have an intra prediction and a transform_add command
+-+#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+-+// Worst case is 16x16 CTUs
+-+#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+-+
+-+#define RPI_CMD_LUMA_UNI 0
+-+#define RPI_CMD_CHROMA_UNI 1
+-+#define RPI_CMD_LUMA_BI 2
+-+#define RPI_CMD_CHROMA_BI 3
+-+#define RPI_CMD_V_BI 4
+-+
+-+// RPI_PRECLEAR is not working yet - perhaps clearing on VPUs is flawed?
+-+// #define RPI_PRECLEAR
+-+
+-+// Command for inter prediction
+-+typedef struct HEVCMvCmd {
+-+    uint8_t cmd;
+-+    uint8_t block_w;
+-+    uint8_t block_h;
+-+    int8_t ref_idx[2];
+-+    uint16_t dststride;
+-+    uint16_t srcstride;
+-+    uint16_t srcstride1;
+-+    int16_t weight;
+-+    int16_t offset;
+-+    int16_t x_off;
+-+    int16_t y_off;
+-+    uint8_t *src;
+-+    uint8_t *src1;
+-+    uint8_t *dst;
+-+    Mv mv;
+-+    Mv mv1;
+-+} HEVCMvCmd;
+-+
+-+
+-+// Command for intra prediction and transform_add of predictions to coefficients
+-+enum rpi_pred_cmd_e
+++static int hls_pcm_sample(HEVCContext * const s, const int x0, const int y0, unsigned int log2_cb_size)
+ +{
+-+    RPI_PRED_ADD_RESIDUAL,
+-+    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+-+    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+-+    RPI_PRED_INTRA,
+-+    RPI_PRED_I_PCM,
+-+    RPI_PRED_CMD_MAX
+-+};
+++    // Length in bits
+++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[1]) +
+++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - s->ps.sps->vshift[2]);
+ +
+-+typedef struct HEVCPredCmd {
+-+    uint8_t type;
+-+    uint8_t size;  // log2 "size" used by all variants
+-+    uint8_t na;    // i_pred - but left here as they pack well
+-+    uint8_t c_idx; // i_pred
+-+    union {
+-+        struct {  // TRANSFORM_ADD
+-+            uint8_t * dst;
+-+            const int16_t * buf;
+-+            uint32_t stride;
+-+        } ta;
+-+        struct {  // INTRA
+-+            uint16_t x;
+-+            uint16_t y;
+-+            enum IntraPredMode mode;
+-+        } i_pred;
+-+        struct {  // I_PCM
+-+            uint16_t x;
+-+            uint16_t y;
+-+            const void * src;
+-+            uint32_t src_len;
+-+        } i_pcm;
+-+    };
+-+} HEVCPredCmd;
+++    const uint8_t * const pcm = skip_bytes(&s->HEVClc->cc, (length + 7) >> 3);
+ +
+-+#endif
+++    if (!s->sh.disable_deblocking_filter_flag)
+++        ff_hevc_deblocking_boundary_strengths(s, x0, y0, log2_cb_size);
+ +
+ +#ifdef RPI
+++    if (s->enable_rpi) {
+++        // Copy coeffs
+++        const int blen = (length + 7) >> 3;
+++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
+++        // Allocation is in int16_t s
+++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
+++        // sample this rounding doesn't affect the total size we need to allocate for
+++        // the coeff buffer
+++        int16_t * const coeffs = rpi_alloc_coeff_buf(s, 0, ((blen + 31) & ~31) >> 1);
+++        memcpy(coeffs, pcm, blen);
+ +
+-+struct qpu_mc_pred_c_s;
+-+struct qpu_mc_pred_y_s;
+-+
+-+typedef struct HEVCRpiLumaPred
+-+{
+-+    struct qpu_mc_pred_y_s *qpu_mc_base;
+-+    struct qpu_mc_pred_y_s *qpu_mc_curr;
+-+    struct qpu_mc_pred_y_s *last_lx;
+-+    unsigned int load;
+-+} HEVCRpiLumaPred;
+-+
+-+typedef struct HEVCRpiChromaPred
+-+{
+-+    struct qpu_mc_pred_c_s *qpu_mc_base;
+-+    struct qpu_mc_pred_c_s *qpu_mc_curr;
+-+    struct qpu_mc_pred_c_s *last_l0;
+-+    struct qpu_mc_pred_c_s *last_l1;
+-+    unsigned int load;
+-+} HEVCRpiChromaPred;
+-+
+-+typedef struct HEVCRpiJob {
+-+    GPU_MEM_PTR_T chroma_mvs_gptr;
+-+    GPU_MEM_PTR_T luma_mvs_gptr;
+-+    HEVCRpiChromaPred chroma_mvs[QPU_N_UV];
+-+    HEVCRpiLumaPred luma_mvs[QPU_N_Y];
+-+} HEVCRpiJob;
+++        // Our coeff stash assumes that any partially allocated 64byte lump
+++        // is zeroed so make that true.
+++        {
+++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
+++            if ((-(intptr_t)eopcm & 63) != 0)
+++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
+++        }
+ +
+-+#if RPI_TSTATS
+-+typedef struct HEVCRpiStats {
+-+    int y_pred1_y8_merge;
+-+    int y_pred1_xy;
+-+    int y_pred1_x0;
+-+    int y_pred1_y0;
+-+    int y_pred1_x0y0;
+-+    int y_pred1_wle8;
+-+    int y_pred1_wgt8;
+-+    int y_pred1_hle16;
+-+    int y_pred1_hgt16;
+-+    int y_pred2_xy;
+-+    int y_pred2_x0;
+-+    int y_pred2_y0;
+-+    int y_pred2_x0y0;
+-+    int y_pred2_hle16;
+-+    int y_pred2_hgt16;
+-+} HEVCRpiStats;
+++        // Add command
+++        {
+++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(s);
+++            cmd->type = RPI_PRED_I_PCM;
+++            cmd->size = log2_cb_size;
+++            cmd->i_pcm.src = coeffs;
+++            cmd->i_pcm.x = x0;
+++            cmd->i_pcm.y = y0;
+++            cmd->i_pcm.src_len = length;
+++        }
+++        return 0;
+++    }
+ +#endif
+ +
+-+#endif
+++    return pcm_extract(s, pcm, length, x0, y0, 1 << log2_cb_size);
+++}
+ +
+- typedef struct HEVCContext {
+-     const AVClass *c;  // needed by private avoptions
+-     AVCodecContext *avctx;
+-@@ -798,13 +978,103 @@ typedef struct HEVCContext {
++ /**
++  * 8.5.3.2.2.1 Luma sample unidirectional interpolation process
++  *
++@@ -1353,6 +1971,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                            (s->sh.slice_type == B_SLICE && s->ps.pps->weighted_bipred_flag);
++     int idx              = ff_hevc_pel_weight[block_w];
+  
+-     HEVCLocalContext    *HEVClcList[MAX_NB_THREADS];
+-     HEVCLocalContext    *HEVClc;
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra HEVClcIntra;
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+-     uint8_t             threads_type;
+-     uint8_t             threads_number;
+- 
+-     int                 width;
+-     int                 height;
+- 
+-+    int used_for_ref;
+-+
+-+#ifdef RPI
+-+    int enable_rpi;
+-+    HEVCMvCmd *unif_mv_cmds_y[RPI_MAX_JOBS];
+-+    HEVCMvCmd *unif_mv_cmds_c[RPI_MAX_JOBS];
+-+    HEVCPredCmd *univ_pred_cmds[RPI_MAX_JOBS];
+-+    int buf_width;
+-+    GPU_MEM_PTR_T coeffs_buf_default[RPI_MAX_JOBS];
+-+    GPU_MEM_PTR_T coeffs_buf_accelerated[RPI_MAX_JOBS];
+-+    int16_t *coeffs_buf_arm[RPI_MAX_JOBS][4];
+-+    unsigned int coeffs_buf_vc[RPI_MAX_JOBS][4];
+-+    int num_coeffs[RPI_MAX_JOBS][4];
+-+    int num_xfm_cmds[RPI_MAX_JOBS];
+-+    int num_mv_cmds_y[RPI_MAX_JOBS];
+-+    int num_mv_cmds_c[RPI_MAX_JOBS];
+-+    int num_pred_cmds[RPI_MAX_JOBS];
+-+    int num_dblk_cmds[RPI_MAX_JOBS];
+-+    int vpu_id;
+-+    int pass0_job; // Pass0 does coefficient decode
+-+    int pass1_job; // Pass1 does pixel processing
+-+    int ctu_count; // Number of CTUs done in pass0 so far
+-+    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+-+    int ctu_per_y_chan; // Number of CTUs per luma QPU
+-+    int ctu_per_uv_chan; // Number of CTUs per chroma QPU
+ +
+-+    HEVCRpiJob jobs[RPI_MAX_JOBS];
+-+#if RPI_TSTATS
+-+    HEVCRpiStats tstats;
++     x_off += mv->x >> 2;
++     y_off += mv->y >> 2;
++     src   += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1399,7 +2021,7 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++  * @param mv1 motion vector1 (relative to block position) to get pixel data from
++  * @param current_mv current motion vector structure
++  */
++- static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
+++static void luma_mc_bi(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++                        AVFrame *ref0, const Mv *mv0, int x_off, int y_off,
++                        int block_w, int block_h, AVFrame *ref1, const Mv *mv1, struct MvField *current_mv)
++ {
++@@ -1423,6 +2045,10 @@ static void luma_mc_uni(HEVCContext *s, uint8_t *dst, ptrdiff_t dststride,
++     uint8_t *src0  = ref0->data[0] + y_off0 * src0stride + (int)((unsigned)x_off0 << s->ps.sps->pixel_shift);
++     uint8_t *src1  = ref1->data[0] + y_off1 * src1stride + (int)((unsigned)x_off1 << s->ps.sps->pixel_shift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+-+#if RPI_INTER
+-+    HEVCRpiChromaPred * curr_pred_c;
+-+    HEVCRpiLumaPred * curr_pred_y;
+-+    struct qpu_mc_pred_y_s * last_y8_p;
+-+    struct qpu_mc_pred_y_s * last_y8_lx;
+ +
+-+    // Function pointers
+-+    uint32_t qpu_filter_uv;
+-+    uint32_t qpu_filter_uv_b0;
+-+    uint32_t qpu_dummy_frame; // Not a frame - just a bit of memory
+-+    uint32_t qpu_filter;
+-+    uint32_t qpu_filter_b;
++     if (x_off0 < QPEL_EXTRA_BEFORE || y_off0 < QPEL_EXTRA_AFTER ||
++         x_off0 >= pic_width - block_w - QPEL_EXTRA_AFTER ||
++         y_off0 >= pic_height - block_h - QPEL_EXTRA_AFTER) {
++@@ -1508,6 +2134,10 @@ static void chroma_mc_uni(HEVCContext *s, uint8_t *dst0,
++     intptr_t _mx         = mx << (1 - hshift);
++     intptr_t _my         = my << (1 - vshift);
++ 
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+ +
+-+#ifdef RPI_WORKER
+-+    pthread_t worker_thread;
+-+    pthread_cond_t worker_cond_head;
+-+    pthread_cond_t worker_cond_tail;
+-+    pthread_mutex_t worker_mutex;
+-+
+-+    int worker_tail; // Contains the number of posted jobs
+-+    int worker_head; // Contains the number of completed jobs
+-+    int kill_worker; // set to 1 to terminate the worker
++     x_off += mv->x >> (2 + hshift);
++     y_off += mv->y >> (2 + vshift);
++     src0  += y_off * srcstride + (x_off * (1 << s->ps.sps->pixel_shift));
++@@ -1572,6 +2202,10 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++     int hshift = s->ps.sps->hshift[1];
++     int vshift = s->ps.sps->vshift[1];
++ 
+++#ifdef DISABLE_MC
+++    return;
+ +#endif
+ +
+-+#define RPI_DEBLOCK_VPU_Q_COUNT 2
++     intptr_t mx0 = av_mod_uintp2(mv0->x, 2 + hshift);
++     intptr_t my0 = av_mod_uintp2(mv0->y, 2 + vshift);
++     intptr_t mx1 = av_mod_uintp2(mv1->x, 2 + hshift);
++@@ -1645,13 +2279,112 @@ static void chroma_mc_bi(HEVCContext *s, uint8_t *dst0, ptrdiff_t dststride, AVF
++                                                          _mx1, _my1, block_w);
++ }
++ 
++-static void hevc_await_progress(HEVCContext *s, HEVCFrame *ref,
++-                                const Mv *mv, int y0, int height)
+++#ifdef RPI
+++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int val, const int field)
++ {
++-    int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
+++        HEVCContext *const fs = ref->tf.owner->priv_data;
+++        HEVCRPiFrameProgressState * const pstate = fs->progress_states + field;
+++        sem_t * sem = NULL;
+++
+++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
+++            HEVCRPiFrameProgressWait * const pwait = &jb->progress_wait;
+++
+++            av_assert0(pwait->req == -1 && pwait->next == NULL);
++ 
++-    if (s->threads_type == FF_THREAD_FRAME )
++-        ff_thread_await_progress(&ref->tf, y, 0);
+++            pwait->req = val;
+++            pwait->next = NULL;
+++            if (pstate->first == NULL)
+++                pstate->first = pwait;
+++            else
+++                pstate->last->next = pwait;
+++            pstate->last = pwait;
+++            sem = &pwait->sem;
+++        }
+++        pthread_mutex_unlock(&pstate->lock);
+ +
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int enable_rpi_deblock;
+++        if (sem != NULL) {
+++            while (sem_wait(sem) != 0)
+++                av_assert0(errno == EINTR);
+++        }
+++    }
+++}
+ +
+-+    int uv_setup_width;
+-+    int uv_setup_height;
+-+    int setup_width; // Number of 16x16 blocks across the image
+-+    int setup_height; // Number of 16x16 blocks down the image
+++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field)
+++{
+++    HEVCRPiFrameProgressState *const pstate = s->progress_states + field;
+ +
+-+    struct dblk_vpu_q_s
+-+    {
+-+        GPU_MEM_PTR_T deblock_vpu_gmem;
+++    ((int *)s->ref->tf.progress->data)[field] = val;
+ +
+-+        uint8_t (*y_setup_arm)[2][2][2][4];
+-+        uint8_t (*y_setup_vc)[2][2][2][4];
+++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
+++    {
+++        HEVCRPiFrameProgressWait ** ppwait = &pstate->first;
+++        HEVCRPiFrameProgressWait * pwait;
+ +
+-+        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+-+        uint8_t (*uv_setup_vc)[2][2][2][4];
+++        while ((pwait = *ppwait) != NULL) {
+++            if (pwait->req > val)
+++            {
+++                ppwait = &pwait->next;
+++                pstate->last = pwait;
+++            }
+++            else
+++            {
+++                *ppwait = pwait->next;
+++                pwait->req = -1;
+++                pwait->next = NULL;
+++                sem_post(&pwait->sem);
+++            }
+++        }
+++    }
+++    pthread_mutex_unlock(&pstate->lock);
+++}
+ +
+-+        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+-+        int vpu_cmds_vc;
+++static void ff_hevc_rpi_progress_init_state(HEVCRPiFrameProgressState * const pstate)
+++{
+++    pstate->first = NULL;
+++    pstate->last = NULL;
+++    pthread_mutex_init(&pstate->lock, NULL);
+++}
+ +
+-+        vpu_qpu_wait_h cmd_id;
+-+    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+++static void ff_hevc_rpi_progress_init_wait(HEVCRPiFrameProgressWait * const pwait)
+++{
+++    pwait->req = -1;
+++    pwait->next = NULL;
+++    sem_init(&pwait->sem, 0, 0);
+++}
+ +
+-+    struct dblk_vpu_q_s * dvq;
+-+    unsigned int dvq_n;
+++static void ff_hevc_rpi_progress_kill_state(HEVCRPiFrameProgressState * const pstate)
+++{
+++    av_assert0(pstate->first == NULL);
+++    pthread_mutex_destroy(&pstate->lock);
+++}
+ +
+++static void ff_hevc_rpi_progress_kill_wait(HEVCRPiFrameProgressWait * const pwait)
+++{
+++    sem_destroy(&pwait->sem);
+++}
+ +#endif
+ +
+-+#endif
+++static void hevc_await_progress(HEVCContext *s, const HEVCFrame * const ref,
+++                                const Mv * const mv, const int y0, const int height)
+++{
+++    if (s->threads_type == FF_THREAD_FRAME) {
+++        const int y = FFMAX(0, (mv->y >> 2) + y0 + height + 9);
+ +
+-     uint8_t *cabac_state;
+- 
+-     /** 1 if the independent slice segment header was successfully parsed */
+-@@ -922,6 +1192,9 @@ typedef struct HEVCContext {
+-     uint32_t max_mastering_luminance;
+-     uint32_t min_mastering_luminance;
+- 
+ +#ifdef RPI
+-+    int dblk_cmds[RPI_MAX_JOBS][RPI_MAX_DEBLOCK_CMDS][2];
+++        if (s->enable_rpi) {
+++            int16_t *const pr = s->jb0->progress + ref->dpb_no;
+++            if (*pr < y) {
+++                *pr = y;
+++            }
+++        }
+++        else
+ +#endif
+- } HEVCContext;
+++        // It is a const ThreadFrame but the prototype isn't
+++        ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
+++    }
++ }
+  
+- int ff_hevc_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
+-@@ -1048,6 +1321,10 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                                  int log2_trafo_size, enum ScanType scan_idx,
+-                                  int c_idx);
++ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++@@ -1699,14 +2432,542 @@ static void hevc_luma_mv_mvp_mode(HEVCContext *s, int x0, int y0, int nPbW,
++     }
++ }
+  
+-+#if RPI_INTER
+-+extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+-+#endif
+-+
+- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
+- 
+- 
+-@@ -1072,4 +1349,15 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
+- extern const uint8_t ff_hevc_diag_scan8x8_x[64];
+- extern const uint8_t ff_hevc_diag_scan8x8_y[64];
+- 
+-+#ifdef RPI
+-+int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+-+
+-+// arm/hevc_misc_neon.S
+-+// Neon coeff zap fn
+-+#if HAVE_NEON
+-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+-+#endif
+-+
+-+#endif
+-+
+- #endif /* AVCODEC_HEVC_H */
+-diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
+-index 05b2821..733efde 100644
+---- a/libavcodec/hevc_cabac.c
+-+++ b/libavcodec/hevc_cabac.c
+-@@ -21,14 +21,76 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+- 
+-+#define UNCHECKED_BITSTREAM_READER 1
+-+
+- #include "libavutil/attributes.h"
+- #include "libavutil/common.h"
+- 
+--#include "cabac_functions.h"
+- #include "hevc.h"
+-+#include "cabac_functions.h"
+-+
+-+#ifdef RPI
+-+#include "rpi_zc.h"
+-+#endif
+-+
+-+// BY22 is probably faster than simple bypass if the processor has
+-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+-+// x86 has fast int divide
+-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+-+// Use native divide if we have a fast one - otherwise use mpy 1/x
+-+// x86 has a fast integer divide - arm doesn't - unsure about other
+-+// architectures
+-+#define USE_BY22_DIV  ARCH_X86
+-+
+-+// Special case blocks with a single significant ceoff
+-+// Decreases the complexity of the code for a common case but increases the
+-+// code size.
+-+#define USE_N_END_1 1
+-+
+-+#if ARCH_ARM
+-+#include "arm/hevc_cabac.h"
+-+#endif
+- 
+- #define CABAC_MAX_BIN 31
+- 
+-+
+-+#if USE_BY22 && !USE_BY22_DIV
+-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+-+
+-+static const uint32_t cabac_by22_inv_range[256] = {
+-+                                                    0,      I(257), I(258), I(259),
+-+    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+-+    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+-+    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+-+    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+-+    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+-+    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+-+    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+-+    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+-+    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+-+    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+-+    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+-+    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+-+    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+-+    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+-+    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+-+    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+-+    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+-+    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+-+    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+-+    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+-+    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+-+    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+-+    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+-+    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+-+    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+-+    I(510), I(511)
+-+};
+-+#undef I
+-+#endif  // USE_BY22
++-static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++-                                int nPbW, int nPbH,
++-                                int log2_cb_size, int partIdx, int idx)
+ +
+- /**
+-  * number of bin by SyntaxElement.
+-  */
+-@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
+-     { 28, 36, 43, 49, 54, 58, 61, 63, },
+- };
+- 
+++#if RPI_INTER
+ +
+-+typedef struct
+++static HEVCRpiInterPredQ *
+++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
+ +{
+-+    uint16_t coeff;
+-+    uint16_t scale;
+-+} xy_off_t;
+-+
+-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+++    HEVCRpiInterPredQ * yp = ipe->q + ipe->curr;
+++    HEVCRpiInterPredQ * ypt = yp + 1;
+++    for (unsigned int i = 1; i != ipe->n_grp; ++i, ++ypt) {
+++        if (ypt->load < yp->load)
+++            yp = ypt;
+++    }
+ +
+-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+++    yp->load += load_val;
+++    ipe->used_grp = 1;
+++    yp->qpu_mc_curr->data[-1] = fn;  // Link is always last el of previous cmd
+ +
+-+#define OFF_DIAG(t) {\
+-+    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+-+    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+-+    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+-+    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+++    return yp;
+ +}
+ +
+-+#define OFF_HORIZ(t) {\
+-+    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+-+    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+-+    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+-+    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+-+}
+ +
+-+#define OFF_VERT(t) {\
+-+    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+-+    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+-+    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+-+    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
+++{
+++    for (unsigned int i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i;
+++        q->qpu_mc_curr->data[-1] = q->code_sync;
+++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
+++        q->load = 0;
+++    }
+ +}
+ +
+-+static const xy_off_t off_xys[3][4][16] =
+++// Returns 0 on success, -1 if Q is dangerously full
+++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+-+    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+-+    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+-+};
+-+
+++    if (!ipe->used_grp)
+++        return 0;
+ +
+-+// Helper fns
+-+#ifndef hevc_mem_bits32
+-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+-+{
+-+    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+-+}
+-+#endif
+++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
+++    {
+++        ipe->curr = 0;
+++        rpi_inter_pred_sync(ipe);
+++    }
+++    ipe->used = 1;
+++    ipe->used_grp = 0;
+ +
+-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+-+#define hevc_clz32 hevc_clz32_builtin
+-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+-+{
+-+    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+-+    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+++    for (unsigned int i = 0; i != ipe->n_grp; ++i) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i + ipe->curr;
+++        if ((char *)q->qpu_mc_curr - (char *)q->qpu_mc_base > ipe->max_fill) {
+++            return -1;
+++        }
+++    }
+++    return 0;
+ +}
+-+#endif
+ +
+-+// It is unlikely that we will ever need this but include for completeness
+-+#ifndef hevc_clz32
+-+static inline unsigned int hevc_clz32(unsigned int x)
+++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    unsigned int n = 1;
+-+    if ((x & 0xffff0000) == 0) {
+-+        n += 16;
+-+        x <<= 16;
+-+    }
+-+    if ((x & 0xff000000) == 0) {
+-+        n += 8;
+-+        x <<= 8;
+-+    }
+-+    if ((x & 0xf0000000) == 0) {
+-+        n += 4;
+-+        x <<= 4;
+-+    }
+-+    if ((x & 0xc0000000) == 0) {
+-+        n += 2;
+-+        x <<= 2;
+++    unsigned int i;
+++    ipe->curr = 0;
+++    ipe->used = 0;
+++    ipe->used_grp = 0;
+++    for (i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const q = ipe->q + i;
+++        q->qpu_mc_curr = q->qpu_mc_base;
+++        q->load = 0;
+++        q->last_l0 = NULL;
+++        q->last_l1 = NULL;
+ +    }
+-+    return n - ((x >> 31) & 1);
+ +}
+-+#endif
+ +
+++static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
+++                                 const unsigned int n_max, const unsigned int n_grp,
+++                                 const unsigned int total_size, const unsigned int min_gap)
+++{
+++    memset(ipe, 0, sizeof(*ipe));
+++    av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
+++    ipe->n_grp = n_grp;
+++    ipe->min_gap = min_gap;
+ +
+-+#if !USE_BY22
+-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+-+// will no longer be called but the setup calls will still exist and we want
+-+// to null them out
+-+#define bypass_start(s)
+-+#define bypass_finish(s)
+++#if RPI_CACHE_UNIF_MVS
+++    gpu_malloc_cached(total_size, &ipe->gptr);
+ +#else
+-+// Use BY22 for residual bypass block
+-+
+-+#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+-+#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+-+
+-+// BY22 notes that bypass is simply a divide into the bitstream and so we
+-+// can peek out large quantities of bits at once and treat the result as if
+-+// it was VLC.  In many cases this will lead to O(1) processing rather than
+-+// O(n) though the setup and teardown is sufficiently expensive that it is
+-+// only worth using if we expect to be dealing with more than a few bits
+-+// The definition of "a few bits" will vary from platform to platform but
+-+// tests on ARM show that it probably isn't worth it for a single coded
+-+// residual, but is for >1 - it also seems likely that if there are
+-+// more residuals then they are likely to be bigger and this will make the
+-+// O(1) nature of the code more worthwhile.
+++    gpu_malloc_uncached(total_size, &ipe->gptr);
+++#endif
+++}
+ +
+ +
+-+#if !USE_BY22_DIV
+-+// * 1/x @ 32 bits gets us 22 bits of accuracy
+-+#define CABAC_BY22_PEEK_BITS  22
+++#if RPI_QPU_EMU_Y
+++#define get_mc_address_y(f) ((f)->data[0])
+ +#else
+-+// A real 32-bit divide gets us another bit
+-+// If we have a 64 bit int & a unit time divider then we should get a lot
+-+// of bits (55)  but that is untested and it is unclear if it would give
+-+// us a large advantage
+-+#define CABAC_BY22_PEEK_BITS  23
+++#define get_mc_address_y(f) get_vc_address_y(f)
+ +#endif
+-+
+-+// Bypass block start
+-+// Must be called before _by22_peek is used as it sets the CABAC environment
+-+// into the correct state.  _by22_finish must be called to return to 'normal'
+-+// (i.e. non-bypass) cabac decoding
+-+static inline void get_cabac_by22_start(CABACContext * const c)
+-+{
+-+    const unsigned int bits = __builtin_ctz(c->low);
+-+    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+-+    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+-+#if !USE_BY22_DIV
+-+    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+++#if RPI_QPU_EMU_C
+++#define get_mc_address_u(f) ((f)->data[1])
+++#else
+++#define get_mc_address_u(f) get_vc_address_u(f)
+ +#endif
+ +
+-+    c->bytestream -= (CABAC_BITS / 8);
+-+    c->by22.bits = bits;
+-+#if !USE_BY22_DIV
+-+    c->by22.range = c->range;
+-+    c->range = inv;
+-+#endif
+-+    c->low = x;
+++static inline int offset_depth_adj(const HEVCContext *const s, const int wt)
+++{
+++    return s->ps.sps->high_precision_offsets_enabled_flag ? wt :
+++           wt << (s->ps.sps->bit_depth - 8);
+ +}
+ +
+-+// Bypass block finish
+-+// Must be called at the end of the bypass block to return to normal operation
+-+static inline void get_cabac_by22_finish(CABACContext * const c)
+++static void
+++rpi_pred_y(HEVCContext *const s, const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const Mv *const mv,
+++           const int weight_mul,
+++           const int weight_offset,
+++           AVFrame *const src_frame)
+ +{
+-+    unsigned int used = c->by22.bits;
+-+    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+-+    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+++    const unsigned int mx          = mv->x & 3;
+++    const unsigned int my          = mv->y & 3;
+++    const unsigned int my_mx       = (my << 8) | mx;
+++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
+++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
+++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
+++    const uint32_t wo = PACK2(offset_depth_adj(s, weight_offset) * 2 + 1, weight_mul);
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+++
+++    if (my_mx == 0)
+++    {
+++        const int x1 = x0 + (mv->x >> 2);
+++        const int y1 = y0 + (mv->y >> 2);
+++        const int bh = nPbH;
+ +
+-+    c->bytestream += bytes_used + (CABAC_BITS / 8);
+-+    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+-+#if !USE_BY22_DIV
+-+    c->range = c->by22.range;
+++        for (int start_x = 0; start_x < nPbW; start_x += 16)
+++        {
+++            const int bw = FFMIN(nPbW - start_x, 16);
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
+++
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                ++ts->y_pred1_x0y0;
+++
+++                if (nPbW > 8)
+++                    ++ts->y_pred1_wgt8;
+++                else
+++                    ++ts->y_pred1_wle8;
+++
+++                if (nPbH > 16)
+++                    ++ts->y_pred1_hgt16;
+++                else
+++                    ++ts->y_pred1_hle16;
+++            }
+ +#endif
+-+}
+ +
+-+// Peek bypass bits
+-+// _by22_start must be called before _by22_peek is called and _by22_flush
+-+// must be called afterwards to flush any used bits
+-+// The actual number of valid bits returned is
+-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+-+// will be at least 22 which should be long enough for any prefix or suffix
+-+// though probably not long enough for the worst case combination
+-+#ifndef get_cabac_by22_peek
+-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+-+{
+-+#if USE_BY22_DIV
+-+    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+-+#else
+-+    uint32_t x = c->low & ~1U;
+-+    const uint32_t inv = c->range;
+++            src1->x = x1 + start_x;
+++            src1->y = y1;
+++            src1->base = src_vc_address_y;
+++            cmd_y->w = bw;
+++            cmd_y->h = bh;
+++            cmd_y->wo1 = wo;
+++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++        }
+++    }
+++    else
+++    {
+++        const int x1_m3 = x0 + (mv->x >> 2) - 3;
+++        const int y1_m3 = y0 + (mv->y >> 2) - 3;
+++        const unsigned int bh = nPbH;
+++        int start_x = 0;
+ +
+-+    if (inv != 0)
+-+        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+++#if 1
+++        // As Y-pred operates on two independant 8-wide src blocks we can merge
+++        // this pred with the previous one if it the previous one is 8 pel wide,
+++        // the same height as the current block, immediately to the left of our
+++        // current dest block and mono-pred.
+ +
+-+    return x << 1;
+++        qpu_mc_pred_y_p_t *const last_y8_p = s->last_y8_p;
+++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
+++        {
+++            const int bw = FFMIN(nPbW, 8);
+++            qpu_mc_src_t *const last_y8_src2 = s->last_y8_l1;
+++
+++            last_y8_src2->x = x1_m3;
+++            last_y8_src2->y = y1_m3;
+++            last_y8_src2->base = src_vc_address_y;
+++            last_y8_p->w += bw;
+++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
+++            last_y8_p->wo2 = wo;
+++
+++            s->last_y8_p = NULL;
+++            s->last_y8_l1 = NULL;
+++            start_x = bw;
+++#if RPI_TSTATS
+++            ++s->tstats.y_pred1_y8_merge;
+ +#endif
+-+}
+++        }
+ +#endif
+ +
+-+// Flush bypass bits peeked by _by22_peek
+-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+-+// val is an unmodified copy of whatever _by22_peek returned
+-+#ifndef get_cabac_by22_flush
+-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+-+{
+-+    // Subtract the bits used & reshift up to the top of the word
+-+#if USE_BY22_DIV
+-+    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+++        for (; start_x < nPbW; start_x += 16)
+++        {
+++            const int bw = FFMIN(nPbW - start_x, 16);
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_src_t *const src2 = yp->last_l1;
+++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                if (mx == 0 && my == 0)
+++                    ++ts->y_pred1_x0y0;
+++                else if (mx == 0)
+++                    ++ts->y_pred1_x0;
+++                else if (my == 0)
+++                    ++ts->y_pred1_y0;
+++                else
+++                    ++ts->y_pred1_xy;
+++
+++                if (nPbW > 8)
+++                    ++ts->y_pred1_wgt8;
+++                else
+++                    ++ts->y_pred1_wle8;
+++
+++                if (nPbH > 16)
+++                    ++ts->y_pred1_hgt16;
+++                else
+++                    ++ts->y_pred1_hle16;
+++            }
+++#endif
+++            src1->x = x1_m3 + start_x;
+++            src1->y = y1_m3;
+++            src1->base = src_vc_address_y;
+++            if (bw <= 8)
+++            {
+++                src2->x = MC_DUMMY_X;
+++                src2->y = MC_DUMMY_Y;
+++#if RPI_QPU_EMU_Y
+++                src2->base = s->qpu_dummy_frame_emu;
+ +#else
+-+    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+++                src2->base = s->qpu_dummy_frame_qpu;
+ +#endif
+-+
+-+    // and refill lower bits
+-+    // We will probably OR over some existing bits but that doesn't matter
+-+    c->by22.bits += n;
+-+    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+++            }
+++            else
+++            {
+++                src2->x = x1_m3 + start_x + 8;
+++                src2->y = y1_m3;
+++                src2->base = src_vc_address_y;
+++            }
+++            cmd_y->w = bw;
+++            cmd_y->h = bh;
+++            cmd_y->mymx21 = my2_mx2_my_mx;
+++            cmd_y->wo1 = wo;
+++            cmd_y->wo2 = wo;
+++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->last_l1 = &cmd_y->next_src2;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++
+++            if (bw == 8) {
+++                s->last_y8_l1 = src2;
+++                s->last_y8_p = cmd_y;
+++            }
+++        }
+++    }
+ +}
+-+#endif
+ +
+-+#endif  // USE_BY22
+++static void
+++rpi_pred_y_b(HEVCContext * const s,
+++           const int x0, const int y0,
+++           const int nPbW, const int nPbH,
+++           const struct MvField *const mv_field,
+++           AVFrame *const src_frame,
+++           AVFrame *const src_frame2)
+++{
+++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
+++    const Mv * const mv  = mv_field->mv + 0;
+++    const Mv * const mv2 = mv_field->mv + 1;
+ +
+++    const unsigned int mx          = mv->x & 3;
+++    const unsigned int my          = mv->y & 3;
+++    const unsigned int my_mx = (my<<8) | mx;
+++    const unsigned int mx2          = mv2->x & 3;
+++    const unsigned int my2          = mv2->y & 3;
+++    const unsigned int my2_mx2 = (my2<<8) | mx2;
+++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
+++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
+++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
+++    const uint32_t wt_offset =
+++        offset_depth_adj(s, s->sh.luma_offset_l0[ref_idx0] + s->sh.luma_offset_l1[ref_idx1]) + 1;
+++    const uint32_t wo1 = PACK2(wt_offset, s->sh.luma_weight_l0[ref_idx0]);
+++    const uint32_t wo2 = PACK2(wt_offset, s->sh.luma_weight_l1[ref_idx1]);
+++
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
+++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
+++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
+++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->luma_ip;
+++
+++    if (my2_mx2_my_mx == 0)
+++    {
+++        const int x1 = x0 + (mv->x >> 2);
+++        const int y1 = y0 + (mv->y >> 2);
+++        const int x2 = x0 + (mv2->x >> 2);
+++        const int y2 = y0 + (mv2->y >> 2);
+++        const int bh = nPbH;
+++
+++        // Can do chunks a full 16 wide if we don't want the H filter
+++        for (int start_x=0; start_x < nPbW; start_x += 16)
+++        {
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_src_t *const src2 = yp->last_l1;
+++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                ++ts->y_pred2_x0y0;
+ +
+- void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
+- {
+-     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
+-@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
+-     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+- }
+- 
+--static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+-+static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
+- {
+--    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+-+    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
+- }
+- 
+--static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+-+static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
+- {
+--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+-+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
+- }
+- 
+--static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+-+static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
+- {
+--    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+-+    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
+- }
+- 
+- int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
+-@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
+-     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
+- }
+- 
+--static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
+-                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
+- {
+-     int i = 0;
+-     int max = (log2_size << 1) - 1;
+-     int ctx_offset, ctx_shift;
+- 
+--    if (!c_idx) {
+-+    if (!c_idx_nz) {
+-         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
+-         ctx_shift = (log2_size + 1) >> 2;
+-     } else {
+-@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
+-     return value;
+- }
+- 
+--static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+-+static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
+- {
+-     int inc;
+- 
+--    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+-+    inc = (ctx_cg != 0) + (c_idx_nz << 1);
+- 
+-     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
+- }
+--static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
+--                                           int offset, const uint8_t *ctx_idx_map)
+--{
+--    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
+--    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
+--}
+- 
+--static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+-+static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
+- {
+-     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
+- }
+-@@ -966,90 +1227,378 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
+-     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
+- }
+- 
+--static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+++                if (nPbH > 16)
+++                    ++ts->y_pred2_hgt16;
+++                else
+++                    ++ts->y_pred2_hle16;
+++            }
+++#endif
+++            src1->x = x1 + start_x;
+++            src1->y = y1;
+++            src1->base = src1_base;
+++            src2->x = x2 + start_x;
+++            src2->y = y2;
+++            src2->base = src2_base;
+++            cmd_y->w = FFMIN(nPbW - start_x, 16);
+++            cmd_y->h = bh;
+++            cmd_y->mymx21 = 0;
+++            cmd_y->wo1 = wo1;
+++            cmd_y->wo2 = wo2;
+++            cmd_y->dst_addr =  dst + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->last_l1 = &cmd_y->next_src2;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++        }
+++    }
+++    else
+++    {
+++        // Filter requires a run-up of 3
+++        const int x1 = x0 + (mv->x >> 2) - 3;
+++        const int y1 = y0 + (mv->y >> 2) - 3;
+++        const int x2 = x0 + (mv2->x >> 2) - 3;
+++        const int y2 = y0 + (mv2->y >> 2) - 3;
+++        const int bh = nPbH;
+++
+++        for (int start_x=0; start_x < nPbW; start_x += 8)
+++        { // B blocks work 8 at a time
+++            // B weights aren't doubled as the QPU code does the same
+++            // amount of work as it does for P
+++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
+++            qpu_mc_src_t *const src1 = yp->last_l0;
+++            qpu_mc_src_t *const src2 = yp->last_l1;
+++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
+++#if RPI_TSTATS
+++            {
+++                HEVCRpiStats *const ts = &s->tstats;
+++                const unsigned int mmx = mx | mx2;
+++                const unsigned int mmy = my | my2;
+++                if (mmx == 0 && mmy == 0)
+++                    ++ts->y_pred2_x0y0;
+++                else if (mmx == 0)
+++                    ++ts->y_pred2_x0;
+++                else if (mmy == 0)
+++                    ++ts->y_pred2_y0;
+++                else
+++                    ++ts->y_pred2_xy;
+ +
+-+#if !USE_BY22
+-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+++                if (nPbH > 16)
+++                    ++ts->y_pred2_hgt16;
+++                else
+++                    ++ts->y_pred2_hle16;
+++            }
+ +#endif
+++            src1->x = x1 + start_x;
+++            src1->y = y1;
+++            src1->base = src1_base;
+++            src2->x = x2 + start_x;
+++            src2->y = y2;
+++            src2->base = src2_base;
+++            cmd_y->w = FFMIN(nPbW - start_x, 8);
+++            cmd_y->h = bh;
+++            cmd_y->mymx21 = my2_mx2_my_mx;
+++            cmd_y->wo1 = wo1;
+++            cmd_y->wo2 = wo2;
+++            cmd_y->dst_addr =  dst + (start_x << xshl);
+++            yp->last_l0 = &cmd_y->next_src1;
+++            yp->last_l1 = &cmd_y->next_src2;
+++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
+++        }
+++    }
+++}
+ +
+++// h/v shifts fixed at one as that is all the qasm copes with
+++static void
+++rpi_pred_c(HEVCContext * const s, const unsigned int lx, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const Mv * const mv,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  AVFrame * const src_frame)
+++{
+++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+++    const int hshift = 1; // = s->ps.sps->hshift[1];
+++    const int vshift = 1; // = s->ps.sps->vshift[1];
+++
+++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
+++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->x, 2 + hshift) << (1 - hshift)];
+++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(mv->y, 2 + vshift) << (1 - vshift)];
+++    const uint32_t wo_u = PACK2(offset_depth_adj(s, c_offsets[0]) * 2 + 1, c_weights[0]);
+++    const uint32_t wo_v = PACK2(offset_depth_adj(s, c_offsets[1]) * 2 + 1, c_weights[1]);
+++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+++    const unsigned int bh = nPbH_c;
+++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
+++
+++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
+++    {
+++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
+++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
+++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
+++        qpu_mc_src_t * const last_lx = *plast_lx;
+++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++        last_lx->x = x1_c + start_x;
+++        last_lx->y = y1_c;
+++        last_lx->base = src_base_u;
+++        cmd_c->h = bh;
+++        cmd_c->w = bw;
+++        cmd_c->coeffs_x = x_coeffs;
+++        cmd_c->coeffs_y = y_coeffs;
+++        cmd_c->wo_u = wo_u;
+++        cmd_c->wo_v = wo_v;
+++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
+++        *plast_lx = &cmd_c->next_src;
+++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
+++    }
+++    return;
+++}
+ +
+-+#ifndef coeff_abs_level_remaining_decode_bypass
+-+static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
+++// h/v shifts fixed at one as that is all the qasm copes with
+++static void
+++rpi_pred_c_b(HEVCContext * const s, const int x0_c, const int y0_c,
+++  const int nPbW_c, const int nPbH_c,
+++  const struct MvField * const mv_field,
+++  const int16_t * const c_weights,
+++  const int16_t * const c_offsets,
+++  const int16_t * const c_weights2,
+++  const int16_t * const c_offsets2,
+++  AVFrame * const src_frame,
+++  AVFrame * const src_frame2)
+ +{
+-+    CABACContext * const c = &s->HEVClc->cc;
+-+    uint32_t y;
+-+    unsigned int prefix;
+-+    unsigned int last_coeff_abs_level_remaining;
+-+    unsigned int n;
+++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
+++    const int hshift = 1; // s->ps.sps->hshift[1];
+++    const int vshift = 1; // s->ps.sps->vshift[1];
+++    const Mv * const mv = mv_field->mv + 0;
+++    const Mv * const mv2 = mv_field->mv + 1;
+ +
+-+    y = get_cabac_by22_peek(c);
+-+    prefix = hevc_clz32(~y);
+-+    // y << prefix will always have top bit 0
+++    const unsigned int mx = av_mod_uintp2(mv->x, 2 + hshift);
+++    const unsigned int my = av_mod_uintp2(mv->y, 2 + vshift);
+++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
+++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
+++    const int x1_c = x0_c + (mv->x >> (2 + hshift)) - 1;
+++    const int y1_c = y0_c + (mv->y >> (2 + hshift)) - 1;
+ +
+-+    if (prefix < 3) {
+-+        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+-+        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+-+        n = prefix + 1 + rice_param;
+-+    }
+-+    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+-+    {
+-+        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+++    const unsigned int mx2 = av_mod_uintp2(mv2->x, 2 + hshift);
+++    const unsigned int my2 = av_mod_uintp2(mv2->y, 2 + vshift);
+++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
+++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
+ +
+-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+-+        n = prefix * 2 + rice_param - 2;
+-+    }
+-+    else {
+-+        unsigned int suffix;
+++    const int x2_c = x0_c + (mv2->x >> (2 + hshift)) - 1;
+++    const int y2_c = y0_c + (mv2->y >> (2 + hshift)) - 1;
+ +
+-+        get_cabac_by22_flush(c, prefix, y);
+-+        y = get_cabac_by22_peek(c);
+++    const uint32_t wo_u2 = PACK2(offset_depth_adj(s, c_offsets[0] + c_offsets2[0]) + 1, c_weights2[0]);
+++    const uint32_t wo_v2 = PACK2(offset_depth_adj(s, c_offsets[1] + c_offsets2[1]) + 1, c_weights2[1]);
+ +
+-+        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+-+        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+-+        n = prefix + rice_param - 2;
+++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
+++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
+++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
+++    HEVCRpiInterPredEnv * const ipe = &s->jb0->chroma_ip;
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
+++    const unsigned int bh = nPbH_c;
+++
+++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
+++    {
+++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
+++
+++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
+++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
+++        qpu_mc_src_t * const src_l0 = cp->last_l0;
+++        qpu_mc_src_t * const src_l1 = cp->last_l1;
+++
+++        src_l0->x = x1_c + start_x;
+++        src_l0->y = y1_c;
+++        src_l0->base = src1_base;
+++        src_l1->x = x2_c + start_x;
+++        src_l1->y = y2_c;
+++        src_l1->base = src2_base;
+++
+++        u[0].h = bh;
+++        u[0].w = bw;
+++        u[0].coeffs_x1 = coefs0_x;
+++        u[0].coeffs_y1 = coefs0_y;
+++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
+++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
+++        u[0].coeffs_x2 = coefs1_x;
+++        u[0].coeffs_y2 = coefs1_y;
+++        u[0].wo_u2 = wo_u2;
+++        u[0].wo_v2 = wo_v2;
+++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
+++
+++        cp->last_l0 = &u[0].next_src1;
+++        cp->last_l1 = &u[0].next_src2;
+++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+ +    }
+++}
+ +
+-+    get_cabac_by22_flush(c, n, y);
+ +
+-+    return last_coeff_abs_level_remaining;
+-+}
+ +#endif
+ +
+-+static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+++
+++
+++static void hls_prediction_unit(HEVCContext * const s, const int x0, const int y0,
+++                                const int nPbW, const int nPbH,
+++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
+  {
+-+    CABACContext * const c = &s->HEVClc->cc;
+-     int prefix = 0;
+-     int suffix = 0;
+-     int last_coeff_abs_level_remaining;
+-     int i;
+- 
+--    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+-+    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
+-         prefix++;
+-     if (prefix == CABAC_MAX_BIN) {
+-         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
+-         return 0;
+-     }
+-+
+-     if (prefix < 3) {
+-         for (i = 0; i < rc_rice_param; i++)
+--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+-+            suffix = (suffix << 1) | get_cabac_bypass(c);
+-         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
+-     } else {
+-         int prefix_minus3 = prefix - 3;
+-         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
+--            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+-+            suffix = (suffix << 1) | get_cabac_bypass(c);
+-         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
+-                                               << rc_rice_param) + suffix;
+-     }
+-+
+-     return last_coeff_abs_level_remaining;
++ #define POS(c_idx, x, y)                                                              \
++     &s->frame->data[c_idx][((y) >> s->ps.sps->vshift[c_idx]) * s->frame->linesize[c_idx] + \
++                            (((x) >> s->ps.sps->hshift[c_idx]) << s->ps.sps->pixel_shift)]
++-    HEVCLocalContext *lc = s->HEVClc;
+++    HEVCLocalContext * const lc = s->HEVClc;
++     int merge_idx = 0;
++     struct MvField current_mv = {{{ 0 }}};
++ 
++@@ -1724,8 +2985,7 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++     int y_cb             = y0 >> log2_min_cb_size;
++     int x_pu, y_pu;
++     int i, j;
++-
++-    int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
+++    const int skip_flag = SAMPLE_CTB(s->skip_flag, x_cb, y_cb);
++ 
++     if (!skip_flag)
++         lc->pu.merge_flag = ff_hevc_merge_flag_decode(s);
++@@ -1769,12 +3029,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 0,
+++              s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
+++              ref0->frame);
+++        } else
+++#endif
+++        {
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref0->frame,
++                     &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l0[current_mv.ref_idx[0]],
++                     s->sh.luma_offset_l0[current_mv.ref_idx[0]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
+++#if RPI_INTER
+++            if (s->enable_rpi) {
+++                rpi_pred_c(s, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 0,
+++                  s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                  ref0->frame);
+++                return;
+++            }
+++#endif
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref0->frame->data[1], ref0->frame->linesize[1],
++                           0, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]][0], s->sh.chroma_offset_l0[current_mv.ref_idx[0]][0]);
++@@ -1788,12 +3065,29 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y(s, x0, y0, nPbW, nPbH, current_mv.mv + 1,
+++              s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
+++              ref1->frame);
+++        } else
+++#endif
+++        {
+++            luma_mc_uni(s, dst0, s->frame->linesize[0], ref1->frame,
++                     &current_mv.mv[1], x0, y0, nPbW, nPbH,
++                     s->sh.luma_weight_l1[current_mv.ref_idx[1]],
++                     s->sh.luma_offset_l1[current_mv.ref_idx[1]]);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
+++#if RPI_INTER
+++            if (s->enable_rpi) {
+++                rpi_pred_c(s, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.mv + 1,
+++                  s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                  ref1->frame);
+++                return;
+++            }
+++#endif
++             chroma_mc_uni(s, dst1, s->frame->linesize[1], ref1->frame->data[1], ref1->frame->linesize[1],
++                           1, x0_c, y0_c, nPbW_c, nPbH_c, &current_mv,
++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]][0], s->sh.chroma_offset_l1[current_mv.ref_idx[1]][0]);
++@@ -1808,11 +3102,31 @@ static void hls_prediction_unit(HEVCContext *s, int x0, int y0,
++         int nPbW_c = nPbW >> s->ps.sps->hshift[1];
++         int nPbH_c = nPbH >> s->ps.sps->vshift[1];
++ 
++-        luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
+++#if RPI_INTER
+++        if (s->enable_rpi) {
+++            rpi_pred_y_b(s, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
+++        } else
+++#endif
+++        {
+++            luma_mc_bi(s, dst0, s->frame->linesize[0], ref0->frame,
++                    &current_mv.mv[0], x0, y0, nPbW, nPbH,
++                    ref1->frame, &current_mv.mv[1], &current_mv);
+++        }
++ 
++         if (s->ps.sps->chroma_format_idc) {
+++#if RPI_INTER
+++          if (s->enable_rpi) {
+++              rpi_pred_c_b(s, x0_c, y0_c, nPbW_c, nPbH_c,
+++                           &current_mv,
+++                           s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
+++                           s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
+++                           s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
+++                           ref0->frame,
+++                           ref1->frame);
+++                return;
+++            }
+++#endif
++             chroma_mc_bi(s, dst1, s->frame->linesize[1], ref0->frame, ref1->frame,
++                          x0_c, y0_c, nPbW_c, nPbH_c, &current_mv, 0);
++ 
++@@ -2087,7 +3401,9 @@ static int hls_coding_unit(HEVCContext *s, int x0, int y0, int log2_cb_size)
++                 intra_prediction_unit_default_value(s, x0, y0, log2_cb_size);
++                 ret = hls_pcm_sample(s, x0, y0, log2_cb_size);
++                 if (s->ps.sps->pcm.loop_filter_disable_flag)
+++                {
++                     set_deblocking_bypass(s, x0, y0, log2_cb_size);
+++                }
++ 
++                 if (ret < 0)
++                     return ret;
++@@ -2310,6 +3626,524 @@ static void hls_decode_neighbour(HEVCContext *s, int x_ctb, int y_ctb,
++     lc->ctb_up_left_flag = ((x_ctb > 0) && (y_ctb > 0)  && (ctb_addr_in_slice-1 >= s->ps.sps->ctb_width) && (s->ps.pps->tile_id[ctb_addr_ts] == s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1 - s->ps.sps->ctb_width]]));
+  }
+  
+--static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+-+#if !USE_BY22
+-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+-+static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
+- {
+--    int i;
+--    int ret = 0;
+-+    CABACContext * const c = &s->HEVClc->cc;
+++#ifdef RPI
+++static void rpi_execute_dblk_cmds(HEVCContext *s)
+++{
+++    const unsigned int ctb_size    = 1 << s->ps.sps->log2_ctb_size;
+++    HEVCRpiDeblkEnv *const de = &s->jb1->deblk;
+ +    unsigned int i;
+-+    uint32_t ret = 0;
+- 
+-     for (i = 0; i < nb; i++)
+--        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
+--    return ret;
+-+        ret = (ret << 1) | get_cabac_bypass(c);
+ +
+-+    return ret << (32 - nb);
+++    for (i = 0; i != de->n; ++i)
+++    {
+++        ff_hevc_hls_filters(s, de->blks[i].x_ctb, de->blks[i].y_ctb, ctb_size);
+++    }
+++    de->n = 0;
+ +}
+-+#endif
+ +
+-+#ifndef coeff_sign_flag_decode_bypass
+-+static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+++#if 0
+++static void rpi_execute_transform(HEVCContext *s)
+ +{
+-+    CABACContext * const c = &s->HEVClc->cc;
+-+    uint32_t y;
+-+    y = get_cabac_by22_peek(c);
+-+    get_cabac_by22_flush(c, nb, y);
+-+    return y & ~(0xffffffffU >> nb);
+-+}
+-+#endif
+++    int i=2;
+++    int job = s->pass1_job;
+++    /*int j;
+++    int16_t *coeffs = s->coeffs_buf_arm[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 16*16, coeffs+=16*16) {
+++        s->hevcdsp.idct[4-2](coeffs, 16);
+++    }
+++    i=3;
+++    coeffs = s->coeffs_buf_arm[job][i] - s->num_coeffs[job][i];
+++    for(j=s->num_coeffs[job][i]; j > 0; j-= 32*32, coeffs+=32*32) {
+++        s->hevcdsp.idct[5-2](coeffs, 32);
+++    }*/
+ +
+++    rpi_cache_flush_one_gm_ptr(&s->coeffs_buf_accelerated[job], RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++    s->vpu_id = vpu_post_code2( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[job][2],
+++                               s->num_coeffs[job][2] >> 8, s->coeffs_buf_vc[job][3] - sizeof(int16_t) * s->num_coeffs[job][3],
+++                               s->num_coeffs[job][3] >> 10, 0, &s->coeffs_buf_accelerated[job]);
+++    //vpu_execute_code( vpu_get_fn(), vpu_get_constants(), s->coeffs_buf_vc[2], s->num_coeffs[2] >> 8, s->coeffs_buf_vc[3], s->num_coeffs[3] >> 10, 0);
+++    //gpu_cache_flush(&s->coeffs_buf_accelerated);
+++    //vpu_wait(s->vpu_id);
+ +
+-+#ifndef get_cabac_greater1_bits
+-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+-+    uint8_t * const state0)
+-+{
+-+    unsigned int i;
+-+    unsigned int rv = 0;
+-+    for (i = 0; i != n; ++i) {
+-+        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+-+        const unsigned int b = get_cabac(c, state0 + idx);
+-+        rv = (rv << 1) | b;
+-+    }
+-+    return rv;
+++    for(i=0;i<4;i++)
+++        s->num_coeffs[job][i] = 0;
+ +}
+ +#endif
+ +
+ +
+-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
+-+// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+-+// this version of events.
+-+static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+-+    int * const pprev_subset_coded, int * const psum,
+-+    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+-+{
+-+    CABACContext * const c = &s->HEVClc->cc;
+-+    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+-+    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+-+    unsigned int rv;
+-+    unsigned int i;
+-+    const unsigned int n = FFMIN(n_end, 8);
+-+
+-+    // Really this is i != n but the simple unconditional loop is cheaper
+-+    // and faster
+-+    for (i = 0; i != 8; ++i)
+-+        levels[i] = 1;
+++#define RPI_OPT_SEP_PRED 0
+ +
+-+    rv = get_cabac_greater1_bits(c, n, state0);
+ +
+-+    *pprev_subset_coded = 0;
+-+    *psum = n;
+++// I-pred, transform_and_add for all blocks types done here
+++// All ARM
+++#if RPI_OPT_SEP_PRED
+++static void rpi_execute_pred_cmds(HEVCContext * const s, const int do_luma, const int do_chroma)
+++#else
+++static void rpi_execute_pred_cmds(HEVCContext * const s)
+++#endif
+++{
+++  int i;
+++  HEVCRpiIntraPredEnv * iap = &s->jb1->intra;
+++  const HEVCPredCmd *cmd = iap->cmds;
+++#ifdef RPI
+++  HEVCLocalContextIntra *lc = &s->HEVClcIntra;
+++#else
+++  HEVCLocalContext *lc = s->HEVClc;
+++#endif
+ +
+-+    rv <<= (32 - n);
+-+    if (rv != 0)
+-+    {
+-+        *pprev_subset_coded = 1;
+-+        *psum = n + 1;
+-+        i = hevc_clz32(rv);
+-+        levels[i] = 2;
+-+        if (get_cabac(c, state_gt2) == 0)
+-+        {
+-+            // Unset first coded bit
+-+            rv &= ~(0x80000000U >> i);
+-+        }
+-+    }
+++  for(i = iap->n; i > 0; i--, cmd++) {
+++//      printf("i=%d cmd=%p job1=%d job0=%d\n",i,cmd,s->pass1_job,s->pass0_job);
+++#if RPI_OPT_SEP_PRED
+++      if (!(cmd->c_idx == 0 ? do_luma : do_chroma)) {
+++          continue;
+++      }
+++#endif
+ +
+-+    if (n_end > 8) {
+-+        const unsigned int g8 = n_end - 8;
+-+        rv |= ((1 << g8) - 1) << (24 - g8);
+-+        for (i = 0; i != g8; ++i) {
+-+            levels[i + 8] = 0;
+-+        }
+-+    }
+++      switch (cmd->type)
+++      {
+++          case RPI_PRED_INTRA:
+++              lc->tu.intra_pred_mode_c = lc->tu.intra_pred_mode = cmd->i_pred.mode;
+++              lc->na.cand_bottom_left  = (cmd->na >> 4) & 1;
+++              lc->na.cand_left         = (cmd->na >> 3) & 1;
+++              lc->na.cand_up_left      = (cmd->na >> 2) & 1;
+++              lc->na.cand_up           = (cmd->na >> 1) & 1;
+++              lc->na.cand_up_right     = (cmd->na >> 0) & 1;
+++              if (!av_rpi_is_sand_frame(s->frame) || cmd->c_idx == 0)
+++                  s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              else
+++                  s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.x, cmd->i_pred.y, cmd->c_idx);
+++              break;
+ +
+-+    return rv;
+-+}
+++          case RPI_PRED_ADD_RESIDUAL:
+++              s->hevcdsp.transform_add[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++          case RPI_PRED_ADD_DC:
+++              s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+++              break;
+++#if RPI_HEVC_SAND
+++          case RPI_PRED_ADD_RESIDUAL_U:
+++              s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_V:
+++              s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
+++              break;
+++          case RPI_PRED_ADD_RESIDUAL_C:
+++              s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
+++              break;
+++          case RPI_PRED_ADD_DC_U:
+++          case RPI_PRED_ADD_DC_V:
+++              s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
+++              break;
+++#endif
+ +
+-+// extended_precision_processing_flag must be false given we are
+-+// putting the result into a 16-bit array
+-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+-+// scale_m is uint8_t
+-+//
+-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+-+//   or it can be 2 (if we have transquant_bypass)
+-+// shift is set to one less than we really want but would normally be
+-+//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+-+// to achieve it
+++          case RPI_PRED_I_PCM:
+++              pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
+++              break;
+ +
+-+#ifndef trans_scale_sat
+-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+-+{
+-+    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+++          default:
+++              av_log(NULL, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
+++              abort();
+++      }
+++  }
+++#if RPI_OPT_SEP_PRED
+++  if (do_luma)
+++#endif
+++  {
+++      iap->n = 0;
+++  }
+ +}
+++
+++
+ +#endif
+ +
+++#ifdef RPI
+ +
+-+#ifndef update_rice
+-+static inline void update_rice(uint8_t * const stat_coeff,
+-+    const unsigned int last_coeff_abs_level_remaining,
+-+    const unsigned int c_rice_param)
+++// Set initial uniform job values & zero ctu_count
+++static void rpi_begin(HEVCContext *s)
+ +{
+-+    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+-+    if (x >= 6)
+-+        (*stat_coeff)++;
+-+    else if (x == 0 && *stat_coeff > 0)
+-+        (*stat_coeff)--;
+++#if RPI_INTER
+++    unsigned int i;
+++    HEVCRpiJob * const jb = s->jb0;
+++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
+++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
+++
+++    const uint16_t pic_width_y        = s->ps.sps->width;
+++    const uint16_t pic_height_y       = s->ps.sps->height;
+++
+++    const uint16_t pic_width_c        = s->ps.sps->width >> s->ps.sps->hshift[1];
+++    const uint16_t pic_height_c       = s->ps.sps->height >> s->ps.sps->vshift[1];
+++
+++    rpi_inter_pred_reset(cipe);
+++    for (i = 0; i < cipe->n; i++) {
+++        HEVCRpiInterPredQ * const cp = cipe->q + i;
+++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
+++
+++        u->next_src1.x = 0;
+++        u->next_src1.y = 0;
+++        u->next_src1.base = 0;
+++        u->pic_cw = pic_width_c;
+++        u->pic_ch = pic_height_c;
+++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
+++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
+++        u->wdenom = s->sh.chroma_log2_weight_denom;
+++        cp->last_l0 = &u->next_src1;
+++
+++        u->next_fn = 0;
+++        u->next_src2.x = 0;
+++        u->next_src2.y = 0;
+++        u->next_src2.base = 0;
+++        cp->last_l1 = &u->next_src2;
+++
+++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
+++    }
+++
+++    rpi_inter_pred_reset(yipe);
+++    for (i = 0; i < yipe->n; i++) {
+++        HEVCRpiInterPredQ * const yp = yipe->q + i;
+++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
+++
+++        y->next_src1.x = 0;
+++        y->next_src1.y = 0;
+++        y->next_src1.base = 0;
+++        y->next_src2.x = 0;
+++        y->next_src2.y = 0;
+++        y->next_src2.base = 0;
+++        y->pic_h = pic_height_y;
+++        y->pic_w = pic_width_y;
+++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
+++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
+++        y->wdenom = s->sh.luma_log2_weight_denom;
+++        y->next_fn = 0;
+++        yp->last_l0 = &y->next_src1;
+++        yp->last_l1 = &y->next_src2;
+++
+++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
+++    }
+++
+++    s->last_y8_p = NULL;
+++    s->last_y8_l1 = NULL;
+++
+++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+++        jb->progress[i] = -1;
+++    }
+++
+++#endif
+++    s->ctu_count = 0;
+ +}
+ +#endif
+ +
+ +
+-+// n must be > 0 on entry
+-+#ifndef get_cabac_sig_coeff_flag_idxs
+-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+-+    unsigned int n,
+-+    const uint8_t const * ctx_map,
+-+    uint8_t * p)
+++#if RPI_INTER
+++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+++static unsigned int mc_terminate_add_qpu(HEVCContext * const s,
+++                                     const vpu_qpu_job_h vqj,
+++                                     rpi_cache_flush_env_t * const rfe,
+++                                     HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    do {
+-+        if (get_cabac(c, state0 + ctx_map[n]))
+-+            *p++ = n;
+-+    } while (--n != 0);
+-+    return p;
+-+}
+++    unsigned int i;
+++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
+++    unsigned int max_block = 0;
+++
+++    if (!ipe->used) {
+++        return 0;
+++    }
+++
+++    if (ipe->curr != 0) {
+++        rpi_inter_pred_sync(ipe);
+++    }
+++
+++    // Add final commands to Q
+++    for(i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const yp = ipe->q + i;
+++        qpu_mc_src_t *const p0 = yp->last_l0;
+++        qpu_mc_src_t *const p1 = yp->last_l1;
+++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
+++
+++        if (block_size > max_block)
+++            max_block = block_size;
+++
+++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        p0->x = MC_DUMMY_X;
+++        p0->y = MC_DUMMY_Y;
+++        p0->base = s->qpu_dummy_frame_qpu;
+++        p1->x = MC_DUMMY_X;
+++        p1->y = MC_DUMMY_Y;
+++        p1->base = s->qpu_dummy_frame_qpu;
+++
+++        yp->last_l0 = NULL;
+++        yp->last_l1 = NULL;
+++
+++        // Add to mailbox list
+++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
+++        mail[i][1] = yp->code_setup;
+++    }
+++
+++#if RPI_CACHE_UNIF_MVS
+++    // We don't need invalidate here as the uniforms aren't changed by the QPU
+++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
+++    // new values which seems to give us a small performance advantage
+++    //
+++    // In most cases we will not have a completely packed set of uniforms and as
+++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
+++    // fullest
+++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
+++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
+++                                  ipe->n, ipe->max_fill + ipe->min_gap);
+ +#endif
+++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
+ +
+++    return 1;
+++}
+++#endif
+ +
+-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+-+    unsigned int n,
+-+    const uint8_t const * ctx_map,
+-+    uint8_t * const flag_idx)
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++static unsigned int mc_terminate_add_emu(HEVCContext * const s,
+++                                     const vpu_qpu_job_h vqj,
+++                                     rpi_cache_flush_env_t * const rfe,
+++                                     HEVCRpiInterPredEnv * const ipe)
+ +{
+-+    int rv;
+++    unsigned int i;
+++    if (!ipe->used) {
+++        return 0;
+++    }
+ +
+-+    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+++    if (ipe->curr != 0) {
+++        rpi_inter_pred_sync(ipe);
+++    }
+ +
+-+    return rv;
+++    // Add final commands to Q
+++    for(i = 0; i != ipe->n; ++i) {
+++        HEVCRpiInterPredQ * const yp = ipe->q + i;
+++        qpu_mc_src_t *const p0 = yp->last_l0;
+++        qpu_mc_src_t *const p1 = yp->last_l1;
+++
+++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
+++
+++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
+++        p0->x = MC_DUMMY_X;
+++        p0->y = MC_DUMMY_Y;
+++        p0->base = s->qpu_dummy_frame_emu;
+++        p1->x = MC_DUMMY_X;
+++        p1->y = MC_DUMMY_Y;
+++        p1->base = s->qpu_dummy_frame_emu;
+++
+++        yp->last_l0 = NULL;
+++        yp->last_l1 = NULL;
+++    }
+++
+++    return 1;
+ +}
+++#endif
+ +
+-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+-+     x0,  x1,  x2,  x3,\
+-+     x4,  x5,  x6,  x7,\
+-+     x8,  x9, x10, x11,\
+-+    x12, x13, x14, x15}
+ +
+-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+-+     x0,  x4,  x8, x12,\
+-+     x1,  x5,  x9, x13,\
+-+     x2,  x6, x10, x14,\
+-+     x3,  x7, x11, x15}
+++#if RPI_QPU_EMU_Y
+++#define mc_terminate_add_y mc_terminate_add_emu
+++#else
+++#define mc_terminate_add_y mc_terminate_add_qpu
+++#endif
+++#if RPI_QPU_EMU_C
+++#define mc_terminate_add_c mc_terminate_add_emu
+++#else
+++#define mc_terminate_add_c mc_terminate_add_qpu
+++#endif
+++#endif
+ +
+-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+-+     x0,  x4,  x1,  x8,\
+-+     x5,  x2, x12,  x9,\
+-+     x6,  x3, x13, x10,\
+-+     x7, x14, x11, x15}
+++#ifdef RPI
+ +
+ +
+-+static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+-+    uint8_t * const significant_coeff_group_flag,
+-+    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+-+    int * const pPrev_sig)
+++static void flush_frame(HEVCContext *s,AVFrame *frame)
+ +{
+-+    while (--i >= 0) {
+-+        unsigned int x_cg = scan_x_cg[i];
+-+        unsigned int y_cg = scan_y_cg[i];
+++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
+++  rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
+++  rpi_cache_flush_finish(rfe);
+++}
+ +
+-+        // For the flag decode we only care about Z/NZ but
+-+        // we use the full Right + Down * 2 when calculating
+-+        // significant coeff flags so we obtain it here
+-+        //.
+-+        // The group flag array is one longer than it needs to
+-+        // be so we don't need to check for y_cg limits
+-+        unsigned int prev_sig = ((significant_coeff_group_flag[y_cg] >> (x_cg + 1)) & 1) |
+-+            (((significant_coeff_group_flag[y_cg + 1] >> x_cg) & 1) << 1);
+ +
+-+        if (i == 0 ||
+-+            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+++// Core execution tasks
+++static void worker_core(HEVCContext * const s)
+++{
+++#if RPI_OPT_SEP_PRED
+++    vpu_qpu_wait_h sync_c;
+++#endif
+++    vpu_qpu_wait_h sync_y;
+++
+++    HEVCRpiJob * const jb = s->jb1;
+++    int pred_y, pred_c;
+++
+++    const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++
+++    {
+++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
+++        if (cf->s[3].n + cf->s[2].n != 0)
+ +        {
+-+            significant_coeff_group_flag[y_cg] |= (1 << x_cg);
+-+            *pPrev_sig = prev_sig;
+-+            break;
+++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
+++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
+++            vpu_qpu_job_add_vpu(vqj,
+++                vpu_get_fn(s->ps.sps->bit_depth),
+++                vpu_get_constants(),
+++                cf->gptr.vc,
+++                cf->s[2].n >> 8,
+++                cf->gptr.vc + offset32,
+++                cf->s[3].n >> 10,
+++                0);
+++
+++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
+++            rpi_cache_flush_add_gm_range(rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
+ +        }
+ +    }
+ +
+-+    return i;
+-+}
+++    pred_c = mc_terminate_add_c(s, vqj, rfe, &jb->chroma_ip);
+ +
+-+#ifdef RPI
+-+static void rpi_add_residual(HEVCContext * const s,
+-+    const unsigned int log2_trafo_size, const unsigned int c_idx,
+-+    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+-+{
+-+    const AVFrame * const frame = s->frame;
+-+    unsigned int stride = frame->linesize[c_idx];
+-+    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+-+    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+-+    const int is_sliced = rpi_sliced_frame(frame);
+-+    uint8_t * dst = !is_sliced ?
+-+            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+-+        c_idx == 0 ?
+-+            rpi_sliced_frame_pos_y(frame, x, y) :
+-+            rpi_sliced_frame_pos_c(frame, x, y);
+++// We can take a sync here and try to locally overlap QPU processing with ARM
+++// but testing showed a slightly negative benefit with noticable extra complexity
+++#if RPI_OPT_SEP_PRED
+++    vpu_qpu_job_add_sync_this(vqj, &sync_c);
+++#endif
+ +
+-+//    if (c_idx != 0) {
+-+//        return;
+-+//    }
+-+    if (s->enable_rpi) {
+-+        HEVCPredCmd * const cmd = s->univ_pred_cmds[s->pass0_job] + s->num_pred_cmds[s->pass0_job]++;
+-+        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+-+        cmd->size = log2_trafo_size;
+-+        cmd->c_idx = c_idx;
+-+        cmd->ta.buf = coeffs;
+-+        cmd->ta.dst = dst;
+-+        cmd->ta.stride = stride;
+-+    }
+-+    else if (!is_sliced || c_idx == 0) {
+-+        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+-+    }
+-+    else if (c_idx == 1) {
+-+        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    pred_y = mc_terminate_add_y(s, vqj, rfe, &jb->luma_ip);
+++
+++    vpu_qpu_job_add_sync_this(vqj, &sync_y);
+++
+++
+++    // We are expecting a contiguous Z-shaped set of blocks
+++    // So generate up to 3 blocks:
+++    //   1st line
+++    //   body
+++    //   last line
+++    // This will work even if we don't have the expected geometry
+++    if (pred_y || pred_c)
+++    {
+++        const HEVCRpiDeblkEnv *const de = &jb->deblk;
+++        const HEVCRpiDeblkBlk * db = de->blks + 0;
+++        const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
+++        unsigned int x0 = db->x_ctb;
+++        unsigned int xx = x0 + ctb_size;
+++        unsigned int y0 = db->y_ctb;
+++
+++        unsigned int blks_tlbr[3][4] = {{~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}, {~0U, ~0U, 0, 0}};
+++        unsigned int b = 0;
+++        unsigned int i;
+++
+++        for (i = 1, ++db; i < de->n; ++i, ++db)
+++        {
+++            if (db->x_ctb == xx && db->y_ctb == y0) {
+++                xx += ctb_size;
+++            }
+++            else
+++            {
+++                unsigned int * const tlbr = blks_tlbr[b];
+++                if (tlbr[0] > y0)
+++                    tlbr[0] = y0;
+++                if (tlbr[1] > x0)
+++                    tlbr[1] = x0;
+++                if (tlbr[2] < y0 + ctb_size)
+++                    tlbr[2] = y0 + ctb_size;
+++                if (tlbr[3] < xx)
+++                    tlbr[3] = xx;
+++                x0 = db->x_ctb;
+++                xx = x0 + ctb_size;
+++                y0 = db->y_ctb;
+++                b = 1;
+++            }
+++        }
+++
+++        if (blks_tlbr[b][0] != ~0U)
+++            ++b;
+++
+++        {
+++            unsigned int * const tlbr = blks_tlbr[b];
+++            tlbr[0] = y0;
+++            tlbr[1] = x0;
+++            tlbr[2] = y0 + ctb_size;
+++            tlbr[3] = xx;
+++        }
+++
+++        // ??? Coalesce blocks ???
+++        for (i = 0; i <= b; ++i) {
+++            const unsigned int * const tlbr = blks_tlbr[i];
+++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
+++              tlbr[1], tlbr[0], tlbr[3] - tlbr[1], tlbr[2] - tlbr[0], s->ps.sps->vshift[1], pred_y, pred_c);
+++        }
+ +    }
+-+    else {
+-+        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++
+++
+++    // Having accumulated some commands - do them
+++    rpi_cache_flush_finish(rfe);
+++
+++    // Await progress as required
+++    {
+++        unsigned int i;
+++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress); ++i) {
+++            if (jb->progress[i] >= 0) {
+++                ff_hevc_progress_wait_recon(s, jb, s->DPB + i, jb->progress[i]);
+++            }
+++        }
+ +    }
+- }
+++
+++    vpu_qpu_job_finish(vqj);
+++
+++    worker_pic_reset(&jb->coeffs);
+++
+++    // If we have emulated VPU ops - do it here
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++    if (av_rpi_is_sand8_frame(s->frame))
+++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+++        rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
+++#elif RPI_QPU_EMU_Y
+++        rpi_shader_c8(s, &jb->luma_ip, NULL);
+++#else
+++        rpi_shader_c8(s, NULL, &jb->chroma_ip);
+ +#endif
+- 
+- void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                                 int log2_trafo_size, enum ScanType scan_idx,
+-                                 int c_idx)
+++    else
+++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
+++        rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
+++#elif RPI_QPU_EMU_Y
+++        rpi_shader_c16(s, &jb->luma_ip, NULL);
+++#else
+++        rpi_shader_c16(s, NULL, &jb->chroma_ip);
+++#endif
+++#endif
+++
+++#if RPI_OPT_SEP_PRED
+++    // Wait for transform completion
+++    vpu_qpu_wait(&sync_c);
+++
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s, 0, 1);
+++
+++    // Wait for transform completion
+++    vpu_qpu_wait(&sync_y);
+++
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s, 1, 0);
+++#else
+++    // Wait for transform completion
+++    vpu_qpu_wait(&sync_y);
+++
+++    // Perform intra prediction and residual reconstruction
+++    rpi_execute_pred_cmds(s);
+++#endif
+++
+++    // Perform deblocking for CTBs in this row
+++    rpi_execute_dblk_cmds(s);
+++}
+++
+++static void rpi_do_all_passes(HEVCContext *s)
+++{
+++    // Called from main thread - must be no pending background jobs
+++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+++
+++    // Do the various passes - common with the worker code
+++    worker_core(s);
+++    // Prepare next batch
+++    rpi_begin(s);
+++}
+++
+++
+++#endif
+++
++ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
+  {
+--#define GET_COORD(offset, n)                                    \
+--    do {                                                        \
+--        x_c = (x_cg << 2) + scan_x_off[n];                      \
+--        y_c = (y_cg << 2) + scan_y_off[n];                      \
+--    } while (0)
+--    HEVCLocalContext *lc = s->HEVClc;
+--    int transform_skip_flag = 0;
+-+    HEVCLocalContext * const lc = s->HEVClc;
+-+    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
+- 
+-     int last_significant_coeff_x, last_significant_coeff_y;
+--    int last_scan_pos;
+--    int n_end;
+-     int num_coeff = 0;
+--    int greater1_ctx = 1;
+-+    int prev_subset_coded = 0;
+- 
+-     int num_last_subset;
+-     int x_cg_last_sig, y_cg_last_sig;
+- 
+--    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+-+    const uint8_t *scan_x_cg, *scan_y_cg;
+-+    const xy_off_t * scan_xy_off;
++     HEVCContext *s  = avctxt->priv_data;
++@@ -2319,6 +4153,17 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++     int y_ctb       = 0;
++     int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
+  
+-+#ifndef RPI
+-     ptrdiff_t stride = s->frame->linesize[c_idx];
+-     int hshift = s->ps.sps->hshift[c_idx];
+-     int vshift = s->ps.sps->vshift[c_idx];
+--    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-+    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+-                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
+--    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+--    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++#ifdef RPI
+++    // * We don't support cross_component_prediction_enabled_flag but as that
+++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
+++    //   only deal with sand which is never 4:4:4
+++    //   [support wouldn't be hard]
+++    s->enable_rpi =
+++        ((s->ps.sps->bit_depth == 8 && s->frame->format == AV_PIX_FMT_SAND128) ||
+++         (s->ps.sps->bit_depth == 10 && s->frame->format == AV_PIX_FMT_SAND64_10));
+ +#endif
+++    //printf("L0=%d L1=%d\n",s->sh.nb_refs[L1],s->sh.nb_refs[L1]);
+++
++     if (!ctb_addr_ts && s->sh.dependent_slice_segment_flag) {
++         av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++         return AVERROR_INVALIDDATA;
++@@ -2332,8 +4177,14 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         }
++     }
++ 
+ +#ifdef RPI
+-+    int use_vpu;
+++    // Worker must be idle at start
+++    av_assert0(s->pass0_job == s->pass1_job && s->jb0 == s->jb1 && !s->jb0->pending);
+++    rpi_begin(s);
+ +#endif
+-+    int16_t *coeffs;
+-+    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
+-     int explicit_rdpcm_flag = 0;
+-     int explicit_rdpcm_dir_flag;
+++
++     while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) {
++-        int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+++        const int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
+  
+-     int trafo_size = 1 << log2_trafo_size;
+-     int i;
+--    int qp,shift,add,scale,scale_m;
+-+    int qp,shift,scale;
+-     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
+-     const uint8_t *scale_matrix = NULL;
+-     uint8_t dc_scale;
+-     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
+-                                          lc->tu.intra_pred_mode_c;
++         x_ctb = (ctb_addr_rs % ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
++         y_ctb = (ctb_addr_rs / ((s->ps.sps->width + ctb_size - 1) >> s->ps.sps->log2_ctb_size)) << s->ps.sps->log2_ctb_size;
++@@ -2348,6 +4199,52 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++         s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
+  
+--    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+-+    int prev_sig = 0;
+-+    const int c_idx_nz = (c_idx != 0);
++         more_data = hls_coding_quadtree(s, x_ctb, y_ctb, s->ps.sps->log2_ctb_size, 0);
+ +
+-+    int may_hide_sign;
+++#ifdef RPI
+++        // Report progress so we can use our MVs in other frames
+++        // If we are tiled then this isn't really optimal but given that tiling
+++        // can change on a per pic basis (described in PPS) other schemes are
+++        // quite a lot harder
+++        if (s->threads_type == FF_THREAD_FRAME && x_ctb + ctb_size >= s->ps.sps->width) {
+++            ff_hevc_progress_signal_mv(s, y_ctb + ctb_size - 1);
+++        }
+ +
+- 
+-     // Derive QP for dequant
+-     if (!lc->cu.cu_transquant_bypass_flag) {
+--        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+-+        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+-         static const uint8_t rem6[51 + 4 * 6 + 1] = {
+-             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
+-             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
+-@@ -1065,9 +1614,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         };
+-         int qp_y = lc->qp_y;
+- 
+-+        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+++        if (s->enable_rpi) {
+++            int q_full = (++s->ctu_count >= s->max_ctu_count);
+ +
+-         if (s->ps.pps->transform_skip_enabled_flag &&
+-             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
+--            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+-+            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+-+            if (transform_skip_flag) {
+-+                trans_skip_or_bypass = 1;
+-+                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+-+                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+-+                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+-+                    may_hide_sign = 0;
+++            if (rpi_inter_pred_next_ctu(&s->jb0->luma_ip) != 0)
+++                q_full = 1;
+++            if (rpi_inter_pred_next_ctu(&s->jb0->chroma_ip) != 0)
+++                q_full = 1;
+++
+++            s->jb0->deblk.blks[s->jb0->deblk.n].x_ctb = x_ctb;
+++            s->jb0->deblk.blks[s->jb0->deblk.n++].y_ctb = y_ctb;
+++
+++            if (q_full) {
+++                if (s->used_for_ref)
+++                {
+++//                  printf("%d %d/%d job=%d, x,y=%d,%d\n",s->ctu_count,s->num_dblk_cmds[s->pass0_job],RPI_MAX_DEBLOCK_CMDS,s->pass0_job, x_ctb, y_ctb);
+++
+++//                  worker_wait(s);
+++                    // Split work load onto separate threads so we make as rapid progress as possible with this frame
+++                    // Pass on this job to worker thread
+++                    worker_submit_job(s);
+++
+++                    // Make sure we have space to prepare the next job
+++                    worker_pass0_ready(s);
+++
+++                    // Prepare the next batch of commands
+++                    rpi_begin(s);
+++                } else {
+++                    // Non-ref frame so do it all on this thread
+++                    rpi_do_all_passes(s);
+ +                }
+ +            }
+-         }
+- 
+-         if (c_idx == 0) {
+-@@ -1100,39 +1659,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             qp += s->ps.sps->qp_bd_offset;
+-         }
+- 
+--        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
+--        add      = 1 << (shift-1);
+--        scale    = level_scale[rem6[qp]] << (div6[qp]);
+--        scale_m  = 16; // default when no custom scaling lists.
+--        dc_scale = 16;
+-+        // Shift is set to one less than will actually occur as the scale
+-+        // and saturate step adds 1 and then shifts right again
+-+        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+-+        scale = level_scale[rem6[qp]];
+-+        if (div6[qp] >= shift) {
+-+            scale <<= (div6[qp] - shift);
+-+            shift = 0;
+-+        } else {
+-+            shift -= div6[qp];
+-+        }
+- 
+--        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+-+        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
+-             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
+--            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+-+                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+-             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
+- 
+-             matrix_id = 3 * matrix_id + c_idx;
+- 
+-             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+-+            dc_scale = scale_matrix[0];
+-             if (log2_trafo_size >= 4)
+-                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
+-         }
+-+        else
+-+        {
+-+            static const uint8_t sixteen_scale[64] = {
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16,
+-+                16, 16, 16, 16, 16, 16, 16, 16
+-+            };
+-+            scale_matrix = sixteen_scale;
+-+            dc_scale = 16;
+++
+ +        }
+-     } else {
+-+        static const uint8_t unit_scale[64] = {
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+            1, 1, 1, 1, 1, 1, 1, 1,
+-+        };
+-+        scale_matrix = unit_scale;
+-         shift        = 0;
+--        add          = 0;
+--        scale        = 0;
+--        dc_scale     = 0;
+-+        scale        = 2;  // We will shift right to kill this
+-+        dc_scale     = 1;
+++#endif
+ +
+-+        may_hide_sign = 0;
+++
++         if (more_data < 0) {
++             s->tab_slice_address[ctb_addr_rs] = -1;
++             return more_data;
++@@ -2356,9 +4253,40 @@ static int hls_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++ 
++         ctb_addr_ts++;
++         ff_hevc_save_states(s, ctb_addr_ts);
+++#ifdef RPI
+++        if (s->enable_rpi)
+++            continue;
+++#endif
++         ff_hevc_hls_filters(s, x_ctb, y_ctb, ctb_size);
+      }
+  
+++#ifdef RPI
+ +
+++    // Wait for the worker to finish all its jobs
+++    if (s->enable_rpi) {
+++        worker_wait(s);
+++    }
+ +
+++    // Finish off any half-completed rows
+++    if (s->enable_rpi && s->ctu_count) {
+++        rpi_do_all_passes(s);
+++    }
+ +
+-     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
+--        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+--        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+-+        trans_skip_or_bypass) {
+-+        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
+-         if (explicit_rdpcm_flag) {
+--            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+-+            may_hide_sign = 0;
+-+            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
+-         }
+-     }
+- 
+--    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+-+    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
+-                                            &last_significant_coeff_x, &last_significant_coeff_y);
+- 
+-     if (last_significant_coeff_x > 3) {
+-@@ -1160,119 +1756,134 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-         int last_x_c = last_significant_coeff_x & 3;
+-         int last_y_c = last_significant_coeff_y & 3;
+- 
+--        scan_x_off = ff_hevc_diag_scan4x4_x;
+--        scan_y_off = ff_hevc_diag_scan4x4_y;
+-         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
+--        if (trafo_size == 4) {
+-+
+-+        switch (log2_trafo_size) {
+-+        case 2:
+-             scan_x_cg = scan_1x1;
+-             scan_y_cg = scan_1x1;
+--        } else if (trafo_size == 8) {
+-+            break;
+-+        case 3:
+-             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+-             scan_x_cg = diag_scan2x2_x;
+-             scan_y_cg = diag_scan2x2_y;
+--        } else if (trafo_size == 16) {
+-+            break;
+-+        case 4:
+-             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+-             scan_x_cg = ff_hevc_diag_scan4x4_x;
+-             scan_y_cg = ff_hevc_diag_scan4x4_y;
+--        } else { // trafo_size == 32
+-+            break;
+-+        case 5:
+-+        default:
+-             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
+-             scan_x_cg = ff_hevc_diag_scan8x8_x;
+-             scan_y_cg = ff_hevc_diag_scan8x8_y;
+-+            break;
+-         }
+-         break;
+-     }
+-     case SCAN_HORIZ:
+-         scan_x_cg = horiz_scan2x2_x;
+-         scan_y_cg = horiz_scan2x2_y;
+--        scan_x_off = horiz_scan4x4_x;
+--        scan_y_off = horiz_scan4x4_y;
+-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
+-         break;
+-     default: //SCAN_VERT
+-         scan_x_cg = horiz_scan2x2_y;
+-         scan_y_cg = horiz_scan2x2_x;
+--        scan_x_off = horiz_scan4x4_y;
+--        scan_y_off = horiz_scan4x4_x;
+-         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
+-         break;
+-     }
+-     num_coeff++;
+-     num_last_subset = (num_coeff - 1) >> 4;
+- 
+--    for (i = num_last_subset; i >= 0; i--) {
+--        int n, m;
+--        int x_cg, y_cg, x_c, y_c, pos;
+-+    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
+++#if RPI_TSTATS
+++    {
+++        HEVCRpiStats *const ts = &s->tstats;
+ +
+-+    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
+++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
+++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
+++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
+++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
+++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
+++        memset(ts, 0, sizeof(*ts));
+++    }
+++#endif
+ +
+-+    {
+-+        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+++#endif
+++
++     if (x_ctb + ctb_size >= s->ps.sps->width &&
++         y_ctb + ctb_size >= s->ps.sps->height)
++         ff_hevc_hls_filter(s, x_ctb, y_ctb, ctb_size);
++@@ -2393,6 +4321,11 @@ static int hls_decode_entry_wpp(AVCodecContext *avctxt, void *input_ctb_row, int
++     s = s1->sList[self_id];
++     lc = s->HEVClc;
++ 
+ +#ifdef RPI
+-+        use_vpu = 0;
+-+        if (s->enable_rpi) {
+-+            use_vpu = !trans_skip_or_bypass && !lc->tu.cross_pf && log2_trafo_size>=4;
+-+            coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+-+#if HAVE_NEON
+-+            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+-+#else
+-+            memset(coeffs, 0, ccount * sizeof(int16_t));
+++    s->enable_rpi = 0;
+++    //printf("Wavefront\n");
+ +#endif
+++
++     if(ctb_row) {
++         ret = init_get_bits8(&lc->gb, s->data + s->sh.offset[ctb_row - 1], s->sh.size[ctb_row - 1]);
++ 
++@@ -2773,9 +4706,47 @@ static int decode_nal_unit(HEVCContext *s, const H2645NAL *nal)
++         if (ret < 0)
++             return ret;
++ 
++-        if (s->max_ra == INT_MAX) {
++-            if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
++-                s->max_ra = s->poc;
+++        // The definition of _N unit types is "non-reference for other frames
+++        // with the same temporal_id" so they may/will be ref frames for pics
+++        // with a higher temporal_id.
+++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
+++            !(s->nal_unit_type == NAL_TRAIL_N ||
+++                        s->nal_unit_type == NAL_TSA_N   ||
+++                        s->nal_unit_type == NAL_STSA_N  ||
+++                        s->nal_unit_type == NAL_RADL_N  ||
+++                        s->nal_unit_type == NAL_RASL_N);
+++
+++#if DEBUG_DECODE_N
+++        {
+++            static int z = 0;
+++            if (IS_IDR(s)) {
+++                z = 1;
+++            }
+++            if (z != 0 && z++ > DEBUG_DECODE_N) {
+++                s->is_decoded = 0;
+++                break;
+++            }
+ +        }
+-+        else
+ +#endif
+-+        {
+-+            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+-+            memset(coeffs, 0, ccount * sizeof(int16_t));
+++        if (!s->used_for_ref && s->avctx->skip_frame >= AVDISCARD_NONREF) {
+++            s->is_decoded = 0;
+++            break;
+ +        }
+-+    }
+ +
+-+    i = num_last_subset;
+-+    do {
+-         int implicit_non_zero_coeff = 0;
+--        int64_t trans_coeff_level;
+--        int prev_sig = 0;
+--        int offset = i << 4;
+--        int rice_init = 0;
+-+        int n_end;
+++        if (s->sh.first_slice_in_pic_flag) {
+++            if (s->max_ra == INT_MAX) {
+++                if (s->nal_unit_type == NAL_CRA_NUT || IS_BLA(s)) {
+++                    s->max_ra = s->poc;
+++                } else {
+++                    if (IS_IDR(s))
+++                        s->max_ra = INT_MIN;
+++                }
+++            }
+++
+++            if ((s->nal_unit_type == NAL_RASL_R || s->nal_unit_type == NAL_RASL_N) &&
+++                s->poc <= s->max_ra) {
+++                s->is_decoded = 0;
+++                break;
++             } else {
++                 if (IS_IDR(s))
++                     s->max_ra = INT_MIN;
++@@ -2896,10 +4867,25 @@ static int decode_nal_units(HEVCContext *s, const uint8_t *buf, int length)
++         }
++     }
+  
+-         uint8_t significant_coeff_flag_idx[16];
+--        uint8_t nb_significant_coeff_flag = 0;
+--
+--        x_cg = scan_x_cg[i];
+--        y_cg = scan_y_cg[i];
+--
+--        if ((i < num_last_subset) && (i > 0)) {
+--            int ctx_cg = 0;
+--            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
+--            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
+--                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+--
+--            significant_coeff_group_flag[x_cg][y_cg] =
+--                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
+--            implicit_non_zero_coeff = 1;
+--        } else {
+--            significant_coeff_group_flag[x_cg][y_cg] =
+--            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
+--             (x_cg == 0 && y_cg == 0));
+--        }
++-fail:
++-    if (s->ref && s->threads_type == FF_THREAD_FRAME)
++-        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+ -
+--        last_scan_pos = num_coeff - offset - 1;
+-+        unsigned int nb_significant_coeff_flag = 0;
+++fail:  // Also success path
+++    if (s->ref != NULL) {
+++        if (s->used_for_ref && s->threads_type == FF_THREAD_FRAME) {
+++#ifdef RPI
+++            rpi_flush_ref_frame_progress(s, &s->ref->tf, s->ps.sps->height);
+++#endif
+++            ff_hevc_progress_signal_all_done(s);
+++        }
+++#ifdef RPI
+++        // * Flush frame will become confused if we pass it something
+++        //   that doesn't have an expected number of planes (e.g. 400)
+++        //   So only flush if we are sure we can.
+++        else if (s->enable_rpi) {
+++            // Flush frame to real memory as we expect to be able to pass
+++            // it straight on to mmal
+++            flush_frame(s, s->frame);
+++        }
+++#endif
+++    }
++     return ret;
++ }
+  
+-         if (i == num_last_subset) {
+-+            // First time through
+-+            int last_scan_pos = num_coeff - (i << 4) - 1;
+-             n_end = last_scan_pos - 1;
+-             significant_coeff_flag_idx[0] = last_scan_pos;
+-             nb_significant_coeff_flag = 1;
+-         } else {
+-             n_end = 15;
+-+            implicit_non_zero_coeff = (i != 0);
+-         }
++@@ -3070,6 +5056,83 @@ fail:
++     return AVERROR(ENOMEM);
++ }
+  
+--        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
+--            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
+--        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
+--            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
+--
+--        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
+--            static const uint8_t ctx_idx_map[] = {
+--                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
+--                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
+--                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
+--                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
+--                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+-+        if (n_end >= 0) {
+-+            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+-+                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+-+                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+-+                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+-+            };
+-+            static const uint8_t ctx_idx_maps[3][4][16] = {
+-+                {
+-+                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+-+                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+-+                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+-+                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+-+                },
+-+                {
+-+                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+-+                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+-+                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+-+                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+-+                },
+-+                {
+-+                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+-+                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 1
+-+                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 2
+-+                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+-+                }
+-             };
+-             const uint8_t *ctx_idx_map_p;
+-             int scf_offset = 0;
+--            if (s->ps.sps->transform_skip_context_enabled_flag &&
+--                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+--                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
+--                if (c_idx == 0) {
+--                    scf_offset = 40;
+--                } else {
+--                    scf_offset = 14 + 27;
+--                }
+++#ifdef RPI
+++static av_cold void hevc_init_worker(HEVCContext * const s)
+++{
+++    int err;
+ +
+-+            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+-+                ctx_idx_map_p = ctx_idx_maps[0][3];
+-+                scf_offset = 40 + c_idx_nz;
+-             } else {
+--                if (c_idx != 0)
+-+                if (c_idx_nz != 0)
+-                     scf_offset = 27;
+++    memset(s->jobs, 0, sizeof(s->jobs));
+ +
+-                 if (log2_trafo_size == 2) {
+--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+-+                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
+-                 } else {
+--                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
+--                    if (c_idx == 0) {
+--                        if ((x_cg > 0 || y_cg > 0))
+-+                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+-+                    if (!c_idx_nz) {
+-+                        if (i != 0)
+-                             scf_offset += 3;
+++    for (unsigned int job = 0; job < RPI_MAX_JOBS; job++) {
+++        HEVCRpiJob * const jb = s->jobs + job;
+ +
+-                         if (log2_trafo_size == 3) {
+-                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
+-                         } else {
+-@@ -1286,34 +1897,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                     }
+-                 }
+-             }
+--            for (n = n_end; n > 0; n--) {
+--                x_c = scan_x_off[n];
+--                y_c = scan_y_off[n];
+--                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
+--                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
+--                    nb_significant_coeff_flag++;
+++        sem_init(&jb->sem_in, 0, 0);
+++        sem_init(&jb->sem_out, 0, 0);
+++        ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
+ +
+-+            if (n_end > 0) {
+-+                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+-+                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+-+                    n_end, ctx_idx_map_p,
+-+                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+++        jb->intra.n = 0;
+++        jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
+ +
+-+                nb_significant_coeff_flag += cnt;
+-+                if (cnt != 0) {
+-                     implicit_non_zero_coeff = 0;
+-                 }
+-             }
+++        // ** Sizeof the union structure might be overkill but at the moment it
+++        //    is correct (it certainly isn't going to be too small)
+ +
+-             if (implicit_non_zero_coeff == 0) {
+--                if (s->ps.sps->transform_skip_context_enabled_flag &&
+--                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
+--                    if (c_idx == 0) {
+--                        scf_offset = 42;
+--                    } else {
+--                        scf_offset = 16 + 27;
+--                    }
+-+                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+-+                    scf_offset = 42 + c_idx_nz;
+-                 } else {
+-                     if (i == 0) {
+--                        if (c_idx == 0)
+--                            scf_offset = 0;
+--                        else
+--                            scf_offset = 27;
+-+                        scf_offset = c_idx_nz ? 27 : 0;
+-                     } else {
+-                         scf_offset = 2 + scf_offset;
+-                     }
+-                 }
+--                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+-+                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
+-                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
+-                     nb_significant_coeff_flag++;
+-                 }
+-@@ -1323,141 +1930,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             }
+-         }
+- 
+--        n_end = nb_significant_coeff_flag;
+-+        if (nb_significant_coeff_flag != 0) {
+-+            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+-+                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+-+                prev_subset_coded;
+-+            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+-+                (gt1_idx_delta << 2);
+-+            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+-+                gt1_idx_delta;
+++        rpi_inter_pred_alloc(&jb->chroma_ip,
+++                             QPU_N_MAX, QPU_N_GRP,
+++                             QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t),
+++                             QPU_C_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_c_t));
+++        rpi_inter_pred_alloc(&jb->luma_ip,
+++                             QPU_N_MAX,  QPU_N_GRP,
+++                             QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t),
+++                             QPU_Y_CMD_PER_CTU_MAX * sizeof(qpu_mc_pred_y_t));
+ +
+-+            const unsigned int x_cg = scan_x_cg[i];
+-+            const unsigned int y_cg = scan_y_cg[i];
+-+            int16_t * const blk_coeffs = coeffs +
+-+                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+-+            // This calculation is 'wrong' for log2_traffo_size == 2
+-+            // but that doesn't mattor as in this case x_cg & y_cg
+-+            // are always 0 so result is correct (0) anyway
+-+            const uint8_t * const blk_scale = scale_matrix +
+-+                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+++        jb->deblk.n = 0;
+++        jb->deblk.blks = av_malloc(sizeof(jb->deblk.blks[0]) * RPI_MAX_DEBLOCK_CMDS);
+++    }
+++    s->pass0_job = 0;
+++    s->pass1_job = 0;
+++    s->jb0 = s->jobs + 0;
+++    s->jb1 = s->jobs + 0;
+ +
+-+            // * The following code block doesn't deal with these flags:
+-+            //   (nor did the one it replaces)
+-+            //
+-+            // cabac_bypass_alignment_enabled_flag
+-+            //    This should be easy but I can't find a test case
+-+            // extended_precision_processing_flag
+-+            //    This can extend the required precision past 16bits
+-+            //    so is probably tricky - also no example found yet
+++    err = pthread_create(&s->worker_thread, NULL, worker_start, s);
+++    if (err) {
+++        printf("Failed to create worker thread\n");
+++        exit(-1);
+++    }
+++}
+ +
+-+#if USE_N_END_1
+-+            if (nb_significant_coeff_flag == 1) {
+-+                // There is a small gain to be had from special casing the single
+-+                // transform coefficient case.  The reduction in complexity
+-+                // makes up for the code duplicatioon.
+++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
+++{
+++    av_freep(&ipe->q);
+++    gpu_free(&ipe->gptr);
+++}
+ +
+-+                int trans_coeff_level = 1;
+-+                int coeff_sign_flag;
+-+                int coded_val = 0;
+++static av_cold void hevc_exit_worker(HEVCContext *s)
+++{
+++    void *res;
+++    unsigned int i;
+ +
+-+                // initialize first elem of coeff_bas_level_greater1_flag
+-+                prev_subset_coded = 0;
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++        s->jobs[i].terminate = 1;
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++        sem_post(&s->jobs[i].sem_in);
+++    pthread_join(s->worker_thread, &res);
+ +
+-+                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+-+                    trans_coeff_level = 2;
+-+                    prev_subset_coded = 1;
+-+                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+-+                }
+- 
+-+                // Probably not worth the overhead of starting by22 for just one value
+-+                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+- 
+--        if (n_end) {
+--            int first_nz_pos_in_cg;
+--            int last_nz_pos_in_cg;
+--            int c_rice_param = 0;
+--            int first_greater1_coeff_idx = -1;
+--            uint8_t coeff_abs_level_greater1_flag[8];
+--            uint16_t coeff_sign_flag;
+--            int sum_abs = 0;
+--            int sign_hidden;
+--            int sb_type;
+-+                if (coded_val)
+-+                {
+-+                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+-+                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+-+                    } else {
+-+                        uint8_t * const stat_coeff =
+-+                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+-+                        const unsigned int c_rice_param = *stat_coeff >> 2;
+-+                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+- 
+-+                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+-+                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+-+                    }
+-+                }
+++    for(i = 0; i < RPI_MAX_JOBS; i++)
+++    {
+++        HEVCRpiJob * const jb = s->jobs + i;
+++
+++        sem_destroy(&jb->sem_in);
+++        sem_destroy(&jb->sem_out);
+++        ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
+++        av_freep(&jb->intra.cmds);
+++        av_freep(&jb->deblk.blks);
+++        rpi_free_inter_pred(&jb->chroma_ip);
+++        rpi_free_inter_pred(&jb->luma_ip);
+++    }
+++}
+++
+++#endif
+++
++ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++ {
++     HEVCContext       *s = avctx->priv_data;
++@@ -3081,10 +5144,19 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
+  
+--            // initialize first elem of coeff_bas_level_greater1_flag
+--            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+-+                {
+-+                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+-+                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+-+                    const unsigned int scale_m = blk_scale[xy_off->scale];
++     av_freep(&s->cabac_state);
+  
+--            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+--                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
+--                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
+--                else
+--                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
+--                c_rice_param = lc->stat_coeff[sb_type] / 4;
+--            }
+--
+--            if (!(i == num_last_subset) && greater1_ctx == 0)
+--                ctx_set++;
+--            greater1_ctx = 1;
+--            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
+--
+--            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
+--                int inc = (ctx_set << 2) + greater1_ctx;
+--                coeff_abs_level_greater1_flag[m] =
+--                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
+--                if (coeff_abs_level_greater1_flag[m]) {
+--                    greater1_ctx = 0;
+--                    if (first_greater1_coeff_idx == -1)
+--                        first_greater1_coeff_idx = m;
+--                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
+--                    greater1_ctx++;
+-+                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+-+                        (trans_coeff_level ^ k) - k,  // Apply sign
+-+                        scale,
+-+                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+-+                        shift);
+-                 }
+-             }
+--            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
+--
+--            if (lc->cu.cu_transquant_bypass_flag ||
+--                (lc->cu.pred_mode ==  MODE_INTRA  &&
+--                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
+--                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
+--                 explicit_rdpcm_flag)
+--                sign_hidden = 0;
+-             else
+--                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
++-    for (i = 0; i < 3; i++) {
++-        av_freep(&s->sao_pixel_buffer_h[i]);
++-        av_freep(&s->sao_pixel_buffer_v[i]);
+++#ifdef RPI
+++
+++    hevc_exit_worker(s);
+++    vpu_qpu_term();
+++    for (i = 0; i != 2; ++i) {
+++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++     }
+++
+++    av_rpi_zc_uninit(avctx);
+ +#endif
+-+            {
+-+                int sign_hidden = may_hide_sign;
+-+                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+-+                uint32_t coeff_sign_flags;
+-+                uint32_t coded_vals = 0;
+-+                // Sum(abs(level[]))
+-+                // In fact we only need the bottom bit and in some future
+-+                // version that may be all we calculate
+-+                unsigned int sum_abs;
+ +
+-+                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+-+                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
+++    av_freep(&s->sao_pixel_buffer_v[0]);
++     av_frame_free(&s->output_frame);
++ 
++     for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++@@ -3122,6 +5194,7 @@ static av_cold int hevc_decode_free(AVCodecContext *avctx)
++     return 0;
++ }
++ 
+ +
+-+                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+-+                    sign_hidden = 0;
++ static av_cold int hevc_init_context(AVCodecContext *avctx)
++ {
++     HEVCContext *s = avctx->priv_data;
++@@ -3135,6 +5208,37 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++     s->HEVClcList[0] = s->HEVClc;
++     s->sList[0] = s;
++ 
+++#ifdef RPI
+++    // Whilst FFmpegs init fn is only called once the close fn is called as
+++    // many times as we have threads (init_thread_copy is called for the
+++    // threads).  So to match init & term put the init here where it will be
+++    // called by both init & copy
+++    av_rpi_zc_init(avctx);
+ +
+-+                // -- Start bypass block
+++    if (vpu_qpu_init() != 0)
+++        goto fail;
+ +
+-+                bypass_start(s);
+++#if RPI_INTER
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++    {
+++        static const uint32_t dframe[1] = {0x80808080};
+++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
+++    }
+++#endif
+++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+++    s->qpu_dummy_frame_qpu = qpu_fn(mc_start);  // Use our code as a dummy frame
+++#endif
+++#endif
+++    //gpu_malloc_uncached(2048*64,&s->dummy);
+ +
+-+                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+++    s->enable_rpi = 0;
+ +
+-+                if (coded_vals != 0)
+-+                {
+-+                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+-+                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+-+                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+-+                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+-+                    int * level = levels - 1;
+++    for (i = 0; i != 2; ++i) {
+++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
+++    }
+++    hevc_init_worker(s);
+++#endif
+ +
+-+                    do {
+-+                        {
+-+                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+-+                            level += z;
+-+                            coded_vals <<= z;
+-+                        }
++     s->cabac_state = av_malloc(HEVC_CONTEXTS);
++     if (!s->cabac_state)
++         goto fail;
++@@ -3148,6 +5252,7 @@ static av_cold int hevc_init_context(AVCodecContext *avctx)
++         if (!s->DPB[i].frame)
++             goto fail;
++         s->DPB[i].tf.f = s->DPB[i].frame;
+++        s->DPB[i].dpb_no = i;
++     }
+  
+--            if (first_greater1_coeff_idx != -1) {
+--                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
+--            }
+--            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
+--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
+--            } else {
+--                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
+--            }
+-+                        {
+-+                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+-+                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+-+
+-+                            sum_abs += last_coeff_abs_level_remaining + 1;
+-+                            *level = trans_coeff_level;
+- 
+--            for (m = 0; m < n_end; m++) {
+--                n = significant_coeff_flag_idx[m];
+--                GET_COORD(offset, n);
+--                if (m < 8) {
+--                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
+--                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
+--                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+--
+--                        trans_coeff_level += last_coeff_abs_level_remaining;
+--                        if (trans_coeff_level > (3 << c_rice_param))
+--                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+--                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+--                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+--                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+--                                lc->stat_coeff[sb_type]++;
+--                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+--                                if (lc->stat_coeff[sb_type] > 0)
+--                                    lc->stat_coeff[sb_type]--;
+--                            rice_init = 1;
+-+                            if (stat_coeff != NULL)
+-+                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+-+                            stat_coeff = NULL;
+-+
+-+                            if (trans_coeff_level > (3 << c_rice_param) &&
+-+                                (c_rice_param < 4 || rice_adaptation_enabled))
+-+                                ++c_rice_param;
+-                         }
+--                    }
+--                } else {
+--                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+--
+--                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
+--                    if (trans_coeff_level > (3 << c_rice_param))
+--                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
+--                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
+--                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
+--                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
+--                            lc->stat_coeff[sb_type]++;
+--                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
+--                            if (lc->stat_coeff[sb_type] > 0)
+--                                lc->stat_coeff[sb_type]--;
+--                        rice_init = 1;
+--                    }
+-+                    } while (coded_vals != 0);
+-                 }
+--                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
+--                    sum_abs += trans_coeff_level;
+--                    if (n == first_nz_pos_in_cg && (sum_abs&1))
+--                        trans_coeff_level = -trans_coeff_level;
+-+
+-+                // sign_hidden = 0 or 1 so we can combine the tests
+-+                if ((sign_hidden & sum_abs) != 0) {
+-+                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
+-                 }
+--                if (coeff_sign_flag >> 15)
+--                    trans_coeff_level = -trans_coeff_level;
+--                coeff_sign_flag <<= 1;
+--                if(!lc->cu.cu_transquant_bypass_flag) {
+--                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+--                        if(y_c || x_c || log2_trafo_size < 4) {
+--                            switch(log2_trafo_size) {
+--                                case 3: pos = (y_c << 3) + x_c; break;
+--                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
+--                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
+--                                default: pos = (y_c << 2) + x_c; break;
+--                            }
+--                            scale_m = scale_matrix[pos];
+--                        } else {
+--                            scale_m = dc_scale;
+--                        }
+-+
+-+                bypass_finish(s);
+-+
+-+                // -- Finish bypass block
+-+
+-+                // Scale loop
+-+                {
+-+                    int m = nb_significant_coeff_flag - 1;
+-+
+-+                    // Deal with DC component (if any) first
+-+                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+-+                    {
+-+                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+-+                        blk_coeffs[0] = trans_scale_sat(
+-+                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+-+                        --m;
+-                     }
+--                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
+--                    if(trans_coeff_level < 0) {
+--                        if((~trans_coeff_level) & 0xFffffffffff8000)
+--                            trans_coeff_level = -32768;
+--                    } else {
+--                        if(trans_coeff_level & 0xffffffffffff8000)
+--                            trans_coeff_level = 32767;
+-+
+-+#if !USE_N_END_1
+-+                    // If N_END_1 set then m was at least 1 initially
+-+                    if (m >= 0)
+-+#endif
+-+                    {
+-+                        do {
+-+                            const xy_off_t * const xy_off = scan_xy_off +
+-+                                significant_coeff_flag_idx[m];
+-+                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+-+
+-+                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+-+                                (levels[m] ^ k) - k,
+-+                                scale,
+-+                                blk_scale[xy_off->scale],
+-+                                shift);
+-+                        } while (--m >= 0);
+-                     }
+-                 }
+--                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+-+
+-             }
+-         }
+--    }
+-+    } while ((i = next_subset(s, i, c_idx_nz,
+-+        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
++     s->max_ra = INT_MAX;
++@@ -3349,9 +5454,9 @@ static av_cold int hevc_decode_init(AVCodecContext *avctx)
++     }
+  
+-     if (lc->cu.cu_transquant_bypass_flag) {
+-         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1467,7 +2118,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+-         }
+-     } else {
+--        if (transform_skip_flag) {
+-+        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
+-             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
+-                       log2_trafo_size == 2 &&
+-                       lc->cu.pred_mode == MODE_INTRA;
+-@@ -1475,7 +2126,6 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 for (i = 0; i < 8; i++)
+-                     FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
+-             }
+--
+-             s->hevcdsp.transform_skip(coeffs, log2_trafo_size);
++     if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++-            s->threads_type = FF_THREAD_FRAME;
++-        else
++-            s->threads_type = FF_THREAD_SLICE;
+++        s->threads_type = FF_THREAD_FRAME;
+++    else
+++        s->threads_type = FF_THREAD_SLICE;
+  
+-             if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
+-@@ -1486,8 +2136,26 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                 s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
+-             }
+-         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
+--            s->hevcdsp.idct_4x4_luma(coeffs);
+-+           s->hevcdsp.idct_4x4_luma(coeffs);
+-         } else {
+-+#ifdef RPI
+-+            if (!use_vpu) {
+-+              int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-+              if (max_xy == 0) {
+-+                  s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-+              } else {
+-+                  int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
+-+                  if (max_xy < 4)
+-+                      col_limit = FFMIN(4, col_limit);
+-+                  else if (max_xy < 8)
+-+                      col_limit = FFMIN(8, col_limit);
+-+                  else if (max_xy < 12)
+-+                      col_limit = FFMIN(24, col_limit);
+-+
+-+                  s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-+              }
+-+            }
+-+#else
+-             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
+-             if (max_xy == 0)
+-                 s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+-@@ -1501,6 +2169,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-                     col_limit = FFMIN(24, col_limit);
+-                 s->hevcdsp.idct[log2_trafo_size-2](coeffs, col_limit);
+-             }
+-+#endif
+-         }
+-     }
+-     if (lc->tu.cross_pf) {
+-@@ -1510,7 +2179,11 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
+-             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+-         }
+-     }
+-+#ifdef RPI
+-+    rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+-+#else
+-     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+-+#endif
++     return 0;
+  }
++@@ -3410,6 +5515,8 @@ AVCodec ff_hevc_decoder = {
++     .update_thread_context = hevc_update_thread_context,
++     .init_thread_copy      = hevc_init_thread_copy,
++     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
+++//                             0,
+++//                             AV_CODEC_CAP_FRAME_THREADS,
++                              AV_CODEC_CAP_SLICE_THREADS | AV_CODEC_CAP_FRAME_THREADS,
++     .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++ };
++diff --git a/libavcodec/hevc.h b/libavcodec/hevc.h
++index 162ca0e582..d647232638 100644
++--- a/libavcodec/hevc.h
+++++ b/libavcodec/hevc.h
++@@ -23,6 +23,7 @@
++ #ifndef AVCODEC_HEVC_H
++ #define AVCODEC_HEVC_H
+  
+- void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
+-diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
+-index 1f33b0c..3143b4f 100644
+---- a/libavcodec/hevc_filter.c
+-+++ b/libavcodec/hevc_filter.c
+-@@ -22,6 +22,12 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+++#include "rpi_opts.h"
++ #include "libavutil/buffer.h"
++ #include "libavutil/md5.h"
+  
+-+//#define DISABLE_SAO
+-+//#define DISABLE_DEBLOCK
+-+//#define DISABLE_STRENGTHS
+-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+-+//#define DISABLE_DEBLOCK_NONREF
++@@ -37,6 +38,10 @@
++ #include "thread.h"
++ #include "videodsp.h"
++ 
+++#ifdef RPI
+++#include "rpi_qpu.h"
+++#endif
+ +
+- #include "libavutil/common.h"
+- #include "libavutil/internal.h"
++ #define MAX_DPB_SIZE 16 // A.4.1
++ #define MAX_REFS 16
+  
+-@@ -31,6 +37,11 @@
++@@ -463,6 +468,7 @@ typedef struct HEVCSPS {
++     int implicit_rdpcm_enabled_flag;
++     int explicit_rdpcm_enabled_flag;
++     int intra_smoothing_disabled_flag;
+++    int high_precision_offsets_enabled_flag;
++     int persistent_rice_adaptation_enabled_flag;
+  
+- #include "bit_depth_template.c"
++     ///< coded frame dimension in various units
++@@ -660,6 +666,7 @@ typedef struct CodingUnit {
++     uint8_t cu_transquant_bypass_flag;
++ } CodingUnit;
+  
+-+#ifdef RPI
+-+#include "rpi_qpu.h"
+-+#include "rpi_zc.h"
+++#if 0
++ typedef struct Mv {
++     int16_t x;  ///< horizontal component of motion vector
++     int16_t y;  ///< vertical component of motion vector
++@@ -670,6 +677,7 @@ typedef struct MvField {
++     int8_t ref_idx[2];
++     int8_t pred_flag;
++ } MvField;
+ +#endif
++ 
++ typedef struct NeighbourAvailable {
++     int cand_bottom_left;
++@@ -745,9 +753,23 @@ typedef struct HEVCFrame {
++      * A combination of HEVC_FRAME_FLAG_*
++      */
++     uint8_t flags;
+ +
+- #define LUMA 0
+- #define CB 1
+- #define CR 2
+-@@ -139,6 +150,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
+-     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
+- }
+++    // Entry no in DPB - can be used as a small unique
+++    // frame identifier (within the current thread)
+++    uint8_t dpb_no;
++ } HEVCFrame;
+  
+-+static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+-+{
+ +#ifdef RPI
+-+    return c_idx != 0 && rpi_sliced_frame(s->frame) ? 1 : s->ps.sps->pixel_shift;
+-+#else
+-+    return s->ps.sps->pixel_shift;
+++typedef struct HEVCLocalContextIntra {
+++    TransformUnit tu;
+++    NeighbourAvailable na;
+++} HEVCLocalContextIntra;
+ +#endif
+-+}
+ +
+- static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
+-                      intptr_t stride_dst, intptr_t stride_src)
+- {
+-@@ -193,7 +213,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
+-                            int stride_src, int x, int y, int width, int height,
+-                            int c_idx, int x_ctb, int y_ctb)
+- {
+--    int sh = s->ps.sps->pixel_shift;
+-+    const unsigned int sh = pixel_shift(s, c_idx);
+-     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+-     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++ typedef struct HEVCLocalContext {
+++    TransformUnit tu;  // Moved to start to match HEVCLocalContextIntra (yuk!)
+++    NeighbourAvailable na;
+++
++     uint8_t cabac_state[HEVC_CONTEXTS];
+  
+-@@ -224,13 +244,14 @@ static void restore_tqb_pixels(HEVCContext *s,
+-         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
+-         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
+-         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
+--        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+-+        const unsigned int sh = pixel_shift(s, c_idx);
+-+        int len          = (min_pu_size >> hshift) << sh;
+-         for (y = y_min; y < y_max; y++) {
+-             for (x = x_min; x < x_max; x++) {
+-                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
+-                     int n;
+--                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+--                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+-+                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+-+                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+-                     for (n = 0; n < (min_pu_size >> vshift); n++) {
+-                         memcpy(src, dst, len);
+-                         src += stride_src;
+-@@ -246,7 +267,7 @@ static void restore_tqb_pixels(HEVCContext *s,
+- 
+- static void sao_filter_CTB(HEVCContext *s, int x, int y)
+- {
+--    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+-+    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+-     HEVCLocalContext *lc = s->HEVClc;
+-     int c_idx;
+-     int edges[4];  // 0 left 1 top 2 right 3 bottom
+-@@ -267,12 +288,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-     uint8_t right_tile_edge  = 0;
+-     uint8_t up_tile_edge     = 0;
+-     uint8_t bottom_tile_edge = 0;
+-+#ifdef RPI
+-+    const int sliced = rpi_sliced_frame(s->frame);
+-+    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+-+#else
+-+    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+-+#endif
++     uint8_t stat_coeff[4];
++@@ -762,8 +784,6 @@ typedef struct HEVCLocalContext {
+  
+-     edges[0]   = x_ctb == 0;
+-     edges[1]   = y_ctb == 0;
+-     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
+-     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++     int qPy_pred;
+  
+-+#ifdef DISABLE_SAO
+-+    return;
+-+#endif
+-+
+-     if (restore) {
+-         if (!edges[0]) {
+-             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
+-@@ -304,7 +335,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-         }
+-     }
++-    TransformUnit tu;
++-
++     uint8_t ctb_left_flag;
++     uint8_t ctb_up_flag;
++     uint8_t ctb_up_right_flag;
++@@ -779,7 +799,6 @@ typedef struct HEVCLocalContext {
++     int ct_depth;
++     CodingUnit cu;
++     PredictionUnit pu;
++-    NeighbourAvailable na;
+  
+--    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+-+    for (c_idx = 0; c_idx < plane_count; c_idx++) {
+-         int x0       = x >> s->ps.sps->hshift[c_idx];
+-         int y0       = y >> s->ps.sps->vshift[c_idx];
+-         int stride_src = s->frame->linesize[c_idx];
+-@@ -313,28 +344,82 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
+-         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
+-         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
+--        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+--        int stride_dst;
+-+        ptrdiff_t stride_dst;
+-         uint8_t *dst;
++ #define BOUNDARY_LEFT_SLICE     (1 << 0)
++ #define BOUNDARY_LEFT_TILE      (1 << 1)
++@@ -790,6 +809,207 @@ typedef struct HEVCLocalContext {
++     int boundary_flags;
++ } HEVCLocalContext;
+  
+ +#ifdef RPI
+-+        const unsigned int sh = (sliced && c_idx != 0) ? 1 : s->ps.sps->pixel_shift;
+-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+-+        uint8_t * const src = !sliced ?
+-+                &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)] :
+-+            c_idx == 0 ?
+-+                rpi_sliced_frame_pos_y(s->frame, x0, y0) :
+-+                rpi_sliced_frame_pos_c(s->frame, x0, y0);
+-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+-+            !sliced ? src - (1 << sh) :
+-+            c_idx == 0 ?
+-+                rpi_sliced_frame_pos_y(s->frame, x0 - 1, y0) :
+-+                rpi_sliced_frame_pos_c(s->frame, x0 - 1, y0);
+-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+-+            !sliced ? src + (width << sh) :
+-+            c_idx == 0 ?
+-+                rpi_sliced_frame_pos_y(s->frame, x0 + width, y0) :
+-+                rpi_sliced_frame_pos_c(s->frame, x0 + width, y0);
+ +
+++// The processing is done in chunks
+++// Increasing RPI_NUM_CHUNKS will reduce time spent activating QPUs and cache flushing,
+++// but allocate more memory and increase the latency before data in the next frame can be processed
+++#define RPI_NUM_CHUNKS 4
+++#define RPI_CHUNK_SIZE 12
+++#define RPI_ROUND_TO_LINES 0
+ +
+-+        if (sliced && c_idx > 1) {
+-+            break;
+-+        }
+-+#else
+-+        const unsigned int sh = s->ps.sps->pixel_shift;
+-+        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+-+        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+-+        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+-+        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+-+#endif
+++// RPI_MAX_WIDTH is maximum width in pixels supported by the accelerated code
+++#define RPI_MAX_WIDTH (RPI_NUM_CHUNKS*64*RPI_CHUNK_SIZE)
+++
+++// Worst case is for 4:4:4 4x4 blocks with 64 high coding tree blocks, so 16 MV cmds per 4 pixels across for each colour plane, * 2 for bi
+++#define RPI_MAX_MV_CMDS_Y   (2*16*1*(RPI_MAX_WIDTH/4))
+++#define RPI_MAX_MV_CMDS_C   (2*16*2*(RPI_MAX_WIDTH/4))
+++// Each block can have an intra prediction and a transform_add command
+++#define RPI_MAX_PRED_CMDS (2*16*3*(RPI_MAX_WIDTH/4))
+++// Worst case is 16x16 CTUs
+++#define RPI_MAX_DEBLOCK_CMDS (RPI_MAX_WIDTH*4/16)
+++
+++#define RPI_CMD_LUMA_UNI 0
+++#define RPI_CMD_CHROMA_UNI 1
+++#define RPI_CMD_LUMA_BI 2
+++#define RPI_CMD_CHROMA_BI 3
+++#define RPI_CMD_V_BI 4
+++
+++// Command for inter prediction
+++typedef struct HEVCMvCmd {
+++    uint8_t cmd;
+++    uint8_t block_w;
+++    uint8_t block_h;
+++    int8_t ref_idx[2];
+++    uint16_t dststride;
+++    uint16_t srcstride;
+++    uint16_t srcstride1;
+++    int16_t weight;
+++    int16_t offset;
+++    int16_t x_off;
+++    int16_t y_off;
+++    uint8_t *src;
+++    uint8_t *src1;
+++    uint8_t *dst;
+++    Mv mv;
+++    Mv mv1;
+++} HEVCMvCmd;
+++
+++
+++// Command for intra prediction and transform_add of predictions to coefficients
+++enum rpi_pred_cmd_e
+++{
+++    RPI_PRED_ADD_RESIDUAL,
+++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
+++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
+++    RPI_PRED_ADD_DC,
+++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
+++    RPI_PRED_ADD_DC_V,
+++    RPI_PRED_INTRA,
+++    RPI_PRED_I_PCM,
+++    RPI_PRED_CMD_MAX
+++};
+++
+++typedef struct HEVCPredCmd {
+++    uint8_t type;
+++    uint8_t size;  // log2 "size" used by all variants
+++    uint8_t na;    // i_pred - but left here as they pack well
+++    uint8_t c_idx; // i_pred
+++    union {
+++        struct {  // TRANSFORM_ADD
+++            uint8_t * dst;
+++            const int16_t * buf;
+++            uint16_t stride;  // Should be good enough for all pic fmts we use
+++            int16_t dc;
+++        } ta;
+++        struct {
+++            uint8_t * dst;
+++            uint32_t stride;
+++            int dc;
+++        } dc;
+++        struct {  // INTRA
+++            uint16_t x;
+++            uint16_t y;
+++            enum IntraPredMode mode;
+++        } i_pred;
+++        struct {  // I_PCM
+++            uint16_t x;
+++            uint16_t y;
+++            const void * src;
+++            uint32_t src_len;
+++        } i_pcm;
+++    };
+++} HEVCPredCmd;
+ +
+-         switch (sao->type_idx[c_idx]) {
+-         case SAO_BAND:
+-             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+-                            x_ctb, y_ctb);
+-             if (s->ps.pps->transquant_bypass_enable_flag ||
+-                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
+--            dst = lc->edge_emu_buffer;
+--            stride_dst = 2*MAX_PB_SIZE;
+--            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
+--            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+--                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+--                                            width, height);
+--            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+--                               x, y, width, height, c_idx);
+-+                dst = lc->edge_emu_buffer;
+-+                stride_dst = 2*MAX_PB_SIZE;
+-+                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+-+#ifdef RPI
+-+                if (sliced && c_idx != 0)
+-+                {
+-+                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+-+                                                    sao->offset_val[1], sao->band_position[1],
+-+                                                    sao->offset_val[2], sao->band_position[2],
+-+                                                    width, height);
+-+                }
+-+                else
+ +#endif
+-+                {
+-+                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+-+                                                    width, height);
+-+                }
+-+                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-+                                   x, y, width, height, c_idx);
+-             } else {
+--            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+--                                            sao->offset_val[c_idx], sao->band_position[c_idx],
+--                                            width, height);
+++
+ +#ifdef RPI
+-+                if (sliced && c_idx != 0)
+-+                {
+-+                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+-+                                                    sao->offset_val[1], sao->band_position[1],
+-+                                                    sao->offset_val[2], sao->band_position[2],
+-+                                                    width, height);
+-+                }
+-+                else
+-+#endif
+-+                {
+-+                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+-+                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+-+                                                    width, height);
+-+                }
+-             }
+-             sao->type_idx[c_idx] = SAO_APPLIED;
+-             break;
+-@@ -342,108 +427,117 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-         {
+-             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
+-             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+--            int left_edge = edges[0];
+-             int top_edge = edges[1];
+--            int right_edge = edges[2];
+-             int bottom_edge = edges[3];
+--            int sh = s->ps.sps->pixel_shift;
+--            int left_pixels, right_pixels;
+- 
+-             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
+-             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+- 
+-             if (!top_edge) {
+--                int left = 1 - left_edge;
+--                int right = 1 - right_edge;
+--                const uint8_t *src1[2];
+-                 uint8_t *dst1;
+--                int src_idx, pos;
+-+                int src_idx;
+-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+- 
+--                dst1 = dst - stride_dst - (left << sh);
+--                src1[0] = src - stride_src - (left << sh);
+--                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
+--                pos = 0;
+--                if (left) {
+-+                dst1 = dst - stride_dst;
+++#include <semaphore.h>
+ +
+-+                if (src_l != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1, src1[src_idx], sh);
+--                    pos += (1 << sh);
+-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
+-                 }
+++union qpu_mc_pred_cmd_s;
+++struct qpu_mc_pred_y_p_s;
+++struct qpu_mc_src_s;
+ +
+-                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
+-                            SAO_APPLIED);
+--                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+--                if (right) {
+--                    pos += width << sh;
+-+                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+++typedef struct HEVCRpiInterPredQ
+++{
+++    union qpu_mc_pred_cmd_u *qpu_mc_base;
+++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
+++    struct qpu_mc_src_s *last_l0;
+++    struct qpu_mc_src_s *last_l1;
+++    unsigned int load;
+++    uint32_t code_setup;
+++    uint32_t code_sync;
+++    uint32_t code_exit;
+++} HEVCRpiInterPredQ;
+ +
+-+                if (src_r != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
+-                 }
+-             }
+-             if (!bottom_edge) {
+--                int left = 1 - left_edge;
+--                int right = 1 - right_edge;
+--                const uint8_t *src1[2];
+--                uint8_t *dst1;
+--                int src_idx, pos;
+-+                uint8_t * const dst1 = dst + height * stride_dst;
+-+                int src_idx;
+-+                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+-+                const unsigned int hoff = height * stride_src;
+- 
+--                dst1 = dst + height * stride_dst - (left << sh);
+--                src1[0] = src + height * stride_src - (left << sh);
+--                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
+--                pos = 0;
+--                if (left) {
+-+                if (src_l != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1, src1[src_idx], sh);
+--                    pos += (1 << sh);
+-+                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
+-                 }
+++typedef struct HEVCRpiInterPredEnv
+++{
+++    HEVCRpiInterPredQ * q;
+++    unsigned int n;        // Number of Qs
+++    unsigned int n_grp;    // Number of Q in a group
+++    unsigned int curr;     // Current Q number (0..n-1)
+++    int used;              // 0 if nothing in any Q, 1 otherwise
+++    int used_grp;          // 0 if nothing in any Q in the current group
+++    unsigned int max_fill;
+++    unsigned int min_gap;
+++    GPU_MEM_PTR_T gptr;
+++} HEVCRpiInterPredEnv;
+++
+++typedef struct HEVCRpiIntraPredEnv {
+++    unsigned int n;        // Number of commands
+++    HEVCPredCmd * cmds;
+++} HEVCRpiIntraPredEnv;
+++
+++typedef struct HEVCRpiCeoffEnv {
+++    unsigned int n;
+++    uint16_t * buf;
+++} HEVCRpiCoeffEnv;
+ +
+-                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
+-                            SAO_APPLIED);
+--                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
+--                if (right) {
+--                    pos += width << sh;
+-+                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+++typedef struct HEVCRpiCeoffsEnv {
+++    HEVCRpiCoeffEnv s[4];
+++    GPU_MEM_PTR_T gptr;
+++    void * mptr;
+++} HEVCRpiCoeffsEnv;
+ +
+-+                if (src_r != NULL) {
+-                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
+-                                SAO_APPLIED);
+--                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+-+                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
+-                 }
+-             }
+--            left_pixels = 0;
+--            if (!left_edge) {
+-+            if (src_l != NULL) {
+-                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+-                     copy_vert(dst - (1 << sh),
+-                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
+-                               sh, height, stride_dst, 1 << sh);
+-                 } else {
+--                    left_pixels = 1;
+-+                    copy_vert(dst - (1 << sh),
+-+                              src_l,
+-+                              sh, height, stride_dst, stride_src);
+-                 }
+-             }
+--            right_pixels = 0;
+--            if (!right_edge) {
+-+            if (src_r != NULL) {
+-                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
+-                     copy_vert(dst + (width << sh),
+-                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
+-                               sh, height, stride_dst, 1 << sh);
+-                 } else {
+--                    right_pixels = 1;
+-+                    copy_vert(dst + (width << sh),
+-+                              src_r,
+-+                              sh, height, stride_dst, stride_src);
+-                 }
+-             }
+- 
+--            copy_CTB(dst - (left_pixels << sh),
+--                     src - (left_pixels << sh),
+--                     (width + left_pixels + right_pixels) << sh,
+-+            copy_CTB(dst,
+-+                     src,
+-+                     width << sh,
+-                      height, stride_dst, stride_src);
+- 
+-             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
+-                            x_ctb, y_ctb);
+--            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+--                                            sao->eo_class[c_idx], width, height);
+--            s->hevcdsp.sao_edge_restore[restore](src, dst,
+--                                                stride_src, stride_dst,
+--                                                sao,
+--                                                edges, width,
+--                                                height, c_idx,
+--                                                vert_edge,
+--                                                horiz_edge,
+--                                                diag_edge);
+-+#ifdef RPI
+-+            if (sliced && c_idx != 0)
+-+            {
+-+                // Class always the same for both U & V (which is just as well :-))
+-+                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+-+                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+-+                                                width, height);
+-+                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+-+                                                    stride_src, stride_dst,
+-+                                                    sao,
+-+                                                    edges, width,
+-+                                                    height, c_idx,
+-+                                                    vert_edge,
+-+                                                    horiz_edge,
+-+                                                    diag_edge);
+-+            }
+-+            else
+-+#endif
+-+            {
+-+                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+-+                                                sao->eo_class[c_idx], width, height);
+-+                s->hevcdsp.sao_edge_restore[restore](src, dst,
+-+                                                    stride_src, stride_dst,
+-+                                                    sao,
+-+                                                    edges, width,
+-+                                                    height, c_idx,
+-+                                                    vert_edge,
+-+                                                    horiz_edge,
+-+                                                    diag_edge);
+-+            }
+-             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+-                                x, y, width, height, c_idx);
+-             sao->type_idx[c_idx] = SAO_APPLIED;
+-@@ -453,6 +547,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
+-     }
+- }
+- 
+-+// Returns 2 or 0.
+- static int get_pcm(HEVCContext *s, int x, int y)
+- {
+-     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
+-@@ -479,7 +574,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     uint8_t *src;
+-     int x, y;
+-     int chroma, beta;
+--    int32_t c_tc[2], tc[2];
+-+    int32_t c_tc[4], tc[2];
+-     uint8_t no_p[2] = { 0 };
+-     uint8_t no_q[2] = { 0 };
+- 
+-@@ -496,6 +591,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 s->ps.sps->pcm.loop_filter_disable_flag) ||
+-                s->ps.pps->transquant_bypass_enable_flag;
+- 
+-+#ifdef DISABLE_DEBLOCK_NONREF
+-+    if (!s->used_for_ref)
+-+      return; // Don't deblock non-reference frames
+++typedef struct HEVCRpiDeblkBlk {
+++    uint16_t x_ctb;
+++    uint16_t y_ctb;
+++} HEVCRpiDeblkBlk;
+++
+++typedef struct HEVCRpiDeblkEnv {
+++    unsigned int n;
+++    HEVCRpiDeblkBlk * blks;
+++} HEVCRpiDeblkEnv;
+++
+++typedef struct HEVCRPiFrameProgressWait {
+++    int req;
+++    struct HEVCRPiFrameProgressWait * next;
+++    sem_t sem;
+++} HEVCRPiFrameProgressWait;
+++
+++typedef struct HEVCRPiFrameProgressState {
+++    struct HEVCRPiFrameProgressWait * first;
+++    struct HEVCRPiFrameProgressWait * last;
+++    pthread_mutex_t lock;
+++} HEVCRPiFrameProgressState;
+++
+++typedef struct HEVCRpiJob {
+++    volatile int terminate;
+++    int pending;
+++    sem_t sem_in;       // set by main
+++    sem_t sem_out;      // set by worker
+++    HEVCRpiInterPredEnv chroma_ip;
+++    HEVCRpiInterPredEnv luma_ip;
+++    int16_t progress[32];  // index by dpb_no
+++    HEVCRpiIntraPredEnv intra;
+++    HEVCRpiCoeffsEnv coeffs;
+++    HEVCRpiDeblkEnv deblk;
+++    HEVCRPiFrameProgressWait progress_wait;
+++} HEVCRpiJob;
+++
+++#if RPI_TSTATS
+++typedef struct HEVCRpiStats {
+++    int y_pred1_y8_merge;
+++    int y_pred1_xy;
+++    int y_pred1_x0;
+++    int y_pred1_y0;
+++    int y_pred1_x0y0;
+++    int y_pred1_wle8;
+++    int y_pred1_wgt8;
+++    int y_pred1_hle16;
+++    int y_pred1_hgt16;
+++    int y_pred2_xy;
+++    int y_pred2_x0;
+++    int y_pred2_y0;
+++    int y_pred2_x0y0;
+++    int y_pred2_hle16;
+++    int y_pred2_hgt16;
+++} HEVCRpiStats;
+ +#endif
+-+#ifdef DISABLE_DEBLOCK
+-+    return;
+++
+ +#endif
+-+    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+-+        return;
+-     if (x0) {
+-         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
+-         left_beta_offset = s->deblock[ctb - 1].beta_offset;
+-@@ -529,19 +633,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+++
++ typedef struct HEVCContext {
++     const AVClass *c;  // needed by private avoptions
++     AVCodecContext *avctx;
++@@ -805,6 +1025,69 @@ typedef struct HEVCContext {
++     int                 width;
++     int                 height;
+  
+-                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+-                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+--                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-                 if (pcmf) {
+-                     no_p[0] = get_pcm(s, x - 1, y);
+-                     no_p[1] = get_pcm(s, x - 1, y + 4);
+-                     no_q[0] = get_pcm(s, x, y);
+-                     no_q[1] = get_pcm(s, x, y + 4);
+--                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+--                                                         s->frame->linesize[LUMA],
+--                                                         beta, tc, no_p, no_q);
+--                } else
+--                    s->hevcdsp.hevc_v_loop_filter_luma(src,
+--                                                       s->frame->linesize[LUMA],
+--                                                       beta, tc, no_p, no_q);
+-+                }
+++    int used_for_ref;  // rpi
+ +#ifdef RPI
+-+                if (rpi_sliced_frame(s->frame)) {
+++    int enable_rpi;
+++    unsigned int pass0_job; // Pass0 does coefficient decode
+++    unsigned int pass1_job; // Pass1 does pixel processing
+++    int ctu_count; // Number of CTUs done in pass0 so far
+++    int max_ctu_count; // Number of CTUs when we trigger a round of processing
+ +
+-+                    // This copes properly with no_p/no_q
+-+                    s->hevcdsp.hevc_v_loop_filter_luma2(rpi_sliced_frame_pos_y(s->frame, x, y),
+-+                                                     s->frame->linesize[LUMA],
+-+                                                     beta, tc, no_p, no_q,
+-+                                                     rpi_sliced_frame_pos_y(s->frame, x - 4, y));
+-+                }
+-+                else
+++    HEVCRpiJob * jb0;
+++    HEVCRpiJob * jb1;
+++    HEVCRpiJob jobs[RPI_MAX_JOBS];
+++#if RPI_TSTATS
+++    HEVCRpiStats tstats;
+ +#endif
+-+                {
+-+                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-+                    if (pcmf) {
+-+                        // Standard DSP code is broken if no_p / no_q is set
+-+                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+-+                                                           s->frame->linesize[LUMA],
+-+                                                           beta, tc, no_p, no_q);
+-+                    }
+-+                    else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                    if (s->enable_rpi_deblock) {
+-+                        uint8_t (*setup)[2][2][4];
+-+                        int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                        int a = ((y>>3) & 1) << 1;
+-+                        int b = (x>>3) & 1;
+-+                        setup = s->dvq->y_setup_arm[num16];
+-+                        setup[0][b][0][a] = beta;
+-+                        setup[0][b][0][a + 1] = beta;
+-+                        setup[0][b][1][a] = tc[0];
+-+                        setup[0][b][1][a + 1] = tc[1];
+-+                    } else
+++#if RPI_INTER
+++    struct qpu_mc_pred_y_p_s * last_y8_p;
+++    struct qpu_mc_src_s * last_y8_l1;
+++
+++    // Function pointers
+++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
+++    const uint8_t * qpu_dummy_frame_emu;
+ +#endif
+-+                    {
+-+                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+-+                                                           s->frame->linesize[LUMA],
+-+                                                           beta, tc, no_p, no_q);
+-+                    }
+-+                }
+-             }
+-         }
+- 
+-@@ -561,7 +697,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
+-                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
+-                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
+--                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-+                src =
+-+#ifdef RPI
+-+                    rpi_sliced_frame(s->frame) ?
+-+                        rpi_sliced_frame_pos_y(s->frame, x, y) :
+++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
+++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
+ +#endif
+-+                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+-                 if (pcmf) {
+-                     no_p[0] = get_pcm(s, x, y - 1);
+-                     no_p[1] = get_pcm(s, x + 4, y - 1);
+-@@ -571,6 +712,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                          s->frame->linesize[LUMA],
+-                                                          beta, tc, no_p, no_q);
+-                 } else
+++    HEVCRpiQpu qpu;
+++#endif
+++
+++    pthread_t worker_thread;
+++
+ +#ifdef RPI_DEBLOCK_VPU
+-+                if (s->enable_rpi_deblock) {
+-+                    uint8_t (*setup)[2][2][4];
+-+                    int num16 = (y>>4)*s->setup_width + (x>>4);
+-+                    int a = ((x>>3) & 1) << 1;
+-+                    int b = (y>>3) & 1;
+-+                    setup = s->dvq->y_setup_arm[num16];
+-+                    setup[1][b][0][a] = beta;
+-+                    setup[1][b][0][a + 1] = beta;
+-+                    setup[1][b][1][a] = tc[0];
+-+                    setup[1][b][1][a + 1] = tc[1];
+-+                } else
+++#define RPI_DEBLOCK_VPU_Q_COUNT 2
+++    int enable_rpi_deblock;
+++
+++    int uv_setup_width;
+++    int uv_setup_height;
+++    int setup_width; // Number of 16x16 blocks across the image
+++    int setup_height; // Number of 16x16 blocks down the image
+++
+++    struct dblk_vpu_q_s
+++    {
+++        GPU_MEM_PTR_T deblock_vpu_gmem;
+++
+++        uint8_t (*y_setup_arm)[2][2][2][4];
+++        uint8_t (*y_setup_vc)[2][2][2][4];
+++
+++        uint8_t (*uv_setup_arm)[2][2][2][4];  // Half of this is unused [][][1][], but easier for the VPU as it allows us to store with zeros and addresses are aligned
+++        uint8_t (*uv_setup_vc)[2][2][2][4];
+++
+++        int (*vpu_cmds_arm)[6]; // r0-r5 for each command
+++        int vpu_cmds_vc;
+++
+++        vpu_qpu_wait_h cmd_id;
+++    } dvq_ents[RPI_DEBLOCK_VPU_Q_COUNT];
+++
+++    struct dblk_vpu_q_s * dvq;
+++    unsigned int dvq_n;
+++
+ +#endif
+-                     s->hevcdsp.hevc_h_loop_filter_luma(src,
+-                                                        s->frame->linesize[LUMA],
+-                                                        beta, tc, no_p, no_q);
+-@@ -579,6 +733,91 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     }
+++    HEVCLocalContextIntra HEVClcIntra;
+++    HEVCRPiFrameProgressState progress_states[2];
+++#endif
+++
++     uint8_t *cabac_state;
+  
+-     if (s->ps.sps->chroma_format_idc) {
+-+#ifdef RPI
+-+        if (rpi_sliced_frame(s->frame)) {
+-+            const int v = 2;
+-+            const int h = 2;
++     /** 1 if the independent slice segment header was successfully parsed */
++@@ -1053,6 +1336,10 @@ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size);
++ 
++ int ff_hevc_encode_nal_vps(HEVCVPS *vps, unsigned int id,
++                            uint8_t *buf, int buf_size);
+++#if RPI_INTER
+++extern void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n);
+++#endif
+ +
+-+            // vertical filtering chroma
+-+            for (y = y0; y < y_end; y += 8 * v) {
+-+                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+-+                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+-+                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
++ 
++ /**
++  * Reset SEI values that are stored on the Context.
++@@ -1072,4 +1359,89 @@ extern const uint8_t ff_hevc_diag_scan4x4_y[16];
++ extern const uint8_t ff_hevc_diag_scan8x8_x[64];
++ extern const uint8_t ff_hevc_diag_scan8x8_y[64];
++ 
+++#ifdef RPI
+++int16_t * rpi_alloc_coeff_buf(HEVCContext * const s, const int buf_no, const int n);
+ +
+-+                    if ((bs0 == 2) || (bs1 == 2)) {
+-+                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+-+                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+-+                        unsigned int no_f = 0;
+++// arm/hevc_misc_neon.S
+++// Neon coeff zap fn
+++#if HAVE_NEON
+++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
+++#endif
+ +
+-+                        // tc_offset here should be set to cur_tc_offset I think
+-+                        const uint32_t tc4 =
+-+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+-+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++void ff_hevc_rpi_progress_wait_field(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int val, const int field);
+ +
+-+                        if (tc4 == 0)
+-+                            continue;
+++void ff_hevc_rpi_progress_signal_field(HEVCContext * const s, const int val, const int field);
+ +
+-+                        if (pcmf) {
+-+                            no_f =
+-+                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+-+                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+-+                                (get_pcm(s, x, y) ? 4 : 0) |
+-+                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+-+                            if (no_f == 0xf)
+-+                                continue;
+-+                        }
+++// All of these expect that s->threads_type == FF_THREAD_FRAME
+ +
+-+                        s->hevcdsp.hevc_v_loop_filter_uv2(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+-+                                                       s->frame->linesize[1],
+-+                                                       tc4,
+-+                                                       rpi_sliced_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+-+                                                       no_f);
+-+                    }
+-+                }
+++static inline void ff_hevc_progress_wait_mv(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int y)
+++{
+++    if (s->enable_rpi)
+++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
+++    else
+++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+++}
+ +
+-+                if (y == 0)
+-+                    continue;
+++static inline void ff_hevc_progress_signal_mv(HEVCContext * const s, const int y)
+++{
+++    if (s->enable_rpi && s->used_for_ref)
+++        ff_hevc_rpi_progress_signal_field(s, y, 1);
+++}
+ +
+-+                // horizontal filtering chroma
+-+                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+-+                x_end2 = x_end;
+-+                if (x_end != s->ps.sps->width)
+-+                    x_end2 = x_end - 8 * h;
+++static inline void ff_hevc_progress_wait_recon(HEVCContext * const s, HEVCRpiJob * const jb,
+++                                     const HEVCFrame * const ref, const int y)
+++{
+++    if (s->enable_rpi)
+++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
+++    else
+++        ff_thread_await_progress((ThreadFrame*)&ref->tf, y, 0);
+++}
+ +
+-+                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+-+                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+-+                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+-+                    if ((bs0 == 2) || (bs1 == 2)) {
+-+                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+-+                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+-+                        const uint32_t tc4 =
+-+                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+-+                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+-+                        unsigned int no_f = 0;
+++static inline void ff_hevc_progress_signal_recon(HEVCContext * const s, const int y)
+++{
+++    if (s->used_for_ref)
+++    {
+++        if (s->enable_rpi)
+++            ff_hevc_rpi_progress_signal_field(s, y, 0);
+++        else
+++            ff_thread_report_progress(&s->ref->tf, y, 0);
+++    }
+++}
+ +
+-+                        if (tc4 == 0)
+-+                            continue;
+++static inline void ff_hevc_progress_signal_all_done(HEVCContext * const s)
+++{
+++    if (s->enable_rpi)
+++    {
+++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
+++        ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
+++    }
+++    else
+++        ff_thread_report_progress(&s->ref->tf, INT_MAX, 0);
+++}
+ +
+-+                        if (pcmf) {
+-+                            no_f =
+-+                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+-+                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+-+                                (get_pcm(s, x,         y)     ? 4 : 0) |
+-+                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+++#else
+ +
+-+                            if (no_f == 0xf)
+-+                                continue;
+-+                        }
+++// Use #define as that allows us to discard "jb" which won't exist in non-RPI world
+++#define ff_hevc_progress_wait_mv(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+++#define ff_hevc_progress_wait_recon(s, jb, ref, y) ff_thread_await_progress((ThreadFrame *)&ref->tf, y, 0)
+++#define ff_hevc_progress_signal_mv(s, y)
+++#define ff_hevc_progress_signal_recon(s, y) ff_thread_report_progress(&s->ref->tf, y, 0)
+++#define ff_hevc_progress_signal_all_done(s) ff_thread_report_progress(&s->ref->tf, INT_MAX, 0)
+ +
+-+                        s->hevcdsp.hevc_h_loop_filter_uv(rpi_sliced_frame_pos_c(s->frame, x >> 1, y >> 1),
+-+                                                             s->frame->linesize[1],
+-+                                                             tc4, no_f);
+-+                    }
+-+                }
+-+            }
+-+        }
+-+        else
+ +#endif
+-         for (chroma = 1; chroma <= 2; chroma++) {
+-             int h = 1 << s->ps.sps->hshift[chroma];
+-             int v = 1 << s->ps.sps->vshift[chroma];
+-@@ -595,7 +834,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+++
+++// Set all done - signal nothing (used in missing refs)
+++// Works for both rpi & non-rpi
+++static inline void ff_hevc_progress_set_all_done(HEVCFrame * const ref)
+++{
+++    if (ref->tf.progress != NULL)
+++    {
+++        int * const p = (int *)&ref->tf.progress->data;
+++        p[0] = INT_MAX;
+++        p[1] = INT_MAX;
+++    }
+++}
+++
++ #endif /* AVCODEC_HEVC_H */
++diff --git a/libavcodec/hevc_cabac.c b/libavcodec/hevc_cabac.c
++index 05b2821840..c84886817d 100644
++--- a/libavcodec/hevc_cabac.c
+++++ b/libavcodec/hevc_cabac.c
++@@ -21,14 +21,76 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
+  
+-                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
+-                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
+--                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+-+                        src =
+-+#ifdef RPI
+-+                            rpi_sliced_frame(s->frame) ?
+-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+-+#endif
+-+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+-                         if (pcmf) {
+-                             no_p[0] = get_pcm(s, x - 1, y);
+-                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
+-@@ -605,9 +849,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((yc>>3) & 1) << 1;
+-+                            int b = (xc>>3) & 1;
+-+                            setup = s->dvq->uv_setup_arm[num16];
+-+                            setup[0][b][0][a] = c_tc[0];
+-+                            setup[0][b][0][a + 1] = c_tc[1];
+-+                        } else
+-+#endif
+-                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+++#define UNCHECKED_BITSTREAM_READER 1
+ +
+-                     }
+-                 }
++ #include "libavutil/attributes.h"
++ #include "libavutil/common.h"
+  
+-@@ -628,7 +886,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ #include "cabac_functions.h"
++ #include "hevc.h"
+  
+-                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
+-                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
+--                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-+                        src =
+ +#ifdef RPI
+-+                            rpi_sliced_frame(s->frame) ?
+-+                                rpi_sliced_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#include "libavutil/rpi_sand_fns.h"
+ +#endif
+-+                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+-                         if (pcmf) {
+-                             no_p[0] = get_pcm(s, x,           y - 1);
+-                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
+-@@ -638,6 +901,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-                                                                    s->frame->linesize[chroma],
+-                                                                    c_tc, no_p, no_q);
+-                         } else
+-+#ifdef RPI_DEBLOCK_VPU
+-+                        if (s->enable_rpi_deblock) {
+-+                            uint8_t (*setup)[2][2][4];
+-+                            int xc = x>>s->ps.sps->hshift[chroma];
+-+                            int yc = y>>s->ps.sps->vshift[chroma];
+-+                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+-+                            int a = ((xc>>3) & 1) << 1;
+-+                            int b = (yc>>3) & 1;
+-+                            setup = s->dvq->uv_setup_arm[num16];
+-+                            setup[1][b][0][a] = c_tc[0];
+-+                            setup[1][b][0][a + 1] = c_tc[1];
+-+                        } else
+++
+++// BY22 is probably faster than simple bypass if the processor has
+++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
+++// x86 has fast int divide
+++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
+++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
+++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
+++// Use native divide if we have a fast one - otherwise use mpy 1/x
+++// x86 has a fast integer divide - arm doesn't - unsure about other
+++// architectures
+++#define USE_BY22_DIV  ARCH_X86
+++
+++// Special case blocks with a single significant ceoff
+++// Decreases the complexity of the code for a common case but increases the
+++// code size.
+++#define USE_N_END_1 1
+++
+++#if ARCH_ARM
+++#include "arm/hevc_cabac.h"
+ +#endif
+-                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
+-                                                                  s->frame->linesize[chroma],
+-                                                                  c_tc, no_p, no_q);
+-@@ -648,69 +924,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
+-     }
+- }
+++
++ #define CABAC_MAX_BIN 31
+  
+--static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
+--                             RefPicList *neigh_refPicList)
+--{
+--    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+--        // same L0 and L1
+--        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
+--            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
+--            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
+--            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+--                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
+--                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
+--            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+--                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else {
+--            return 1;
+--        }
+--    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+--        Mv A, B;
+--        int ref_A, ref_B;
+--
+--        if (curr->pred_flag & 1) {
+--            A     = curr->mv[0];
+--            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
+--        } else {
+--            A     = curr->mv[1];
+--            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
+--        }
+--
+--        if (neigh->pred_flag & 1) {
+--            B     = neigh->mv[0];
+--            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
+--        } else {
+--            B     = neigh->mv[1];
+--            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
+--        }
+--
+--        if (ref_A == ref_B) {
+--            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
+--                return 1;
+--            else
+--                return 0;
+--        } else
+--            return 1;
+--    }
+--
+--    return 1;
+--}
+- 
+- void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-                                            int log2_trafo_size)
+-@@ -721,10 +934,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
+-     int min_pu_width     = s->ps.sps->min_pu_width;
+-     int min_tu_width     = s->ps.sps->min_tb_width;
+--    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
+--                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
+-     int boundary_upper, boundary_left;
+--    int i, j, bs;
+-+    int i, j;
+-+    RefPicList *rpl      = s->ref->refPicList;
+-+    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+-+    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+-+    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+-+    int y_pu             = y0 >> log2_min_pu_size;
+-+    int x_pu             = x0 >> log2_min_pu_size;
+-+    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    int is_intra         = curr->pred_flag == PF_INTRA;
+-+    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+-+    uint8_t *bs;
+ +
+-+#ifdef DISABLE_STRENGTHS
+-+    return;
+-+#endif
+- 
+-     boundary_upper = y0 > 0 && !(y0 & 7);
+-     if (boundary_upper &&
+-@@ -736,34 +961,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_upper = 0;
+++#if USE_BY22 && !USE_BY22_DIV
+++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
+++
+++static const uint32_t cabac_by22_inv_range[256] = {
+++                                                    0,      I(257), I(258), I(259),
+++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
+++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
+++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
+++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
+++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
+++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
+++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
+++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
+++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
+++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
+++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
+++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
+++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
+++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
+++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
+++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
+++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
+++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
+++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
+++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
+++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
+++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
+++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
+++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
+++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
+++    I(510), I(511)
+++};
+++#undef I
+++#endif  // USE_BY22
+++
++ /**
++  * number of bin by SyntaxElement.
++  */
++@@ -445,6 +507,211 @@ static const uint8_t diag_scan8x8_inv[8][8] = {
++     { 28, 36, 43, 49, 54, 58, 61, 63, },
++ };
+  
+-+    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
+-     if (boundary_upper) {
+-         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
+-                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
+--                              s->ref->refPicList;
+--        int yp_pu = (y0 - 1) >> log2_min_pu_size;
+--        int yq_pu =  y0      >> log2_min_pu_size;
+--        int yp_tu = (y0 - 1) >> log2_min_tu_size;
+--        int yq_tu =  y0      >> log2_min_tu_size;
+-+                              rpl;
+-+        MvField *top = curr - min_pu_width;
+++typedef struct
+++{
+++    uint16_t coeff;
+++    uint16_t scale;
+++} xy_off_t;
+ +
+-+        if (is_intra) {
+-+            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+-+                bs[i >> 2] = 2;
+++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
+++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
+++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
+++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
+ +
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+-+                    curr, top, bs);
+- 
+-             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                int x_tu = (x0 + i) >> log2_min_tu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || top_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, top, rpl_top);
+--                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+-+                int i_pu = i >> log2_min_pu_size;
+-+                int i_tu = i >> log2_min_tu_size;
+++#define OFF_DIAG(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
+++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
+++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
+++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+                if (top[i_pu].pred_flag == PF_INTRA)
+-+                    bs[i >> 2] = 2;
+-+                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+-+                    bs[i >> 2] = 1;
+-             }
+-+        }
+-+    }
+++#define OFF_HORIZ(t) {\
+++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
+++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
+++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
+++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
+++}
+ +
+-+    if (!is_intra) {
+-+        for (j = inc; j < trafo_in_min_pus; j += inc) {
+-+            MvField *top;
+++#define OFF_VERT(t) {\
+++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
+++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
+++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
+++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
+++}
+ +
+-+            curr += min_pu_width * inc;
+-+            top = curr - min_pu_width;
+-+            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+++static const xy_off_t off_xys[3][4][16] =
+++{
+++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
+++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
+++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
+++};
+ +
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, top, bs);
+-+        }
+-     }
+- 
+--    // bs for vertical TU boundaries
+-     boundary_left = x0 > 0 && !(x0 & 7);
+-     if (boundary_left &&
+-         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
+-@@ -774,64 +1021,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+-           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
+-         boundary_left = 0;
+- 
+-+    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+-+    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+ +
+-     if (boundary_left) {
+-         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
+-                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
+--                               s->ref->refPicList;
+--        int xp_pu = (x0 - 1) >> log2_min_pu_size;
+--        int xq_pu =  x0      >> log2_min_pu_size;
+--        int xp_tu = (x0 - 1) >> log2_min_tu_size;
+--        int xq_tu =  x0      >> log2_min_tu_size;
+--
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int y_pu      = (y0 + i) >> log2_min_pu_size;
+--                int y_tu      = (y0 + i) >> log2_min_tu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+--                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
+--                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
+--
+--                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
+--                    bs = 2;
+--                else if (curr_cbf_luma || left_cbf_luma)
+--                    bs = 1;
+--                else
+--                    bs = boundary_strength(s, curr, left, rpl_left);
+--                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
+--            }
+--    }
+--
+--    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
+--        RefPicList *rpl = s->ref->refPicList;
+-+                               rpl;
+-+        MvField *left = curr - 1;
+- 
+--        // bs for TU internal horizontal PU boundaries
+--        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
+--            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
+--            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+-+        if (is_intra) {
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+-+                bs[j * s->bs_width >> 2] = 2;
+- 
+--            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
+--                int x_pu = (x0 + i) >> log2_min_pu_size;
+--                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
+--                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
+--
+--                bs = boundary_strength(s, curr, top, rpl);
+--                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+-+        } else {
+-+            int y_tu = y0 >> log2_min_tu_size;
+-+            int x_tu = x0 >> log2_min_tu_size;
+-+            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+-+            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+-+
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+-+                    curr, left, bs);
+-+
+-+            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+-+                int j_pu = j >> log2_min_pu_size;
+-+                int j_tu = j >> log2_min_tu_size;
+-+
+-+                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+-+                    bs[j * s->bs_width >> 2] = 2;
+-+                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+-+                    bs[j * s->bs_width >> 2] = 1;
+-             }
+-         }
+-+    }
+- 
+--        // bs for TU internal vertical PU boundaries
+--        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+--            int y_pu = (y0 + j) >> log2_min_pu_size;
+-+    if (!is_intra) {
+-+        for (i = inc; i < trafo_in_min_pus; i += inc) {
+-+            MvField *left;
+- 
+--            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
+--                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
+--                int xq_pu = (x0 + i)     >> log2_min_pu_size;
+--                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
+--                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+-+            curr += inc;
+-+            left = curr - 1;
+-+            bs += inc << log2_min_pu_size >> 2;
+- 
+--                bs = boundary_strength(s, curr, left, rpl);
+--                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+--            }
+-+            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+-+                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+-+                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+-+                    curr, left, bs);
+-         }
+-     }
+- }
+-@@ -840,11 +1077,104 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
+- #undef CB
+- #undef CR
+- 
+-+#ifdef RPI_DEBLOCK_VPU
+-+// ff_hevc_flush_buffer_lines
+-+// flushes and invalidates all pixel rows in [start,end-1]
+-+static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+++// Helper fns
+++#ifndef hevc_mem_bits32
+++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
+ +{
+-+    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+-+    rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+      start, end - start, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+-+    rpi_cache_flush_finish(rfe);
+++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
+ +}
+ +#endif
+ +
+-+#if RPI_INTER
+-+
+-+// Flush some lines of a reference frames
+-+void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
+++#define hevc_clz32 hevc_clz32_builtin
+++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
+ +{
+-+    if (s->enable_rpi && s->used_for_ref) {
+-+        const int d0 = ((int *)f->progress->data)[0];
+-+        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
+++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
+++}
+++#endif
+ +
+-+        if (curr_y < (unsigned int)f->f->height) {
+-+            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+-+            rpi_cache_flush_add_frame_lines(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+-+              curr_y, FFMIN(n, (unsigned int)f->f->height) - curr_y, s->ps.sps->vshift[1], 1, 1);
+-+            rpi_cache_flush_finish(rfe);
+-+        }
+++// It is unlikely that we will ever need this but include for completeness
+++#ifndef hevc_clz32
+++static inline unsigned int hevc_clz32(unsigned int x)
+++{
+++    unsigned int n = 1;
+++    if ((x & 0xffff0000) == 0) {
+++        n += 16;
+++        x <<= 16;
+++    }
+++    if ((x & 0xff000000) == 0) {
+++        n += 8;
+++        x <<= 8;
+++    }
+++    if ((x & 0xf0000000) == 0) {
+++        n += 4;
+++        x <<= 4;
+++    }
+++    if ((x & 0xc0000000) == 0) {
+++        n += 2;
+++        x <<= 2;
+ +    }
+++    return n - ((x >> 31) & 1);
+ +}
+ +#endif
+ +
+-+#ifdef RPI_DEBLOCK_VPU
+-+/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+-+static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+-+{
+-+  // Flush image, 4 lines above to bottom of ctb stripe
+-+  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+-+  // TODO flush buffer of beta/tc setup when it becomes cached
+ +
+-+  // Prepare three commands at once to avoid calling overhead
+-+  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+-+  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+-+  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+-+  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+-+  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+-+  s->dvq->vpu_cmds_arm[0][5] = 2;
+++#if !USE_BY22
+++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
+++// will no longer be called but the setup calls will still exist and we want
+++// to null them out
+++#define bypass_start(s)
+++#define bypass_finish(s)
+++#else
+++// Use BY22 for residual bypass block
+ +
+-+  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+-+  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+-+  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+-+  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->dvq->vpu_cmds_arm[1][5] = 3;
+++#define bypass_start(s) get_cabac_by22_start(&s->HEVClc->cc)
+++#define bypass_finish(s) get_cabac_by22_finish(&s->HEVClc->cc)
+ +
+-+  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+-+  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+-+  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+-+  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+-+  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+-+  s->dvq->vpu_cmds_arm[2][5] = 4;
+++// BY22 notes that bypass is simply a divide into the bitstream and so we
+++// can peek out large quantities of bits at once and treat the result as if
+++// it was VLC.  In many cases this will lead to O(1) processing rather than
+++// O(n) though the setup and teardown is sufficiently expensive that it is
+++// only worth using if we expect to be dealing with more than a few bits
+++// The definition of "a few bits" will vary from platform to platform but
+++// tests on ARM show that it probably isn't worth it for a single coded
+++// residual, but is for >1 - it also seems likely that if there are
+++// more residuals then they are likely to be bigger and this will make the
+++// O(1) nature of the code more worthwhile.
+ +
+-+  // Call VPU
+-+  {
+-+      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+-+      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+-+      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+-+      vpu_qpu_job_finish(vqj);
+-+  }
+ +
+-+  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+-+  s->dvq = s->dvq_ents + s->dvq_n;
+++#if !USE_BY22_DIV
+++// * 1/x @ 32 bits gets us 22 bits of accuracy
+++#define CABAC_BY22_PEEK_BITS  22
+++#else
+++// A real 32-bit divide gets us another bit
+++// If we have a 64 bit int & a unit time divider then we should get a lot
+++// of bits (55)  but that is untested and it is unclear if it would give
+++// us a large advantage
+++#define CABAC_BY22_PEEK_BITS  23
+++#endif
+ +
+-+  vpu_qpu_wait(&s->dvq->cmd_id);
+++// Bypass block start
+++// Must be called before _by22_peek is used as it sets the CABAC environment
+++// into the correct state.  _by22_finish must be called to return to 'normal'
+++// (i.e. non-bypass) cabac decoding
+++static inline void get_cabac_by22_start(CABACContext * const c)
+++{
+++    const unsigned int bits = __builtin_ctz(c->low);
+++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
+++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
+++#if !USE_BY22_DIV
+++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
+++#endif
+++
+++    c->bytestream -= (CABAC_BITS / 8);
+++    c->by22.bits = bits;
+++#if !USE_BY22_DIV
+++    c->by22.range = c->range;
+++    c->range = inv;
+++#endif
+++    c->low = x;
+ +}
+ +
+++// Bypass block finish
+++// Must be called at the end of the bypass block to return to normal operation
+++static inline void get_cabac_by22_finish(CABACContext * const c)
+++{
+++    unsigned int used = c->by22.bits;
+++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
+++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
+++
+++    c->bytestream += bytes_used + (CABAC_BITS / 8);
+++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
+++#if !USE_BY22_DIV
+++    c->range = c->by22.range;
+ +#endif
+++}
+ +
+- void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+- {
+-     int x_end = x >= s->ps.sps->width  - ctb_size;
+-+#ifdef RPI_DEBLOCK_VPU
+-+    int done_deblock = 0;
+++// Peek bypass bits
+++// _by22_start must be called before _by22_peek is called and _by22_flush
+++// must be called afterwards to flush any used bits
+++// The actual number of valid bits returned is
+++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
+++// will be at least 22 which should be long enough for any prefix or suffix
+++// though probably not long enough for the worst case combination
+++#ifndef get_cabac_by22_peek
+++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
+++{
+++#if USE_BY22_DIV
+++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
+++#else
+++    uint32_t x = c->low & ~1U;
+++    const uint32_t inv = c->range;
+++
+++    if (inv != 0)
+++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
+++
+++    return x << 1;
+ +#endif
+-     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
+-         deblocking_filter_CTB(s, x, y);
+-+#ifdef RPI_DEBLOCK_VPU
+-+    if (s->enable_rpi_deblock && x_end)
+-+    {
+-+      int y_at_end = y >= s->ps.sps->height - ctb_size;
+-+      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+-+      int y_start = y&~63;
+-+      if (y_at_end) height = s->ps.sps->height - y_start;
+-+      if ((((y+ctb_size)&63)==0) || y_at_end) {
+-+        done_deblock = 1;
+-+        rpi_deblock(s, y_start, height);
+-+      }
+-+    }
+-+#endif
+-     if (s->ps.sps->sao_enabled) {
+-         int y_end = y >= s->ps.sps->height - ctb_size;
+-         if (y && x)
+-@@ -853,16 +1183,46 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
+-             sao_filter_CTB(s, x - ctb_size, y);
+-         if (y && x_end) {
+-             sao_filter_CTB(s, x, y - ctb_size);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type == FF_THREAD_FRAME ) {
+-+#if RPI_INTER
+-+                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y, 0);
+-+            }
+-         }
+-         if (x_end && y_end) {
+-             sao_filter_CTB(s, x , y);
+--            if (s->threads_type & FF_THREAD_FRAME )
+-+            if (s->threads_type == FF_THREAD_FRAME ) {
+-+#if RPI_INTER
+-+                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+-+#endif
+-                 ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+-+            }
+-+        }
+-+    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+-+        //int newh = y + ctb_size - 4;
+-+        //int currh = s->ref->tf.progress->data[0];
+-+        //if (((y + ctb_size)&63)==0)
+-+#ifdef RPI_DEBLOCK_VPU
+-+        if (s->enable_rpi_deblock) {
+-+          // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+-+          if (done_deblock) {
+-+            ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-+          }
+-+        } else {
+-+#if RPI_INTER
+-+          rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+++}
+ +#endif
+-+          ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+-         }
+--    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
+++
+++// Flush bypass bits peeked by _by22_peek
+++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
+++// val is an unmodified copy of whatever _by22_peek returned
+++#ifndef get_cabac_by22_flush
+++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
+++{
+++    // Subtract the bits used & reshift up to the top of the word
+++#if USE_BY22_DIV
+++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
+ +#else
+-+#if RPI_INTER
+-+        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+-+        // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
+ +#endif
+-         ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++
+++    // and refill lower bits
+++    // We will probably OR over some existing bits but that doesn't matter
+++    c->by22.bits += n;
+++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
+++}
+ +#endif
+-+    }
+++
+++#endif  // USE_BY22
+++
+++
++ void ff_hevc_save_states(HEVCContext *s, int ctb_addr_ts)
++ {
++     if (s->ps.pps->entropy_coding_sync_enabled_flag &&
++@@ -863,19 +1130,19 @@ int ff_hevc_cbf_luma_decode(HEVCContext *s, int trafo_depth)
++     return GET_CABAC(elem_offset[CBF_LUMA] + !trafo_depth);
+  }
+  
+- void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
+-diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
+-index 83f2ec2..bcf53dc 100644
+---- a/libavcodec/hevc_ps.c
+-+++ b/libavcodec/hevc_ps.c
+-@@ -767,7 +767,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
+-     switch (sps->bit_depth) {
+-     case 8:
+-         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+-+#if RPI_HEVC_SAND
+-+        // *** Horrid kludge s.t. we start out with sand format
+-+        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+-+#else
+-         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+-+#endif
+-         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
+-         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
+-        break;
+-@@ -989,6 +994,8 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
+-     sps->amp_enabled_flag = get_bits1(gb);
+-     sps->sao_enabled      = get_bits1(gb);
++-static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx)
+++static int hevc_transform_skip_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++ }
+  
+-+    av_log(avctx, AV_LOG_INFO, "sao_enabled=%d\n", sps->sao_enabled);
+-+
+-     sps->pcm_enabled_flag = get_bits1(gb);
+-     if (sps->pcm_enabled_flag) {
+-         sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
+-diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
+-index 9d773d9..c4d7250 100644
+---- a/libavcodec/hevcdsp.c
+-+++ b/libavcodec/hevcdsp.c
+-@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
+- #include "hevcdsp_template.c"
+- #undef BIT_DEPTH
++-static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++ }
+  
+-+static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs)
+-+{
+-+    for (; pus > 0; pus--) {
+-+        int strength, out;
+-+        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+-+        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+-+        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+-+        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
++-static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx)
+++static int explicit_rdpcm_dir_flag_decode(HEVCContext *s, int c_idx_nz)
++ {
++-    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + !!c_idx);
+++    return GET_CABAC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++ }
++ 
++ int ff_hevc_log2_res_scale_abs(HEVCContext *s, int idx) {
++@@ -891,14 +1158,14 @@ int ff_hevc_res_scale_sign_flag(HEVCContext *s, int idx) {
++     return GET_CABAC(elem_offset[RES_SCALE_SIGN_FLAG] + idx);
++ }
++ 
++-static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx,
+++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCContext *s, int c_idx_nz,
++                                                    int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++ {
++     int i = 0;
++     int max = (log2_size << 1) - 1;
++     int ctx_offset, ctx_shift;
++ 
++-    if (!c_idx) {
+++    if (!c_idx_nz) {
++         ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++         ctx_shift = (log2_size + 1) >> 2;
++     } else {
++@@ -929,22 +1196,16 @@ static av_always_inline int last_significant_coeff_suffix_decode(HEVCContext *s,
++     return value;
++ }
++ 
++-static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx, int ctx_cg)
+++static av_always_inline int significant_coeff_group_flag_decode(HEVCContext *s, int c_idx_nz, int ctx_cg)
++ {
++     int inc;
++ 
++-    inc = FFMIN(ctx_cg, 1) + (c_idx>0 ? 2 : 0);
+++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
++ 
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++ }
++-static av_always_inline int significant_coeff_flag_decode(HEVCContext *s, int x_c, int y_c,
++-                                           int offset, const uint8_t *ctx_idx_map)
++-{
++-    int inc = ctx_idx_map[(y_c << 2) + x_c] + offset;
++-    return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + inc);
++-}
++ 
++-static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int c_idx, int offset)
+++static av_always_inline int significant_coeff_flag_decode_0(HEVCContext *s, int offset)
++ {
++     return GET_CABAC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++ }
++@@ -966,90 +1227,470 @@ static av_always_inline int coeff_abs_level_greater2_flag_decode(HEVCContext *s,
++     return GET_CABAC(elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + inc);
++ }
++ 
++-static av_always_inline int coeff_abs_level_remaining_decode(HEVCContext *s, int rc_rice_param)
+ +
+-+#if 1 // This more directly matches the original implementation
+-+        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+-+            // same L0 and L1
+-+            if (curr_refL0 == neigh_refL0 &&
+-+                curr_refL0 == curr_refL1 &&
+-+                neigh_refL0 == neigh_refL1) {
+-+                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+-+                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL0 == curr_refL0 &&
+-+                       neigh_refL1 == curr_refL1) {
+-+                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else if (neigh_refL1 == curr_refL0 &&
+-+                       neigh_refL0 == curr_refL1) {
+-+                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+-+                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else {
+-+                strength = 1;
+-+            }
+-+        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+-+            Mv curr_mv0, neigh_mv0;
+++#if !USE_BY22
+++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
+++#endif
+ +
+-+            if (curr->pred_flag & 1) {
+-+                curr_mv0   = curr->mv[0];
+-+            } else {
+-+                curr_mv0   = curr->mv[1];
+-+                curr_refL0 = curr_refL1;
+-+            }
+ +
+-+            if (neigh->pred_flag & 1) {
+-+                neigh_mv0   = neigh->mv[0];
+-+            } else {
+-+                neigh_mv0   = neigh->mv[1];
+-+                neigh_refL0 = neigh_refL1;
+-+            }
+++#ifndef coeff_abs_level_remaining_decode_bypass
+++static int coeff_abs_level_remaining_decode_bypass(HEVCContext * const s, const unsigned int rice_param)
++ {
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    unsigned int prefix;
+++    unsigned int last_coeff_abs_level_remaining;
+++    unsigned int n;
+ +
+-+            if (curr_refL0 == neigh_refL0) {
+-+                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+-+                    strength = 1;
+-+                else
+-+                    strength = 0;
+-+            } else
+-+                strength = 1;
+-+        } else
+-+            strength = 1;
+-+#else // This has exactly the same effect, but is more suitable for vectorisation
+-+        Mv curr_mv[2];
+-+        Mv neigh_mv[2];
+-+        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+-+        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++    y = get_cabac_by22_peek(c);
+++    prefix = hevc_clz32(~y);
+++    // y << prefix will always have top bit 0
+ +
+-+        if (!(curr->pred_flag & 2)) {
+-+            curr_mv[1] = curr_mv[0];
+-+            curr_refL1 = curr_refL0;
+-+        }
+-+        if (!(neigh->pred_flag & 2)) {
+-+            neigh_mv[1] = neigh_mv[0];
+-+            neigh_refL1 = neigh_refL0;
+-+        }
+-+        if (!(curr->pred_flag & 1)) {
+-+            curr_mv[0] = curr_mv[1];
+-+            curr_refL0 = curr_refL1;
+-+        }
+-+        if (!(neigh->pred_flag & 1)) {
+-+            neigh_mv[0] = neigh_mv[1];
+-+            neigh_refL0 = neigh_refL1;
+-+        }
+-+
+-+        strength = 1;
+++    if (prefix < 3) {
+++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
+++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
+++        n = prefix + 1 + rice_param;
+++    }
+++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
+++    {
+++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
+ +
+-+        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix * 2 + rice_param - 2;
+++    }
+++    else {
+++        unsigned int suffix;
+ +
+-+        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+-+                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+-+                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++        get_cabac_by22_flush(c, prefix, y);
+++        y = get_cabac_by22_peek(c);
+ +
+-+        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+-+#endif
+++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
+++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
+++        n = prefix + rice_param - 2;
+++    }
+ +
+-+        curr += in_inc / sizeof (MvField);
+-+        neigh += in_inc / sizeof (MvField);
+++    get_cabac_by22_flush(c, n, y);
+ +
+-+        for (out = dup; out > 0; out--)
+-+        {
+-+            *bs = strength;
+-+            bs += out_inc;
+-+        }
+-+    }
+++    return last_coeff_abs_level_remaining;
+ +}
+-+
+- void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+- {
+- #undef FUNC
+-@@ -193,6 +307,16 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
+-     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
+- 
+-+#ifndef RPI
+-+#define SLICED_LOOP_FILTERS(depth)
+-+#else
+-+#define SLICED_LOOP_FILTERS(depth)\
+-+    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+-+    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+-+    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+ +#endif
+ +
+-+
+- #define HEVC_DSP(depth)                                                     \
+-     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
+-     hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
+-@@ -200,6 +324,15 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
+-     hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
+-     hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+-+    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth);                 \
+-+    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+-+    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+-+    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+-+    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+-+    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+-+    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+-+    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+-+    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+-     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
+-     hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+-     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
+-@@ -225,6 +358,19 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
+-     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+-                                                                                \
+-+    hevcdsp->sao_band_filter_c[0] =                                            \
+-+    hevcdsp->sao_band_filter_c[1] =                                            \
+-+    hevcdsp->sao_band_filter_c[2] =                                            \
+-+    hevcdsp->sao_band_filter_c[3] =                                            \
+-+    hevcdsp->sao_band_filter_c[4] = FUNC(sao_band_filter_c, depth);            \
+-+    hevcdsp->sao_edge_filter_c[0] =                                            \
+-+    hevcdsp->sao_edge_filter_c[1] =                                            \
+-+    hevcdsp->sao_edge_filter_c[2] =                                            \
+-+    hevcdsp->sao_edge_filter_c[3] =                                            \
+-+    hevcdsp->sao_edge_filter_c[4] = FUNC(sao_edge_filter_c, depth);            \
+-+    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);        \
+-+    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth);        \
+-+                                                                               \
+-     QPEL_FUNCS(depth);                                                         \
+-     QPEL_UNI_FUNCS(depth);                                                     \
+-     QPEL_BI_FUNCS(depth);                                                      \
+-@@ -232,6 +378,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
+-     EPEL_UNI_FUNCS(depth);                                                     \
+-     EPEL_BI_FUNCS(depth);                                                      \
+-                                                                                \
+-+    SLICED_LOOP_FILTERS(depth);                                                \
+-     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
+-     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
+-     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
+-@@ -257,6 +404,8 @@ int i = 0;
+-         break;
+-     }
+- 
+-+    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+-+
+-     if (ARCH_X86)
+-         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
+-     if (ARCH_ARM)
+-diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
+-index 9f1f6dd..639ecf1 100644
+---- a/libavcodec/hevcdsp.h
+-+++ b/libavcodec/hevcdsp.h
+-@@ -42,11 +42,26 @@ typedef struct SAOParams {
+-     uint8_t type_idx[3];    ///< sao_type_idx
+- } SAOParams;
+++static int coeff_abs_level_remaining_decode(HEVCContext * const s, int rc_rice_param)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
++     int prefix = 0;
++     int suffix = 0;
++     int last_coeff_abs_level_remaining;
++     int i;
+  
+-+typedef struct Mv {
+-+    int16_t x;  ///< horizontal component of motion vector
+-+    int16_t y;  ///< vertical component of motion vector
+-+} Mv;
++-    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(&s->HEVClc->cc))
+++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++         prefix++;
++     if (prefix == CABAC_MAX_BIN) {
++         av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++         return 0;
++     }
+ +
+-+typedef struct MvField {
+-+    DECLARE_ALIGNED(4, Mv, mv)[2];
+-+    int8_t ref_idx[2];
+-+    int8_t pred_flag;
+-+} MvField;
++     if (prefix < 3) {
++         for (i = 0; i < rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++     } else {
++         int prefix_minus3 = prefix - 3;
++         for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++-            suffix = (suffix << 1) | get_cabac_bypass(&s->HEVClc->cc);
+++            suffix = (suffix << 1) | get_cabac_bypass(c);
++         last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                               << rc_rice_param) + suffix;
++     }
+ +
+- typedef struct HEVCDSPContext {
+-     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+-                     struct GetBitContext *gb, int pcm_bit_depth);
+-+    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+-+                    struct GetBitContext *gb, int pcm_bit_depth);
+- 
+--    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+-+    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+-+    void (*add_residual_u[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+-+    void (*add_residual_v[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+- 
+-     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
+- 
+-@@ -60,14 +75,23 @@ typedef struct HEVCDSPContext {
+- 
+-     void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+-+    void (*sao_band_filter_c[5])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-+                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                               int width, int height);
+- 
+-     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
+-     void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+-+    void (*sao_edge_filter_c[5])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+-+                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++     return last_coeff_abs_level_remaining;
++ }
+  
+-     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+-                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+-+    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+-+                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+-+                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++-static av_always_inline int coeff_sign_flag_decode(HEVCContext *s, uint8_t nb)
+++#if !USE_BY22
+++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
+++static inline uint32_t coeff_sign_flag_decode(HEVCContext * const s, const unsigned int nb)
++ {
++-    int i;
++-    int ret = 0;
+++    CABACContext * const c = &s->HEVClc->cc;
+++    unsigned int i;
+++    uint32_t ret = 0;
+  
+-     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
+-                                     int height, intptr_t mx, intptr_t my, int width);
+-@@ -120,6 +144,22 @@ typedef struct HEVCDSPContext {
+-     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
+-                                         int32_t *tc, uint8_t *no_p,
+-                                         uint8_t *no_q);
+-+#ifdef RPI
+-+    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+-+                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+-+                                 const uint8_t no_p[2], const uint8_t no_q[2],
+-+                                 uint8_t * _pix_l);
+-+    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+-+                                 unsigned int no_f);
+-+    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+-+                                 uint8_t * src_l,
+-+                                 unsigned int no_f);
++     for (i = 0; i < nb; i++)
++-        ret = (ret << 1) | get_cabac_bypass(&s->HEVClc->cc);
++-    return ret;
+++        ret = (ret << 1) | get_cabac_bypass(c);
+ +
+++    return ret << (32 - nb);
+++}
+ +#endif
+ +
+-+    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+-+                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+-+                                               MvField *curr, MvField *neigh, uint8_t *bs);
+- } HEVCDSPContext;
+- 
+- void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
+-diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
+-index b840d17..32b9e47 100644
+---- a/libavcodec/hevcdsp_template.c
+-+++ b/libavcodec/hevcdsp_template.c
+-@@ -26,6 +26,9 @@
+- #include "bit_depth_template.c"
+- #include "hevcdsp.h"
+- 
+-+#ifdef RPI
+-+#include "rpi_zc.h"
+-+#endif
+- 
+- static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+-                           GetBitContext *gb, int pcm_bit_depth)
+-@@ -42,6 +45,29 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
+-     }
+- }
+- 
+-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+-+                          GetBitContext *gb, int pcm_bit_depth)
+++#ifndef coeff_sign_flag_decode_bypass
+++static inline uint32_t coeff_sign_flag_decode_bypass(HEVCContext * const s, const unsigned int nb)
+ +{
+-+    int x, y;
+-+    pixel *dst = (pixel *)_dst;
+-+
+-+    stride /= sizeof(pixel);
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint32_t y;
+++    y = get_cabac_by22_peek(c);
+++    get_cabac_by22_flush(c, nb, y);
+++    return y & ~(0xffffffffU >> nb);
+++}
+++#endif
+ +
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x++)
+-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+-+        dst += stride;
+-+    }
+ +
+-+    dst = (pixel *)_dst + 1;
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x++)
+-+            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+-+        dst += stride;
+++#ifndef get_cabac_greater1_bits
+++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
+++    uint8_t * const state0)
+++{
+++    unsigned int i;
+++    unsigned int rv = 0;
+++    for (i = 0; i != n; ++i) {
+++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
+++        const unsigned int b = get_cabac(c, state0 + idx);
+++        rv = (rv << 1) | b;
+ +    }
+-+}
+++    return rv;
++ }
+++#endif
+ +
+ +
+- static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
+-                                                      ptrdiff_t stride, int size)
+- {
+-@@ -59,6 +85,23 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
+-     }
+- }
+- 
+-+static av_always_inline void FUNC(add_residual_uv)(uint8_t *_dst, int16_t *res,
+-+                                                ptrdiff_t stride, int size)
+-+{
+-+    int x, y;
+-+    pixel *dst = (pixel *)_dst;
+++// N.B. levels returned are the values assuming coeff_abs_level_remaining
+++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
+++// this version of events.
+++static inline uint32_t get_greaterx_bits(HEVCContext * const s, const unsigned int n_end, int * const levels,
+++    int * const pprev_subset_coded, int * const psum,
+++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
+++{
+++    CABACContext * const c = &s->HEVClc->cc;
+++    uint8_t * const state0 = s->HEVClc->cabac_state + idx0_gt1;
+++    uint8_t * const state_gt2 = s->HEVClc->cabac_state + idx_gt2;
+++    unsigned int rv;
+++    unsigned int i;
+++    const unsigned int n = FFMIN(n_end, 8);
+ +
+-+    stride /= sizeof(pixel);
+++    // Really this is i != n but the simple unconditional loop is cheaper
+++    // and faster
+++    for (i = 0; i != 8; ++i)
+++        levels[i] = 1;
+ +
+-+    for (y = 0; y < size; y++) {
+-+        for (x = 0; x < size * 2; x += 2) {
+-+            dst[x] = av_clip_pixel(dst[x] + *res);
+-+            res++;
+++    rv = get_cabac_greater1_bits(c, n, state0);
+++
+++    *pprev_subset_coded = 0;
+++    *psum = n;
+++
+++    rv <<= (32 - n);
+++    if (rv != 0)
+++    {
+++        *pprev_subset_coded = 1;
+++        *psum = n + 1;
+++        i = hevc_clz32(rv);
+++        levels[i] = 2;
+++        if (get_cabac(c, state_gt2) == 0)
+++        {
+++            // Unset first coded bit
+++            rv &= ~(0x80000000U >> i);
+ +        }
+-+        dst += stride;
+ +    }
+-+}
+ +
+- static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
+-                                        ptrdiff_t stride)
+- {
+-@@ -83,6 +126,58 @@ static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+-     FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+- }
+- 
+-+// -- U -- (plaited)
+++    if (n_end > 8) {
+++        const unsigned int g8 = n_end - 8;
+++        rv |= ((1 << g8) - 1) << (24 - g8);
+++        for (i = 0; i != g8; ++i) {
+++            levels[i + 8] = 0;
+++        }
+++    }
+ +
+-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+-+{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 4);
+++    return rv;
+ +}
+ +
+-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+++// extended_precision_processing_flag must be false given we are
+++// putting the result into a 16-bit array
+++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
+++// scale_m is uint8_t
+++//
+++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
+++//   or it can be 2 (if we have transquant_bypass)
+++// shift is set to one less than we really want but would normally be
+++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
+++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
+++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
+++// to achieve it
+++
+++#ifndef trans_scale_sat
+++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
+ +{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 8);
+++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
+ +}
+++#endif
+ +
+-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++
+++#ifndef update_rice
+++static inline void update_rice(uint8_t * const stat_coeff,
+++    const unsigned int last_coeff_abs_level_remaining,
+++    const unsigned int c_rice_param)
+ +{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 16);
+++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
+++    if (x >= 6)
+++        (*stat_coeff)++;
+++    else if (x == 0 && *stat_coeff > 0)
+++        (*stat_coeff)--;
+ +}
+++#endif
+ +
+-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++
+++// n must be > 0 on entry
+++#ifndef get_cabac_sig_coeff_flag_idxs
+++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * p)
+ +{
+-+    FUNC(add_residual_uv)(_dst, res, stride, 32);
+++    do {
+++        if (get_cabac(c, state0 + ctx_map[n]))
+++            *p++ = n;
+++    } while (--n != 0);
+++    return p;
+ +}
+++#endif
+ +
+-+// -- V -- (plaited)
+ +
+-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
+++    unsigned int n,
+++    const uint8_t const * ctx_map,
+++    uint8_t * const flag_idx)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 4);
+++    int rv;
+++
+++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
+++
+++    return rv;
+ +}
+ +
+-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, int16_t *res,
+-+                                  ptrdiff_t stride)
+++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x1,  x2,  x3,\
+++     x4,  x5,  x6,  x7,\
+++     x8,  x9, x10, x11,\
+++    x12, x13, x14, x15}
+++
+++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x8, x12,\
+++     x1,  x5,  x9, x13,\
+++     x2,  x6, x10, x14,\
+++     x3,  x7, x11, x15}
+++
+++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
+++     x0,  x4,  x1,  x8,\
+++     x5,  x2, x12,  x9,\
+++     x6,  x3, x13, x10,\
+++     x7, x14, x11, x15}
+++
+++
+++static inline int next_subset(HEVCContext * const s, int i, const int c_idx_nz,
+++    uint8_t * const significant_coeff_group_flag,
+++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
+++    int * const pPrev_sig)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 8);
+++    while (--i >= 0) {
+++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
+++        const unsigned int x_cg = scan_x_cg[i];
+++
+++        // For the flag decode we only care about Z/NZ but
+++        // we use the full Right * 2 + Down when calculating
+++        // significant coeff flags so we obtain it here.
+++        //
+++        // The group flag array is one longer than it needs to
+++        // be so we don't need to check for y_cg limits
+++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
+++
+++        if (i == 0 ||
+++            significant_coeff_group_flag_decode(s, c_idx_nz, prev_sig))
+++        {
+++            gf_y[0] |= (1 << x_cg);
+++            *pPrev_sig = prev_sig;
+++            break;
+++        }
+++    }
+++
+++    return i;
+ +}
+ +
+-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++#ifdef RPI
+++static void rpi_add_residual(HEVCContext * const s,
+++    const unsigned int log2_trafo_size, const unsigned int c_idx,
+++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 16);
+++    const AVFrame * const frame = s->frame;
+++    unsigned int stride = frame->linesize[c_idx];
+++    unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+++    unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+++    const int is_sliced = av_rpi_is_sand_frame(frame);
+++    uint8_t * dst = !is_sliced ?
+++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+++        c_idx == 0 ?
+++            av_rpi_sand_frame_pos_y(frame, x, y) :
+++            av_rpi_sand_frame_pos_c(frame, x, y);
+++
+++    if (s->enable_rpi) {
+++        const unsigned int i = s->jb0->intra.n;
+++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+++
+++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+++            pc->ta.dst == dst)
+++        {
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->ta.stride == stride);
+++
+++            pc->type = RPI_PRED_ADD_RESIDUAL_C;
+++        }
+++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+++            pc->dc.dst == dst)
+++        {
+++            const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->dc.stride == stride);
+++
+++            // Rewrite as add residual - must rewrite all fields as different union member
+++            pc->type = RPI_PRED_ADD_RESIDUAL_V;
+++            pc->c_idx = c_idx;
+++            pc->ta.buf = coeffs;
+++            pc->ta.dst = dst;
+++            pc->ta.stride = stride;
+++            pc->ta.dc = dc;
+++        }
+++        else
+++        {
+++            HEVCPredCmd * const cmd = pc + 1;
+++            s->jb0->intra.n = i + 1;
+++
+++            cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
+++            cmd->size = log2_trafo_size;
+++            cmd->c_idx = c_idx;
+++            cmd->ta.buf = coeffs;
+++            cmd->ta.dst = dst;
+++            cmd->ta.stride = stride;
+++            cmd->ta.dc = 0;
+++        }
+++    }
+++    else if (!is_sliced || c_idx == 0) {
+++        s->hevcdsp.transform_add[log2_trafo_size-2](dst, (int16_t *)coeffs, stride);
+++    }
+++#if RPI_HEVC_SAND
+++    // * These should probably never happen
+++    else if (c_idx == 1) {
+++        s->hevcdsp.add_residual_u[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+++    }
+++    else {
+++        s->hevcdsp.add_residual_v[log2_trafo_size-2](dst, (int16_t *)coeffs, stride, 0);
+++    }
+++#endif
+ +}
+ +
+-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, int16_t *res,
+-+                                    ptrdiff_t stride)
+++
+++static void rpi_add_dc(HEVCContext * const s,
+++    const unsigned int log2_trafo_size, const unsigned int c_idx,
+++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
+ +{
+-+    FUNC(add_residual_uv)(_dst + 1, res, stride, 32);
+++    const AVFrame * const frame = s->frame;
+++    const unsigned int stride = frame->linesize[c_idx];
+++    const unsigned int x = x0 >> s->ps.sps->hshift[c_idx];
+++    const unsigned int y = y0 >> s->ps.sps->vshift[c_idx];
+++    const int is_sliced = av_rpi_is_sand_frame(frame);
+++    uint8_t * const dst = !is_sliced ?
+++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
+++        c_idx == 0 ?
+++            av_rpi_sand_frame_pos_y(frame, x, y) :
+++            av_rpi_sand_frame_pos_c(frame, x, y);
+++
+++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
+++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
+++
+++    if (s->enable_rpi) {
+++        const unsigned int i = s->jb0->intra.n;
+++        HEVCPredCmd *const pc = s->jb0->intra.cmds + i - 1;
+++
+++        if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
+++            pc->ta.dst == dst)
+++        {
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->ta.stride == stride);
+++
+++            pc->ta.dc = (int16_t)coeff;
+++        }
+++        else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
+++            pc->dc.dst == dst)
+++        {
+++            av_assert1(pc->size == log2_trafo_size &&
+++                       pc->c_idx == 1 &&
+++                       pc->dc.stride == stride &&
+++                       (pc->dc.dc & ~0xffff) == 0);
+++
+++            pc->dc.dc |= (coeff << 16);
+++        }
+++        else
+++        {
+++            HEVCPredCmd * const cmd = pc + 1;
+++            s->jb0->intra.n = i + 1;
+++
+++            cmd->type = RPI_PRED_ADD_DC + c_idx;
+++            cmd->size = log2_trafo_size;
+++            cmd->c_idx = c_idx;
+++            cmd->dc.dst = dst;
+++            cmd->dc.stride = stride;
+++            cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
+++        }
+++    }
+ +}
+ +
+++
+++#endif
+  
+- static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                                 int log2_trafo_size, enum ScanType scan_idx,
++                                 int c_idx)
+  {
+-@@ -367,7 +462,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
+-     int x, y;
+-     pixel *dst = (pixel *)_dst;
+-     pixel *src = (pixel *)_src;
+--    int16_t *sao_offset_val = sao->offset_val[c_idx];
+-     int sao_eo_class    = sao->eo_class[c_idx];
+-     int init_x = 0, width = _width, height = _height;
++-#define GET_COORD(offset, n)                                    \
++-    do {                                                        \
++-        x_c = (x_cg << 2) + scan_x_off[n];                      \
++-        y_c = (y_cg << 2) + scan_y_off[n];                      \
++-    } while (0)
++-    HEVCLocalContext *lc = s->HEVClc;
++-    int transform_skip_flag = 0;
+++    HEVCLocalContext * const lc = s->HEVClc;
+++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
+  
+-@@ -376,33 +470,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++     int last_significant_coeff_x, last_significant_coeff_y;
++-    int last_scan_pos;
++-    int n_end;
++     int num_coeff = 0;
++-    int greater1_ctx = 1;
+++    int prev_subset_coded = 0;
+  
+-     if (sao_eo_class != SAO_EO_VERT) {
+-         if (borders[0]) {
+--            int offset_val = sao_offset_val[0];
+-             for (y = 0; y < height; y++) {
+--                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+-+                dst[y * stride_dst] = src[y * stride_src];
+-             }
+-             init_x = 1;
+-         }
+-         if (borders[2]) {
+--            int offset_val = sao_offset_val[0];
+-             int offset     = width - 1;
+-             for (x = 0; x < height; x++) {
+--                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+-             }
+-             width--;
+-         }
+-     }
+-     if (sao_eo_class != SAO_EO_HORIZ) {
+-         if (borders[1]) {
+--            int offset_val = sao_offset_val[0];
+-             for (x = init_x; x < width; x++)
+--                dst[x] = av_clip_pixel(src[x] + offset_val);
+-+                dst[x] = src[x];
+-         }
+-         if (borders[3]) {
+--            int offset_val   = sao_offset_val[0];
+--            int y_stride_dst = stride_dst * (height - 1);
+--            int y_stride_src = stride_src * (height - 1);
+-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
+-             for (x = init_x; x < width; x++)
+--                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+-+                dst[x + y_stride_dst] = src[x + y_stride_src];
+-             height--;
+-         }
+-     }
+-@@ -417,7 +507,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+-     int x, y;
+-     pixel *dst = (pixel *)_dst;
+-     pixel *src = (pixel *)_src;
+--    int16_t *sao_offset_val = sao->offset_val[c_idx];
+-     int sao_eo_class    = sao->eo_class[c_idx];
+-     int init_x = 0, init_y = 0, width = _width, height = _height;
++     int num_last_subset;
++     int x_cg_last_sig, y_cg_last_sig;
+  
+-@@ -426,34 +515,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++-    const uint8_t *scan_x_cg, *scan_y_cg, *scan_x_off, *scan_y_off;
+++    const uint8_t *scan_x_cg, *scan_y_cg;
+++    const xy_off_t * scan_xy_off;
+  
+-     if (sao_eo_class != SAO_EO_VERT) {
+-         if (borders[0]) {
+--            int offset_val = sao_offset_val[0];
+-             for (y = 0; y < height; y++) {
+--                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+-+                dst[y * stride_dst] = src[y * stride_src];
+-             }
+-             init_x = 1;
+-         }
+-         if (borders[2]) {
+--            int offset_val = sao_offset_val[0];
+-             int offset     = width - 1;
+-             for (x = 0; x < height; x++) {
+--                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+-+                dst[x * stride_dst + offset] = src[x * stride_src + offset];
+-             }
+-             width--;
+-         }
+-     }
+-     if (sao_eo_class != SAO_EO_HORIZ) {
+-         if (borders[1]) {
+--            int offset_val = sao_offset_val[0];
+-             for (x = init_x; x < width; x++)
+--                dst[x] = av_clip_pixel(src[x] + offset_val);
+-+                dst[x] = src[x];
+-             init_y = 1;
+-         }
+-         if (borders[3]) {
+--            int offset_val   = sao_offset_val[0];
+--            int y_stride_dst = stride_dst * (height - 1);
+--            int y_stride_src = stride_src * (height - 1);
+-+            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+-+            ptrdiff_t y_stride_src = stride_src * (height - 1);
+-             for (x = init_x; x < width; x++)
+--                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+-+                dst[x + y_stride_dst] = src[x + y_stride_src];
+-             height--;
+-         }
+-     }
+-@@ -494,6 +579,127 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
+-     }
+- }
+++#ifndef RPI
++     ptrdiff_t stride = s->frame->linesize[c_idx];
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++-    uint8_t *dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
+++    uint8_t * const dst = &s->frame->data[c_idx][(y0 >> vshift) * stride +
++                                           ((x0 >> hshift) << s->ps.sps->pixel_shift)];
++-    int16_t *coeffs = (int16_t*)(c_idx ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
++-    uint8_t significant_coeff_group_flag[8][8] = {{0}};
+++#endif
+++#ifdef RPI
+++    int use_vpu;
+++    int use_dc = 0;
+++#endif
+++    int16_t *coeffs;
+++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
++     int explicit_rdpcm_flag = 0;
++     int explicit_rdpcm_dir_flag;
+  
++     int trafo_size = 1 << log2_trafo_size;
++     int i;
++-    int qp,shift,add,scale,scale_m;
+++    int qp,shift,scale;
++     static const uint8_t level_scale[] = { 40, 45, 51, 57, 64, 72 };
++     const uint8_t *scale_matrix = NULL;
++     uint8_t dc_scale;
++     int pred_mode_intra = (c_idx == 0) ? lc->tu.intra_pred_mode :
++                                          lc->tu.intra_pred_mode_c;
++ 
++-    memset(coeffs, 0, trafo_size * trafo_size * sizeof(int16_t));
+++    int prev_sig = 0;
+++    const int c_idx_nz = (c_idx != 0);
+ +
+-+// --- Plaited chroma versions
+-+
+-+#if BIT_DEPTH != 8
+-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                                  int width, int height)
+-+{
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+#else
+-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+-+                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+-+                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+-+                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+-+                                  int width, int height)
+-+{
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int offset_table_u[32] = { 0 };
+-+    int offset_table_v[32] = { 0 };
+-+    int k, y, x;
+-+    int shift  = BIT_DEPTH - 5;
+-+
+-+    stride_dst /= sizeof(pixel);
+-+    stride_src /= sizeof(pixel);
+-+    width *= 2;
+++    int may_hide_sign;
++ 
++     // Derive QP for dequant
++     if (!lc->cu.cu_transquant_bypass_flag) {
++-        static const int qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
+++        static const uint8_t qp_c[] = { 29, 30, 31, 32, 33, 33, 34, 34, 35, 35, 36, 36, 37, 37 };
++         static const uint8_t rem6[51 + 4 * 6 + 1] = {
++             0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2,
++             3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5,
++@@ -1065,9 +1706,19 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         };
++         int qp_y = lc->qp_y;
++ 
+++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
+ +
+-+    for (k = 0; k < 4; k++)
+-+    {
+-+        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+-+        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+-+    }
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x += 2)
++         if (s->ps.pps->transform_skip_enabled_flag &&
++             log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++-            transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx);
+++            int transform_skip_flag = hevc_transform_skip_flag_decode(s, c_idx_nz);
+++            if (transform_skip_flag) {
+++                trans_skip_or_bypass = 1;
+++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
+++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
+++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
+++                    may_hide_sign = 0;
+++                }
+++            }
++         }
++ 
++         if (c_idx == 0) {
++@@ -1100,39 +1751,76 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             qp += s->ps.sps->qp_bd_offset;
++         }
++ 
++-        shift    = s->ps.sps->bit_depth + log2_trafo_size - 5;
++-        add      = 1 << (shift-1);
++-        scale    = level_scale[rem6[qp]] << (div6[qp]);
++-        scale_m  = 16; // default when no custom scaling lists.
++-        dc_scale = 16;
+++        // Shift is set to one less than will actually occur as the scale
+++        // and saturate step adds 1 and then shifts right again
+++        shift = s->ps.sps->bit_depth + log2_trafo_size - 6;
+++        scale = level_scale[rem6[qp]];
+++        if (div6[qp] >= shift) {
+++            scale <<= (div6[qp] - shift);
+++            shift = 0;
+++        } else {
+++            shift -= div6[qp];
+++        }
++ 
++-        if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
+++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++             const ScalingList *sl = s->ps.pps->scaling_list_data_present_flag ?
++-            &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
+++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++             int matrix_id = lc->cu.pred_mode != MODE_INTRA;
++ 
++             matrix_id = 3 * matrix_id + c_idx;
++ 
++             scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
+++            dc_scale = scale_matrix[0];
++             if (log2_trafo_size >= 4)
++                 dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++         }
+++        else
+ +        {
+-+            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[src[x + 0] >> shift]);
+-+            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[src[x + 1] >> shift]);
+++            static const uint8_t sixteen_scale[64] = {
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16,
+++                16, 16, 16, 16, 16, 16, 16, 16
+++            };
+++            scale_matrix = sixteen_scale;
+++            dc_scale = 16;
+ +        }
+-+        dst += stride_dst;
+-+        src += stride_src;
+-+    }
+-+}
+-+#endif
++     } else {
+++        static const uint8_t unit_scale[64] = {
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++            1, 1, 1, 1, 1, 1, 1, 1,
+++        };
+++        scale_matrix = unit_scale;
++         shift        = 0;
++-        add          = 0;
++-        scale        = 0;
++-        dc_scale     = 0;
+++        scale        = 2;  // We will shift right to kill this
+++        dc_scale     = 1;
+ +
+-+#if BIT_DEPTH != 8
+-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+-+                                  int eo, int width, int height) {
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+#else
+++        may_hide_sign = 0;
++     }
++ 
+ +
+-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+-+                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+-+                                  int eo, int width, int height) {
+ +
+-+    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+-+    static const int8_t pos[4][2][2] = {
+-+        { { -1,  0 }, {  1, 0 } }, // horizontal
+-+        { {  0, -1 }, {  0, 1 } }, // vertical
+-+        { { -1, -1 }, {  1, 1 } }, // 45 degree
+-+        { {  1, -1 }, { -1, 1 } }, // 135 degree
+-+    };
+-+    pixel *dst = (pixel *)_dst;
+-+    pixel *src = (pixel *)_src;
+-+    int a_stride, b_stride;
+-+    int x, y;
+-+    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+-+    stride_dst /= sizeof(pixel);
+-+    width *= 2;
+ +
+-+    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+-+    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+-+    for (y = 0; y < height; y++) {
+-+        for (x = 0; x < width; x += 2) {
+-+            int diff0u = CMP(src[x], src[x + a_stride]);
+-+            int diff1u = CMP(src[x], src[x + b_stride]);
+-+            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+-+            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+-+            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+-+            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+-+            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+-+            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+-+        }
+-+        src += stride_src;
+-+        dst += stride_dst;
+-+    }
+-+}
+-+#endif
++     if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++-        (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx);
+++        trans_skip_or_bypass) {
+++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(s, c_idx_nz);
++         if (explicit_rdpcm_flag) {
++-            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx);
+++            may_hide_sign = 0;
+++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(s, c_idx_nz);
++         }
++     }
++ 
++-    last_significant_coeff_xy_prefix_decode(s, c_idx, log2_trafo_size,
+++    last_significant_coeff_xy_prefix_decode(s, c_idx_nz, log2_trafo_size,
++                                            &last_significant_coeff_x, &last_significant_coeff_y);
++ 
++     if (last_significant_coeff_x > 3) {
++@@ -1160,119 +1848,147 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++         int last_x_c = last_significant_coeff_x & 3;
++         int last_y_c = last_significant_coeff_y & 3;
++ 
++-        scan_x_off = ff_hevc_diag_scan4x4_x;
++-        scan_y_off = ff_hevc_diag_scan4x4_y;
++         num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++-        if (trafo_size == 4) {
+ +
+-+#if BIT_DEPTH != 8
+-+static void FUNC(sao_edge_restore_c_0)(uint8_t *_dst, uint8_t *_src,
+-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+-+                                    int *borders, int _width, int _height,
+-+                                    int c_idx, uint8_t *vert_edge,
+-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+-+{
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+static void FUNC(sao_edge_restore_c_1)(uint8_t *_dst, uint8_t *_src,
+-+                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
+-+                                    int *borders, int _width, int _height,
+-+                                    int c_idx, uint8_t *vert_edge,
+-+                                    uint8_t *horiz_edge, uint8_t *diag_edge)
+-+{
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+++        switch (log2_trafo_size) {
+++        case 2:
++             scan_x_cg = scan_1x1;
++             scan_y_cg = scan_1x1;
++-        } else if (trafo_size == 8) {
+++            break;
+++        case 3:
++             num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = diag_scan2x2_x;
++             scan_y_cg = diag_scan2x2_y;
++-        } else if (trafo_size == 16) {
+++            break;
+++        case 4:
++             num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan4x4_x;
++             scan_y_cg = ff_hevc_diag_scan4x4_y;
++-        } else { // trafo_size == 32
+++            break;
+++        case 5:
+++        default:
++             num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++             scan_x_cg = ff_hevc_diag_scan8x8_x;
++             scan_y_cg = ff_hevc_diag_scan8x8_y;
+++            break;
++         }
++         break;
++     }
++     case SCAN_HORIZ:
++         scan_x_cg = horiz_scan2x2_x;
++         scan_y_cg = horiz_scan2x2_y;
++-        scan_x_off = horiz_scan4x4_x;
++-        scan_y_off = horiz_scan4x4_y;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++         break;
++     default: //SCAN_VERT
++         scan_x_cg = horiz_scan2x2_y;
++         scan_y_cg = horiz_scan2x2_x;
++-        scan_x_off = horiz_scan4x4_y;
++-        scan_y_off = horiz_scan4x4_x;
++         num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++         break;
++     }
++     num_coeff++;
++     num_last_subset = (num_coeff - 1) >> 4;
++ 
++-    for (i = num_last_subset; i >= 0; i--) {
++-        int n, m;
++-        int x_cg, y_cg, x_c, y_c, pos;
++-        int implicit_non_zero_coeff = 0;
++-        int64_t trans_coeff_level;
++-        int prev_sig = 0;
++-        int offset = i << 4;
++-        int rice_init = 0;
++-
++-        uint8_t significant_coeff_flag_idx[16];
++-        uint8_t nb_significant_coeff_flag = 0;
+++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++ 
++-        x_cg = scan_x_cg[i];
++-        y_cg = scan_y_cg[i];
+++    scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++ 
++-        if ((i < num_last_subset) && (i > 0)) {
++-            int ctx_cg = 0;
++-            if (x_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg + 1][y_cg];
++-            if (y_cg < (1 << (log2_trafo_size - 2)) - 1)
++-                ctx_cg += significant_coeff_group_flag[x_cg][y_cg + 1];
+++    {
+++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
+++#ifdef RPI
+++        use_vpu = 0;
+++        if (s->enable_rpi) {
+++            const int special = trans_skip_or_bypass || lc->tu.cross_pf;  // These need special processinmg
+++            use_dc = (num_coeff == 1) && !special &&
+++                !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++ 
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-                significant_coeff_group_flag_decode(s, c_idx, ctx_cg);
++-            implicit_non_zero_coeff = 1;
++-        } else {
++-            significant_coeff_group_flag[x_cg][y_cg] =
++-            ((x_cg == x_cg_last_sig && y_cg == y_cg_last_sig) ||
++-             (x_cg == 0 && y_cg == 0));
+++            if (use_dc) {
+++                // Just need a little empty space
+++                coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+++                // No need to clear
+++            }
+++            else
+++            {
+++                use_vpu = !special && log2_trafo_size >= 4;
+++                coeffs = rpi_alloc_coeff_buf(s, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
+++#if HAVE_NEON
+++                rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
+ +#else
+-+// Any old 2 byte 'normal' restore will work for these
+-+#define sao_edge_restore_c_0_8 sao_edge_restore_0_10
+-+#define sao_edge_restore_c_1_8 sao_edge_restore_1_10
+++                memset(coeffs, 0, ccount * sizeof(int16_t));
+ +#endif
+++            }
++         }
+++        else
+++#endif
+++        {
+++            coeffs = (int16_t*)(c_idx_nz ? lc->edge_emu_buffer2 : lc->edge_emu_buffer);
+++            memset(coeffs, 0, ccount * sizeof(int16_t));
+++        }
+++    }
++ 
++-        last_scan_pos = num_coeff - offset - 1;
+++    i = num_last_subset;
+++    do {
+++        int implicit_non_zero_coeff = 0;
+++        int n_end;
+ +
+-+
+- #undef CMP
+++        uint8_t significant_coeff_flag_idx[16];
+++        unsigned int nb_significant_coeff_flag = 0;
+  
+- ////////////////////////////////////////////////////////////////////////////////
+-@@ -1694,3 +1900,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
+- #undef TQ1
+- #undef TQ2
+- #undef TQ3
++         if (i == num_last_subset) {
+++            // First time through
+++            int last_scan_pos = num_coeff - (i << 4) - 1;
++             n_end = last_scan_pos - 1;
++             significant_coeff_flag_idx[0] = last_scan_pos;
++             nb_significant_coeff_flag = 1;
++         } else {
++             n_end = 15;
+++            implicit_non_zero_coeff = (i != 0);
++         }
++ 
++-        if (x_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig = !!significant_coeff_group_flag[x_cg + 1][y_cg];
++-        if (y_cg < ((1 << log2_trafo_size) - 1) >> 2)
++-            prev_sig += (!!significant_coeff_group_flag[x_cg][y_cg + 1] << 1);
++-
++-        if (significant_coeff_group_flag[x_cg][y_cg] && n_end >= 0) {
++-            static const uint8_t ctx_idx_map[] = {
++-                0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8, // log2_trafo_size == 2
++-                1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 0
++-                2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, // prev_sig == 1
++-                2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, // prev_sig == 2
++-                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2  // default
+++        if (n_end >= 0) {
+++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
+++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
+++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
+++            };
+++            // N.B. prev_sig = Right * 2 + Down
+++            static const uint8_t ctx_idx_maps[3][4][16] = {
+++                {
+++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                },
+++                {
+++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
+++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
+++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
+++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
+++                }
++             };
++             const uint8_t *ctx_idx_map_p;
++             int scf_offset = 0;
++-            if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                ctx_idx_map_p = (uint8_t*) &ctx_idx_map[4 * 16];
++-                if (c_idx == 0) {
++-                    scf_offset = 40;
++-                } else {
++-                    scf_offset = 14 + 27;
++-                }
+ +
+-+#ifdef RPI
+++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                ctx_idx_map_p = ctx_idx_maps[0][3];
+++                scf_offset = 40 + c_idx_nz;
++             } else {
++-                if (c_idx != 0)
+++                if (c_idx_nz != 0)
++                     scf_offset = 27;
+ +
+-+// line zero
+-+#define P3 pix_l[0 * xstride]
+-+#define P2 pix_l[1 * xstride]
+-+#define P1 pix_l[2 * xstride]
+-+#define P0 pix_l[3 * xstride]
+-+#define Q0 pix_r[0 * xstride]
+-+#define Q1 pix_r[1 * xstride]
+-+#define Q2 pix_r[2 * xstride]
+-+#define Q3 pix_r[3 * xstride]
++                 if (log2_trafo_size == 2) {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[0];
+++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++                 } else {
++-                    ctx_idx_map_p = (uint8_t*) &ctx_idx_map[(prev_sig + 1) << 4];
++-                    if (c_idx == 0) {
++-                        if ((x_cg > 0 || y_cg > 0))
+++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
+++                    if (!c_idx_nz) {
+++                        if (i != 0)
++                             scf_offset += 3;
+ +
+-+// line three. used only for deblocking decision
+-+#define TP3 pix_l[0 * xstride + 3 * ystride]
+-+#define TP2 pix_l[1 * xstride + 3 * ystride]
+-+#define TP1 pix_l[2 * xstride + 3 * ystride]
+-+#define TP0 pix_l[3 * xstride + 3 * ystride]
+-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
+-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
+-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
+-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
++                         if (log2_trafo_size == 3) {
++                             scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                         } else {
++@@ -1286,34 +2002,30 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++                     }
++                 }
++             }
++-            for (n = n_end; n > 0; n--) {
++-                x_c = scan_x_off[n];
++-                y_c = scan_y_off[n];
++-                if (significant_coeff_flag_decode(s, x_c, y_c, scf_offset, ctx_idx_map_p)) {
++-                    significant_coeff_flag_idx[nb_significant_coeff_flag] = n;
++-                    nb_significant_coeff_flag++;
+ +
+-+// This is identical to hevc_loop_filter_luma except that the P/Q
+-+// components are on separate pointers
+-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+-+                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+-+                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+-+                                 uint8_t * _pix_l)
+-+{
+-+    int d, j;
+-+    pixel *pix_l        = (pixel *)_pix_l;
+-+    pixel *pix_r        = (pixel *)_pix_r;
+-+    const ptrdiff_t xstride = 1;
+-+    const ptrdiff_t ystride = _stride / sizeof(pixel);
+-+
+-+    beta <<= BIT_DEPTH - 8;
+-+
+-+    for (j = 0; j < 2; j++) {
+-+        const int dp0  = abs(P2  - 2 * P1  + P0);
+-+        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+-+        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+-+        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+-+        const int d0   = dp0 + dq0;
+-+        const int d3   = dp3 + dq3;
+-+        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+-+        const int no_p = _no_p[j];
+-+        const int no_q = _no_q[j];
+-+
+-+        if (d0 + d3 >= beta) {
+-+            pix_l += 4 * ystride;
+-+            pix_r += 4 * ystride;
+-+            continue;
+-+        } else {
+-+            const int beta_3 = beta >> 3;
+-+            const int beta_2 = beta >> 2;
+-+            const int tc25   = ((tc * 5 + 1) >> 1);
+-+
+-+            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+-+                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+-+                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+-+                // strong filtering
+-+                const int tc2 = tc << 1;
+-+                for (d = 0; d < 4; d++) {
+-+                    const int p3 = P3;
+-+                    const int p2 = P2;
+-+                    const int p1 = P1;
+-+                    const int p0 = P0;
+-+                    const int q0 = Q0;
+-+                    const int q1 = Q1;
+-+                    const int q2 = Q2;
+-+                    const int q3 = Q3;
+-+                    if (!no_p) {
+-+                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+-+                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+-+                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+-+                    }
+-+                    if (!no_q) {
+-+                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+-+                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+-+                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+-+                    }
+-+                    pix_l += ystride;
+-+                    pix_r += ystride;
+-+                }
+-+            } else { // normal filtering
+-+                int nd_p = 1;
+-+                int nd_q = 1;
+-+                const int tc_2 = tc >> 1;
+-+                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+-+                    nd_p = 2;
+-+                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+-+                    nd_q = 2;
+-+
+-+                for (d = 0; d < 4; d++) {
+-+                    const int p2 = P2;
+-+                    const int p1 = P1;
+-+                    const int p0 = P0;
+-+                    const int q0 = Q0;
+-+                    const int q1 = Q1;
+-+                    const int q2 = Q2;
+-+                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+-+                    if (abs(delta0) < 10 * tc) {
+-+                        delta0 = av_clip(delta0, -tc, tc);
+-+                        if (!no_p)
+-+                            P0 = av_clip_pixel(p0 + delta0);
+-+                        if (!no_q)
+-+                            Q0 = av_clip_pixel(q0 - delta0);
+-+                        if (!no_p && nd_p > 1) {
+-+                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+-+                            P1 = av_clip_pixel(p1 + deltap1);
+-+                        }
+-+                        if (!no_q && nd_q > 1) {
+-+                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+-+                            Q1 = av_clip_pixel(q1 + deltaq1);
+-+                        }
+-+                    }
+-+                    pix_l += ystride;
+-+                    pix_r += ystride;
+-+                }
+-+            }
+-+        }
+-+    }
+-+}
+-+
+-+#undef TP3
+-+#undef TP2
+-+#undef TP1
+-+#undef TP0
+-+#undef TQ0
+-+#undef TQ1
+-+#undef TQ2
+-+#undef TQ3
+-+
+-+#undef P3
+-+#undef P2
+-+#undef P1
+-+#undef P0
+-+#undef Q0
+-+#undef Q1
+-+#undef Q2
+-+#undef Q3
+-+
+-+#define P1 pix_l[0 * xstride]
+-+#define P0 pix_l[1 * xstride]
+-+#define Q0 pix_r[0 * xstride]
+-+#define Q1 pix_r[1 * xstride]
+-+
+-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+-+                                          ptrdiff_t _ystride, const int32_t *_tc,
+-+                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+-+{
+-+    int d, j, no_p, no_q;
+-+    pixel *pix_l        = (pixel *)_pix_l;
+-+    pixel *pix_r        = (pixel *)_pix_r;
+-+    ptrdiff_t xstride = _xstride / sizeof(pixel);
+-+    ptrdiff_t ystride = _ystride / sizeof(pixel);
+++            if (n_end > 0) {
+++                int cnt = get_sig_coeff_flag_idxs(&s->HEVClc->cc,
+++                    s->HEVClc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
+++                    n_end, ctx_idx_map_p,
+++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
+ +
+-+    for (j = 0; j < 2; j++) {
+-+        const int tc = _tc[j] << (BIT_DEPTH - 8);
+-+        if (tc <= 0) {
+-+            pix_l += 4 * ystride;
+-+            pix_r += 4 * ystride;
+-+            continue;
+-+        }
+-+        no_p = _no_p[j];
+-+        no_q = _no_q[j];
+++                nb_significant_coeff_flag += cnt;
+++                if (cnt != 0) {
++                     implicit_non_zero_coeff = 0;
++                 }
++             }
+ +
+-+        for (d = 0; d < 4; d++) {
+-+            int delta0;
+-+            const int p1 = P1;
+-+            const int p0 = P0;
+-+            const int q0 = Q0;
+-+            const int q1 = Q1;
+-+            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+-+            if (!no_p)
+-+                P0 = av_clip_pixel(p0 + delta0);
+-+            if (!no_q)
+-+                Q0 = av_clip_pixel(q0 - delta0);
+-+            pix_l += ystride;
+-+            pix_r += ystride;
+-+        }
+-+    }
+-+}
++             if (implicit_non_zero_coeff == 0) {
++-                if (s->ps.sps->transform_skip_context_enabled_flag &&
++-                    (transform_skip_flag || lc->cu.cu_transquant_bypass_flag)) {
++-                    if (c_idx == 0) {
++-                        scf_offset = 42;
++-                    } else {
++-                        scf_offset = 16 + 27;
++-                    }
+++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
+++                    scf_offset = 42 + c_idx_nz;
++                 } else {
++                     if (i == 0) {
++-                        if (c_idx == 0)
++-                            scf_offset = 0;
++-                        else
++-                            scf_offset = 27;
+++                        scf_offset = c_idx_nz ? 27 : 0;
++                     } else {
++                         scf_offset = 2 + scf_offset;
++                     }
++                 }
++-                if (significant_coeff_flag_decode_0(s, c_idx, scf_offset) == 1) {
+++                if (significant_coeff_flag_decode_0(s, scf_offset) == 1) {
++                     significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                     nb_significant_coeff_flag++;
++                 }
++@@ -1323,141 +2035,185 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             }
++         }
++ 
++-        n_end = nb_significant_coeff_flag;
++-
+++        if (nb_significant_coeff_flag != 0) {
+++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
+++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
+++                prev_subset_coded;
+++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
+++                (gt1_idx_delta << 2);
+++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
+++                gt1_idx_delta;
+ +
+-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+-+                                 unsigned int no_f)
+-+{
+-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+-+    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+-+    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+-+}
+++            const unsigned int x_cg = scan_x_cg[i];
+++            const unsigned int y_cg = scan_y_cg[i];
+++            int16_t * const blk_coeffs = coeffs +
+++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
+++            // This calculation is 'wrong' for log2_traffo_size == 2
+++            // but that doesn't mattor as in this case x_cg & y_cg
+++            // are always 0 so result is correct (0) anyway
+++            const uint8_t * const blk_scale = scale_matrix +
+++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
+ +
+-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+-+                                 uint8_t * src_l,
+-+                                 unsigned int no_f)
+-+{
+-+    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+-+    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+-+    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+-+    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+-+    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+-+}
+++            // * The following code block doesn't deal with these flags:
+++            //   (nor did the one it replaces)
+++            //
+++            // cabac_bypass_alignment_enabled_flag
+++            //    This should be easy but I can't find a test case
+++            // extended_precision_processing_flag
+++            //    This can extend the required precision past 16bits
+++            //    so is probably tricky - also no example found yet
+ +
+-+#undef P1
+-+#undef P0
+-+#undef Q0
+-+#undef Q1
+++#if USE_N_END_1
+++            if (nb_significant_coeff_flag == 1) {
+++                // There is a small gain to be had from special casing the single
+++                // transform coefficient case.  The reduction in complexity
+++                // makes up for the code duplicatioon.
+ +
+++                int trans_coeff_level = 1;
+++                int coeff_sign_flag;
+++                int coded_val = 0;
+ +
+-+#endif
+++                // initialize first elem of coeff_bas_level_greater1_flag
+++                prev_subset_coded = 0;
+ +
+-diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
+-index 02c1766..cea16ea 100644
+---- a/libavcodec/hevcpred.c
+-+++ b/libavcodec/hevcpred.c
+-@@ -24,6 +24,7 @@
+- 
+- #include "hevcpred.h"
+- 
+-+#define PRED_C 0
+- #define BIT_DEPTH 8
+- #include "hevcpred_template.c"
+- #undef BIT_DEPTH
+-@@ -39,13 +40,37 @@
+- #define BIT_DEPTH 12
+- #include "hevcpred_template.c"
+- #undef BIT_DEPTH
+-+#undef PRED_C
+-+
+-+#ifdef RPI
+-+#define PRED_C 1
+-+#define BIT_DEPTH 8
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+
+-+#define BIT_DEPTH 9
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+
+-+#define BIT_DEPTH 10
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+
+-+#define BIT_DEPTH 12
+-+#include "hevcpred_template.c"
+-+#undef BIT_DEPTH
+-+#undef PRED_C
+-+#endif
+- 
+- void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+- {
+- #undef FUNC
+- #define FUNC(a, depth) a ## _ ## depth
+- 
+--#define HEVC_PRED(depth)                                \
+-+#undef FUNCC
+-+#define FUNCC(a, depth) a ## _ ## depth ## _c
+-+
+-+#define HEVC_PRED_Y(depth)                                \
+-     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
+-     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
+-     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
+-@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
+-     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
+-     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
+++                if (get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx0_gt1 + 1)) {
+++                    trans_coeff_level = 2;
+++                    prev_subset_coded = 1;
+++                    coded_val = get_cabac(&s->HEVClc->cc, s->HEVClc->cabac_state + idx_gt2);
+++                }
+  
+-+#define HEVC_PRED_C(depth)                                \
+-+    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+-+    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+-+    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+-+    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+-+    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+-+    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+-+    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+-+    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+-+    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+-+    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+-+    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+-+    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+-+    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+-+
+-+#ifdef RPI
+-+#define HEVC_PRED(depth) \
+-+    HEVC_PRED_Y(depth); \
+-+    HEVC_PRED_C(depth);
+-+#else
+-+#define HEVC_PRED(depth) \
+-+    HEVC_PRED_Y(depth);
+-+#endif
+-+
+-     switch (bit_depth) {
+-     case 9:
+-         HEVC_PRED(9);
+-diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
+-index eb17663..00ba3f9 100644
+---- a/libavcodec/hevcpred.h
+-+++ b/libavcodec/hevcpred.h
+-@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
+-     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
+-                             const uint8_t *left, ptrdiff_t stride,
+-                             int c_idx, int mode);
+-+#ifdef RPI
+-+    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+-+
+-+    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+-+                           const uint8_t *left, ptrdiff_t stride);
+-+    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+-+                    ptrdiff_t stride, int log2_size, int c_idx);
+-+    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+-+                            const uint8_t *left, ptrdiff_t stride,
+-+                            int c_idx, int mode);
+-+#endif
+- } HEVCPredContext;
++-        if (n_end) {
++-            int first_nz_pos_in_cg;
++-            int last_nz_pos_in_cg;
++-            int c_rice_param = 0;
++-            int first_greater1_coeff_idx = -1;
++-            uint8_t coeff_abs_level_greater1_flag[8];
++-            uint16_t coeff_sign_flag;
++-            int sum_abs = 0;
++-            int sign_hidden;
++-            int sb_type;
+++                // Probably not worth the overhead of starting by22 for just one value
+++                coeff_sign_flag = get_cabac_bypass(&s->HEVClc->cc);
+  
+- void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
+-diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
+-index 6ae87cc..c14dddd 100644
+---- a/libavcodec/hevcpred_template.c
+-+++ b/libavcodec/hevcpred_template.c
+-@@ -20,13 +20,55 @@
+-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+-  */
+++                if (coded_val)
+++                {
+++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
+++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(s, 0);
+++                    } else {
+++                        uint8_t * const stat_coeff =
+++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                        const unsigned int c_rice_param = *stat_coeff >> 2;
+++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
+  
+-+//#define DISABLE_INTRA
+-+
+- #include "libavutil/pixdesc.h"
++-            // initialize first elem of coeff_bas_level_greater1_flag
++-            int ctx_set = (i > 0 && c_idx == 0) ? 2 : 0;
+++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
+++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                    }
+++                }
+  
+- #include "bit_depth_template.c"
+- #include "hevcpred.h"
++-            if (s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++-                if (!transform_skip_flag && !lc->cu.cu_transquant_bypass_flag)
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0);
++-                else
++-                    sb_type = 2 * (c_idx == 0 ? 1 : 0) + 1;
++-                c_rice_param = lc->stat_coeff[sb_type] / 4;
++-            }
+++                {
+++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
+++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
+++                    const unsigned int scale_m = blk_scale[xy_off->scale];
+  
+-+#ifdef RPI
+-+#include "rpi_zc.h"
++-            if (!(i == num_last_subset) && greater1_ctx == 0)
++-                ctx_set++;
++-            greater1_ctx = 1;
++-            last_nz_pos_in_cg = significant_coeff_flag_idx[0];
++-
++-            for (m = 0; m < (n_end > 8 ? 8 : n_end); m++) {
++-                int inc = (ctx_set << 2) + greater1_ctx;
++-                coeff_abs_level_greater1_flag[m] =
++-                    coeff_abs_level_greater1_flag_decode(s, c_idx, inc);
++-                if (coeff_abs_level_greater1_flag[m]) {
++-                    greater1_ctx = 0;
++-                    if (first_greater1_coeff_idx == -1)
++-                        first_greater1_coeff_idx = m;
++-                } else if (greater1_ctx > 0 && greater1_ctx < 3) {
++-                    greater1_ctx++;
+++                    blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                        (trans_coeff_level ^ k) - k,  // Apply sign
+++                        scale,
+++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
+++                        shift);
++                 }
++             }
++-            first_nz_pos_in_cg = significant_coeff_flag_idx[n_end - 1];
++-
++-            if (lc->cu.cu_transquant_bypass_flag ||
++-                (lc->cu.pred_mode ==  MODE_INTRA  &&
++-                 s->ps.sps->implicit_rdpcm_enabled_flag  &&  transform_skip_flag  &&
++-                 (pred_mode_intra == 10 || pred_mode_intra  ==  26 )) ||
++-                 explicit_rdpcm_flag)
++-                sign_hidden = 0;
++             else
++-                sign_hidden = (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4);
+ +#endif
+++            {
+++                int sign_hidden = may_hide_sign;
+++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
+++                uint32_t coeff_sign_flags;
+++                uint32_t coded_vals = 0;
+++                // Sum(abs(level[]))
+++                // In fact we only need the bottom bit and in some future
+++                // version that may be all we calculate
+++                unsigned int sum_abs;
+ +
+-+#define DUMP_PRED 0
+++                coded_vals = get_greaterx_bits(s, nb_significant_coeff_flag, levels,
+++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
+ +
+- #define POS(x, y) src[(x) + stride * (y)]
+- 
+-+#if PRED_C
+++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
+++                    sign_hidden = 0;
+ +
+-+typedef uint8_t (* c8_dst_ptr_t)[2];
+-+typedef const uint8_t (* c8_src_ptr_t)[2];
+++                // -- Start bypass block
+ +
+-+#if BIT_DEPTH == 8
+-+#undef BIT_DEPTH
+-+#define BIT_DEPTH 16
+-+#include "bit_depth_template.c"
+-+#undef FUNC
+-+#define FUNC(a) FUNC3(a, 8, _c)
+-+#else
+-+#undef FUNC
+-+#define FUNC FUNCC
+-+#endif
+++                bypass_start(s);
+ +
+-+#endif
+++                coeff_sign_flags = coeff_sign_flag_decode_bypass(s, nb_significant_coeff_flag - sign_hidden);
+ +
+-+#if DUMP_PRED
+-+#ifndef DEBUG_ONCE
+-+#define DEBUG_ONCE
+-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+-+{
+-+    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+-+        for (unsigned int x = 0; x != size; x++) {
+-+            printf("%4d", data[x * 2]);
+-+        }
+-+        printf("\n");
+-+    }
+-+    printf("\n");
+-+}
+-+#endif
+-+#endif
+++                if (coded_vals != 0)
+++                {
+++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
+++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
+++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
+++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
+++                    int * level = levels - 1;
+ +
+- static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
+-                                               int log2_size, int c_idx)
+- {
+-@@ -69,8 +111,11 @@ do {                                  \
+-                 AV_WN4P(&ptr[i], a);                                           \
+-             else                                                               \
+-                 a = PIXEL_SPLAT_X4(ptr[i + 3])
+--
+-+#ifdef RPI_WORKER
+-+    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+-+#else
+-     HEVCLocalContext *lc = s->HEVClc;
+-+#endif
+-     int i;
+-     int hshift = s->ps.sps->hshift[c_idx];
+-     int vshift = s->ps.sps->vshift[c_idx];
+-@@ -79,15 +124,23 @@ do {                                  \
+-     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
+-     int size_in_luma_v = size << vshift;
+-     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
+--    int x = x0 >> hshift;
+--    int y = y0 >> vshift;
+-+    const int x = x0 >> hshift;
+-+    const int y = y0 >> vshift;
+-     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+-     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
+- 
+-     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
+- 
+--    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+-+    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+-+#if defined(RPI)
+-+    pixel *const src = s->frame->format != AV_PIX_FMT_SAND128 ?
+-+            (pixel*)s->frame->data[c_idx] + x + y * stride :
+-+        c_idx == 0 ?
+-+            (pixel *)rpi_sliced_frame_pos_y(s->frame, x, y) :
+-+            (pixel *)rpi_sliced_frame_pos_c(s->frame, x, y);
+-+#else
+-     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+-+#endif
+- 
+-     int min_pu_width = s->ps.sps->min_pu_width;
+- 
+-@@ -95,14 +148,20 @@ do {                                  \
+-                               lc->tu.intra_pred_mode;
+-     pixel4 a;
+-     pixel  left_array[2 * MAX_TB_SIZE + 1];
+-+#if !PRED_C
+-     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+-+#endif
+-     pixel  top_array[2 * MAX_TB_SIZE + 1];
+-+#if !PRED_C
+-     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+-+#endif
+++                    do {
+++                        {
+++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
+++                            level += z;
+++                            coded_vals <<= z;
+++                        }
+  
+-     pixel  *left          = left_array + 1;
+-     pixel  *top           = top_array  + 1;
+-+#if !PRED_C
+-     pixel  *filtered_left = filtered_left_array + 1;
+-     pixel  *filtered_top  = filtered_top_array  + 1;
+-+#endif
+-     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
+-     int cand_left        = lc->na.cand_left;
+-     int cand_up_left     = lc->na.cand_up_left;
+-@@ -114,6 +173,26 @@ do {                                  \
+-     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
+-                            (x0 + size_in_luma_h)) >> hshift;
++-            if (first_greater1_coeff_idx != -1) {
++-                coeff_abs_level_greater1_flag[first_greater1_coeff_idx] += coeff_abs_level_greater2_flag_decode(s, c_idx, ctx_set);
++-            }
++-            if (!s->ps.pps->sign_data_hiding_flag || !sign_hidden ) {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag) << (16 - nb_significant_coeff_flag);
++-            } else {
++-                coeff_sign_flag = coeff_sign_flag_decode(s, nb_significant_coeff_flag - 1) << (16 - (nb_significant_coeff_flag - 1));
++-            }
+++                        {
+++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(s, c_rice_param);
+++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
+  
+-+    pixel * src_l = src - 1;
+-+    pixel * src_u = src - stride;
+-+    pixel * src_ur = src_u + size;
++-            for (m = 0; m < n_end; m++) {
++-                n = significant_coeff_flag_idx[m];
++-                GET_COORD(offset, n);
++-                if (m < 8) {
++-                    trans_coeff_level = 1 + coeff_abs_level_greater1_flag[m];
++-                    if (trans_coeff_level == ((m == first_greater1_coeff_idx) ? 3 : 2)) {
++-                        int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                        trans_coeff_level += last_coeff_abs_level_remaining;
++-                        if (trans_coeff_level > (3 << c_rice_param))
++-                            c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                        if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                            int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                            if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                                lc->stat_coeff[sb_type]++;
++-                            else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                                if (lc->stat_coeff[sb_type] > 0)
++-                                    lc->stat_coeff[sb_type]--;
++-                            rice_init = 1;
+++                            sum_abs += last_coeff_abs_level_remaining + 1;
+++                            *level = trans_coeff_level;
+ +
+-+#ifdef DISABLE_INTRA
+-+    return;
+-+#endif
+++                            if (stat_coeff != NULL)
+++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
+++                            stat_coeff = NULL;
+ +
+-+#if defined(RPI)
+-+    if (s->frame->format == AV_PIX_FMT_SAND128) {
+-+        const AVFrame * const frame = s->frame;
+-+        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+-+        const unsigned int stripe_adj = (frame->linesize[3] - 1) * stride;
+-+        if ((x & mask) == 0)
+-+            src_l -= stripe_adj;
+-+        if (((x + size) & mask) == 0)
+-+            src_ur += stripe_adj;
+-+    }
+++                            if (trans_coeff_level > (3 << c_rice_param) &&
+++                                (c_rice_param < 4 || rice_adaptation_enabled))
+++                                ++c_rice_param;
++                         }
++-                    }
++-                } else {
++-                    int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(s, c_rice_param);
++-
++-                    trans_coeff_level = 1 + last_coeff_abs_level_remaining;
++-                    if (trans_coeff_level > (3 << c_rice_param))
++-                        c_rice_param = s->ps.sps->persistent_rice_adaptation_enabled_flag ? c_rice_param + 1 : FFMIN(c_rice_param + 1, 4);
++-                    if (s->ps.sps->persistent_rice_adaptation_enabled_flag && !rice_init) {
++-                        int c_rice_p_init = lc->stat_coeff[sb_type] / 4;
++-                        if (last_coeff_abs_level_remaining >= (3 << c_rice_p_init))
++-                            lc->stat_coeff[sb_type]++;
++-                        else if (2 * last_coeff_abs_level_remaining < (1 << c_rice_p_init))
++-                            if (lc->stat_coeff[sb_type] > 0)
++-                                lc->stat_coeff[sb_type]--;
++-                        rice_init = 1;
++-                    }
+++                    } while (coded_vals != 0);
++                 }
++-                if (s->ps.pps->sign_data_hiding_flag && sign_hidden) {
++-                    sum_abs += trans_coeff_level;
++-                    if (n == first_nz_pos_in_cg && (sum_abs&1))
++-                        trans_coeff_level = -trans_coeff_level;
+++
+++                // sign_hidden = 0 or 1 so we can combine the tests
+++                if ((sign_hidden & sum_abs) != 0) {
+++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++                 }
++-                if (coeff_sign_flag >> 15)
++-                    trans_coeff_level = -trans_coeff_level;
++-                coeff_sign_flag <<= 1;
++-                if(!lc->cu.cu_transquant_bypass_flag) {
++-                    if (s->ps.sps->scaling_list_enable_flag && !(transform_skip_flag && log2_trafo_size > 2)) {
++-                        if(y_c || x_c || log2_trafo_size < 4) {
++-                            switch(log2_trafo_size) {
++-                                case 3: pos = (y_c << 3) + x_c; break;
++-                                case 4: pos = ((y_c >> 1) << 3) + (x_c >> 1); break;
++-                                case 5: pos = ((y_c >> 2) << 3) + (x_c >> 2); break;
++-                                default: pos = (y_c << 2) + x_c; break;
++-                            }
++-                            scale_m = scale_matrix[pos];
++-                        } else {
++-                            scale_m = dc_scale;
++-                        }
+++
+++                bypass_finish(s);
+++
+++                // -- Finish bypass block
+++
+++                // Scale loop
+++                {
+++                    int m = nb_significant_coeff_flag - 1;
+++
+++                    // Deal with DC component (if any) first
+++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
+++                    {
+++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+++                        blk_coeffs[0] = trans_scale_sat(
+++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
+++                        --m;
++                     }
++-                    trans_coeff_level = (trans_coeff_level * (int64_t)scale * (int64_t)scale_m + add) >> shift;
++-                    if(trans_coeff_level < 0) {
++-                        if((~trans_coeff_level) & 0xFffffffffff8000)
++-                            trans_coeff_level = -32768;
++-                    } else {
++-                        if(trans_coeff_level & 0xffffffffffff8000)
++-                            trans_coeff_level = 32767;
+++
+++#if !USE_N_END_1
+++                    // If N_END_1 set then m was at least 1 initially
+++                    if (m >= 0)
+ +#endif
+++                    {
+++                        do {
+++                            const xy_off_t * const xy_off = scan_xy_off +
+++                                significant_coeff_flag_idx[m];
+++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
+ +
+-     if (s->ps.pps->constrained_intra_pred_flag == 1) {
+-         int size_in_luma_pu_v = PU(size_in_luma_v);
+-         int size_in_luma_pu_h = PU(size_in_luma_h);
+-@@ -163,23 +242,24 @@ do {                                  \
+-         top[-1] = 128;
+-     }
+-     if (cand_up_left) {
+--        left[-1] = POS(-1, -1);
+-+        left[-1] = src_l[-stride];
+-         top[-1]  = left[-1];
+-     }
+-     if (cand_up)
+--        memcpy(top, src - stride, size * sizeof(pixel));
+-+        // Always good - even with sand
+-+        memcpy(top, src_u, size * sizeof(pixel));
+-     if (cand_up_right) {
+--        memcpy(top + size, src - stride + size, size * sizeof(pixel));
+--        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+-+        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+-+        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
+-                size - top_right_size);
+-     }
+-     if (cand_left)
+-         for (i = 0; i < size; i++)
+--            left[i] = POS(-1, i);
+-+            left[i] = src_l[stride * i];
+-     if (cand_bottom_left) {
+-         for (i = size; i < size + bottom_left_size; i++)
+--            left[i] = POS(-1, i);
+--        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+-+            left[i] = src_l[stride * i];
+-+        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
+-                size - bottom_left_size);
+-     }
+++                            blk_coeffs[xy_off->coeff] = trans_scale_sat(
+++                                (levels[m] ^ k) - k,
+++                                scale,
+++                                blk_scale[xy_off->scale],
+++                                shift);
+++                        } while (--m >= 0);
++                     }
++                 }
++-                coeffs[y_c * trafo_size + x_c] = trans_coeff_level;
+++
++             }
++         }
++-    }
+++    } while ((i = next_subset(s, i, c_idx_nz,
+++        significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0);
+  
+-@@ -268,7 +348,11 @@ do {                                  \
+-             cand_up_left = 1;
+-             cand_left    = 1;
+-         } else { // No samples available
+-+#if PRED_C && BIT_DEPTH == 16
+-+            left[-1] = 0x8080;
++     if (lc->cu.cu_transquant_bypass_flag) {
++         if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++@@ -1467,7 +2223,7 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++         }
++     } else {
++-        if (transform_skip_flag) {
+++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++             int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                       log2_trafo_size == 2 &&
++                       lc->cu.pred_mode == MODE_INTRA;
++@@ -1487,10 +2243,23 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             }
++         } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++             s->hevcdsp.idct_4x4_luma(coeffs);
++-        } else {
+++        }
+++#ifdef RPI
+++        else if (!use_vpu)
+ +#else
+-             left[-1] = (1 << (BIT_DEPTH - 1));
+++        else
+ +#endif
+-             EXTEND(top,  left[-1], 2 * size);
+-             EXTEND(left, left[-1], 2 * size);
+++        {
++             int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++             if (max_xy == 0)
++-                s->hevcdsp.idct_dc[log2_trafo_size-2](coeffs);
+++            {
+++#ifdef RPI
+++                if (use_dc)
+++                    rpi_add_dc(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+++                else
+++#endif
+++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
+++            }
++             else {
++                 int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++                 if (max_xy < 4)
++@@ -1510,7 +2279,14 @@ void ff_hevc_hls_residual_coding(HEVCContext *s, int x0, int y0,
++             coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
+          }
+-@@ -287,6 +371,9 @@ do {                                  \
+-     top[-1] = left[-1];
+- 
+-     // Filtering process
+-+    // Sand128 can only apply to chroma_format_idc == 1 so we don't need to
+-+    // worry about chroma smoothing for that case
+-+#if !PRED_C
+-     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
+-         if (mode != INTRA_DC && size != 4){
+-             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
+-@@ -342,13 +429,46 @@ do {                                  \
+-                                            mode);
+-         break;
+      }
+-+#else
+-+    switch (mode) {
+-+    case INTRA_PLANAR:
+-+        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+-+                                          (uint8_t *)left, stride);
+-+        break;
+-+    case INTRA_DC:
+-+        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+-+                       (uint8_t *)left, stride, log2_size, c_idx);
+-+        break;
+-+    default:
+-+        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+-+                                           (uint8_t *)left, stride, c_idx,
+-+                                           mode);
+-+        break;
+++#ifdef RPI
+++    if (!use_dc)
+++    {
+++        rpi_add_residual(s, log2_trafo_size, c_idx, x0, y0, coeffs);
+ +    }
+-+
+-+#if DUMP_PRED
+-+    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+-+    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+-+    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+-+    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+-+#endif
+++#else
++     s->hevcdsp.transform_add[log2_trafo_size-2](dst, coeffs, stride);
+ +#endif
+  }
+  
+-+#if !PRED_C || BIT_DEPTH == 16
+- #define INTRA_PRED(size)                                                            \
+- static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+- {                                                                                   \
+-     FUNC(intra_pred)(s, x0, y0, size, c_idx);                                       \
+- }
+-+#else
+-+#define INTRA_PRED(size)                                                            \
+-+static void FUNC(intra_pred_ ## size)(HEVCContext *s, int x0, int y0, int c_idx)    \
+-+{                                                                                   \
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF\n", __func__);                              \
+-+    abort();                                                                        \
+-+}
+-+#endif
++ void ff_hevc_hls_mvd_coding(HEVCContext *s, int x0, int y0, int log2_cb_size)
++diff --git a/libavcodec/hevc_filter.c b/libavcodec/hevc_filter.c
++index 9fbcd1d8b8..df129e2e46 100644
++--- a/libavcodec/hevc_filter.c
+++++ b/libavcodec/hevc_filter.c
++@@ -22,6 +22,12 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
+  
+- INTRA_PRED(2)
+- INTRA_PRED(3)
+-@@ -357,6 +477,7 @@ INTRA_PRED(5)
+++//#define DISABLE_SAO
+++//#define DISABLE_DEBLOCK
+++//#define DISABLE_STRENGTHS
+++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
+++//#define DISABLE_DEBLOCK_NONREF
+++
++ #include "libavutil/common.h"
++ #include "libavutil/internal.h"
+  
+- #undef INTRA_PRED
++@@ -31,6 +37,16 @@
+  
+-+#if !PRED_C
+- static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
+-                                   const uint8_t *_left, ptrdiff_t stride,
+-                                   int trafo_size)
+-@@ -371,13 +492,46 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
+-             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
+-                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
+- }
++ #include "bit_depth_template.c"
++ 
+++#ifdef RPI
+++#include "rpi_qpu.h"
+++#endif
+++#if RPI_HEVC_SAND
+++#include "rpi_zc.h"
+++#include "libavutil/rpi_sand_fns.h"
+ +#else
+-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+-+                                  const uint8_t * _left, ptrdiff_t stride,
+-+                                  int trafo_size)
+++#define RPI_ZC_SAND_8_IN_10_BUF 0
+++#endif
+++
++ #define LUMA 0
++ #define CB 1
++ #define CR 2
++@@ -139,6 +155,15 @@ static int get_qPy(HEVCContext *s, int xC, int yC)
++     return s->qp_y_tab[x + y * s->ps.sps->min_cb_width];
++ }
++ 
+++static inline unsigned int pixel_shift(const HEVCContext * const s, const unsigned int c_idx)
+ +{
+-+    int x, y;
+-+    int size = 1 << trafo_size;
+-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+++#if RPI_HEVC_SAND
+++    return c_idx != 0 && av_rpi_is_sand_frame(s->frame) ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
+++#else
+++    return s->ps.sps->pixel_shift;
+++#endif
+++}
+ +
+-+    for (y = 0; y < size; y++, src += stride)
++ static void copy_CTB(uint8_t *dst, const uint8_t *src, int width, int height,
++                      intptr_t stride_dst, intptr_t stride_src)
++ {
++@@ -161,12 +186,21 @@ int i, j;
++     }
++ }
++ 
+++// "DSP" these?
++ static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++ {
++-    if (pixel_shift)
++-        *(uint16_t *)dst = *(uint16_t *)src;
++-    else
++-        *dst = *src;
+++    switch (pixel_shift)
+ +    {
+-+        for (x = 0; x < size; x++)
+-+        {
+-+            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+-+                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+-+            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+-+                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+-+        }
+++        case 2:
+++            *(uint32_t *)dst = *(uint32_t *)src;
+++            break;
+++        case 1:
+++            *(uint16_t *)dst = *(uint16_t *)src;
+++            break;
+++        default:
+++            *dst = *src;
+++            break;
+ +    }
+-+}
+-+#endif
++ }
+  
+-+#if !PRED_C || BIT_DEPTH == 16
+- #define PRED_PLANAR(size)\
+- static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+-                                        const uint8_t *left, ptrdiff_t stride)   \
+- {                                                                               \
+-     FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++ static void copy_vert(uint8_t *dst, const uint8_t *src,
++@@ -174,18 +208,29 @@ static void copy_vert(uint8_t *dst, const uint8_t *src,
++                       int stride_dst, int stride_src)
++ {
++     int i;
++-    if (pixel_shift == 0) {
++-        for (i = 0; i < height; i++) {
++-            *dst = *src;
++-            dst += stride_dst;
++-            src += stride_src;
++-        }
++-    } else {
++-        for (i = 0; i < height; i++) {
++-            *(uint16_t *)dst = *(uint16_t *)src;
++-            dst += stride_dst;
++-            src += stride_src;
++-        }
+++    switch (pixel_shift)
+++    {
+++        case 2:
+++            for (i = 0; i < height; i++) {
+++                *(uint32_t *)dst = *(uint32_t *)src;
+++                dst += stride_dst;
+++                src += stride_src;
+++            }
+++            break;
+++        case 1:
+++            for (i = 0; i < height; i++) {
+++                *(uint16_t *)dst = *(uint16_t *)src;
+++                dst += stride_dst;
+++                src += stride_src;
+++            }
+++            break;
+++        default:
+++            for (i = 0; i < height; i++) {
+++                *dst = *src;
+++                dst += stride_dst;
+++                src += stride_src;
+++            }
+++            break;
++     }
+  }
+-+#else
+-+#define PRED_PLANAR(size)\
+-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
+-+                                       const uint8_t *left, ptrdiff_t stride)   \
+-+{                                                                               \
+-+    av_log(NULL, AV_LOG_PANIC, "%s: NIF", __func__);                            \
+-+    abort();                                                                    \
+-+}
+-+#endif
+  
+- PRED_PLANAR(0)
+- PRED_PLANAR(1)
+-@@ -386,6 +540,7 @@ PRED_PLANAR(3)
++@@ -193,7 +238,7 @@ static void copy_CTB_to_hv(HEVCContext *s, const uint8_t *src,
++                            int stride_src, int x, int y, int width, int height,
++                            int c_idx, int x_ctb, int y_ctb)
++ {
++-    int sh = s->ps.sps->pixel_shift;
+++    const unsigned int sh = pixel_shift(s, c_idx);
++     int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++     int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
+  
+- #undef PRED_PLANAR
++@@ -224,13 +269,14 @@ static void restore_tqb_pixels(HEVCContext *s,
++         int y_min        = ((y0         ) >> s->ps.sps->log2_min_pu_size);
++         int x_max        = ((x0 + width ) >> s->ps.sps->log2_min_pu_size);
++         int y_max        = ((y0 + height) >> s->ps.sps->log2_min_pu_size);
++-        int len          = (min_pu_size >> hshift) << s->ps.sps->pixel_shift;
+++        const unsigned int sh = pixel_shift(s, c_idx);
+++        int len          = (min_pu_size >> hshift) << sh;
++         for (y = y_min; y < y_max; y++) {
++             for (x = x_min; x < x_max; x++) {
++                 if (s->is_pcm[y * s->ps.sps->min_pu_width + x]) {
++                     int n;
++-                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
++-                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << s->ps.sps->pixel_shift);
+++                    uint8_t *src = src1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_src + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
+++                    const uint8_t *dst = dst1 + (((y << s->ps.sps->log2_min_pu_size) - y0) >> vshift) * stride_dst + ((((x << s->ps.sps->log2_min_pu_size) - x0) >> hshift) << sh);
++                     for (n = 0; n < (min_pu_size >> vshift); n++) {
++                         memcpy(src, dst, len);
++                         src += stride_src;
++@@ -246,7 +292,13 @@ static void restore_tqb_pixels(HEVCContext *s,
+  
+-+#if !PRED_C
+- static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+-                           const uint8_t *_left,
+-                           ptrdiff_t stride, int log2_size, int c_idx)
+-@@ -416,7 +571,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+-             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
+-     }
+- }
++ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++ {
++-    static const uint8_t sao_tab[8] = { 0, 1, 2, 2, 3, 3, 4, 4 };
+++#if SAO_FILTER_N == 5
+++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+++#elif SAO_FILTER_N == 6
+++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
+ +#else
+-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+-+                          const uint8_t *_left,
+-+                          ptrdiff_t stride, int log2_size, int c_idx)
+-+{
+-+    unsigned int i, j;
+-+    const unsigned int size = (1 << log2_size);
+-+    c8_dst_ptr_t src = (c8_dst_ptr_t)_src;
+-+    const c8_src_ptr_t top = (c8_src_ptr_t)_top;
+-+    const c8_src_ptr_t left = (c8_src_ptr_t)_left;
+-+    unsigned int dc0 = size;
+-+    unsigned int dc1 = size;
+++#error Confused by size of sao fn array
+++#endif
++     HEVCLocalContext *lc = s->HEVClc;
++     int c_idx;
++     int edges[4];  // 0 left 1 top 2 right 3 bottom
++@@ -267,12 +319,22 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++     uint8_t right_tile_edge  = 0;
++     uint8_t up_tile_edge     = 0;
++     uint8_t bottom_tile_edge = 0;
+++#if RPI_HEVC_SAND
+++    const int sliced = av_rpi_is_sand_frame(s->frame);
+++    const int plane_count = sliced ? 2 : (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#else
+++    const int plane_count = (s->ps.sps->chroma_format_idc ? 3 : 1);
+++#endif
++ 
++     edges[0]   = x_ctb == 0;
++     edges[1]   = y_ctb == 0;
++     edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++     edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++ 
+++#ifdef DISABLE_SAO
+++    return;
+++#endif
+ +
+-+    for (i = 0; i < size; i++)
+-+    {
+-+        dc0 += left[i][0] + top[i][0];
+-+        dc1 += left[i][1] + top[i][1];
+-+    }
++     if (restore) {
++         if (!edges[0]) {
++             left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++@@ -304,7 +366,7 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         }
++     }
++ 
++-    for (c_idx = 0; c_idx < (s->ps.sps->chroma_format_idc ? 3 : 1); c_idx++) {
+++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
++         int x0       = x >> s->ps.sps->hshift[c_idx];
++         int y0       = y >> s->ps.sps->vshift[c_idx];
++         int stride_src = s->frame->linesize[c_idx];
++@@ -313,28 +375,84 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> s->ps.sps->hshift[c_idx]) - x0);
++         int height   = FFMIN(ctb_size_v, (s->ps.sps->height >> s->ps.sps->vshift[c_idx]) - y0);
++         int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++-        uint8_t *src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
++-        int stride_dst;
+++        ptrdiff_t stride_dst;
++         uint8_t *dst;
++ 
+++#if RPI_HEVC_SAND
+++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = !sliced ?
+++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
+++            c_idx == 0 ?
+++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
+++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
+++            !sliced ? src - (1 << sh) :
+++            c_idx == 0 ?
+++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
+++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
+++            !sliced ? src + (width << sh) :
+++            c_idx == 0 ?
+++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
+++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
+ +
+-+    dc0 >>= log2_size + 1;
+-+    dc1 >>= log2_size + 1;
+ +
+-+    for (i = 0; i < size; i++, src += stride)
+-+    {
+-+        for (j = 0; j < size; ++j)
+-+        {
+-+            src[j][0] = dc0;
+-+            src[j][1] = dc1;
+- 
+++        if (sliced && c_idx > 1) {
+++            break;
+ +        }
+-+    }
+-+}
+-+#endif
+-+
+-+#ifndef ANGLE_CONSTS
+-+#define ANGLE_CONSTS
+-+static const int intra_pred_angle[] = {
+-+     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+-+    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+-+};
+-+static const int inv_angle[] = {
+-+    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+-+    -630, -910, -1638, -4096
+-+};
+++#else
+++        const unsigned int sh = s->ps.sps->pixel_shift;
+++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
+++        uint8_t * const src = &s->frame->data[c_idx][y0 * stride_src + (x0 << s->ps.sps->pixel_shift)];
+++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : src - (1 << sh);
+++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : src + (width << sh);
+ +#endif
+ +
+-+#if !PRED_C
+- static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-                                                 const uint8_t *_top,
+-                                                 const uint8_t *_left,
+-@@ -428,15 +629,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-     const pixel *top  = (const pixel *)_top;
+-     const pixel *left = (const pixel *)_left;
+- 
+--    static const int intra_pred_angle[] = {
+--         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+--        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+--    };
+--    static const int inv_angle[] = {
+--        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+--        -630, -910, -1638, -4096
+--    };
+--
+-     int angle = intra_pred_angle[mode - 2];
+-     pixel ref_array[3 * MAX_TB_SIZE + 4];
+-     pixel *ref_tmp = ref_array + size;
+-@@ -509,6 +701,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-         }
+-     }
+- }
+-+#else
+-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+-+                                                const uint8_t *_top,
+-+                                                const uint8_t *_left,
+-+                                                ptrdiff_t stride, int c_idx,
+-+                                                int mode, int size)
+-+{
+-+    int x, y;
+-+    c8_dst_ptr_t src  = (c8_dst_ptr_t)_src;
+-+    c8_src_ptr_t top  = (c8_src_ptr_t)_top;
+-+    c8_src_ptr_t left = (c8_src_ptr_t)_left;
+-+
+-+    const int angle = intra_pred_angle[mode - 2];
+-+    uint8_t ref_array[3 * MAX_TB_SIZE + 4][2];
+-+    c8_dst_ptr_t ref_tmp = ref_array + size;
+-+    c8_src_ptr_t ref;
+-+    const int last = (size * angle) >> 5;
+-+
+-+    if (mode >= 18) {
+-+        ref = top - 1;
+-+        if (angle < 0 && last < -1) {
+-+            memcpy(ref_tmp, top - 1, (size + 1) * 2);
+-+            for (x = last; x <= -1; x++)
+-+            {
+-+                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+-+                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+-+            }
+-+            ref = (c8_src_ptr_t)ref_tmp;
+-+        }
+-+
+-+        for (y = 0; y < size; y++, src += stride) {
+-+            const int idx  = ((y + 1) * angle) >> 5;
+-+            const int fact = ((y + 1) * angle) & 31;
+-+            if (fact) {
+-+                for (x = 0; x < size; ++x) {
+-+                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+-+                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+-+                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+-+                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++         switch (sao->type_idx[c_idx]) {
++         case SAO_BAND:
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++             if (s->ps.pps->transquant_bypass_enable_flag ||
++                 (s->ps.sps->pcm.loop_filter_disable_flag && s->ps.sps->pcm_enabled_flag)) {
++-            dst = lc->edge_emu_buffer;
++-            stride_dst = 2*MAX_PB_SIZE;
++-            copy_CTB(dst, src, width << s->ps.sps->pixel_shift, height, stride_dst, stride_src);
++-            s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
++-            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++-                               x, y, width, height, c_idx);
+++                dst = lc->edge_emu_buffer;
+++                stride_dst = 2*MAX_PB_SIZE;
+++                copy_CTB(dst, src, width << sh, height, stride_dst, stride_src);
+++#if RPI_HEVC_SAND
+++                if (sliced && c_idx != 0)
+++                {
+++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+ +                }
+-+            } else {
+-+                memcpy(src, ref + idx + 1, size * 2);
+-+            }
+-+        }
+-+    } else {
+-+        ref = left - 1;
+-+        if (angle < 0 && last < -1) {
+-+            memcpy(ref_tmp, left - 1, (size + 1) * 2);
+-+            for (x = last; x <= -1; x++)
+-+            {
+-+                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+-+                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+-+            }
+-+            ref = (c8_src_ptr_t)ref_tmp;
+-+        }
+-+
+-+        for (x = 0; x < size; x++, src++) {
+-+            const int idx  = ((x + 1) * angle) >> 5;
+-+            const int fact = ((x + 1) * angle) & 31;
+-+            if (fact) {
+-+                for (y = 0; y < size; y++) {
+-+                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+-+                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+-+                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+-+                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+++                else
+++#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+ +                }
+-+            } else {
+-+                for (y = 0; y < size; y++)
+++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
+++                                   x, y, width, height, c_idx);
++             } else {
++-            s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++-                                            sao->offset_val[c_idx], sao->band_position[c_idx],
++-                                            width, height);
+++#if RPI_HEVC_SAND
+++                if (sliced && c_idx != 0)
+ +                {
+-+                    src[y * stride][0] = ref[y + idx + 1][0];
+-+                    src[y * stride][1] = ref[y + idx + 1][1];
+++//                    printf("x,y=%d,%d data[1]=%p, src=%p\n", x0, y0, s->frame->data[1], src);
+++
+++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[1], sao->band_position[1],
+++                                                    sao->offset_val[2], sao->band_position[2],
+++                                                    width, height);
+ +                }
+-+            }
+-+        }
+-+    }
+-+}
+++                else
+ +#endif
+++                {
+++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
+++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
+++                                                    width, height);
+++                }
++             }
++             sao->type_idx[c_idx] = SAO_APPLIED;
++             break;
++@@ -342,108 +460,118 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         {
++             int w = s->ps.sps->width >> s->ps.sps->hshift[c_idx];
++             int h = s->ps.sps->height >> s->ps.sps->vshift[c_idx];
++-            int left_edge = edges[0];
++             int top_edge = edges[1];
++-            int right_edge = edges[2];
++             int bottom_edge = edges[3];
++-            int sh = s->ps.sps->pixel_shift;
++-            int left_pixels, right_pixels;
+  
+- static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
+-                                  const uint8_t *left,
+-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+-index 099a8c5..bdff2d2 100644
+---- a/libavcodec/mmaldec.c
+-+++ b/libavcodec/mmaldec.c
+-@@ -24,6 +24,9 @@
+-  * MMAL Video Decoder
+-  */
+- 
+-+#pragma GCC diagnostic push
+-+// Many many redundant decls in the header files
+-+#pragma GCC diagnostic ignored "-Wredundant-decls"
+- #include <bcm_host.h>
+- #include <interface/mmal/mmal.h>
+- #include <interface/mmal/mmal_parameters_video.h>
+-@@ -31,6 +34,7 @@
+- #include <interface/mmal/util/mmal_util_params.h>
+- #include <interface/mmal/util/mmal_default_components.h>
+- #include <interface/mmal/vc/mmal_vc_api.h>
+-+#pragma GCC diagnostic pop
+- 
+- #include "avcodec.h"
+- #include "internal.h"
+-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
+-index 3adf28d..2f9195f 100644
+---- a/libavcodec/mpeg4videodec.c
+-+++ b/libavcodec/mpeg4videodec.c
+-@@ -2205,6 +2205,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+- 
+-         if (ctx->divx_version >= 0)
+-             s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
+-+
+-+        if (ctx->num_sprite_warping_points > 1)
+-+            s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
+-     }
+- 
+-     if (s->workaround_bugs & FF_BUG_STD_QPEL) {
+-@@ -2229,6 +2232,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
+-                s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
+-                ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
+- 
+-+    avctx->workaround_bugs = s->workaround_bugs;
+-     if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
+-         s->codec_id == AV_CODEC_ID_MPEG4 &&
+-         avctx->idct_algo == FF_IDCT_AUTO) {
+-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+-index bfa2537..1bca89e 100644
+---- a/libavcodec/raw.c
+-+++ b/libavcodec/raw.c
+-@@ -259,6 +259,11 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+-     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
+-     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
++             stride_dst = 2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE;
++             dst = lc->edge_emu_buffer + stride_dst + AV_INPUT_BUFFER_PADDING_SIZE;
+  
+-+    /* RPI */
+-+#ifdef RPI
+-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+-+#endif
+-+
+-     /* special */
+-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+-index d837056..81256b5 100644
+---- a/libavcodec/rawenc.c
+-+++ b/libavcodec/rawenc.c
+-@@ -47,6 +47,47 @@ FF_ENABLE_DEPRECATION_WARNINGS
+-     return 0;
+- }
++             if (!top_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++                 uint8_t *dst1;
++-                int src_idx, pos;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
+  
+-+static uint8_t * cpy_sand_c(uint8_t * dst, const AVFrame * const frame, const int c_off)
+-+{
+-+    for (int y = 0; y != frame->height / 2; ++y) {
+-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+-+            const uint8_t * p = frame->data[1] + x * frame->linesize[3] + y * frame->linesize[0] + c_off;
+-+            const int w = FFMIN(frame->linesize[0], frame->width - x) / 2;
+-+            for (int i = 0; i < w; ++i)
+-+                *dst++ = p[i * 2];
+-+        }
+-+    }
+-+    return dst;
+-+}
+-+
+-+static int raw_sand_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+-+                      const AVFrame *frame)
+-+{
+-+    int size = frame->width * frame->height * 3 / 2;
+-+    uint8_t * dst;
+-+    int ret;
++-                dst1 = dst - stride_dst - (left << sh);
++-                src1[0] = src - stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                dst1 = dst - stride_dst;
+ +
+-+    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+-+        return ret;
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++                 }
+ +
+-+    dst = pkt->data;
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
+ +
+-+    // Luma is "easy"
+-+    for (int y = 0; y != frame->height; ++y) {
+-+        for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+-+            const int w = FFMIN(frame->linesize[0], frame->width - x);
+-+            memcpy(dst,
+-+                frame->data[0] + x * frame->linesize[3] + y * frame->linesize[0], w);
+-+            dst += w;
+-+        }
+-+    }
+-+    // Chroma is dull
+-+    dst = cpy_sand_c(dst, frame, 0);
+-+    dst = cpy_sand_c(dst, frame, 1);
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++                 }
++             }
++             if (!bottom_edge) {
++-                int left = 1 - left_edge;
++-                int right = 1 - right_edge;
++-                const uint8_t *src1[2];
++-                uint8_t *dst1;
++-                int src_idx, pos;
+++                uint8_t * const dst1 = dst + height * stride_dst;
+++                int src_idx;
+++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
+++                const unsigned int hoff = height * stride_src;
++ 
++-                dst1 = dst + height * stride_dst - (left << sh);
++-                src1[0] = src + height * stride_src - (left << sh);
++-                src1[1] = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0 - left) << sh);
++-                pos = 0;
++-                if (left) {
+++                if (src_l != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1, src1[src_idx], sh);
++-                    pos += (1 << sh);
+++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++                 }
+ +
+-+    return 0;
+-+}
++                 src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                            SAO_APPLIED);
++-                memcpy(dst1 + pos, src1[src_idx] + pos, width << sh);
++-                if (right) {
++-                    pos += width << sh;
+++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
+ +
+- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+-                       const AVFrame *frame, int *got_packet)
+- {
+-@@ -56,6 +97,12 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+-     if (ret < 0)
+-         return ret;
+++                if (src_r != NULL) {
++                     src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                                SAO_APPLIED);
++-                    copy_pixel(dst1 + pos, src1[src_idx] + pos, sh);
+++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++                 }
++             }
++-            left_pixels = 0;
++-            if (!left_edge) {
+++            if (src_l != NULL) {
++                 if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst - (1 << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    left_pixels = 1;
+++                    copy_vert(dst - (1 << sh),
+++                              src_l,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
++-            right_pixels = 0;
++-            if (!right_edge) {
+++            if (src_r != NULL) {
++                 if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                     copy_vert(dst + (width << sh),
++                               s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                               sh, height, stride_dst, 1 << sh);
++                 } else {
++-                    right_pixels = 1;
+++                    copy_vert(dst + (width << sh),
+++                              src_r,
+++                              sh, height, stride_dst, stride_src);
++                 }
++             }
+  
+-+    if (frame->format == AV_PIX_FMT_SAND128) {
+-+        ret = raw_sand_as_yuv420(avctx, pkt, frame);
+-+        *got_packet = (ret == 0);
+-+        return ret;
++-            copy_CTB(dst - (left_pixels << sh),
++-                     src - (left_pixels << sh),
++-                     (width + left_pixels + right_pixels) << sh,
+++            copy_CTB(dst,
+++                     src,
+++                     width << sh,
++                      height, stride_dst, stride_src);
++ 
++             copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                            x_ctb, y_ctb);
++-            s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++-                                            sao->eo_class[c_idx], width, height);
++-            s->hevcdsp.sao_edge_restore[restore](src, dst,
++-                                                stride_src, stride_dst,
++-                                                sao,
++-                                                edges, width,
++-                                                height, c_idx,
++-                                                vert_edge,
++-                                                horiz_edge,
++-                                                diag_edge);
+++#if RPI_HEVC_SAND
+++            if (sliced && c_idx != 0)
+++            {
+++                // Class always the same for both U & V (which is just as well :-))
+++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
+++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
+++                                                width, height);
+++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
+++            else
+++#endif
+++            {
+++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
+++                                                sao->eo_class[c_idx], width, height);
+++                s->hevcdsp.sao_edge_restore[restore](src, dst,
+++                                                    stride_src, stride_dst,
+++                                                    sao,
+++                                                    edges, width,
+++                                                    height, c_idx,
+++                                                    vert_edge,
+++                                                    horiz_edge,
+++                                                    diag_edge);
+++            }
+++            // ??? Does this actually work for chroma ???
++             restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                x, y, width, height, c_idx);
++             sao->type_idx[c_idx] = SAO_APPLIED;
++@@ -451,8 +579,30 @@ static void sao_filter_CTB(HEVCContext *s, int x, int y)
++         }
++         }
++     }
+++
+++#if RPI_ZC_SAND_8_IN_10_BUF
+++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
+++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
+++    {
+++        const unsigned int stride1 = s->frame->linesize[0];
+++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
+++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
+++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
+++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
+++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
+++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
+++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
+++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
+++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
+++
+++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
+++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
+++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
+ +    }
+++#endif
++ }
++ 
+++// Returns 2 or 0.
++ static int get_pcm(HEVCContext *s, int x, int y)
++ {
++     int log2_min_pu_size = s->ps.sps->log2_min_pu_size;
++@@ -479,7 +629,7 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     uint8_t *src;
++     int x, y;
++     int chroma, beta;
++-    int32_t c_tc[2], tc[2];
+++    int32_t c_tc[4], tc[2];
++     uint8_t no_p[2] = { 0 };
++     uint8_t no_q[2] = { 0 };
++ 
++@@ -496,6 +646,15 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 s->ps.sps->pcm.loop_filter_disable_flag) ||
++                s->ps.pps->transquant_bypass_enable_flag;
++ 
+++#ifdef DISABLE_DEBLOCK_NONREF
+++    if (!s->used_for_ref)
+++      return; // Don't deblock non-reference frames
+++#endif
+++#ifdef DISABLE_DEBLOCK
+++    return;
+++#endif
+++    if (!s->used_for_ref && s->avctx->skip_loop_filter >= AVDISCARD_NONREF)
+++        return;
++     if (x0) {
++         left_tc_offset   = s->deblock[ctb - 1].tc_offset;
++         left_beta_offset = s->deblock[ctb - 1].beta_offset;
++@@ -529,19 +688,51 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x - 1, y);
++                     no_p[1] = get_pcm(s, x - 1, y + 4);
++                     no_q[0] = get_pcm(s, x, y);
++                     no_q[1] = get_pcm(s, x, y + 4);
++-                    s->hevcdsp.hevc_v_loop_filter_luma_c(src,
++-                                                         s->frame->linesize[LUMA],
++-                                                         beta, tc, no_p, no_q);
++-                } else
++-                    s->hevcdsp.hevc_v_loop_filter_luma(src,
++-                                                       s->frame->linesize[LUMA],
++-                                                       beta, tc, no_p, no_q);
+++                }
+++#if RPI_HEVC_SAND
+++                if (av_rpi_is_sand_frame(s->frame)) {
+ +
+-     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+-         return ret;
+-     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+-diff --git a/libavcodec/rpi_hevc_transform.h b/libavcodec/rpi_hevc_transform.h
+-new file mode 100644
+-index 0000000..4309f1c
+---- /dev/null
+-+++ b/libavcodec/rpi_hevc_transform.h
+-@@ -0,0 +1,3070 @@
+-+unsigned char rpi_hevc_transform [] = {
+-+21,
+-+106,
+-+0,
+-+144,
+-+47,
+-+1,
+-+37,
+-+106,
+-+0,
+-+144,
+-+66,
+-+1,
+-+53,
+-+106,
+-+0,
+-+144,
+-+192,
+-+4,
+-+69,
+-+106,
+-+0,
+-+144,
+-+192,
+-+4,
+-+85,
+-+106,
+-+0,
+-+144,
+-+220,
+-+5,
+-+169,
+-+3,
+-+62,
+-+64,
+-+79,
+-+64,
+-+3,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+136,
+-+0,
+-+0,
+-+192,
+-+248,
+-+0,
+-+0,
+-+64,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+168,
+-+0,
+-+0,
+-+192,
+-+248,
+-+0,
+-+0,
+-+0,
+-+96,
+-+3,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+7,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+8,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+4,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+5,
+-+232,
+-+0,
+-+8,
+-+0,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+8,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+70,
+-+128,
+-+144,
+-+40,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+16,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+9,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+4,
+-+254,
+-+0,
+-+144,
+-+128,
+-+2,
+-+0,
+-+8,
+-+2,
+-+0,
+-+128,
+-+144,
+-+23,
+-+0,
+-+4,
+-+255,
+-+48,
+-+192,
+-+128,
+-+3,
+-+32,
+-+8,
+-+20,
+-+0,
+-+76,
+-+254,
+-+48,
+-+192,
+-+4,
+-+4,
+-+32,
+-+8,
+-+0,
+-+0,
+-+140,
+-+248,
+-+44,
+-+0,
+-+0,
+-+0,
+-+32,
+-+48,
+-+4,
+-+0,
+-+128,
+-+69,
+-+113,
+-+66,
+-+242,
+-+140,
+-+211,
+-+192,
+-+34,
+-+31,
+-+41,
+-+3,
+-+70,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+204,
+-+96,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+91,
+-+0,
+-+103,
+-+90,
+-+0,
+-+70,
+-+192,
+-+80,
+-+7,
+-+164,
+-+255,
+-+36,
+-+204,
+-+224,
+-+2,
+-+0,
+-+248,
+-+62,
+-+0,
+-+3,
+-+255,
+-+55,
+-+208,
+-+120,
+-+3,
+-+224,
+-+3,
+-+190,
+-+11,
+-+16,
+-+139,
+-+246,
+-+91,
+-+0,
+-+103,
+-+90,
+-+0,
+-+225,
+-+64,
+-+242,
+-+64,
+-+3,
+-+232,
+-+128,
+-+0,
+-+0,
+-+0,
+-+7,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+57,
+-+239,
+-+224,
+-+247,
+-+255,
+-+255,
+-+72,
+-+192,
+-+95,
+-+207,
+-+88,
+-+122,
+-+88,
+-+124,
+-+137,
+-+64,
+-+26,
+-+64,
+-+4,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+149,
+-+96,
+-+161,
+-+64,
+-+152,
+-+64,
+-+128,
+-+144,
+-+35,
+-+0,
+-+72,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+65,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+128,
+-+144,
+-+27,
+-+0,
+-+4,
+-+232,
+-+0,
+-+8,
+-+0,
+-+0,
+-+69,
+-+96,
+-+145,
+-+64,
+-+168,
+-+64,
+-+128,
+-+144,
+-+19,
+-+0,
+-+72,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+65,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+128,
+-+144,
+-+11,
+-+0,
+-+74,
+-+232,
+-+0,
+-+8,
+-+0,
+-+0,
+-+242,
+-+140,
+-+221,
+-+192,
+-+57,
+-+239,
+-+32,
+-+8,
+-+0,
+-+0,
+-+41,
+-+3,
+-+239,
+-+3,
+-+12,
+-+248,
+-+0,
+-+128,
+-+0,
+-+0,
+-+192,
+-+248,
+-+4,
+-+0,
+-+12,
+-+248,
+-+0,
+-+132,
+-+64,
+-+0,
+-+192,
+-+248,
+-+4,
+-+0,
+-+0,
+-+96,
+-+255,
+-+159,
+-+154,
+-+255,
+-+0,
+-+232,
+-+0,
+-+4,
+-+0,
+-+0,
+-+255,
+-+159,
+-+165,
+-+255,
+-+4,
+-+255,
+-+48,
+-+204,
+-+16,
+-+3,
+-+224,
+-+251,
+-+62,
+-+0,
+-+4,
+-+255,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+16,
+-+0,
+-+76,
+-+254,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+20,
+-+0,
+-+128,
+-+64,
+-+6,
+-+232,
+-+64,
+-+0,
+-+0,
+-+0,
+-+140,
+-+248,
+-+47,
+-+0,
+-+0,
+-+0,
+-+224,
+-+99,
+-+0,
+-+0,
+-+32,
+-+247,
+-+240,
+-+207,
+-+16,
+-+3,
+-+32,
+-+247,
+-+176,
+-+207,
+-+17,
+-+19,
+-+32,
+-+247,
+-+112,
+-+207,
+-+18,
+-+35,
+-+32,
+-+247,
+-+48,
+-+207,
+-+19,
+-+51,
+-+32,
+-+247,
+-+240,
+-+206,
+-+20,
+-+67,
+-+32,
+-+247,
+-+176,
+-+206,
+-+21,
+-+83,
+-+32,
+-+247,
+-+112,
+-+206,
+-+22,
+-+99,
+-+32,
+-+247,
+-+48,
+-+206,
+-+23,
+-+115,
+-+32,
+-+247,
+-+240,
+-+205,
+-+24,
+-+131,
+-+32,
+-+247,
+-+176,
+-+205,
+-+25,
+-+147,
+-+32,
+-+247,
+-+112,
+-+205,
+-+26,
+-+163,
+-+32,
+-+247,
+-+48,
+-+205,
+-+27,
+-+179,
+-+32,
+-+247,
+-+240,
+-+204,
+-+28,
+-+195,
+-+32,
+-+247,
+-+176,
+-+204,
+-+29,
+-+211,
+-+32,
+-+247,
+-+112,
+-+204,
+-+30,
+-+227,
+-+32,
+-+247,
+-+48,
+-+204,
+-+31,
+-+243,
+-+4,
+-+255,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+16,
+-+0,
+-+76,
+-+254,
+-+51,
+-+204,
+-+128,
+-+3,
+-+224,
+-+251,
+-+20,
+-+0,
+-+0,
+-+237,
+-+32,
+-+0,
+-+0,
+-+0,
+-+140,
+-+248,
+-+47,
+-+0,
+-+0,
+-+0,
+-+224,
+-+99,
+-+0,
+-+0,
+-+111,
+-+3,
+-+4,
+-+254,
+-+0,
+-+128,
+-+0,
+-+4,
+-+0,
+-+248,
+-+0,
+-+0,
+-+2,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+140,
+-+248,
+-+32,
+-+0,
+-+0,
+-+0,
+-+224,
+-+35,
+-+0,
+-+0,
+-+64,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+193,
+-+232,
+-+0,
+-+1,
+-+0,
+-+0,
+-+1,
+-+106,
+-+116,
+-+30,
+-+90,
+-+0,
+-+169,
+-+3,
+-+73,
+-+64,
+-+52,
+-+64,
+-+45,
+-+64,
+-+2,
+-+64,
+-+10,
+-+64,
+-+64,
+-+198,
+-+1,
+-+7,
+-+8,
+-+232,
+-+63,
+-+0,
+-+0,
+-+0,
+-+6,
+-+232,
+-+253,
+-+255,
+-+255,
+-+255,
+-+0,
+-+246,
+-+0,
+-+0,
+-+0,
+-+4,
+-+215,
+-+64,
+-+3,
+-+96,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+137,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+129,
+-+0,
+-+131,
+-+102,
+-+0,
+-+158,
+-+67,
+-+0,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+108,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+100,
+-+0,
+-+131,
+-+102,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+161,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+150,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+3,
+-+99,
+-+131,
+-+71,
+-+68,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+0,
+-+99,
+-+2,
+-+99,
+-+23,
+-+102,
+-+7,
+-+106,
+-+127,
+-+156,
+-+182,
+-+255,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+112,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+101,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+25,
+-+102,
+-+9,
+-+106,
+-+2,
+-+30,
+-+41,
+-+3,
+-+26,
+-+87,
+-+162,
+-+64,
+-+64,
+-+198,
+-+1,
+-+23,
+-+127,
+-+158,
+-+103,
+-+255,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+92,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+93,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+94,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+95,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+208,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+209,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+210,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+211,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+107,
+-+0,
+-+8,
+-+255,
+-+99,
+-+23,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+23,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+23,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+52,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+52,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+52,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+239,
+-+3,
+-+0,
+-+254,
+-+0,
+-+143,
+-+12,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+143,
+-+13,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+143,
+-+14,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+15,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+16,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+17,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+64,
+-+142,
+-+18,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+0,
+-+142,
+-+19,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+33,
+-+0,
+-+8,
+-+255,
+-+99,
+-+3,
+-+0,
+-+212,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+3,
+-+0,
+-+228,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+227,
+-+3,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+4,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+99,
+-+4,
+-+0,
+-+164,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+163,
+-+4,
+-+0,
+-+148,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+32,
+-+246,
+-+192,
+-+11,
+-+1,
+-+16,
+-+32,
+-+246,
+-+2,
+-+137,
+-+47,
+-+240,
+-+40,
+-+246,
+-+2,
+-+140,
+-+47,
+-+240,
+-+128,
+-+245,
+-+99,
+-+140,
+-+5,
+-+4,
+-+0,
+-+247,
+-+99,
+-+140,
+-+1,
+-+20,
+-+88,
+-+246,
+-+99,
+-+140,
+-+1,
+-+20,
+-+0,
+-+247,
+-+35,
+-+136,
+-+62,
+-+226,
+-+32,
+-+247,
+-+35,
+-+136,
+-+32,
+-+210,
+-+0,
+-+247,
+-+34,
+-+136,
+-+63,
+-+2,
+-+208,
+-+246,
+-+34,
+-+136,
+-+0,
+-+4,
+-+0,
+-+247,
+-+99,
+-+136,
+-+58,
+-+162,
+-+32,
+-+247,
+-+99,
+-+136,
+-+33,
+-+146,
+-+0,
+-+247,
+-+98,
+-+136,
+-+59,
+-+18,
+-+208,
+-+246,
+-+98,
+-+136,
+-+0,
+-+20,
+-+0,
+-+247,
+-+162,
+-+136,
+-+33,
+-+2,
+-+88,
+-+246,
+-+98,
+-+137,
+-+2,
+-+68,
+-+88,
+-+246,
+-+162,
+-+137,
+-+3,
+-+68,
+-+208,
+-+254,
+-+227,
+-+136,
+-+60,
+-+242,
+-+192,
+-+243,
+-+188,
+-+11,
+-+208,
+-+254,
+-+227,
+-+136,
+-+56,
+-+178,
+-+192,
+-+243,
+-+188,
+-+10,
+-+32,
+-+255,
+-+226,
+-+136,
+-+38,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+208,
+-+254,
+-+227,
+-+136,
+-+59,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+49,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+226,
+-+136,
+-+34,
+-+34,
+-+192,
+-+243,
+-+60,
+-+128,
+-+32,
+-+255,
+-+226,
+-+136,
+-+37,
+-+58,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+194,
+-+8,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+255,
+-+202,
+-+40,
+-+0,
+-+52,
+-+195,
+-+243,
+-+0,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+35,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+1,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+0,
+-+255,
+-+226,
+-+140,
+-+34,
+-+34,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+255,
+-+227,
+-+140,
+-+36,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+0,
+-+254,
+-+192,
+-+136,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+16,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+246,
+-+226,
+-+136,
+-+35,
+-+50,
+-+32,
+-+254,
+-+226,
+-+136,
+-+35,
+-+58,
+-+192,
+-+243,
+-+60,
+-+0,
+-+11,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+115,
+-+5,
+-+106,
+-+0,
+-+144,
+-+173,
+-+1,
+-+27,
+-+96,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+227,
+-+0,
+-+64,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+0,
+-+246,
+-+192,
+-+175,
+-+63,
+-+2,
+-+0,
+-+246,
+-+192,
+-+174,
+-+59,
+-+2,
+-+0,
+-+246,
+-+128,
+-+175,
+-+62,
+-+2,
+-+0,
+-+246,
+-+128,
+-+174,
+-+58,
+-+2,
+-+0,
+-+246,
+-+64,
+-+175,
+-+61,
+-+2,
+-+0,
+-+246,
+-+64,
+-+174,
+-+57,
+-+2,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+191,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+143,
+-+52,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+190,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+143,
+-+52,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+180,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+191,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+212,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+196,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+189,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+143,
+-+52,
+-+210,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+180,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+240,
+-+1,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+228,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+187,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+235,
+-+142,
+-+52,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+2,
+-+148,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+64,
+-+254,
+-+43,
+-+141,
+-+0,
+-+244,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+2,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+186,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+171,
+-+142,
+-+52,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+255,
+-+43,
+-+240,
+-+4,
+-+244,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+43,
+-+240,
+-+187,
+-+162,
+-+192,
+-+243,
+-+188,
+-+10,
+-+128,
+-+253,
+-+43,
+-+240,
+-+3,
+-+148,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+254,
+-+35,
+-+141,
+-+1,
+-+132,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+32,
+-+247,
+-+35,
+-+141,
+-+185,
+-+66,
+-+240,
+-+246,
+-+35,
+-+141,
+-+50,
+-+66,
+-+0,
+-+255,
+-+107,
+-+142,
+-+52,
+-+146,
+-+192,
+-+243,
+-+60,
+-+128,
+-+64,
+-+255,
+-+98,
+-+141,
+-+0,
+-+52,
+-+192,
+-+243,
+-+0,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+53,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+1,
+-+4,
+-+0,
+-+240,
+-+64,
+-+147,
+-+5,
+-+106,
+-+0,
+-+144,
+-+177,
+-+0,
+-+88,
+-+246,
+-+163,
+-+140,
+-+1,
+-+4,
+-+128,
+-+245,
+-+99,
+-+141,
+-+10,
+-+4,
+-+88,
+-+246,
+-+162,
+-+138,
+-+1,
+-+68,
+-+0,
+-+247,
+-+162,
+-+138,
+-+36,
+-+162,
+-+88,
+-+254,
+-+162,
+-+138,
+-+3,
+-+164,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+255,
+-+226,
+-+137,
+-+32,
+-+2,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+226,
+-+137,
+-+42,
+-+114,
+-+0,
+-+255,
+-+34,
+-+138,
+-+33,
+-+18,
+-+195,
+-+243,
+-+60,
+-+0,
+-+32,
+-+247,
+-+34,
+-+138,
+-+42,
+-+130,
+-+16,
+-+246,
+-+98,
+-+138,
+-+40,
+-+114,
+-+16,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+98,
+-+138,
+-+41,
+-+146,
+-+32,
+-+246,
+-+226,
+-+137,
+-+41,
+-+146,
+-+40,
+-+246,
+-+34,
+-+138,
+-+41,
+-+146,
+-+32,
+-+247,
+-+163,
+-+141,
+-+63,
+-+178,
+-+32,
+-+247,
+-+227,
+-+141,
+-+62,
+-+162,
+-+0,
+-+254,
+-+0,
+-+240,
+-+8,
+-+4,
+-+0,
+-+240,
+-+128,
+-+11,
+-+128,
+-+253,
+-+35,
+-+240,
+-+9,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+128,
+-+253,
+-+163,
+-+141,
+-+128,
+-+115,
+-+192,
+-+243,
+-+152,
+-+10,
+-+88,
+-+246,
+-+163,
+-+141,
+-+4,
+-+100,
+-+208,
+-+246,
+-+35,
+-+139,
+-+0,
+-+100,
+-+32,
+-+255,
+-+34,
+-+139,
+-+53,
+-+202,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+139,
+-+0,
+-+4,
+-+0,
+-+240,
+-+0,
+-+160,
+-+240,
+-+246,
+-+163,
+-+141,
+-+48,
+-+98,
+-+0,
+-+247,
+-+99,
+-+139,
+-+63,
+-+210,
+-+0,
+-+247,
+-+98,
+-+139,
+-+1,
+-+212,
+-+88,
+-+254,
+-+98,
+-+139,
+-+1,
+-+212,
+-+192,
+-+243,
+-+128,
+-+11,
+-+32,
+-+255,
+-+99,
+-+139,
+-+62,
+-+98,
+-+192,
+-+243,
+-+188,
+-+10,
+-+88,
+-+246,
+-+98,
+-+139,
+-+1,
+-+212,
+-+240,
+-+246,
+-+98,
+-+139,
+-+50,
+-+210,
+-+0,
+-+247,
+-+163,
+-+128,
+-+59,
+-+146,
+-+0,
+-+247,
+-+160,
+-+128,
+-+1,
+-+36,
+-+88,
+-+254,
+-+160,
+-+128,
+-+1,
+-+36,
+-+192,
+-+243,
+-+128,
+-+11,
+-+0,
+-+247,
+-+163,
+-+128,
+-+58,
+-+98,
+-+64,
+-+255,
+-+35,
+-+240,
+-+0,
+-+100,
+-+192,
+-+243,
+-+128,
+-+10,
+-+64,
+-+255,
+-+163,
+-+128,
+-+0,
+-+164,
+-+192,
+-+243,
+-+128,
+-+10,
+-+88,
+-+246,
+-+160,
+-+128,
+-+1,
+-+36,
+-+240,
+-+246,
+-+160,
+-+128,
+-+50,
+-+34,
+-+8,
+-+255,
+-+227,
+-+143,
+-+54,
+-+242,
+-+192,
+-+243,
+-+60,
+-+128,
+-+40,
+-+255,
+-+227,
+-+142,
+-+54,
+-+178,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+39,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+143,
+-+45,
+-+226,
+-+192,
+-+243,
+-+60,
+-+128,
+-+0,
+-+254,
+-+0,
+-+240,
+-+44,
+-+10,
+-+0,
+-+240,
+-+60,
+-+0,
+-+0,
+-+254,
+-+0,
+-+240,
+-+40,
+-+10,
+-+0,
+-+240,
+-+60,
+-+128,
+-+8,
+-+255,
+-+163,
+-+142,
+-+2,
+-+162,
+-+192,
+-+243,
+-+60,
+-+128,
+-+90,
+-+0,
+-+169,
+-+3,
+-+14,
+-+96,
+-+4,
+-+31,
+-+169,
+-+3,
+-+30,
+-+96,
+-+1,
+-+31,
+-+73,
+-+64,
+-+52,
+-+64,
+-+45,
+-+64,
+-+2,
+-+64,
+-+10,
+-+64,
+-+64,
+-+198,
+-+1,
+-+7,
+-+8,
+-+232,
+-+63,
+-+0,
+-+0,
+-+0,
+-+6,
+-+232,
+-+253,
+-+255,
+-+255,
+-+255,
+-+0,
+-+246,
+-+0,
+-+0,
+-+0,
+-+4,
+-+215,
+-+64,
+-+3,
+-+96,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+30,
+-+106,
+-+132,
+-+24,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+143,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+135,
+-+0,
+-+131,
+-+102,
+-+0,
+-+158,
+-+71,
+-+0,
+-+2,
+-+248,
+-+0,
+-+35,
+-+0,
+-+0,
+-+64,
+-+56,
+-+0,
+-+0,
+-+4,
+-+248,
+-+0,
+-+36,
+-+0,
+-+0,
+-+64,
+-+56,
+-+8,
+-+0,
+-+0,
+-+240,
+-+64,
+-+0,
+-+132,
+-+3,
+-+30,
+-+106,
+-+132,
+-+24,
+-+128,
+-+240,
+-+0,
+-+0,
+-+132,
+-+3,
+-+128,
+-+144,
+-+112,
+-+0,
+-+131,
+-+98,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+104,
+-+0,
+-+131,
+-+102,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+30,
+-+106,
+-+134,
+-+24,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+123,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+112,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+3,
+-+99,
+-+131,
+-+71,
+-+68,
+-+232,
+-+32,
+-+0,
+-+0,
+-+0,
+-+0,
+-+99,
+-+2,
+-+99,
+-+23,
+-+102,
+-+7,
+-+106,
+-+127,
+-+156,
+-+178,
+-+255,
+-+0,
+-+248,
+-+64,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+30,
+-+106,
+-+134,
+-+24,
+-+128,
+-+248,
+-+0,
+-+0,
+-+112,
+-+0,
+-+192,
+-+243,
+-+211,
+-+31,
+-+128,
+-+144,
+-+72,
+-+0,
+-+188,
+-+64,
+-+67,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+0,
+-+255,
+-+64,
+-+0,
+-+0,
+-+20,
+-+200,
+-+243,
+-+0,
+-+0,
+-+128,
+-+144,
+-+61,
+-+0,
+-+195,
+-+232,
+-+0,
+-+2,
+-+0,
+-+0,
+-+12,
+-+128,
+-+7,
+-+192,
+-+130,
+-+248,
+-+0,
+-+0,
+-+112,
+-+192,
+-+224,
+-+16,
+-+195,
+-+31,
+-+132,
+-+248,
+-+1,
+-+0,
+-+112,
+-+0,
+-+224,
+-+16,
+-+203,
+-+31,
+-+25,
+-+102,
+-+9,
+-+106,
+-+2,
+-+30,
+-+41,
+-+3,
+-+26,
+-+87,
+-+162,
+-+64,
+-+64,
+-+198,
+-+1,
+-+23,
+-+127,
+-+158,
+-+95,
+-+255,
+-+239,
+-+3,
+-+0,
+-+254,
+-+128,
+-+143,
+-+94,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+95,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+208,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+209,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+47,
+-+0,
+-+8,
+-+255,
+-+227,
+-+23,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+52,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+239,
+-+3,
+-+0,
+-+254,
+-+128,
+-+143,
+-+14,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+143,
+-+15,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+192,
+-+142,
+-+16,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+0,
+-+254,
+-+128,
+-+142,
+-+17,
+-+0,
+-+0,
+-+240,
+-+12,
+-+0,
+-+128,
+-+144,
+-+13,
+-+0,
+-+8,
+-+255,
+-+227,
+-+3,
+-+0,
+-+244,
+-+192,
+-+51,
+-+0,
+-+0,
+-+8,
+-+255,
+-+35,
+-+4,
+-+0,
+-+180,
+-+192,
+-+51,
+-+0,
+-+0,
+-+111,
+-+3,
+-+32,
+-+246,
+-+192,
+-+11,
+-+1,
+-+16,
+-+32,
+-+246,
+-+2,
+-+140,
+-+47,
+-+240,
+-+32,
+-+247,
+-+35,
+-+141,
+-+63,
+-+178,
+-+64,
+-+254,
+-+35,
+-+141,
+-+2,
+-+68,
+-+192,
+-+243,
+-+128,
+-+11,
+-+32,
+-+255,
+-+35,
+-+240,
+-+58,
+-+226,
+-+192,
+-+243,
+-+188,
+-+10,
+-+0,
+-+254,
+-+0,
+-+141,
+-+4,
+-+4,
+-+0,
+-+240,
+-+128,
+-+10,
+-+88,
+-+246,
+-+35,
+-+141,
+-+3,
+-+68,
+-+240,
+-+246,
+-+35,
+-+141,
+-+48,
+-+66,
+-+0,
+-+247,
+-+227,
+-+143,
+-+52,
+-+242,
+-+32,
+-+247,
+-+227,
+-+142,
+-+52,
+-+178,
+-+90,
+-+0,
+-+161,
+-+3,
+-+6,
+-+64,
+-+23,
+-+64,
+-+96,
+-+8,
+-+70,
+-+98,
+-+97,
+-+8,
+-+70,
+-+98,
+-+98,
+-+8,
+-+70,
+-+98,
+-+99,
+-+8,
+-+70,
+-+98,
+-+100,
+-+8,
+-+70,
+-+98,
+-+101,
+-+8,
+-+70,
+-+98,
+-+255,
+-+159,
+-+8,
+-+250,
+-+23,
+-+102,
+-+7,
+-+106,
+-+112,
+-+30,
+-+33,
+-+3,
+++                    // This copes properly with no_p/no_q
+++                    s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
+++                                                     s->frame->linesize[LUMA],
+++                                                     beta, tc, no_p, no_q,
+++                                                     av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
+++                }
+++                else
+++#endif
+++                {
+++                    src = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                    if (pcmf) {
+++                        // Standard DSP code is broken if no_p / no_q is set
+++                        s->hevcdsp.hevc_v_loop_filter_luma_c(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                    else
+++#ifdef RPI_DEBLOCK_VPU
+++                    if (s->enable_rpi_deblock) {
+++                        uint8_t (*setup)[2][2][4];
+++                        int num16 = (y>>4)*s->setup_width + (x>>4);
+++                        int a = ((y>>3) & 1) << 1;
+++                        int b = (x>>3) & 1;
+++                        setup = s->dvq->y_setup_arm[num16];
+++                        setup[0][b][0][a] = beta;
+++                        setup[0][b][0][a + 1] = beta;
+++                        setup[0][b][1][a] = tc[0];
+++                        setup[0][b][1][a + 1] = tc[1];
+++                    } else
+++#endif
+++                    {
+++                        s->hevcdsp.hevc_v_loop_filter_luma(src,
+++                                                           s->frame->linesize[LUMA],
+++                                                           beta, tc, no_p, no_q);
+++                    }
+++                }
++             }
++         }
++ 
++@@ -561,7 +752,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                 beta = betatable[av_clip(qp + beta_offset, 0, MAX_QP)];
++                 tc[0]   = bs0 ? TC_CALC(qp, bs0) : 0;
++                 tc[1]   = bs1 ? TC_CALC(qp, bs1) : 0;
++-                src     = &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
+++                src =
+++#if RPI_HEVC_SAND
+++                    av_rpi_is_sand_frame(s->frame) ?
+++                        av_rpi_sand_frame_pos_y(s->frame, x, y) :
+++#endif
+++                        &s->frame->data[LUMA][y * s->frame->linesize[LUMA] + (x << s->ps.sps->pixel_shift)];
++                 if (pcmf) {
++                     no_p[0] = get_pcm(s, x, y - 1);
++                     no_p[1] = get_pcm(s, x + 4, y - 1);
++@@ -571,6 +767,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                          s->frame->linesize[LUMA],
++                                                          beta, tc, no_p, no_q);
++                 } else
+++#ifdef RPI_DEBLOCK_VPU
+++                if (s->enable_rpi_deblock) {
+++                    uint8_t (*setup)[2][2][4];
+++                    int num16 = (y>>4)*s->setup_width + (x>>4);
+++                    int a = ((x>>3) & 1) << 1;
+++                    int b = (y>>3) & 1;
+++                    setup = s->dvq->y_setup_arm[num16];
+++                    setup[1][b][0][a] = beta;
+++                    setup[1][b][0][a + 1] = beta;
+++                    setup[1][b][1][a] = tc[0];
+++                    setup[1][b][1][a + 1] = tc[1];
+++                } else
+++#endif
++                     s->hevcdsp.hevc_h_loop_filter_luma(src,
++                                                        s->frame->linesize[LUMA],
++                                                        beta, tc, no_p, no_q);
++@@ -579,6 +788,96 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ 
++     if (s->ps.sps->chroma_format_idc) {
+++#if RPI_HEVC_SAND
+++        if (av_rpi_is_sand_frame(s->frame)) {
+++            const int v = 2;
+++            const int h = 2;
+++
+++            // vertical filtering chroma
+++            for (y = y0; y < y_end; y += 8 * v) {
+++//                const int demi_y = y + 4 * v >= s->ps.sps->height;
+++                const int demi_y = 0;
+++                for (x = x0 ? x0 : 8 * h; x < x_end; x += 8 * h) {
+++                    const int bs0 = s->vertical_bs[(x +  y          * s->bs_width) >> 2];
+++                    const int bs1 = s->vertical_bs[(x + (y + 4 * v) * s->bs_width) >> 2];
+++
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = (get_qPy(s, x - 1, y)         + get_qPy(s, x, y)         + 1) >> 1;
+++                        const int qp1 = (get_qPy(s, x - 1, y + 4 * v) + get_qPy(s, x, y + 4 * v) + 1) >> 1;
+++                        unsigned int no_f = !demi_y ? 0 : 2 | 8;
+++
+++                        // tc_offset here should be set to cur_tc_offset I think
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, cur_tc_offset) | (chroma_tc(s, qp0, 2, cur_tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x - 1, y) ? 1 : 0) |
+++                                (get_pcm(s, x - 1, y + 4 * v) ? 2 : 0) |
+++                                (get_pcm(s, x, y) ? 4 : 0) |
+++                                (get_pcm(s, x, y + 4 * v) ? 8 : 0);
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                       s->frame->linesize[1],
+++                                                       tc4,
+++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
+++                                                       no_f);
+++                    }
+++                }
+++
+++                if (y == 0)
+++                    continue;
+++
+++                // horizontal filtering chroma
+++                tc_offset = x0 ? left_tc_offset : cur_tc_offset;
+++                x_end2 = x_end;
+++                if (x_end != s->ps.sps->width)
+++                    x_end2 = x_end - 8 * h;
+++
+++                for (x = x0 ? x0 - 8 * h: 0; x < x_end2; x += 8 * h) {
+++//                    const int demi_x = x + 4 * v >= s->ps.sps->width;
+++                    const int demi_x = 0;
+++
+++                    const int bs0 = s->horizontal_bs[( x          + y * s->bs_width) >> 2];
+++                    const int bs1 = s->horizontal_bs[((x + 4 * h) + y * s->bs_width) >> 2];
+++                    if ((bs0 == 2) || (bs1 == 2)) {
+++                        const int qp0 = bs0 == 2 ? (get_qPy(s, x,         y - 1) + get_qPy(s, x,         y) + 1) >> 1 : 0;
+++                        const int qp1 = bs1 == 2 ? (get_qPy(s, x + 4 * h, y - 1) + get_qPy(s, x + 4 * h, y) + 1) >> 1 : 0;
+++                        const uint32_t tc4 =
+++                            ((bs0 != 2) ? 0 : chroma_tc(s, qp0, 1, tc_offset) | (chroma_tc(s, qp0, 2, tc_offset) << 16)) |
+++                            ((bs1 != 2) ? 0 : ((chroma_tc(s, qp1, 1, cur_tc_offset) | (chroma_tc(s, qp1, 2, cur_tc_offset) << 16)) << 8));
+++                        unsigned int no_f = !demi_x ? 0 : 2 | 8;
+++
+++                        if (tc4 == 0)
+++                            continue;
+++
+++                        if (pcmf) {
+++                            no_f =
+++                                (get_pcm(s, x,         y - 1) ? 1 : 0) |
+++                                (get_pcm(s, x + 4 * h, y - 1) ? 2 : 0) |
+++                                (get_pcm(s, x,         y)     ? 4 : 0) |
+++                                (get_pcm(s, x + 4 * h, y)     ? 8 : 0);
+++
+++                            if (no_f == 0xf)
+++                                continue;
+++                        }
+++
+++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
+++                                                             s->frame->linesize[1],
+++                                                             tc4, no_f);
+++                    }
+++                }
+++            }
+++        }
+++        else
+++#endif
++         for (chroma = 1; chroma <= 2; chroma++) {
++             int h = 1 << s->ps.sps->hshift[chroma];
++             int v = 1 << s->ps.sps->vshift[chroma];
++@@ -595,7 +894,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0] = (bs0 == 2) ? chroma_tc(s, qp0, chroma, tc_offset) : 0;
++                         c_tc[1] = (bs1 == 2) ? chroma_tc(s, qp1, chroma, tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#if RPI_HEVC_SAND
+++                            av_rpi_is_sand_frame(s->frame) ?
+++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[chroma]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[chroma]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x - 1, y);
++                             no_p[1] = get_pcm(s, x - 1, y + (4 * v));
++@@ -605,9 +909,23 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((yc>>3) & 1) << 1;
+++                            int b = (xc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[0][b][0][a] = c_tc[0];
+++                            setup[0][b][0][a + 1] = c_tc[1];
+++                        } else
+++#endif
++                             s->hevcdsp.hevc_v_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
+++
++                     }
++                 }
++ 
++@@ -628,7 +946,12 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++ 
++                         c_tc[0]   = bs0 == 2 ? chroma_tc(s, qp0, chroma, tc_offset)     : 0;
++                         c_tc[1]   = bs1 == 2 ? chroma_tc(s, qp1, chroma, cur_tc_offset) : 0;
++-                        src       = &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
+++                        src =
+++#if RPI_HEVC_SAND
+++                            av_rpi_is_sand_frame(s->frame) ?
+++                                av_rpi_sand_frame_pos_c(s->frame, x >> s->ps.sps->hshift[chroma], y >> s->ps.sps->vshift[chroma]) :
+++#endif
+++                                &s->frame->data[chroma][(y >> s->ps.sps->vshift[1]) * s->frame->linesize[chroma] + ((x >> s->ps.sps->hshift[1]) << s->ps.sps->pixel_shift)];
++                         if (pcmf) {
++                             no_p[0] = get_pcm(s, x,           y - 1);
++                             no_p[1] = get_pcm(s, x + (4 * h), y - 1);
++@@ -638,6 +961,19 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++                                                                    s->frame->linesize[chroma],
++                                                                    c_tc, no_p, no_q);
++                         } else
+++#ifdef RPI_DEBLOCK_VPU
+++                        if (s->enable_rpi_deblock) {
+++                            uint8_t (*setup)[2][2][4];
+++                            int xc = x>>s->ps.sps->hshift[chroma];
+++                            int yc = y>>s->ps.sps->vshift[chroma];
+++                            int num16 = (yc>>4)*s->uv_setup_width + (xc>>4);
+++                            int a = ((xc>>3) & 1) << 1;
+++                            int b = (yc>>3) & 1;
+++                            setup = s->dvq->uv_setup_arm[num16];
+++                            setup[1][b][0][a] = c_tc[0];
+++                            setup[1][b][0][a + 1] = c_tc[1];
+++                        } else
+++#endif
++                             s->hevcdsp.hevc_h_loop_filter_chroma(src,
++                                                                  s->frame->linesize[chroma],
++                                                                  c_tc, no_p, no_q);
++@@ -648,69 +984,6 @@ static void deblocking_filter_CTB(HEVCContext *s, int x0, int y0)
++     }
++ }
++ 
++-static int boundary_strength(HEVCContext *s, MvField *curr, MvField *neigh,
++-                             RefPicList *neigh_refPicList)
++-{
++-    if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++-        // same L0 and L1
++-        if (s->ref->refPicList[0].list[curr->ref_idx[0]] == neigh_refPicList[0].list[neigh->ref_idx[0]]  &&
++-            s->ref->refPicList[0].list[curr->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]] &&
++-            neigh_refPicList[0].list[neigh->ref_idx[0]] == neigh_refPicList[1].list[neigh->ref_idx[1]]) {
++-            if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
++-                (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                 FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else if (neigh_refPicList[1].list[neigh->ref_idx[1]] == s->ref->refPicList[0].list[curr->ref_idx[0]] &&
++-                   neigh_refPicList[0].list[neigh->ref_idx[0]] == s->ref->refPicList[1].list[curr->ref_idx[1]]) {
++-            if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
++-                FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else {
++-            return 1;
++-        }
++-    } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++-        Mv A, B;
++-        int ref_A, ref_B;
++-
++-        if (curr->pred_flag & 1) {
++-            A     = curr->mv[0];
++-            ref_A = s->ref->refPicList[0].list[curr->ref_idx[0]];
++-        } else {
++-            A     = curr->mv[1];
++-            ref_A = s->ref->refPicList[1].list[curr->ref_idx[1]];
++-        }
++-
++-        if (neigh->pred_flag & 1) {
++-            B     = neigh->mv[0];
++-            ref_B = neigh_refPicList[0].list[neigh->ref_idx[0]];
++-        } else {
++-            B     = neigh->mv[1];
++-            ref_B = neigh_refPicList[1].list[neigh->ref_idx[1]];
++-        }
++-
++-        if (ref_A == ref_B) {
++-            if (FFABS(A.x - B.x) >= 4 || FFABS(A.y - B.y) >= 4)
++-                return 1;
++-            else
++-                return 0;
++-        } else
++-            return 1;
++-    }
++-
++-    return 1;
++-}
++ 
++ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++                                            int log2_trafo_size)
++@@ -721,10 +994,22 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++     int log2_min_tu_size = s->ps.sps->log2_min_tb_size;
++     int min_pu_width     = s->ps.sps->min_pu_width;
++     int min_tu_width     = s->ps.sps->min_tb_width;
++-    int is_intra = tab_mvf[(y0 >> log2_min_pu_size) * min_pu_width +
++-                           (x0 >> log2_min_pu_size)].pred_flag == PF_INTRA;
++     int boundary_upper, boundary_left;
++-    int i, j, bs;
+++    int i, j;
+++    RefPicList *rpl      = s->ref->refPicList;
+++    const unsigned int log2_dup = FFMIN(log2_min_pu_size, log2_trafo_size);
+++    const unsigned int min_pu_in_4pix = 1 << (log2_dup - 2);  // Dup
+++    const unsigned int trafo_in_min_pus = 1 << (log2_trafo_size - log2_dup); // Rep
+++    int y_pu             = y0 >> log2_min_pu_size;
+++    int x_pu             = x0 >> log2_min_pu_size;
+++    MvField *curr        = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    int is_intra         = curr->pred_flag == PF_INTRA;
+++    int inc              = log2_min_pu_size == 2 ? 2 : 1;
+++    uint8_t *bs;
+++
+++#ifdef DISABLE_STRENGTHS
+++    return;
+++#endif
++ 
++     boundary_upper = y0 > 0 && !(y0 & 7);
++     if (boundary_upper &&
++@@ -736,34 +1021,56 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (y0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_upper = 0;
++ 
+++    bs = &s->horizontal_bs[(x0 + y0 * s->bs_width) >> 2];
+++
++     if (boundary_upper) {
++         RefPicList *rpl_top = (lc->boundary_flags & BOUNDARY_UPPER_SLICE) ?
++                               ff_hevc_get_ref_list(s, s->ref, x0, y0 - 1) :
++-                              s->ref->refPicList;
++-        int yp_pu = (y0 - 1) >> log2_min_pu_size;
++-        int yq_pu =  y0      >> log2_min_pu_size;
++-        int yp_tu = (y0 - 1) >> log2_min_tu_size;
++-        int yq_tu =  y0      >> log2_min_tu_size;
+++                              rpl;
+++        MvField *top = curr - min_pu_width;
+++
+++        if (is_intra) {
+++            for (i = 0; i < (1 << log2_trafo_size); i += 4)
+++                bs[i >> 2] = 2;
+++
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *top_cbf_luma = curr_cbf_luma - min_tu_width;
+++
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_top[0].list, rpl_top[1].list,
+++                    curr, top, bs);
++ 
++             for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                int x_tu = (x0 + i) >> log2_min_tu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-                uint8_t top_cbf_luma  = s->cbf_luma[yp_tu * min_tu_width + x_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[yq_tu * min_tu_width + x_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || top->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || top_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, top, rpl_top);
++-                s->horizontal_bs[((x0 + i) + y0 * s->bs_width) >> 2] = bs;
+++                int i_pu = i >> log2_min_pu_size;
+++                int i_tu = i >> log2_min_tu_size;
+++
+++                if (top[i_pu].pred_flag == PF_INTRA)
+++                    bs[i >> 2] = 2;
+++                else if (curr_cbf_luma[i_tu] || top_cbf_luma[i_tu])
+++                    bs[i >> 2] = 1;
++             }
+++        }
+++    }
+++
+++    if (!is_intra) {
+++        for (j = inc; j < trafo_in_min_pus; j += inc) {
+++            MvField *top;
+++
+++            curr += min_pu_width * inc;
+++            top = curr - min_pu_width;
+++            bs += s->bs_width * inc << log2_min_pu_size >> 2;
+++
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, sizeof (MvField), 4 >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, top, bs);
+++        }
++     }
++ 
++-    // bs for vertical TU boundaries
++     boundary_left = x0 > 0 && !(x0 & 7);
++     if (boundary_left &&
++         ((!s->sh.slice_loop_filter_across_slices_enabled_flag &&
++@@ -774,64 +1081,54 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++           (x0 % (1 << s->ps.sps->log2_ctb_size)) == 0)))
++         boundary_left = 0;
++ 
+++    curr = &tab_mvf[y_pu * min_pu_width + x_pu];
+++    bs = &s->vertical_bs[(x0 + y0 * s->bs_width) >> 2];
+++
++     if (boundary_left) {
++         RefPicList *rpl_left = (lc->boundary_flags & BOUNDARY_LEFT_SLICE) ?
++                                ff_hevc_get_ref_list(s, s->ref, x0 - 1, y0) :
++-                               s->ref->refPicList;
++-        int xp_pu = (x0 - 1) >> log2_min_pu_size;
++-        int xq_pu =  x0      >> log2_min_pu_size;
++-        int xp_tu = (x0 - 1) >> log2_min_tu_size;
++-        int xq_tu =  x0      >> log2_min_tu_size;
++-
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int y_pu      = (y0 + i) >> log2_min_pu_size;
++-                int y_tu      = (y0 + i) >> log2_min_tu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
++-                uint8_t left_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xp_tu];
++-                uint8_t curr_cbf_luma = s->cbf_luma[y_tu * min_tu_width + xq_tu];
++-
++-                if (curr->pred_flag == PF_INTRA || left->pred_flag == PF_INTRA)
++-                    bs = 2;
++-                else if (curr_cbf_luma || left_cbf_luma)
++-                    bs = 1;
++-                else
++-                    bs = boundary_strength(s, curr, left, rpl_left);
++-                s->vertical_bs[(x0 + (y0 + i) * s->bs_width) >> 2] = bs;
++-            }
++-    }
++-
++-    if (log2_trafo_size > log2_min_pu_size && !is_intra) {
++-        RefPicList *rpl = s->ref->refPicList;
+++                               rpl;
+++        MvField *left = curr - 1;
++ 
++-        // bs for TU internal horizontal PU boundaries
++-        for (j = 8; j < (1 << log2_trafo_size); j += 8) {
++-            int yp_pu = (y0 + j - 1) >> log2_min_pu_size;
++-            int yq_pu = (y0 + j)     >> log2_min_pu_size;
+++        if (is_intra) {
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4)
+++                bs[j * s->bs_width >> 2] = 2;
++ 
++-            for (i = 0; i < (1 << log2_trafo_size); i += 4) {
++-                int x_pu = (x0 + i) >> log2_min_pu_size;
++-                MvField *top  = &tab_mvf[yp_pu * min_pu_width + x_pu];
++-                MvField *curr = &tab_mvf[yq_pu * min_pu_width + x_pu];
++-
++-                bs = boundary_strength(s, curr, top, rpl);
++-                s->horizontal_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
+++        } else {
+++            int y_tu = y0 >> log2_min_tu_size;
+++            int x_tu = x0 >> log2_min_tu_size;
+++            uint8_t *curr_cbf_luma = &s->cbf_luma[y_tu * min_tu_width + x_tu];
+++            uint8_t *left_cbf_luma = curr_cbf_luma - 1;
+++
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl_left[0].list, rpl_left[1].list,
+++                    curr, left, bs);
+++
+++            for (j = 0; j < (1 << log2_trafo_size); j += 4) {
+++                int j_pu = j >> log2_min_pu_size;
+++                int j_tu = j >> log2_min_tu_size;
+++
+++                if (left[j_pu * min_pu_width].pred_flag == PF_INTRA)
+++                    bs[j * s->bs_width >> 2] = 2;
+++                else if (curr_cbf_luma[j_tu * min_tu_width] || left_cbf_luma[j_tu * min_tu_width])
+++                    bs[j * s->bs_width >> 2] = 1;
++             }
++         }
+++    }
++ 
++-        // bs for TU internal vertical PU boundaries
++-        for (j = 0; j < (1 << log2_trafo_size); j += 4) {
++-            int y_pu = (y0 + j) >> log2_min_pu_size;
+++    if (!is_intra) {
+++        for (i = inc; i < trafo_in_min_pus; i += inc) {
+++            MvField *left;
++ 
++-            for (i = 8; i < (1 << log2_trafo_size); i += 8) {
++-                int xp_pu = (x0 + i - 1) >> log2_min_pu_size;
++-                int xq_pu = (x0 + i)     >> log2_min_pu_size;
++-                MvField *left = &tab_mvf[y_pu * min_pu_width + xp_pu];
++-                MvField *curr = &tab_mvf[y_pu * min_pu_width + xq_pu];
+++            curr += inc;
+++            left = curr - 1;
+++            bs += inc << log2_min_pu_size >> 2;
++ 
++-                bs = boundary_strength(s, curr, left, rpl);
++-                s->vertical_bs[((x0 + i) + (y0 + j) * s->bs_width) >> 2] = bs;
++-            }
+++            s->hevcdsp.hevc_deblocking_boundary_strengths(trafo_in_min_pus,
+++                    min_pu_in_4pix, min_pu_width * sizeof (MvField), 4 * s->bs_width >> 2,
+++                    rpl[0].list, rpl[1].list, rpl[0].list, rpl[1].list,
+++                    curr, left, bs);
++         }
++     }
++ }
++@@ -840,11 +1137,105 @@ void ff_hevc_deblocking_boundary_strengths(HEVCContext *s, int x0, int y0,
++ #undef CB
++ #undef CR
++ 
+++#ifdef RPI_DEBLOCK_VPU
+++// ff_hevc_flush_buffer_lines
+++// flushes and invalidates all pixel rows in [start,end-1]
+++static void ff_hevc_flush_buffer_lines(HEVCContext *s, int start, int end, int flush_luma, int flush_chroma)
+++{
+++    rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++      0, start, s->ps.sps->width, end - start, 0, s->ps.sps->vshift[1], flush_luma, flush_chroma);
+++    rpi_cache_flush_finish(rfe);
+++}
+++#endif
+++
+++#if RPI_INTER
+++
+++// Flush some lines of a reference frames
+++void rpi_flush_ref_frame_progress(HEVCContext * const s, ThreadFrame * const f, const unsigned int n)
+++{
+++    if (s->enable_rpi && s->used_for_ref) {
+++        const int d0 = ((int *)f->progress->data)[0];
+++        const unsigned int curr_y = d0 == -1 ? 0 : d0;  // At start of time progress is -1
+++
+++        if (curr_y < (unsigned int)s->ps.sps->height) {
+++            rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init();
+++            rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
+++              0, curr_y, s->ps.sps->width, FFMIN(n, (unsigned int)s->ps.sps->height) - curr_y,
+++              s->ps.sps->vshift[1], 1, 1);
+++            rpi_cache_flush_finish(rfe);
+++        }
+++    }
+++}
+++#endif
+++
+++#ifdef RPI_DEBLOCK_VPU
+++/* rpi_deblock deblocks an entire row of ctbs using the VPU */
+++static void rpi_deblock(HEVCContext *s, int y, int ctb_size)
+++{
+++  // Flush image, 4 lines above to bottom of ctb stripe
+++  ff_hevc_flush_buffer_lines(s, FFMAX(y-4,0), y+ctb_size, 1, 1);
+++  // TODO flush buffer of beta/tc setup when it becomes cached
+++
+++  // Prepare three commands at once to avoid calling overhead
+++  s->dvq->vpu_cmds_arm[0][0] = get_vc_address_y(s->frame) + s->frame->linesize[0] * y;
+++  s->dvq->vpu_cmds_arm[0][1] = s->frame->linesize[0];
+++  s->dvq->vpu_cmds_arm[0][2] = s->setup_width;
+++  s->dvq->vpu_cmds_arm[0][3] = (int) ( s->dvq->y_setup_vc + s->setup_width * (y>>4) );
+++  s->dvq->vpu_cmds_arm[0][4] = ctb_size>>4;
+++  s->dvq->vpu_cmds_arm[0][5] = 2;
+++
+++  s->dvq->vpu_cmds_arm[1][0] = get_vc_address_u(s->frame) + s->frame->linesize[1] * (y>> s->ps.sps->vshift[1]);
+++  s->dvq->vpu_cmds_arm[1][1] = s->frame->linesize[1];
+++  s->dvq->vpu_cmds_arm[1][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[1][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[1][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[1][5] = 3;
+++
+++  s->dvq->vpu_cmds_arm[2][0] = get_vc_address_v(s->frame) + s->frame->linesize[2] * (y>> s->ps.sps->vshift[2]);
+++  s->dvq->vpu_cmds_arm[2][1] = s->frame->linesize[2];
+++  s->dvq->vpu_cmds_arm[2][2] = s->uv_setup_width;
+++  s->dvq->vpu_cmds_arm[2][3] = (int) ( s->dvq->uv_setup_vc + s->uv_setup_width * ((y>>4)>> s->ps.sps->vshift[1]) );
+++  s->dvq->vpu_cmds_arm[2][4] = (ctb_size>>4)>> s->ps.sps->vshift[1];
+++  s->dvq->vpu_cmds_arm[2][5] = 4;
+++
+++  // Call VPU
+++  {
+++      const vpu_qpu_job_h vqj = vpu_qpu_job_new();
+++      vpu_qpu_job_add_vpu(vqj, vpu_get_fn(s->ps.sps->bit_depth), s->dvq->vpu_cmds_vc, 3, 0, 0, 0, 5);  // 5 means to do all the commands
+++      vpu_qpu_job_add_sync_this(vqj, &s->dvq->cmd_id);
+++      vpu_qpu_job_finish(vqj);
+++  }
+++
+++  s->dvq_n = (s->dvq_n + 1) & (RPI_DEBLOCK_VPU_Q_COUNT - 1);
+++  s->dvq = s->dvq_ents + s->dvq_n;
+++
+++  vpu_qpu_wait(&s->dvq->cmd_id);
+++}
+++
+++#endif
+++
++ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++ {
++     int x_end = x >= s->ps.sps->width  - ctb_size;
+++#ifdef RPI_DEBLOCK_VPU
+++    int done_deblock = 0;
+++#endif
++     if (s->avctx->skip_loop_filter < AVDISCARD_ALL)
++         deblocking_filter_CTB(s, x, y);
+++#ifdef RPI_DEBLOCK_VPU
+++    if (s->enable_rpi_deblock && x_end)
+++    {
+++      int y_at_end = y >= s->ps.sps->height - ctb_size;
+++      int height = 64;  // Deblock in units 64 high to avoid too many VPU calls
+++      int y_start = y&~63;
+++      if (y_at_end) height = s->ps.sps->height - y_start;
+++      if ((((y+ctb_size)&63)==0) || y_at_end) {
+++        done_deblock = 1;
+++        rpi_deblock(s, y_start, height);
+++      }
+++    }
+++#endif
++     if (s->ps.sps->sao_enabled) {
++         int y_end = y >= s->ps.sps->height - ctb_size;
++         if (y && x)
++@@ -853,16 +1244,45 @@ void ff_hevc_hls_filter(HEVCContext *s, int x, int y, int ctb_size)
++             sao_filter_CTB(s, x - ctb_size, y);
++         if (y && x_end) {
++             sao_filter_CTB(s, x, y - ctb_size);
++-            if (s->threads_type & FF_THREAD_FRAME )
++-                ff_thread_report_progress(&s->ref->tf, y, 0);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s,&s->ref->tf, y);
+++#endif
+++                ff_hevc_progress_signal_recon(s, y);
+++            }
++         }
++         if (x_end && y_end) {
++             sao_filter_CTB(s, x , y);
++-            if (s->threads_type & FF_THREAD_FRAME )
++-                ff_thread_report_progress(&s->ref->tf, y + ctb_size, 0);
+++            if (s->threads_type == FF_THREAD_FRAME ) {
+++#if RPI_INTER
+++                rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size);
+++#endif
+++                ff_hevc_progress_signal_recon(s, y + ctb_size);
+++            }
++         }
++-    } else if (s->threads_type & FF_THREAD_FRAME && x_end)
++-        ff_thread_report_progress(&s->ref->tf, y + ctb_size - 4, 0);
+++    } else if (s->threads_type == FF_THREAD_FRAME && x_end) {
+++        //int newh = y + ctb_size - 4;
+++        //int currh = s->ref->tf.progress->data[0];
+++        //if (((y + ctb_size)&63)==0)
+++#ifdef RPI_DEBLOCK_VPU
+++        if (s->enable_rpi_deblock) {
+++            // we no longer need to flush the luma buffer as it is in GPU memory when using deblocking on the rpi
+++            if (done_deblock) {
+++                ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+++            }
+++        } else {
+++#if RPI_INTER
+++            rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+++#endif
+++            ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+++        }
+++#else
+++#if RPI_INTER
+++        rpi_flush_ref_frame_progress(s, &s->ref->tf, y + ctb_size - 4);
+++#endif
+++        ff_hevc_progress_signal_recon(s, y + ctb_size - 4);
+++#endif
+++    }
++ }
++ 
++ void ff_hevc_hls_filters(HEVCContext *s, int x_ctb, int y_ctb, int ctb_size)
++diff --git a/libavcodec/hevc_mvs.c b/libavcodec/hevc_mvs.c
++index 4a6dde0f67..8ee37ebfbc 100644
++--- a/libavcodec/hevc_mvs.c
+++++ b/libavcodec/hevc_mvs.c
++@@ -111,7 +111,7 @@ static av_always_inline int compare_mv_ref_idx(struct MvField A, struct MvField
++     return 0;
++ }
++ 
++-static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
+++static av_always_inline void mv_scale(Mv * const dst, const Mv * const src, int td, int tb)
++ {
++     int tx, scale_factor;
++ 
++@@ -125,10 +125,10 @@ static av_always_inline void mv_scale(Mv *dst, Mv *src, int td, int tb)
++                            (scale_factor * src->y < 0)) >> 8);
++ }
++ 
++-static int check_mvset(Mv *mvLXCol, Mv *mvCol,
++-                       int colPic, int poc,
++-                       RefPicList *refPicList, int X, int refIdxLx,
++-                       RefPicList *refPicList_col, int listCol, int refidxCol)
+++static int check_mvset(Mv * const mvLXCol, const Mv * const mvCol,
+++                       const int colPic, const int poc,
+++                       const RefPicList * const refPicList, const int X, const int refIdxLx,
+++                       const RefPicList * const refPicList_col, const int listCol, const int refidxCol)
++ {
++     int cur_lt = refPicList[X].isLongTerm[refIdxLx];
++     int col_lt = refPicList_col[listCol].isLongTerm[refidxCol];
++@@ -159,11 +159,11 @@ static int check_mvset(Mv *mvLXCol, Mv *mvCol,
++                 refPicList_col, L ## l, temp_col.ref_idx[l])
++ 
++ // derive the motion vectors section 8.5.3.1.8
++-static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
++-                                         int refIdxLx, Mv *mvLXCol, int X,
++-                                         int colPic, RefPicList *refPicList_col)
+++static int derive_temporal_colocated_mvs(const HEVCContext * const s, const MvField temp_col,
+++                                         const int refIdxLx, Mv * const mvLXCol, const int X,
+++                                         const int colPic, const RefPicList * const refPicList_col)
++ {
++-    RefPicList *refPicList = s->ref->refPicList;
+++    const RefPicList * const refPicList = s->ref->refPicList;
++ 
++     if (temp_col.pred_flag == PF_INTRA)
++         return 0;
++@@ -214,20 +214,20 @@ static int derive_temporal_colocated_mvs(HEVCContext *s, MvField temp_col,
++ /*
++  * 8.5.3.1.7  temporal luma motion vector prediction
++  */
++-static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
++-                                       int nPbW, int nPbH, int refIdxLx,
++-                                       Mv *mvLXCol, int X)
+++static int temporal_luma_motion_vector(HEVCContext * const s, const int x0, const int y0,
+++                                       const int nPbW, const int nPbH, const int refIdxLx,
+++                                       Mv * const mvLXCol, const int X)
++ {
++     MvField *tab_mvf;
++     MvField temp_col;
++     int x, y, x_pu, y_pu;
++-    int min_pu_width = s->ps.sps->min_pu_width;
+++    const int min_pu_width = s->ps.sps->min_pu_width;
++     int availableFlagLXCol = 0;
++     int colPic;
++ 
++-    HEVCFrame *ref = s->ref->collocated_ref;
+++    HEVCFrame * const ref = s->ref->collocated_ref;
++ 
++-    if (!ref) {
+++    if (ref == NULL || ref->tab_mvf == NULL) {
++         memset(mvLXCol, 0, sizeof(*mvLXCol));
++         return 0;
++     }
++@@ -239,14 +239,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
++     x = x0 + nPbW;
++     y = y0 + nPbH;
++ 
++-    if (tab_mvf &&
++-        (y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
+++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++         y < s->ps.sps->height &&
++         x < s->ps.sps->width) {
++         x                 &= ~15;
++         y                 &= ~15;
++         if (s->threads_type == FF_THREAD_FRAME)
++-            ff_thread_await_progress(&ref->tf, y, 0);
+++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++         x_pu               = x >> s->ps.sps->log2_min_pu_size;
++         y_pu               = y >> s->ps.sps->log2_min_pu_size;
++         temp_col           = TAB_MVF(x_pu, y_pu);
++@@ -254,13 +253,13 @@ static int temporal_luma_motion_vector(HEVCContext *s, int x0, int y0,
++     }
++ 
++     // derive center collocated motion vector
++-    if (tab_mvf && !availableFlagLXCol) {
+++    if (!availableFlagLXCol) {
++         x                  = x0 + (nPbW >> 1);
++         y                  = y0 + (nPbH >> 1);
++         x                 &= ~15;
++         y                 &= ~15;
++         if (s->threads_type == FF_THREAD_FRAME)
++-            ff_thread_await_progress(&ref->tf, y, 0);
+++            ff_hevc_progress_wait_mv(s, s->jb0, ref, y);
++         x_pu               = x >> s->ps.sps->log2_min_pu_size;
++         y_pu               = y >> s->ps.sps->log2_min_pu_size;
++         temp_col           = TAB_MVF(x_pu, y_pu);
++diff --git a/libavcodec/hevc_ps.c b/libavcodec/hevc_ps.c
++index c1b69a0199..455cdaea1c 100644
++--- a/libavcodec/hevc_ps.c
+++++ b/libavcodec/hevc_ps.c
++@@ -785,7 +785,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
++     switch (sps->bit_depth) {
++     case 8:
++         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY8;
+++#if RPI_HEVC_SAND
+++        // *** Horrid kludge s.t. we start out with sand format
+++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND128 : AV_PIX_FMT_YUV420P;
+++#else
++         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P;
+++#endif
++         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P;
++         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P;
++        break;
++@@ -797,7 +802,12 @@ static int map_pixel_format(AVCodecContext *avctx, HEVCSPS *sps)
++         break;
++     case 10:
++         if (sps->chroma_format_idc == 0) sps->pix_fmt = AV_PIX_FMT_GRAY16;
+++#if RPI_HEVC_SAND
+++        // *** Horrid kludge s.t. we start out with sand format
+++        if (sps->chroma_format_idc == 1) sps->pix_fmt = sps->width <= 2048 && sps->height <= 1088 ? AV_PIX_FMT_SAND64_10 : AV_PIX_FMT_YUV420P10;
+++#else
++         if (sps->chroma_format_idc == 1) sps->pix_fmt = AV_PIX_FMT_YUV420P10;
+++#endif
++         if (sps->chroma_format_idc == 2) sps->pix_fmt = AV_PIX_FMT_YUV422P10;
++         if (sps->chroma_format_idc == 3) sps->pix_fmt = AV_PIX_FMT_YUV444P10;
++         break;
++@@ -1064,7 +1074,6 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++         skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++         if (sps_extension_flag[0]) {
++             int extended_precision_processing_flag;
++-            int high_precision_offsets_enabled_flag;
++             int cabac_bypass_alignment_enabled_flag;
++ 
++             sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++@@ -1079,10 +1088,10 @@ int ff_hevc_parse_sps(HEVCSPS *sps, GetBitContext *gb, unsigned int *sps_id,
++                    "extended_precision_processing_flag not yet implemented\n");
++ 
++             sps->intra_smoothing_disabled_flag       = get_bits1(gb);
++-            high_precision_offsets_enabled_flag  = get_bits1(gb);
++-            if (high_precision_offsets_enabled_flag)
+++            sps->high_precision_offsets_enabled_flag  = get_bits1(gb);
+++            if (sps->high_precision_offsets_enabled_flag)
++                 av_log(avctx, AV_LOG_WARNING,
++-                   "high_precision_offsets_enabled_flag not yet implemented\n");
+++                   "high_precision_offsets_enabled_flag not fully implemented\n");
++ 
++             sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
++ 
++diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
++index df52e401ad..992e994b1a 100644
++--- a/libavcodec/hevc_refs.c
+++++ b/libavcodec/hevc_refs.c
++@@ -23,7 +23,7 @@
++ 
++ #include "libavutil/avassert.h"
++ #include "libavutil/pixdesc.h"
++-
+++#include "libavutil/rpi_sand_fns.h"
++ #include "internal.h"
++ #include "thread.h"
++ #include "hevc.h"
++@@ -205,7 +205,8 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
++             HEVCFrame *frame = &s->DPB[min_idx];
++             AVFrame *dst = out;
++             AVFrame *src = frame->frame;
++-            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src->format);
+++            const int fmt = src->format;
+++            const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
++             int pixel_shift = !!(desc->comp[0].depth > 8);
++ 
++             ret = av_frame_ref(out, src);
++@@ -215,13 +216,31 @@ int ff_hevc_output_frame(HEVCContext *s, AVFrame *out, int flush)
++                 ff_hevc_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++             if (ret < 0)
++                 return ret;
++-
++-            for (i = 0; i < 3; i++) {
++-                int hshift = (i > 0) ? desc->log2_chroma_w : 0;
++-                int vshift = (i > 0) ? desc->log2_chroma_h : 0;
++-                int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
++-                          (frame->window.top_offset   >> vshift) * dst->linesize[i];
++-                dst->data[i] += off;
+++#ifdef RPI
+++            if (av_rpi_is_sand_format(fmt))
+++            {
+++                // Sand cannot be windowed by offset so add side data if we have an offset
+++                const HEVCWindow * const window = &frame->window;
+++                if (window->left_offset + window->right_offset + window->top_offset + window->bottom_offset != 0)
+++                {
+++                    AVFrameSideData *const sd = av_frame_new_side_data(dst, AV_FRAME_DATA_SAND_INFO, sizeof(AVPanScan));
+++                    AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+++                    si->left_offset = window->left_offset;
+++                    si->top_offset = window->top_offset;
+++                    si->pic_width = s->ps.sps->width;
+++                    si->pic_height = s->ps.sps->height;
+++                }
+++            }
+++            else
+++#endif
+++            {
+++                for (i = 0; i < 3; i++) {
+++                    int hshift = (i > 0) ? desc->log2_chroma_w : 0;
+++                    int vshift = (i > 0) ? desc->log2_chroma_h : 0;
+++                    int off = ((frame->window.left_offset >> hshift) << pixel_shift) +
+++                              (frame->window.top_offset   >> vshift) * dst->linesize[i];
+++                    dst->data[i] += off;
+++                }
++             }
++             av_log(s->avctx, AV_LOG_DEBUG,
++                    "Output frame with POC %d.\n", frame->poc);
++@@ -426,8 +445,7 @@ static HEVCFrame *generate_missing_ref(HEVCContext *s, int poc)
++     frame->sequence = s->seq_decode;
++     frame->flags    = 0;
++ 
++-    if (s->threads_type == FF_THREAD_FRAME)
++-        ff_thread_report_progress(&frame->tf, INT_MAX, 0);
+++    ff_hevc_progress_set_all_done(frame);
++ 
++     return frame;
++ }
++diff --git a/libavcodec/hevcdsp.c b/libavcodec/hevcdsp.c
++index 9d773d960e..c9661c3ab1 100644
++--- a/libavcodec/hevcdsp.c
+++++ b/libavcodec/hevcdsp.c
++@@ -123,6 +123,120 @@ DECLARE_ALIGNED(16, const int8_t, ff_hevc_qpel_filters[3][16]) = {
++ #include "hevcdsp_template.c"
++ #undef BIT_DEPTH
++ 
+++static void hevc_deblocking_boundary_strengths(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs)
+++{
+++    for (; pus > 0; pus--) {
+++        int strength, out;
+++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
+++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
+++        int neigh_refL0 = neigh_rpl0[neigh->ref_idx[0]];
+++        int neigh_refL1 = neigh_rpl1[neigh->ref_idx[1]];
+++
+++#if 1 // This more directly matches the original implementation
+++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
+++            // same L0 and L1
+++            if (curr_refL0 == neigh_refL0 &&
+++                curr_refL0 == curr_refL1 &&
+++                neigh_refL0 == neigh_refL1) {
+++                if ((FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4) &&
+++                    (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                     FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4))
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL0 == curr_refL0 &&
+++                       neigh_refL1 == curr_refL1) {
+++                if (FFABS(neigh->mv[0].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[1].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else if (neigh_refL1 == curr_refL0 &&
+++                       neigh_refL0 == curr_refL1) {
+++                if (FFABS(neigh->mv[1].x - curr->mv[0].x) >= 4 || FFABS(neigh->mv[1].y - curr->mv[0].y) >= 4 ||
+++                    FFABS(neigh->mv[0].x - curr->mv[1].x) >= 4 || FFABS(neigh->mv[0].y - curr->mv[1].y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else {
+++                strength = 1;
+++            }
+++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
+++            Mv curr_mv0, neigh_mv0;
+++
+++            if (curr->pred_flag & 1) {
+++                curr_mv0   = curr->mv[0];
+++            } else {
+++                curr_mv0   = curr->mv[1];
+++                curr_refL0 = curr_refL1;
+++            }
+++
+++            if (neigh->pred_flag & 1) {
+++                neigh_mv0   = neigh->mv[0];
+++            } else {
+++                neigh_mv0   = neigh->mv[1];
+++                neigh_refL0 = neigh_refL1;
+++            }
+++
+++            if (curr_refL0 == neigh_refL0) {
+++                if (FFABS(curr_mv0.x - neigh_mv0.x) >= 4 || FFABS(curr_mv0.y - neigh_mv0.y) >= 4)
+++                    strength = 1;
+++                else
+++                    strength = 0;
+++            } else
+++                strength = 1;
+++        } else
+++            strength = 1;
+++#else // This has exactly the same effect, but is more suitable for vectorisation
+++        Mv curr_mv[2];
+++        Mv neigh_mv[2];
+++        memcpy(curr_mv, curr->mv, sizeof curr_mv);
+++        memcpy(neigh_mv, neigh->mv, sizeof neigh_mv);
+++
+++        if (!(curr->pred_flag & 2)) {
+++            curr_mv[1] = curr_mv[0];
+++            curr_refL1 = curr_refL0;
+++        }
+++        if (!(neigh->pred_flag & 2)) {
+++            neigh_mv[1] = neigh_mv[0];
+++            neigh_refL1 = neigh_refL0;
+++        }
+++        if (!(curr->pred_flag & 1)) {
+++            curr_mv[0] = curr_mv[1];
+++            curr_refL0 = curr_refL1;
+++        }
+++        if (!(neigh->pred_flag & 1)) {
+++            neigh_mv[0] = neigh_mv[1];
+++            neigh_refL0 = neigh_refL1;
+++        }
+++
+++        strength = 1;
+++
+++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
+++                (FFABS(neigh_mv[0].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[1].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[1].y) >= 4);
+++
+++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
+++                (FFABS(neigh_mv[1].x - curr_mv[0].x) >= 4) | (FFABS(neigh_mv[1].y - curr_mv[0].y) >= 4) |
+++                (FFABS(neigh_mv[0].x - curr_mv[1].x) >= 4) | (FFABS(neigh_mv[0].y - curr_mv[1].y) >= 4);
+++
+++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
+++#endif
+++
+++        curr += in_inc / sizeof (MvField);
+++        neigh += in_inc / sizeof (MvField);
+++
+++        for (out = dup; out > 0; out--)
+++        {
+++            *bs = strength;
+++            bs += out_inc;
+++        }
+++    }
+++}
+++
++ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++ {
++ #undef FUNC
++@@ -193,15 +307,57 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++     PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++ 
+++#if !RPI_HEVC_SAND
+++#define SLICED_LOOP_FILTERS(depth)
+++#define SLICED_ADD_RESIDUAL(depth)
+++#define SLICED_SAO(depth)
+++#else
+++#define SLICED_ADD_RESIDUAL(depth)\
+++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
+++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
+++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
+++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
+++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
+++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
+++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
+++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
+++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
+++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
+++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
+++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
+++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
+++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
+++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
+++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
+++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
+++#define SLICED_LOOP_FILTERS(depth)\
+++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
+++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
+++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
+++#define SLICED_SAO(depth)\
+++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
+++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
+++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
+++    }                                                                         \
+++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
+++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
+++
+++#endif
+++
++ #define HEVC_DSP(depth)                                                     \
++     hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++-    hevcdsp->transform_add[0]       = FUNC(transform_add4x4, depth);        \
++-    hevcdsp->transform_add[1]       = FUNC(transform_add8x8, depth);        \
++-    hevcdsp->transform_add[2]       = FUNC(transform_add16x16, depth);      \
++-    hevcdsp->transform_add[3]       = FUNC(transform_add32x32, depth);      \
++-    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+++    hevcdsp->transform_add[0]       = FUNC(add_residual4x4, depth);         \
+++    hevcdsp->transform_add[1]       = FUNC(add_residual8x8, depth);         \
+++    hevcdsp->transform_add[2]       = FUNC(add_residual16x16, depth);       \
+++    hevcdsp->transform_add[3]       = FUNC(add_residual32x32, depth);       \
+++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);      \
+++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);      \
+++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);    \
+++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);    \
+++    SLICED_ADD_RESIDUAL(depth);                                             \
++     hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++-    hevcdsp->idct_4x4_luma          = FUNC(transform_4x4_luma, depth);      \
+++    hevcdsp->transform_skip         = FUNC(transform_skip, depth);          \
+++    hevcdsp->idct_4x4_luma          = FUNC(idct_4x4_luma, depth);           \
++     hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++     hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
++     hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
++@@ -212,18 +368,13 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
++     hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
++                                                                             \
++-    hevcdsp->sao_band_filter[0] =                                              \
++-    hevcdsp->sao_band_filter[1] =                                              \
++-    hevcdsp->sao_band_filter[2] =                                              \
++-    hevcdsp->sao_band_filter[3] =                                              \
++-    hevcdsp->sao_band_filter[4] = FUNC(sao_band_filter, depth);                \
++-    hevcdsp->sao_edge_filter[0] =                                              \
++-    hevcdsp->sao_edge_filter[1] =                                              \
++-    hevcdsp->sao_edge_filter[2] =                                              \
++-    hevcdsp->sao_edge_filter[3] =                                              \
++-    hevcdsp->sao_edge_filter[4] = FUNC(sao_edge_filter, depth);                \
+++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
+++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
+++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
+++    }                                                                       \
++     hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++     hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
+++    SLICED_SAO(depth);                                                         \
++                                                                                \
++     QPEL_FUNCS(depth);                                                         \
++     QPEL_UNI_FUNCS(depth);                                                     \
++@@ -232,6 +383,7 @@ void ff_hevc_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++     EPEL_UNI_FUNCS(depth);                                                     \
++     EPEL_BI_FUNCS(depth);                                                      \
++                                                                                \
+++    SLICED_LOOP_FILTERS(depth);                                                \
++     hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++     hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++     hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++@@ -257,6 +409,8 @@ int i = 0;
++         break;
++     }
++ 
+++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
+++
++     if (ARCH_X86)
++         ff_hevc_dsp_init_x86(hevcdsp, bit_depth);
++     if (ARCH_ARM)
++diff --git a/libavcodec/hevcdsp.h b/libavcodec/hevcdsp.h
++index 9f1f6dd59f..c4a1b0f09d 100644
++--- a/libavcodec/hevcdsp.h
+++++ b/libavcodec/hevcdsp.h
++@@ -25,6 +25,7 @@
++ #ifndef AVCODEC_HEVCDSP_H
++ #define AVCODEC_HEVCDSP_H
++ 
+++#include "rpi_opts.h"
++ #include "get_bits.h"
++ 
++ #define MAX_PB_SIZE 64
++@@ -42,11 +43,40 @@ typedef struct SAOParams {
++     uint8_t type_idx[3];    ///< sao_type_idx
++ } SAOParams;
++ 
+++typedef struct Mv {
+++    int16_t x;  ///< horizontal component of motion vector
+++    int16_t y;  ///< vertical component of motion vector
+++} Mv;
+++
+++typedef struct MvField {
+++    DECLARE_ALIGNED(4, Mv, mv)[2];
+++    int8_t ref_idx[2];
+++    int8_t pred_flag;
+++} MvField;
+++
+++#ifdef RPI
+++#define SAO_FILTER_N 6
+++#else
+++#define SAO_FILTER_N 5
+++#endif
+++
+++
++ typedef struct HEVCDSPContext {
++     void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                     struct GetBitContext *gb, int pcm_bit_depth);
++ 
++-    void (*transform_add[4])(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride);
+++    // add_residual was transform_add - import 3.3 names
+++    void (*transform_add[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
+++#if RPI_HEVC_SAND
+++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
+++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
+++
+++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
+++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
+++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
+++                    struct GetBitContext *gb, int pcm_bit_depth);
+++#endif
++ 
++     void (*transform_skip)(int16_t *coeffs, int16_t log2_size);
++ 
++@@ -58,16 +88,31 @@ typedef struct HEVCDSPContext {
++ 
++     void (*idct_dc[4])(int16_t *coeffs);
++ 
++-    void (*sao_band_filter[5])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
+++#if RPI_HEVC_SAND
+++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                               int width, int height);
+++#endif
++ 
++     /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++-    void (*sao_edge_filter[5])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                                int16_t *sao_offset_val, int sao_eo_class, int width, int height);
+++#if RPI_HEVC_SAND
+++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
+++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
+++#endif
++ 
++     void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                 struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                 uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+++#if RPI_HEVC_SAND
+++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
+++                                struct SAOParams *sao, int *borders, int _width, int _height, int c_idx,
+++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
+++#endif
++ 
++     void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width);
++@@ -120,6 +165,22 @@ typedef struct HEVCDSPContext {
++     void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                         int32_t *tc, uint8_t *no_p,
++                                         uint8_t *no_q);
+++#ifdef RPI
+++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t tc[2],
+++                                 const uint8_t no_p[2], const uint8_t no_q[2],
+++                                 uint8_t * _pix_l);
+++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f);
+++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f);
+++
+++#endif
+++
+++    void (*hevc_deblocking_boundary_strengths)(int pus, int dup, int in_inc, int out_inc,
+++                                               int *curr_rpl0, int *curr_rpl1, int *neigh_rpl0, int *neigh_rpl1,
+++                                               MvField *curr, MvField *neigh, uint8_t *bs);
++ } HEVCDSPContext;
++ 
++ void ff_hevc_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++diff --git a/libavcodec/hevcdsp_template.c b/libavcodec/hevcdsp_template.c
++index 5bca02342d..122fbe8154 100644
++--- a/libavcodec/hevcdsp_template.c
+++++ b/libavcodec/hevcdsp_template.c
++@@ -26,6 +26,7 @@
++ #include "bit_depth_template.c"
++ #include "hevcdsp.h"
++ 
+++#include "rpi_shader_template.h"
++ 
++ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                           GetBitContext *gb, int pcm_bit_depth)
++@@ -42,8 +43,32 @@ static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height
++     }
++ }
++ 
++-static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coeffs,
++-                                                     ptrdiff_t stride, int size)
+++#if RPI_HEVC_SAND
+++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
+++                          GetBitContext *gb, int pcm_bit_depth)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++
+++    dst = (pixel *)_dst + 1;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x++)
+++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
+++        dst += stride;
+++    }
+++}
+++#endif
+++
+++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *coeffs,
+++                                                ptrdiff_t stride, int size)
++ {
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++@@ -59,30 +84,255 @@ static av_always_inline void FUNC(transquant_bypass)(uint8_t *_dst, int16_t *coe
++     }
++ }
++ 
++-static void FUNC(transform_add4x4)(uint8_t *_dst, int16_t *coeffs,
++-                                       ptrdiff_t stride)
+++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size; x++) {
+++            dst[x] = av_clip_pixel(dst[x] + dc);
+++        }
+++        dst += stride;
+++    }
+++}
+++
+++
+++#if RPI_HEVC_SAND
+++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
+++                                                ptrdiff_t stride, const int dc_v, int size)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 4);
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + *res);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+++            res++;
+++        }
+++        dst += stride;
+++    }
++ }
++ 
++-static void FUNC(transform_add8x8)(uint8_t *_dst, int16_t *coeffs,
+++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
+++                                                ptrdiff_t stride, const int dc_u, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + dc_u);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
+++            res++;
+++        }
+++        dst += stride;
+++    }
+++}
+++
+++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
+++                                                ptrdiff_t stride, unsigned int size)
+++{
+++    unsigned int x, y;
+++    pixel *dst = (pixel *)_dst;
+++    const int16_t * ru = res;
+++    const int16_t * rv = res + size * size;
+++
+++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
+++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
+++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
+++        }
+++        dst += stride;
+++    }
+++
+++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
+++}
+++
+++
+++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
+++{
+++    int x, y;
+++    pixel *dst = (pixel *)_dst;
+++    const int dc_v = dc >> 16;
+++    const int dc_u = (dc << 16) >> 16;
+++
+++    stride /= sizeof(pixel);
+++
+++    for (y = 0; y < size; y++) {
+++        for (x = 0; x < size * 2; x += 2) {
+++            dst[x] = av_clip_pixel(dst[x] + dc_u);
+++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
+++        }
+++        dst += stride;
+++    }
+++}
+++
+++
+++#endif
+++
+++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *coeffs,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual)(_dst, coeffs, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *coeffs,
++                                        ptrdiff_t stride)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 8);
+++    FUNC(add_residual)(_dst, coeffs, stride, 8);
++ }
++ 
++-static void FUNC(transform_add16x16)(uint8_t *_dst, int16_t *coeffs,
+++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *coeffs,
++                                          ptrdiff_t stride)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 16);
+++    FUNC(add_residual)(_dst, coeffs, stride, 16);
++ }
++ 
++-static void FUNC(transform_add32x32)(uint8_t *_dst, int16_t *coeffs,
+++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *coeffs,
++                                          ptrdiff_t stride)
++ {
++-    FUNC(transquant_bypass)(_dst, coeffs, stride, 32);
+++    FUNC(add_residual)(_dst, coeffs, stride, 32);
++ }
++ 
+++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
+++}
+++
+++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
+++}
+++
+++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
+++}
+++
+++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
+++{
+++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
+++}
+++
+++#if RPI_HEVC_SAND
+++// -- U -- (plaited)
+++
+++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_u)
+++{
+++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
+++}
+++
+++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_u)
+++{
+++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
+++}
+++
+++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_u)
+++{
+++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
+++}
+++
+++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_u)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++// -- V -- (plaited)
+++
+++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_v)
+++{
+++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
+++}
+++
+++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride, int dc_v)
+++{
+++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
+++}
+++
+++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_v)
+++{
+++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
+++}
+++
+++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride, int dc_v)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++// -- C -- (plaited - both U & V)
+++
+++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_c)(_dst, res, stride, 4);
+++}
+++
+++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
+++                                  ptrdiff_t stride)
+++{
+++    FUNC(add_residual_c)(_dst, res, stride, 8);
+++}
+++
+++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride)
+++{
+++    FUNC(add_residual_c)(_dst, res, stride, 16);
+++}
+++
+++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
+++                                    ptrdiff_t stride)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
+++}
+++
+++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
+++}
+++
+++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
+++}
+++
+++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
+++{
+++    // Should never occur for 420, which is all that sand supports
+++    av_assert0(0);
+++}
+++
+++#endif
+++
++ 
++ static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++ {
++@@ -152,7 +402,7 @@ static void FUNC(transform_skip)(int16_t *_coeffs, int16_t log2_size)
++         assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
++     } while (0)
++ 
++-static void FUNC(transform_4x4_luma)(int16_t *coeffs)
+++static void FUNC(idct_4x4_luma)(int16_t *coeffs)
++ {
++     int i;
++     int shift    = 7;
++@@ -358,6 +608,32 @@ static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride
++     }
++ }
++ 
+++
+++#if BIT_DEPTH == 10
+++#if RPI_HEVC_SAND
+++// We need a 32 bit variation for the _c restores so hijack bit depth 10
+++#undef pixel
+++#undef BIT_DEPTH
+++#define pixel uint32_t
+++#define BIT_DEPTH 32
+++#endif
+++// All 16 bit variations are the same
+++#define sao_edge_restore_0_10 sao_edge_restore_0_9
+++#define sao_edge_restore_1_10 sao_edge_restore_1_9
+++#define sao_edge_restore_0_11 sao_edge_restore_0_9
+++#define sao_edge_restore_1_11 sao_edge_restore_1_9
+++#define sao_edge_restore_0_12 sao_edge_restore_0_9
+++#define sao_edge_restore_1_12 sao_edge_restore_1_9
+++#define sao_edge_restore_0_13 sao_edge_restore_0_9
+++#define sao_edge_restore_1_13 sao_edge_restore_1_9
+++#define sao_edge_restore_0_14 sao_edge_restore_0_9
+++#define sao_edge_restore_1_14 sao_edge_restore_1_9
+++#define sao_edge_restore_0_15 sao_edge_restore_0_9
+++#define sao_edge_restore_1_15 sao_edge_restore_1_9
+++#define sao_edge_restore_0_16 sao_edge_restore_0_9
+++#define sao_edge_restore_1_16 sao_edge_restore_1_9
+++#endif
+++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++                                     ptrdiff_t stride_dst, ptrdiff_t stride_src, SAOParams *sao,
++                                     int *borders, int _width, int _height,
++@@ -367,7 +643,6 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, width = _width, height = _height;
++ 
++@@ -376,33 +651,29 @@ static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -417,7 +688,6 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++     int x, y;
++     pixel *dst = (pixel *)_dst;
++     pixel *src = (pixel *)_src;
++-    int16_t *sao_offset_val = sao->offset_val[c_idx];
++     int sao_eo_class    = sao->eo_class[c_idx];
++     int init_x = 0, init_y = 0, width = _width, height = _height;
++ 
++@@ -426,34 +696,30 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ 
++     if (sao_eo_class != SAO_EO_VERT) {
++         if (borders[0]) {
++-            int offset_val = sao_offset_val[0];
++             for (y = 0; y < height; y++) {
++-                dst[y * stride_dst] = av_clip_pixel(src[y * stride_src] + offset_val);
+++                dst[y * stride_dst] = src[y * stride_src];
++             }
++             init_x = 1;
++         }
++         if (borders[2]) {
++-            int offset_val = sao_offset_val[0];
++             int offset     = width - 1;
++             for (x = 0; x < height; x++) {
++-                dst[x * stride_dst + offset] = av_clip_pixel(src[x * stride_src + offset] + offset_val);
+++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++             }
++             width--;
++         }
++     }
++     if (sao_eo_class != SAO_EO_HORIZ) {
++         if (borders[1]) {
++-            int offset_val = sao_offset_val[0];
++             for (x = init_x; x < width; x++)
++-                dst[x] = av_clip_pixel(src[x] + offset_val);
+++                dst[x] = src[x];
++             init_y = 1;
++         }
++         if (borders[3]) {
++-            int offset_val   = sao_offset_val[0];
++-            int y_stride_dst = stride_dst * (height - 1);
++-            int y_stride_src = stride_src * (height - 1);
+++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
+++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++             for (x = init_x; x < width; x++)
++-                dst[x + y_stride_dst] = av_clip_pixel(src[x + y_stride_src] + offset_val);
+++                dst[x + y_stride_dst] = src[x + y_stride_src];
++             height--;
++         }
++     }
++@@ -493,6 +759,121 @@ static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++ 
++     }
++ }
+++#endif
+++#if BIT_DEPTH == 32
+++#undef BIT_DEPTH
+++#undef pixel
+++#define BIT_DEPTH 10
+++#define pixel uint16_t
+++#endif
+++
+++// --- Plaited chroma versions
+++
+++#if RPI_HEVC_SAND
+++
+++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
+++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
+++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
+++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
+++                                  int width, int height)
+++{
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int offset_table_u[32] = { 0 };
+++    int offset_table_v[32] = { 0 };
+++    int k, y, x;
+++    int shift  = BIT_DEPTH - 5;
+++
+++    stride_dst /= sizeof(pixel);
+++    stride_src /= sizeof(pixel);
+++    width *= 2;
+++
+++    for (k = 0; k < 4; k++)
+++    {
+++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
+++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
+++    }
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2)
+++        {
+++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
+++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
+++            // *** & 31 shouldn't be wanted but just now we generate broken input that
+++            // crashes us in 10-bit world
+++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
+++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
+++        }
+++        dst += stride_dst;
+++        src += stride_src;
+++    }
+++}
+++
+++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
+++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
+++                                  int eo, int width, int height) {
+++
+++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
+++    static const int8_t pos[4][2][2] = {
+++        { { -1,  0 }, {  1, 0 } }, // horizontal
+++        { {  0, -1 }, {  0, 1 } }, // vertical
+++        { { -1, -1 }, {  1, 1 } }, // 45 degree
+++        { {  1, -1 }, { -1, 1 } }, // 135 degree
+++    };
+++    pixel *dst = (pixel *)_dst;
+++    pixel *src = (pixel *)_src;
+++    int a_stride, b_stride;
+++    int x, y;
+++    ptrdiff_t stride_src = (2*MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(pixel);
+++
+++    stride_dst /= sizeof(pixel);
+++    width *= 2;
+++
+++    av_assert0(width <= 64);
+++
+++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
+++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
+++    for (y = 0; y < height; y++) {
+++        for (x = 0; x < width; x += 2) {
+++            int diff0u = CMP(src[x], src[x + a_stride]);
+++            int diff1u = CMP(src[x], src[x + b_stride]);
+++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
+++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
+++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
+++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
+++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
+++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
+++        }
+++        src += stride_src;
+++        dst += stride_dst;
+++    }
+++}
+++
+++// Do once
+++#if BIT_DEPTH == 8
+++// Any old 2 byte 'normal' restore will work for these
+++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
+++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
+++// We need 32 bit for 9 bit+
+++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
+++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
+++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
+++#endif
+++
+++#endif  // RPI_HEVC_SAND
+++
++ 
++ #undef CMP
++ 
++@@ -1694,3 +2075,217 @@ static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++ #undef TQ1
++ #undef TQ2
++ #undef TQ3
+++
+++#if RPI_HEVC_SAND
+++
+++// line zero
+++#define P3 pix_l[0 * xstride]
+++#define P2 pix_l[1 * xstride]
+++#define P1 pix_l[2 * xstride]
+++#define P0 pix_l[3 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++#define Q2 pix_r[2 * xstride]
+++#define Q3 pix_r[3 * xstride]
+++
+++// line three. used only for deblocking decision
+++#define TP3 pix_l[0 * xstride + 3 * ystride]
+++#define TP2 pix_l[1 * xstride + 3 * ystride]
+++#define TP1 pix_l[2 * xstride + 3 * ystride]
+++#define TP0 pix_l[3 * xstride + 3 * ystride]
+++#define TQ0 pix_r[0 * xstride + 3 * ystride]
+++#define TQ1 pix_r[1 * xstride + 3 * ystride]
+++#define TQ2 pix_r[2 * xstride + 3 * ystride]
+++#define TQ3 pix_r[3 * xstride + 3 * ystride]
+++
+++// This is identical to hevc_loop_filter_luma except that the P/Q
+++// components are on separate pointers
+++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
+++                                 unsigned int _stride, unsigned int beta, const int32_t _tc[2],
+++                                 const uint8_t _no_p[2], const uint8_t _no_q[2],
+++                                 uint8_t * _pix_l)
+++{
+++    int d, j;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    const ptrdiff_t xstride = 1;
+++    const ptrdiff_t ystride = _stride / sizeof(pixel);
+++
+++    beta <<= BIT_DEPTH - 8;
+++
+++    for (j = 0; j < 2; j++) {
+++        const int dp0  = abs(P2  - 2 * P1  + P0);
+++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
+++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
+++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
+++        const int d0   = dp0 + dq0;
+++        const int d3   = dp3 + dq3;
+++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
+++        const int no_p = _no_p[j];
+++        const int no_q = _no_q[j];
+++
+++        if (d0 + d3 >= beta) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        } else {
+++            const int beta_3 = beta >> 3;
+++            const int beta_2 = beta >> 2;
+++            const int tc25   = ((tc * 5 + 1) >> 1);
+++
+++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
+++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
+++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
+++                // strong filtering
+++                const int tc2 = tc << 1;
+++                for (d = 0; d < 4; d++) {
+++                    const int p3 = P3;
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    const int q3 = Q3;
+++                    if (!no_p) {
+++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
+++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
+++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
+++                    }
+++                    if (!no_q) {
+++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
+++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
+++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            } else { // normal filtering
+++                int nd_p = 1;
+++                int nd_q = 1;
+++                const int tc_2 = tc >> 1;
+++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_p = 2;
+++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
+++                    nd_q = 2;
+++
+++                for (d = 0; d < 4; d++) {
+++                    const int p2 = P2;
+++                    const int p1 = P1;
+++                    const int p0 = P0;
+++                    const int q0 = Q0;
+++                    const int q1 = Q1;
+++                    const int q2 = Q2;
+++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
+++                    if (abs(delta0) < 10 * tc) {
+++                        delta0 = av_clip(delta0, -tc, tc);
+++                        if (!no_p)
+++                            P0 = av_clip_pixel(p0 + delta0);
+++                        if (!no_q)
+++                            Q0 = av_clip_pixel(q0 - delta0);
+++                        if (!no_p && nd_p > 1) {
+++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
+++                            P1 = av_clip_pixel(p1 + deltap1);
+++                        }
+++                        if (!no_q && nd_q > 1) {
+++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
+++                            Q1 = av_clip_pixel(q1 + deltaq1);
+++                        }
+++                    }
+++                    pix_l += ystride;
+++                    pix_r += ystride;
+++                }
+++            }
+++        }
+++    }
+++}
+++
+++#undef TP3
+++#undef TP2
+++#undef TP1
+++#undef TP0
+++#undef TQ0
+++#undef TQ1
+++#undef TQ2
+++#undef TQ3
+++
+++#undef P3
+++#undef P2
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++#undef Q2
+++#undef Q3
+++
+++#define P1 pix_l[0 * xstride]
+++#define P0 pix_l[1 * xstride]
+++#define Q0 pix_r[0 * xstride]
+++#define Q1 pix_r[1 * xstride]
+++
+++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
+++                                          ptrdiff_t _ystride, const int32_t *_tc,
+++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
+++{
+++    int d, j, no_p, no_q;
+++    pixel *pix_l        = (pixel *)_pix_l;
+++    pixel *pix_r        = (pixel *)_pix_r;
+++    ptrdiff_t xstride = _xstride / sizeof(pixel);
+++    ptrdiff_t ystride = _ystride / sizeof(pixel);
+++
+++    for (j = 0; j < 2; j++) {
+++        const int tc = _tc[j] << (BIT_DEPTH - 8);
+++        if (tc <= 0) {
+++            pix_l += 4 * ystride;
+++            pix_r += 4 * ystride;
+++            continue;
+++        }
+++        no_p = _no_p[j];
+++        no_q = _no_q[j];
+++
+++        for (d = 0; d < 4; d++) {
+++            int delta0;
+++            const int p1 = P1;
+++            const int p0 = P0;
+++            const int q0 = Q0;
+++            const int q1 = Q1;
+++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
+++            if (!no_p)
+++                P0 = av_clip_pixel(p0 + delta0);
+++            if (!no_q)
+++                Q0 = av_clip_pixel(q0 - delta0);
+++            pix_l += ystride;
+++            pix_r += ystride;
+++        }
+++    }
+++}
+++
+++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
+++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
+++}
+++
+++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
+++                                 uint8_t * src_l,
+++                                 unsigned int no_f)
+++{
+++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
+++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
+++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
+++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
+++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
+++}
+++
+++#undef P1
+++#undef P0
+++#undef Q0
+++#undef Q1
+++
+++
+++#endif
+++
++diff --git a/libavcodec/hevcpred.c b/libavcodec/hevcpred.c
++index 02c1766059..cea16eade4 100644
++--- a/libavcodec/hevcpred.c
+++++ b/libavcodec/hevcpred.c
++@@ -24,6 +24,7 @@
++ 
++ #include "hevcpred.h"
++ 
+++#define PRED_C 0
++ #define BIT_DEPTH 8
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
++@@ -39,13 +40,37 @@
++ #define BIT_DEPTH 12
++ #include "hevcpred_template.c"
++ #undef BIT_DEPTH
+++#undef PRED_C
+++
+++#ifdef RPI
+++#define PRED_C 1
+++#define BIT_DEPTH 8
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 9
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 10
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++
+++#define BIT_DEPTH 12
+++#include "hevcpred_template.c"
+++#undef BIT_DEPTH
+++#undef PRED_C
+++#endif
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++ {
++ #undef FUNC
++ #define FUNC(a, depth) a ## _ ## depth
++ 
++-#define HEVC_PRED(depth)                                \
+++#undef FUNCC
+++#define FUNCC(a, depth) a ## _ ## depth ## _c
+++
+++#define HEVC_PRED_Y(depth)                                \
++     hpc->intra_pred[0]   = FUNC(intra_pred_2, depth);   \
++     hpc->intra_pred[1]   = FUNC(intra_pred_3, depth);   \
++     hpc->intra_pred[2]   = FUNC(intra_pred_4, depth);   \
++@@ -60,6 +85,30 @@ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth)
++     hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++     hpc->pred_angular[3] = FUNC(pred_angular_3, depth);
++ 
+++#define HEVC_PRED_C(depth)                                \
+++    hpc->intra_pred_c[0]   = FUNCC(intra_pred_2, depth);   \
+++    hpc->intra_pred_c[1]   = FUNCC(intra_pred_3, depth);   \
+++    hpc->intra_pred_c[2]   = FUNCC(intra_pred_4, depth);   \
+++    hpc->intra_pred_c[3]   = FUNCC(intra_pred_5, depth);   \
+++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
+++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
+++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
+++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
+++    hpc->pred_dc_c         = FUNCC(pred_dc, depth);        \
+++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
+++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
+++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
+++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth);
+++
+++#ifdef RPI
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth); \
+++    HEVC_PRED_C(depth);
+++#else
+++#define HEVC_PRED(depth) \
+++    HEVC_PRED_Y(depth);
+++#endif
+++
++     switch (bit_depth) {
++     case 9:
++         HEVC_PRED(9);
++diff --git a/libavcodec/hevcpred.h b/libavcodec/hevcpred.h
++index eb17663683..00ba3f94c0 100644
++--- a/libavcodec/hevcpred.h
+++++ b/libavcodec/hevcpred.h
++@@ -38,6 +38,17 @@ typedef struct HEVCPredContext {
++     void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                             const uint8_t *left, ptrdiff_t stride,
++                             int c_idx, int mode);
+++#ifdef RPI
+++    void (*intra_pred_c[4])(struct HEVCContext *s, int x0, int y0, int c_idx);
+++
+++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
+++                           const uint8_t *left, ptrdiff_t stride);
+++    void (*pred_dc_c)(uint8_t *src, const uint8_t *top, const uint8_t *left,
+++                    ptrdiff_t stride, int log2_size, int c_idx);
+++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
+++                            const uint8_t *left, ptrdiff_t stride,
+++                            int c_idx, int mode);
+++#endif
++ } HEVCPredContext;
++ 
++ void ff_hevc_pred_init(HEVCPredContext *hpc, int bit_depth);
++diff --git a/libavcodec/hevcpred_template.c b/libavcodec/hevcpred_template.c
++index 6fe33546b1..2f9f5f2798 100644
++--- a/libavcodec/hevcpred_template.c
+++++ b/libavcodec/hevcpred_template.c
++@@ -20,13 +20,110 @@
++  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++  */
++ 
+++//#define DISABLE_INTRA
+++
++ #include "libavutil/pixdesc.h"
++ 
++ #include "bit_depth_template.c"
++ #include "hevcpred.h"
++ 
+++#ifdef RPI
+++#include "libavutil/rpi_sand_fns.h"
+++#endif
+++
+++#define DUMP_PRED 0
+++
++ #define POS(x, y) src[(x) + stride * (y)]
++ 
+++// REPEAT_INCLUDE defined at EOF
+++#if defined(RPI) && !defined(INCLUDED_ONCE)
+++typedef uint8_t (* c8_dst_ptr_t)[2];
+++typedef const uint8_t (* c8_src_ptr_t)[2];
+++typedef uint16_t (* c16_dst_ptr_t)[2];
+++typedef const uint16_t (* c16_src_ptr_t)[2];
+++
+++// *** On ARM make these NEON registers
+++typedef struct pixel4_16 {
+++    uint16_t x[4];
+++} pixel4_16;
+++typedef struct pixel4_32 {
+++    uint32_t x[4];
+++} pixel4_32;
+++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
+++{
+++    pixel4_16 t = {{x, x, x, x}};
+++    return t;
+++}
+++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
+++{
+++    pixel4_32 t = {{x, x, x, x}};
+++    return t;
+++}
+++#endif
+++
+++#if PRED_C
+++// For chroma we double pixel size so we copy pairs
+++#undef pixel
+++#undef pixel2
+++#undef pixel4
+++#undef dctcoef
+++#undef INIT_CLIP
+++#undef no_rnd_avg_pixel4
+++#undef rnd_avg_pixel4
+++#undef AV_RN2P
+++#undef AV_RN4P
+++#undef AV_RN4PA
+++#undef AV_WN2P
+++#undef AV_WN4P
+++#undef AV_WN4PA
+++#undef CLIP
+++#undef FUNC
+++#undef FUNCC
+++#undef av_clip_pixel
+++#undef PIXEL_SPLAT_X4
+++
+++#if BIT_DEPTH == 8
+++#define pixel uint16_t
+++#define pixel4 pixel4_16
+++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
+++#define cpel uint8_t
+++#define c_src_ptr_t  c8_src_ptr_t
+++#define c_dst_ptr_t  c8_dst_ptr_t
+++#else
+++#define pixel uint32_t
+++#define pixel4 pixel4_32
+++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
+++#define cpel uint16_t
+++#define c_src_ptr_t c16_dst_ptr_t
+++#define c_dst_ptr_t c16_dst_ptr_t
+++#endif
+++#define AV_RN4P(p) (*(pixel4*)(p))
+++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
+++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
+++#endif
+++
+++
+++// Get PW prior to horrid PRED_C trickery
+++#if BIT_DEPTH == 8
+++#define PW 1
+++#else
+++#define PW 2
+++#endif
+++
+++
+++#if DUMP_PRED && !defined(INCLUDE_ONCE)
+++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
+++{
+++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
+++        for (unsigned int x = 0; x != size; x++) {
+++            printf("%4d", data[x * 2]);
+++        }
+++        printf("\n");
+++    }
+++    printf("\n");
+++}
+++#endif
+++
++ static av_always_inline void FUNC(intra_pred)(HEVCContext *s, int x0, int y0,
++                                               int log2_size, int c_idx)
++ {
++@@ -69,8 +166,11 @@ do {                                  \
++                 AV_WN4P(&ptr[i], a);                                           \
++             else                                                               \
++                 a = PIXEL_SPLAT_X4(ptr[i + 3])
++-
+++#ifdef RPI
+++    HEVCLocalContextIntra *lc = (s->enable_rpi) ? &s->HEVClcIntra : (HEVCLocalContextIntra *)s->HEVClc ;
+++#else
++     HEVCLocalContext *lc = s->HEVClc;
+++#endif
++     int i;
++     int hshift = s->ps.sps->hshift[c_idx];
++     int vshift = s->ps.sps->vshift[c_idx];
++@@ -79,15 +179,23 @@ do {                                  \
++     int size_in_tbs_h  = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
++     int size_in_luma_v = size << vshift;
++     int size_in_tbs_v  = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
++-    int x = x0 >> hshift;
++-    int y = y0 >> vshift;
+++    const int x = x0 >> hshift;
+++    const int y = y0 >> vshift;
++     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
++ 
++     int cur_tb_addr = MIN_TB_ADDR_ZS(x_tb, y_tb);
++ 
++-    ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++    const ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(pixel);
+++#if defined(RPI)
+++    pixel *const src = !av_rpi_is_sand_frame(s->frame) ?
+++            (pixel*)s->frame->data[c_idx] + x + y * stride :
+++        c_idx == 0 ?
+++            (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
+++            (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
+++#else
++     pixel *src = (pixel*)s->frame->data[c_idx] + x + y * stride;
+++#endif
++ 
++     int min_pu_width = s->ps.sps->min_pu_width;
++ 
++@@ -95,14 +203,20 @@ do {                                  \
++                               lc->tu.intra_pred_mode;
++     pixel4 a;
++     pixel  left_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_left_array[2 * MAX_TB_SIZE + 1];
+++#endif
++     pixel  top_array[2 * MAX_TB_SIZE + 1];
+++#if !PRED_C
++     pixel  filtered_top_array[2 * MAX_TB_SIZE + 1];
+++#endif
++ 
++     pixel  *left          = left_array + 1;
++     pixel  *top           = top_array  + 1;
+++#if !PRED_C
++     pixel  *filtered_left = filtered_left_array + 1;
++     pixel  *filtered_top  = filtered_top_array  + 1;
+++#endif
++     int cand_bottom_left = lc->na.cand_bottom_left && cur_tb_addr > MIN_TB_ADDR_ZS( x_tb - 1, (y_tb + size_in_tbs_v) & s->ps.sps->tb_mask);
++     int cand_left        = lc->na.cand_left;
++     int cand_up_left     = lc->na.cand_up_left;
++@@ -114,6 +228,27 @@ do {                                  \
++     int top_right_size   = (FFMIN(x0 + 2 * size_in_luma_h, s->ps.sps->width) -
++                            (x0 + size_in_luma_h)) >> hshift;
++ 
+++    pixel * src_l = src - 1;
+++    pixel * src_u = src - stride;
+++    pixel * src_ur = src_u + size;
+++
+++#ifdef DISABLE_INTRA
+++    return;
+++#endif
+++
+++#if defined(RPI)
+++    if (av_rpi_is_sand_frame(s->frame)) {
+++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
+++        const AVFrame * const frame = s->frame;
+++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
+++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
+++        if ((x & mask) == 0)
+++            src_l -= stripe_adj;
+++        if (((x + size) & mask) == 0)
+++            src_ur += stripe_adj;
+++    }
+++#endif
+++
++     if (s->ps.pps->constrained_intra_pred_flag == 1) {
++         int size_in_luma_pu_v = PU(size_in_luma_v);
++         int size_in_luma_pu_h = PU(size_in_luma_h);
++@@ -163,23 +298,24 @@ do {                                  \
++         top[-1] = 128;
++     }
++     if (cand_up_left) {
++-        left[-1] = POS(-1, -1);
+++        left[-1] = src_l[-stride];
++         top[-1]  = left[-1];
++     }
++     if (cand_up)
++-        memcpy(top, src - stride, size * sizeof(pixel));
+++        // Always good - even with sand
+++        memcpy(top, src_u, size * sizeof(pixel));
++     if (cand_up_right) {
++-        memcpy(top + size, src - stride + size, size * sizeof(pixel));
++-        EXTEND(top + size + top_right_size, POS(size + top_right_size - 1, -1),
+++        memcpy(top + size, src_ur, top_right_size * sizeof(pixel));
+++        EXTEND(top + size + top_right_size, top[size + top_right_size - 1],
++                size - top_right_size);
++     }
++     if (cand_left)
++         for (i = 0; i < size; i++)
++-            left[i] = POS(-1, i);
+++            left[i] = src_l[stride * i];
++     if (cand_bottom_left) {
++         for (i = size; i < size + bottom_left_size; i++)
++-            left[i] = POS(-1, i);
++-        EXTEND(left + size + bottom_left_size, POS(-1, size + bottom_left_size - 1),
+++            left[i] = src_l[stride * i];
+++        EXTEND(left + size + bottom_left_size, left[size + bottom_left_size - 1],
++                size - bottom_left_size);
++     }
++ 
++@@ -268,7 +404,11 @@ do {                                  \
++             cand_up_left = 1;
++             cand_left    = 1;
++         } else { // No samples available
+++#if PRED_C
+++            left[-1] = (1 << (BIT_DEPTH - 1)) | (1 << (BIT_DEPTH - 1 + PW * 8));
+++#else
++             left[-1] = (1 << (BIT_DEPTH - 1));
+++#endif
++             EXTEND(top,  left[-1], 2 * size);
++             EXTEND(left, left[-1], 2 * size);
++         }
++@@ -287,6 +427,9 @@ do {                                  \
++     top[-1] = left[-1];
++ 
++     // Filtering process
+++    // Sand can only apply to chroma_format_idc == 1 so we don't need to
+++    // worry about chroma smoothing for that case
+++#if !PRED_C
++     if (!s->ps.sps->intra_smoothing_disabled_flag && (c_idx == 0  || s->ps.sps->chroma_format_idc == 3)) {
++         if (mode != INTRA_DC && size != 4){
++             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
++@@ -342,6 +485,30 @@ do {                                  \
++                                            mode);
++         break;
++     }
+++#else
+++    switch (mode) {
+++    case INTRA_PLANAR:
+++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                          (uint8_t *)left, stride);
+++        break;
+++    case INTRA_DC:
+++        s->hpc.pred_dc_c((uint8_t *)src, (uint8_t *)top,
+++                       (uint8_t *)left, stride, log2_size, c_idx);
+++        break;
+++    default:
+++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top,
+++                                           (uint8_t *)left, stride, c_idx,
+++                                           mode);
+++        break;
+++    }
+++
+++#if DUMP_PRED
+++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
+++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
+++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
+++#endif
+++#endif
++ }
++ 
++ #define INTRA_PRED(size)                                                            \
++@@ -357,6 +524,7 @@ INTRA_PRED(5)
++ 
++ #undef INTRA_PRED
++ 
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                   const uint8_t *_left, ptrdiff_t stride,
++                                   int trafo_size)
++@@ -371,6 +539,29 @@ static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_to
++             POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                          (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++ }
+++#else
+++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
+++                                  const uint8_t * _left, ptrdiff_t stride,
+++                                  int trafo_size)
+++{
+++    int x, y;
+++    int size = 1 << trafo_size;
+++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+++    const c_src_ptr_t top = (c_src_ptr_t)_top;
+++    const c_src_ptr_t left = (c_src_ptr_t)_left;
+++
+++    for (y = 0; y < size; y++, src += stride)
+++    {
+++        for (x = 0; x < size; x++)
+++        {
+++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
+++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
+++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
+++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
+++        }
+++    }
+++}
+++#endif
++ 
++ #define PRED_PLANAR(size)\
++ static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++@@ -386,6 +577,7 @@ PRED_PLANAR(3)
++ 
++ #undef PRED_PLANAR
++ 
+++#if !PRED_C
++ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                           const uint8_t *_left,
++                           ptrdiff_t stride, int log2_size, int c_idx)
++@@ -416,7 +608,53 @@ static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++             POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++     }
++ }
+++#else
+++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
+++                          const uint8_t *_left,
+++                          ptrdiff_t stride, int log2_size, int c_idx)
+++{
+++    unsigned int i, j;
+++    const unsigned int size = (1 << log2_size);
+++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
+++    const c_src_ptr_t top = (c_src_ptr_t)_top;
+++    const c_src_ptr_t left = (c_src_ptr_t)_left;
+++    unsigned int dc0 = size;
+++    unsigned int dc1 = size;
+++
+++    for (i = 0; i < size; i++)
+++    {
+++        dc0 += left[i][0] + top[i][0];
+++        dc1 += left[i][1] + top[i][1];
+++    }
+++
+++    dc0 >>= log2_size + 1;
+++    dc1 >>= log2_size + 1;
+++
+++    for (i = 0; i < size; i++, src += stride)
+++    {
+++        for (j = 0; j < size; ++j)
+++        {
+++            src[j][0] = dc0;
+++            src[j][1] = dc1;
++ 
+++        }
+++    }
+++}
+++#endif
+++
+++#ifndef ANGLE_CONSTS
+++#define ANGLE_CONSTS
+++static const int intra_pred_angle[] = {
+++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
+++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
+++};
+++static const int inv_angle[] = {
+++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
+++    -630, -910, -1638, -4096
+ +};
+++#endif
+++
+++#if !PRED_C
++ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                 const uint8_t *_top,
++                                                 const uint8_t *_left,
++@@ -428,15 +666,6 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++     const pixel *top  = (const pixel *)_top;
++     const pixel *left = (const pixel *)_left;
++ 
++-    static const int intra_pred_angle[] = {
++-         32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++-        -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++-    };
++-    static const int inv_angle[] = {
++-        -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++-        -630, -910, -1638, -4096
++-    };
++-
++     int angle = intra_pred_angle[mode - 2];
++     pixel ref_array[3 * MAX_TB_SIZE + 4];
++     pixel *ref_tmp = ref_array + size;
++@@ -509,6 +738,83 @@ static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++         }
++     }
++ }
+++#else
+++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
+++                                                const uint8_t *_top,
+++                                                const uint8_t *_left,
+++                                                ptrdiff_t stride, int c_idx,
+++                                                int mode, int size)
+++{
+++    int x, y;
+++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
+++    c_src_ptr_t top  = (c_src_ptr_t)_top;
+++    c_src_ptr_t left = (c_src_ptr_t)_left;
+++
+++    const int angle = intra_pred_angle[mode - 2];
+++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
+++    c_dst_ptr_t ref_tmp = ref_array + size;
+++    c_src_ptr_t ref;
+++    const int last = (size * angle) >> 5;
+++
+++    if (mode >= 18) {
+++        ref = top - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (y = 0; y < size; y++, src += stride) {
+++            const int idx  = ((y + 1) * angle) >> 5;
+++            const int fact = ((y + 1) * angle) & 31;
+++            if (fact) {
+++                for (x = 0; x < size; ++x) {
+++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
+++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
+++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
+++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                memcpy(src, ref + idx + 1, size * 2 * PW);
+++            }
+++        }
+++    } else {
+++        ref = left - 1;
+++        if (angle < 0 && last < -1) {
+++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
+++            for (x = last; x <= -1; x++)
+++            {
+++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
+++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
+++            }
+++            ref = (c_src_ptr_t)ref_tmp;
+++        }
+++
+++        for (x = 0; x < size; x++, src++) {
+++            const int idx  = ((x + 1) * angle) >> 5;
+++            const int fact = ((x + 1) * angle) & 31;
+++            if (fact) {
+++                for (y = 0; y < size; y++) {
+++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
+++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
+++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
+++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
+++                }
+++            } else {
+++                for (y = 0; y < size; y++)
+++                {
+++                    src[y * stride][0] = ref[y + idx + 1][0];
+++                    src[y * stride][1] = ref[y + idx + 1][1];
+++                }
+++            }
+++        }
+++    }
+++}
+++#endif
++ 
++ static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                  const uint8_t *left,
++@@ -538,6 +844,10 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++     FUNC(pred_angular)(src, top, left, stride, c_idx, mode, 1 << 5);
++ }
++ 
+++#undef cpel
+++#undef c_src_ptr_t
+++#undef c_dst_ptr_t
+++
++ #undef EXTEND_LEFT_CIP
++ #undef EXTEND_RIGHT_CIP
++ #undef EXTEND_UP_CIP
++@@ -549,3 +859,9 @@ static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++ #undef EXTEND
++ #undef MIN_TB_ADDR_ZS
++ #undef POS
+++#undef PW
+++
+++#ifndef INCLUDED_ONCE
+++#define INCLUDED_ONCE
+++#endif
+++
++diff --git a/libavcodec/raw.c b/libavcodec/raw.c
++index d36b68bfae..b526dc393d 100644
++--- a/libavcodec/raw.c
+++++ b/libavcodec/raw.c
++@@ -260,6 +260,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
++     { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
++     { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
++ 
+++    /* RPI */
+++#ifdef RPI
+++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
+++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
+++#endif
+++
++     /* special */
++     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
++     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
++diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
++index d83705645c..4c746786ff 100644
++--- a/libavcodec/rawenc.c
+++++ b/libavcodec/rawenc.c
++@@ -31,6 +31,8 @@
++ #include "libavutil/intreadwrite.h"
++ #include "libavutil/imgutils.h"
++ #include "libavutil/internal.h"
+++#include "libavutil/avassert.h"
+++#include "libavutil/rpi_sand_fns.h"
++ 
++ static av_cold int raw_encode_init(AVCodecContext *avctx)
++ {
++@@ -47,6 +49,73 @@ FF_ENABLE_DEPRECATION_WARNINGS
++     return 0;
++ }
++ 
+++#ifdef RPI
+++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+++                      const AVFrame *frame)
+++{
+++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+++    int size;
+++    int width = frame->width;
+++    int height = frame->height;
+++    int x0 = 0;
+++    int y0 = 0;
+++    uint8_t * dst;
+++    int ret;
+++
+++    if (sd != NULL) {
+++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+++
+++        x0 = si->left_offset;
+++        y0 = si->top_offset;
+++    }
+++
+++    size = width * height * 3 / 2;
+++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+++        return ret;
+++
+++    dst = pkt->data;
+++
+++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
+++    dst += width * height;
+++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
+++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
+++    return 0;
+++}
+++
+++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
+++                      const AVFrame *frame)
+++{
+++    const AVFrameSideData *const sd = av_frame_get_side_data(frame, AV_FRAME_DATA_SAND_INFO);
+++    int size;
+++    int width = frame->width;
+++    int height = frame->height;
+++    int x0 = 0;
+++    int y0 = 0;
+++    uint8_t * dst;
+++    int ret;
+++
+++    if (sd != NULL) {
+++        const AVFrameDataSandInfo *const si = (AVFrameDataSandInfo *)sd->data;
+++
+++        x0 = si->left_offset;
+++        y0 = si->top_offset;
+++    }
+++
+++    size = width * height * 3;
+++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
+++        return ret;
+++
+++    dst = pkt->data;
+++
+++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
+++    dst += width * height * 2;
+++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
+++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
+++    return 0;
+++}
+++#endif
+++
+++
++ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++                       const AVFrame *frame, int *got_packet)
++ {
++@@ -56,6 +125,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
++     if (ret < 0)
++         return ret;
++ 
+++#ifdef RPI
+++    if (av_rpi_is_sand_frame(frame)) {
+++        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
+++        *got_packet = (ret == 0);
+++        return ret;
+++    }
+++#endif
+++
++     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
++         return ret;
++     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+ diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+ new file mode 100644
+-index 0000000..5543093
++index 0000000000..391f761df9
+ --- /dev/null
+ +++ b/libavcodec/rpi_hevc_transform.s
+-@@ -0,0 +1,917 @@
++@@ -0,0 +1,923 @@
+ +# ******************************************************************************
+ +# Argon Design Ltd.
+ +# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
+@@ -13502,7 +14994,7 @@ index 0000000..5543093
+ +# ******************************************************************************
+ +
+ +# HEVC VPU Transform
+-+#
+++#             fe
+ +# Transform matrix can be thought of as
+ +#   output row vector = input row vector * transMatrix2
+ +#
+@@ -13575,6 +15067,12 @@ index 0000000..5543093
+ +# num32: number of 32x32 transforms
+ +# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
+ +#
+++
+++.equ TRANS_SHIFT, 20 - BIT_DEPTH
+++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
+++.equ TRANS_ASL2, 16 - TRANS_SHIFT
+++
+++
+ +hevc_trans_16x16:
+ +  cmp r5,1
+ +  beq memclear16
+@@ -13604,7 +15102,7 @@ index 0000000..5543093
+ +  mov r8,64*16 # Value used to swap from current to next VRF location
+ +  vldh HX(0++,0)+r0,(r1 += r3) REP 16
+ +  mov r4,64 # Constant used for rounding first pass
+-+  mov r5,1<<11 # Constant used for rounding second pass
+++  mov r5,TRANS_RND2 # Constant used for rounding second pass
+ +
+ +  # At start of block r0,r1 point to the current block (that has already been loaded)
+ +block_loop:
+@@ -13625,7 +15123,7 @@ index 0000000..5543093
+ +  bl col_trans_16
+ +  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
+ +  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
+-+  vasl HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
+ +
+ +  # Save results - note there has been a transposition during the processing so we save columns
+ +  vsth VX(0,32++)+r0, (r1 += r3) REP 16
+@@ -13708,8 +15206,8 @@ index 0000000..5543093
+ +  bl trans32
+ +
+ +  # ROW TRANSFORM
+-+  mov r4, 1<<11 # Constant used for rounding second pass
+-+  mov r5, 4 # left shift used for rounding second pass
+++  mov r4, TRANS_RND2 # Constant used for rounding second pass
+++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
+ +
+ +  mov r1,r9  # Input temporary storage
+ +  mov r8,r10   # Output Coefficient buffer
+@@ -14410,9 +15908,6161 @@ index 0000000..5543093
+ +  bgt loop_cmds
+ +
+ +  pop r6-r7, pc
++diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
++new file mode 100644
++index 0000000000..b0e9902d82
++--- /dev/null
+++++ b/libavcodec/rpi_hevc_transform10.h
++@@ -0,0 +1,3070 @@
+++static const unsigned char rpi_hevc_transform10 [] = {
+++21,
+++106,
+++0,
+++144,
+++47,
+++1,
+++37,
+++106,
+++0,
+++144,
+++66,
+++1,
+++53,
+++106,
+++0,
+++144,
+++192,
+++4,
+++69,
+++106,
+++0,
+++144,
+++192,
+++4,
+++85,
+++106,
+++0,
+++144,
+++220,
+++5,
+++169,
+++3,
+++62,
+++64,
+++79,
+++64,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++12,
+++248,
+++0,
+++136,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++248,
+++0,
+++168,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++0,
+++96,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++8,
+++232,
+++0,
+++4,
+++0,
+++0,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++5,
+++232,
+++0,
+++2,
+++0,
+++0,
+++128,
+++69,
+++113,
+++66,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++128,
+++69,
+++113,
+++70,
+++128,
+++144,
+++40,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++16,
+++0,
+++76,
+++254,
+++48,
+++192,
+++9,
+++4,
+++32,
+++8,
+++0,
+++0,
+++4,
+++254,
+++0,
+++144,
+++128,
+++2,
+++0,
+++8,
+++2,
+++0,
+++128,
+++144,
+++23,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++20,
+++0,
+++76,
+++254,
+++48,
+++192,
+++6,
+++4,
+++32,
+++8,
+++0,
+++0,
+++140,
+++248,
+++44,
+++0,
+++0,
+++0,
+++32,
+++48,
+++4,
+++0,
+++128,
+++69,
+++113,
+++66,
+++242,
+++140,
+++211,
+++192,
+++34,
+++31,
+++41,
+++3,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++96,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++224,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++225,
+++64,
+++242,
+++64,
+++3,
+++232,
+++128,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++57,
+++239,
+++224,
+++247,
+++255,
+++255,
+++72,
+++192,
+++95,
+++207,
+++88,
+++122,
+++88,
+++124,
+++137,
+++64,
+++26,
+++64,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++149,
+++96,
+++161,
+++64,
+++152,
+++64,
+++128,
+++144,
+++35,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++27,
+++0,
+++4,
+++232,
+++0,
+++2,
+++0,
+++0,
+++101,
+++96,
+++145,
+++64,
+++168,
+++64,
+++128,
+++144,
+++19,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++11,
+++0,
+++74,
+++232,
+++0,
+++8,
+++0,
+++0,
+++242,
+++140,
+++221,
+++192,
+++57,
+++239,
+++32,
+++8,
+++0,
+++0,
+++41,
+++3,
+++239,
+++3,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++248,
+++4,
+++0,
+++12,
+++248,
+++0,
+++132,
+++64,
+++0,
+++192,
+++248,
+++4,
+++0,
+++0,
+++96,
+++255,
+++159,
+++154,
+++255,
+++0,
+++232,
+++0,
+++4,
+++0,
+++0,
+++255,
+++159,
+++165,
+++255,
+++4,
+++255,
+++48,
+++204,
+++16,
+++3,
+++224,
+++251,
+++62,
+++0,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++128,
+++64,
+++6,
+++232,
+++64,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++32,
+++247,
+++240,
+++207,
+++16,
+++3,
+++32,
+++247,
+++176,
+++207,
+++17,
+++19,
+++32,
+++247,
+++112,
+++207,
+++18,
+++35,
+++32,
+++247,
+++48,
+++207,
+++19,
+++51,
+++32,
+++247,
+++240,
+++206,
+++20,
+++67,
+++32,
+++247,
+++176,
+++206,
+++21,
+++83,
+++32,
+++247,
+++112,
+++206,
+++22,
+++99,
+++32,
+++247,
+++48,
+++206,
+++23,
+++115,
+++32,
+++247,
+++240,
+++205,
+++24,
+++131,
+++32,
+++247,
+++176,
+++205,
+++25,
+++147,
+++32,
+++247,
+++112,
+++205,
+++26,
+++163,
+++32,
+++247,
+++48,
+++205,
+++27,
+++179,
+++32,
+++247,
+++240,
+++204,
+++28,
+++195,
+++32,
+++247,
+++176,
+++204,
+++29,
+++211,
+++32,
+++247,
+++112,
+++204,
+++30,
+++227,
+++32,
+++247,
+++48,
+++204,
+++31,
+++243,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++0,
+++237,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++111,
+++3,
+++4,
+++254,
+++0,
+++128,
+++0,
+++4,
+++0,
+++248,
+++0,
+++0,
+++2,
+++232,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++32,
+++0,
+++0,
+++0,
+++224,
+++35,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++193,
+++232,
+++0,
+++1,
+++0,
+++0,
+++1,
+++106,
+++116,
+++30,
+++90,
+++0,
+++169,
+++3,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++137,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++129,
+++0,
+++131,
+++102,
+++0,
+++158,
+++67,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++108,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++100,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++161,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++150,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++182,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++112,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++101,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++103,
+++255,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++92,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++93,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++210,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++211,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++107,
+++0,
+++8,
+++255,
+++99,
+++23,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++23,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++52,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++52,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++12,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++13,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++18,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++19,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++33,
+++0,
+++8,
+++255,
+++99,
+++3,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++3,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++4,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++4,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++137,
+++47,
+++240,
+++40,
+++246,
+++2,
+++140,
+++47,
+++240,
+++128,
+++245,
+++99,
+++140,
+++5,
+++4,
+++0,
+++247,
+++99,
+++140,
+++1,
+++20,
+++88,
+++246,
+++99,
+++140,
+++1,
+++20,
+++0,
+++247,
+++35,
+++136,
+++62,
+++226,
+++32,
+++247,
+++35,
+++136,
+++32,
+++210,
+++0,
+++247,
+++34,
+++136,
+++63,
+++2,
+++208,
+++246,
+++34,
+++136,
+++0,
+++4,
+++0,
+++247,
+++99,
+++136,
+++58,
+++162,
+++32,
+++247,
+++99,
+++136,
+++33,
+++146,
+++0,
+++247,
+++98,
+++136,
+++59,
+++18,
+++208,
+++246,
+++98,
+++136,
+++0,
+++20,
+++0,
+++247,
+++162,
+++136,
+++33,
+++2,
+++88,
+++246,
+++98,
+++137,
+++2,
+++68,
+++88,
+++246,
+++162,
+++137,
+++3,
+++68,
+++208,
+++254,
+++227,
+++136,
+++60,
+++242,
+++192,
+++243,
+++188,
+++11,
+++208,
+++254,
+++227,
+++136,
+++56,
+++178,
+++192,
+++243,
+++188,
+++10,
+++32,
+++255,
+++226,
+++136,
+++38,
+++58,
+++192,
+++243,
+++60,
+++0,
+++208,
+++254,
+++227,
+++136,
+++59,
+++242,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++49,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++226,
+++136,
+++34,
+++34,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++37,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++194,
+++8,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++255,
+++202,
+++40,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++254,
+++0,
+++240,
+++35,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++226,
+++140,
+++34,
+++34,
+++195,
+++243,
+++60,
+++0,
+++32,
+++255,
+++227,
+++140,
+++36,
+++58,
+++192,
+++243,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++254,
+++226,
+++136,
+++35,
+++58,
+++192,
+++243,
+++60,
+++0,
+++11,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++115,
+++5,
+++106,
+++0,
+++144,
+++173,
+++1,
+++27,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++227,
+++0,
+++64,
+++246,
+++163,
+++140,
+++1,
+++4,
+++0,
+++246,
+++192,
+++175,
+++63,
+++2,
+++0,
+++246,
+++192,
+++174,
+++59,
+++2,
+++0,
+++246,
+++128,
+++175,
+++62,
+++2,
+++0,
+++246,
+++128,
+++174,
+++58,
+++2,
+++0,
+++246,
+++64,
+++175,
+++61,
+++2,
+++0,
+++246,
+++64,
+++174,
+++57,
+++2,
+++0,
+++255,
+++43,
+++240,
+++4,
+++212,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++228,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++191,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++143,
+++52,
+++242,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++212,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++180,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++190,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++143,
+++52,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++180,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++212,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++196,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++189,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++143,
+++52,
+++210,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++148,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++164,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++228,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++187,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++142,
+++52,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++148,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++244,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++186,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++142,
+++52,
+++162,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++244,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++148,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++132,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++185,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++142,
+++52,
+++146,
+++192,
+++243,
+++60,
+++128,
+++64,
+++255,
+++98,
+++141,
+++0,
+++52,
+++192,
+++243,
+++0,
+++0,
+++0,
+++254,
+++0,
+++240,
+++53,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++177,
+++0,
+++88,
+++246,
+++163,
+++140,
+++1,
+++4,
+++128,
+++245,
+++99,
+++141,
+++10,
+++4,
+++88,
+++246,
+++162,
+++138,
+++1,
+++68,
+++0,
+++247,
+++162,
+++138,
+++36,
+++162,
+++88,
+++254,
+++162,
+++138,
+++3,
+++164,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++226,
+++137,
+++32,
+++2,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++226,
+++137,
+++42,
+++114,
+++0,
+++255,
+++34,
+++138,
+++33,
+++18,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++34,
+++138,
+++42,
+++130,
+++16,
+++246,
+++98,
+++138,
+++40,
+++114,
+++16,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++226,
+++137,
+++41,
+++146,
+++40,
+++246,
+++34,
+++138,
+++41,
+++146,
+++32,
+++247,
+++163,
+++141,
+++63,
+++178,
+++32,
+++247,
+++227,
+++141,
+++62,
+++162,
+++0,
+++254,
+++0,
+++240,
+++8,
+++4,
+++0,
+++240,
+++128,
+++11,
+++128,
+++253,
+++35,
+++240,
+++9,
+++100,
+++192,
+++243,
+++128,
+++10,
+++128,
+++253,
+++163,
+++141,
+++128,
+++115,
+++192,
+++243,
+++152,
+++10,
+++88,
+++246,
+++163,
+++141,
+++4,
+++100,
+++208,
+++246,
+++35,
+++139,
+++0,
+++100,
+++32,
+++255,
+++34,
+++139,
+++53,
+++202,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++139,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++240,
+++246,
+++163,
+++141,
+++48,
+++98,
+++0,
+++247,
+++99,
+++139,
+++63,
+++210,
+++0,
+++247,
+++98,
+++139,
+++1,
+++212,
+++88,
+++254,
+++98,
+++139,
+++1,
+++212,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++99,
+++139,
+++62,
+++98,
+++192,
+++243,
+++188,
+++10,
+++88,
+++246,
+++98,
+++139,
+++1,
+++212,
+++240,
+++246,
+++98,
+++139,
+++50,
+++210,
+++0,
+++247,
+++163,
+++128,
+++59,
+++146,
+++0,
+++247,
+++160,
+++128,
+++1,
+++36,
+++88,
+++254,
+++160,
+++128,
+++1,
+++36,
+++192,
+++243,
+++128,
+++11,
+++0,
+++247,
+++163,
+++128,
+++58,
+++98,
+++64,
+++255,
+++35,
+++240,
+++0,
+++100,
+++192,
+++243,
+++128,
+++10,
+++64,
+++255,
+++163,
+++128,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++160,
+++128,
+++1,
+++36,
+++240,
+++246,
+++160,
+++128,
+++50,
+++34,
+++8,
+++255,
+++227,
+++143,
+++54,
+++242,
+++192,
+++243,
+++60,
+++128,
+++40,
+++255,
+++227,
+++142,
+++54,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++39,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++143,
+++45,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++44,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++40,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++142,
+++2,
+++162,
+++192,
+++243,
+++60,
+++128,
+++90,
+++0,
+++169,
+++3,
+++14,
+++96,
+++4,
+++31,
+++169,
+++3,
+++30,
+++96,
+++1,
+++31,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++143,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++135,
+++0,
+++131,
+++102,
+++0,
+++158,
+++71,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++112,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++104,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++123,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++112,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++178,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++72,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++61,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++95,
+++255,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++47,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++13,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++140,
+++47,
+++240,
+++32,
+++247,
+++35,
+++141,
+++63,
+++178,
+++64,
+++254,
+++35,
+++141,
+++2,
+++68,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++35,
+++240,
+++58,
+++226,
+++192,
+++243,
+++188,
+++10,
+++0,
+++254,
+++0,
+++141,
+++4,
+++4,
+++0,
+++240,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++240,
+++246,
+++35,
+++141,
+++48,
+++66,
+++0,
+++247,
+++227,
+++143,
+++52,
+++242,
+++32,
+++247,
+++227,
+++142,
+++52,
+++178,
+++90,
+++0,
+++161,
+++3,
+++6,
+++64,
+++23,
+++64,
+++96,
+++8,
+++70,
+++98,
+++97,
+++8,
+++70,
+++98,
+++98,
+++8,
+++70,
+++98,
+++99,
+++8,
+++70,
+++98,
+++100,
+++8,
+++70,
+++98,
+++101,
+++8,
+++70,
+++98,
+++255,
+++159,
+++8,
+++250,
+++23,
+++102,
+++7,
+++106,
+++112,
+++30,
+++33,
+++3,
+++};
++diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
++new file mode 100644
++index 0000000000..2901b6568d
++--- /dev/null
+++++ b/libavcodec/rpi_hevc_transform8.h
++@@ -0,0 +1,3070 @@
+++static const unsigned char rpi_hevc_transform8 [] = {
+++21,
+++106,
+++0,
+++144,
+++47,
+++1,
+++37,
+++106,
+++0,
+++144,
+++66,
+++1,
+++53,
+++106,
+++0,
+++144,
+++192,
+++4,
+++69,
+++106,
+++0,
+++144,
+++192,
+++4,
+++85,
+++106,
+++0,
+++144,
+++220,
+++5,
+++169,
+++3,
+++62,
+++64,
+++79,
+++64,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++12,
+++248,
+++0,
+++136,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++248,
+++0,
+++168,
+++0,
+++0,
+++192,
+++248,
+++0,
+++0,
+++0,
+++96,
+++3,
+++232,
+++32,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++8,
+++232,
+++0,
+++4,
+++0,
+++0,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++5,
+++232,
+++0,
+++8,
+++0,
+++0,
+++128,
+++69,
+++113,
+++66,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++8,
+++4,
+++0,
+++128,
+++69,
+++113,
+++70,
+++128,
+++144,
+++40,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++16,
+++0,
+++76,
+++254,
+++48,
+++192,
+++9,
+++4,
+++32,
+++8,
+++0,
+++0,
+++4,
+++254,
+++0,
+++144,
+++128,
+++2,
+++0,
+++8,
+++2,
+++0,
+++128,
+++144,
+++23,
+++0,
+++4,
+++255,
+++48,
+++192,
+++128,
+++3,
+++32,
+++8,
+++20,
+++0,
+++76,
+++254,
+++48,
+++192,
+++4,
+++4,
+++32,
+++8,
+++0,
+++0,
+++140,
+++248,
+++44,
+++0,
+++0,
+++0,
+++32,
+++48,
+++4,
+++0,
+++128,
+++69,
+++113,
+++66,
+++242,
+++140,
+++211,
+++192,
+++34,
+++31,
+++41,
+++3,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++96,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++70,
+++192,
+++80,
+++7,
+++164,
+++255,
+++36,
+++204,
+++224,
+++2,
+++0,
+++248,
+++62,
+++0,
+++3,
+++255,
+++55,
+++208,
+++120,
+++3,
+++224,
+++3,
+++190,
+++11,
+++16,
+++139,
+++246,
+++91,
+++0,
+++103,
+++90,
+++0,
+++225,
+++64,
+++242,
+++64,
+++3,
+++232,
+++128,
+++0,
+++0,
+++0,
+++7,
+++232,
+++0,
+++2,
+++0,
+++0,
+++57,
+++239,
+++224,
+++247,
+++255,
+++255,
+++72,
+++192,
+++95,
+++207,
+++88,
+++122,
+++88,
+++124,
+++137,
+++64,
+++26,
+++64,
+++4,
+++232,
+++64,
+++0,
+++0,
+++0,
+++149,
+++96,
+++161,
+++64,
+++152,
+++64,
+++128,
+++144,
+++35,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++27,
+++0,
+++4,
+++232,
+++0,
+++8,
+++0,
+++0,
+++69,
+++96,
+++145,
+++64,
+++168,
+++64,
+++128,
+++144,
+++19,
+++0,
+++72,
+++232,
+++0,
+++4,
+++0,
+++0,
+++65,
+++232,
+++32,
+++0,
+++0,
+++0,
+++128,
+++144,
+++11,
+++0,
+++74,
+++232,
+++0,
+++8,
+++0,
+++0,
+++242,
+++140,
+++221,
+++192,
+++57,
+++239,
+++32,
+++8,
+++0,
+++0,
+++41,
+++3,
+++239,
+++3,
+++12,
+++248,
+++0,
+++128,
+++0,
+++0,
+++192,
+++248,
+++4,
+++0,
+++12,
+++248,
+++0,
+++132,
+++64,
+++0,
+++192,
+++248,
+++4,
+++0,
+++0,
+++96,
+++255,
+++159,
+++154,
+++255,
+++0,
+++232,
+++0,
+++4,
+++0,
+++0,
+++255,
+++159,
+++165,
+++255,
+++4,
+++255,
+++48,
+++204,
+++16,
+++3,
+++224,
+++251,
+++62,
+++0,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++128,
+++64,
+++6,
+++232,
+++64,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++32,
+++247,
+++240,
+++207,
+++16,
+++3,
+++32,
+++247,
+++176,
+++207,
+++17,
+++19,
+++32,
+++247,
+++112,
+++207,
+++18,
+++35,
+++32,
+++247,
+++48,
+++207,
+++19,
+++51,
+++32,
+++247,
+++240,
+++206,
+++20,
+++67,
+++32,
+++247,
+++176,
+++206,
+++21,
+++83,
+++32,
+++247,
+++112,
+++206,
+++22,
+++99,
+++32,
+++247,
+++48,
+++206,
+++23,
+++115,
+++32,
+++247,
+++240,
+++205,
+++24,
+++131,
+++32,
+++247,
+++176,
+++205,
+++25,
+++147,
+++32,
+++247,
+++112,
+++205,
+++26,
+++163,
+++32,
+++247,
+++48,
+++205,
+++27,
+++179,
+++32,
+++247,
+++240,
+++204,
+++28,
+++195,
+++32,
+++247,
+++176,
+++204,
+++29,
+++211,
+++32,
+++247,
+++112,
+++204,
+++30,
+++227,
+++32,
+++247,
+++48,
+++204,
+++31,
+++243,
+++4,
+++255,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++16,
+++0,
+++76,
+++254,
+++51,
+++204,
+++128,
+++3,
+++224,
+++251,
+++20,
+++0,
+++0,
+++237,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++47,
+++0,
+++0,
+++0,
+++224,
+++99,
+++0,
+++0,
+++111,
+++3,
+++4,
+++254,
+++0,
+++128,
+++0,
+++4,
+++0,
+++248,
+++0,
+++0,
+++2,
+++232,
+++32,
+++0,
+++0,
+++0,
+++140,
+++248,
+++32,
+++0,
+++0,
+++0,
+++224,
+++35,
+++0,
+++0,
+++64,
+++232,
+++0,
+++2,
+++0,
+++0,
+++193,
+++232,
+++0,
+++1,
+++0,
+++0,
+++1,
+++106,
+++116,
+++30,
+++90,
+++0,
+++169,
+++3,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++137,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++129,
+++0,
+++131,
+++102,
+++0,
+++158,
+++67,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++108,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++100,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++161,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++150,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++182,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++112,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++101,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++103,
+++255,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++92,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++93,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++210,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++211,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++107,
+++0,
+++8,
+++255,
+++99,
+++23,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++23,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++52,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++52,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++0,
+++143,
+++12,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++143,
+++13,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++64,
+++142,
+++18,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++0,
+++142,
+++19,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++33,
+++0,
+++8,
+++255,
+++99,
+++3,
+++0,
+++212,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++3,
+++0,
+++228,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++99,
+++4,
+++0,
+++164,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++163,
+++4,
+++0,
+++148,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++137,
+++47,
+++240,
+++40,
+++246,
+++2,
+++140,
+++47,
+++240,
+++128,
+++245,
+++99,
+++140,
+++5,
+++4,
+++0,
+++247,
+++99,
+++140,
+++1,
+++20,
+++88,
+++246,
+++99,
+++140,
+++1,
+++20,
+++0,
+++247,
+++35,
+++136,
+++62,
+++226,
+++32,
+++247,
+++35,
+++136,
+++32,
+++210,
+++0,
+++247,
+++34,
+++136,
+++63,
+++2,
+++208,
+++246,
+++34,
+++136,
+++0,
+++4,
+++0,
+++247,
+++99,
+++136,
+++58,
+++162,
+++32,
+++247,
+++99,
+++136,
+++33,
+++146,
+++0,
+++247,
+++98,
+++136,
+++59,
+++18,
+++208,
+++246,
+++98,
+++136,
+++0,
+++20,
+++0,
+++247,
+++162,
+++136,
+++33,
+++2,
+++88,
+++246,
+++98,
+++137,
+++2,
+++68,
+++88,
+++246,
+++162,
+++137,
+++3,
+++68,
+++208,
+++254,
+++227,
+++136,
+++60,
+++242,
+++192,
+++243,
+++188,
+++11,
+++208,
+++254,
+++227,
+++136,
+++56,
+++178,
+++192,
+++243,
+++188,
+++10,
+++32,
+++255,
+++226,
+++136,
+++38,
+++58,
+++192,
+++243,
+++60,
+++0,
+++208,
+++254,
+++227,
+++136,
+++59,
+++242,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++49,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++226,
+++136,
+++34,
+++34,
+++192,
+++243,
+++60,
+++128,
+++32,
+++255,
+++226,
+++136,
+++37,
+++58,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++194,
+++8,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++255,
+++202,
+++40,
+++0,
+++52,
+++195,
+++243,
+++0,
+++128,
+++0,
+++254,
+++0,
+++240,
+++35,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++1,
+++4,
+++0,
+++240,
+++0,
+++160,
+++0,
+++255,
+++226,
+++140,
+++34,
+++34,
+++195,
+++243,
+++60,
+++0,
+++32,
+++255,
+++227,
+++140,
+++36,
+++58,
+++192,
+++243,
+++60,
+++0,
+++0,
+++254,
+++192,
+++136,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++16,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++246,
+++226,
+++136,
+++35,
+++50,
+++32,
+++254,
+++226,
+++136,
+++35,
+++58,
+++192,
+++243,
+++60,
+++0,
+++11,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++115,
+++5,
+++106,
+++0,
+++144,
+++173,
+++1,
+++27,
+++96,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++227,
+++0,
+++64,
+++246,
+++163,
+++140,
+++1,
+++4,
+++0,
+++246,
+++192,
+++175,
+++63,
+++2,
+++0,
+++246,
+++192,
+++174,
+++59,
+++2,
+++0,
+++246,
+++128,
+++175,
+++62,
+++2,
+++0,
+++246,
+++128,
+++174,
+++58,
+++2,
+++0,
+++246,
+++64,
+++175,
+++61,
+++2,
+++0,
+++246,
+++64,
+++174,
+++57,
+++2,
+++0,
+++255,
+++43,
+++240,
+++4,
+++212,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++228,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++191,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++143,
+++52,
+++242,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++212,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++180,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++190,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++143,
+++52,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++180,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++191,
+++226,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++212,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++196,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++189,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++143,
+++52,
+++210,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++148,
+++192,
+++243,
+++128,
+++11,
+++64,
+++254,
+++43,
+++240,
+++1,
+++164,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++180,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++240,
+++1,
+++244,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++228,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++187,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++235,
+++142,
+++52,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++2,
+++148,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++64,
+++254,
+++43,
+++141,
+++0,
+++244,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++2,
+++68,
+++32,
+++247,
+++35,
+++141,
+++186,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++171,
+++142,
+++52,
+++162,
+++192,
+++243,
+++60,
+++128,
+++0,
+++255,
+++43,
+++240,
+++4,
+++244,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++43,
+++240,
+++187,
+++162,
+++192,
+++243,
+++188,
+++10,
+++128,
+++253,
+++43,
+++240,
+++3,
+++148,
+++192,
+++243,
+++128,
+++10,
+++64,
+++254,
+++35,
+++141,
+++1,
+++132,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++32,
+++247,
+++35,
+++141,
+++185,
+++66,
+++240,
+++246,
+++35,
+++141,
+++50,
+++66,
+++0,
+++255,
+++107,
+++142,
+++52,
+++146,
+++192,
+++243,
+++60,
+++128,
+++64,
+++255,
+++98,
+++141,
+++0,
+++52,
+++192,
+++243,
+++0,
+++0,
+++0,
+++254,
+++0,
+++240,
+++53,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++1,
+++4,
+++0,
+++240,
+++64,
+++147,
+++5,
+++106,
+++0,
+++144,
+++177,
+++0,
+++88,
+++246,
+++163,
+++140,
+++1,
+++4,
+++128,
+++245,
+++99,
+++141,
+++10,
+++4,
+++88,
+++246,
+++162,
+++138,
+++1,
+++68,
+++0,
+++247,
+++162,
+++138,
+++36,
+++162,
+++88,
+++254,
+++162,
+++138,
+++3,
+++164,
+++192,
+++243,
+++128,
+++11,
+++0,
+++255,
+++226,
+++137,
+++32,
+++2,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++226,
+++137,
+++42,
+++114,
+++0,
+++255,
+++34,
+++138,
+++33,
+++18,
+++195,
+++243,
+++60,
+++0,
+++32,
+++247,
+++34,
+++138,
+++42,
+++130,
+++16,
+++246,
+++98,
+++138,
+++40,
+++114,
+++16,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++98,
+++138,
+++41,
+++146,
+++32,
+++246,
+++226,
+++137,
+++41,
+++146,
+++40,
+++246,
+++34,
+++138,
+++41,
+++146,
+++32,
+++247,
+++163,
+++141,
+++63,
+++178,
+++32,
+++247,
+++227,
+++141,
+++62,
+++162,
+++0,
+++254,
+++0,
+++240,
+++8,
+++4,
+++0,
+++240,
+++128,
+++11,
+++128,
+++253,
+++35,
+++240,
+++9,
+++100,
+++192,
+++243,
+++128,
+++10,
+++128,
+++253,
+++163,
+++141,
+++128,
+++115,
+++192,
+++243,
+++152,
+++10,
+++88,
+++246,
+++163,
+++141,
+++4,
+++100,
+++208,
+++246,
+++35,
+++139,
+++0,
+++100,
+++32,
+++255,
+++34,
+++139,
+++53,
+++202,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++139,
+++0,
+++4,
+++0,
+++240,
+++0,
+++160,
+++240,
+++246,
+++163,
+++141,
+++48,
+++98,
+++0,
+++247,
+++99,
+++139,
+++63,
+++210,
+++0,
+++247,
+++98,
+++139,
+++1,
+++212,
+++88,
+++254,
+++98,
+++139,
+++1,
+++212,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++99,
+++139,
+++62,
+++98,
+++192,
+++243,
+++188,
+++10,
+++88,
+++246,
+++98,
+++139,
+++1,
+++212,
+++240,
+++246,
+++98,
+++139,
+++50,
+++210,
+++0,
+++247,
+++163,
+++128,
+++59,
+++146,
+++0,
+++247,
+++160,
+++128,
+++1,
+++36,
+++88,
+++254,
+++160,
+++128,
+++1,
+++36,
+++192,
+++243,
+++128,
+++11,
+++0,
+++247,
+++163,
+++128,
+++58,
+++98,
+++64,
+++255,
+++35,
+++240,
+++0,
+++100,
+++192,
+++243,
+++128,
+++10,
+++64,
+++255,
+++163,
+++128,
+++0,
+++164,
+++192,
+++243,
+++128,
+++10,
+++88,
+++246,
+++160,
+++128,
+++1,
+++36,
+++240,
+++246,
+++160,
+++128,
+++50,
+++34,
+++8,
+++255,
+++227,
+++143,
+++54,
+++242,
+++192,
+++243,
+++60,
+++128,
+++40,
+++255,
+++227,
+++142,
+++54,
+++178,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++39,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++143,
+++45,
+++226,
+++192,
+++243,
+++60,
+++128,
+++0,
+++254,
+++0,
+++240,
+++44,
+++10,
+++0,
+++240,
+++60,
+++0,
+++0,
+++254,
+++0,
+++240,
+++40,
+++10,
+++0,
+++240,
+++60,
+++128,
+++8,
+++255,
+++163,
+++142,
+++2,
+++162,
+++192,
+++243,
+++60,
+++128,
+++90,
+++0,
+++169,
+++3,
+++14,
+++96,
+++4,
+++31,
+++169,
+++3,
+++30,
+++96,
+++1,
+++31,
+++73,
+++64,
+++52,
+++64,
+++45,
+++64,
+++2,
+++64,
+++10,
+++64,
+++64,
+++198,
+++1,
+++7,
+++8,
+++232,
+++63,
+++0,
+++0,
+++0,
+++6,
+++232,
+++253,
+++255,
+++255,
+++255,
+++0,
+++246,
+++0,
+++0,
+++0,
+++4,
+++215,
+++64,
+++3,
+++96,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++143,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++135,
+++0,
+++131,
+++102,
+++0,
+++158,
+++71,
+++0,
+++2,
+++248,
+++0,
+++35,
+++0,
+++0,
+++64,
+++56,
+++0,
+++0,
+++4,
+++248,
+++0,
+++36,
+++0,
+++0,
+++64,
+++56,
+++8,
+++0,
+++0,
+++240,
+++64,
+++0,
+++132,
+++3,
+++30,
+++106,
+++132,
+++24,
+++128,
+++240,
+++0,
+++0,
+++132,
+++3,
+++128,
+++144,
+++112,
+++0,
+++131,
+++98,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++104,
+++0,
+++131,
+++102,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++123,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++112,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++3,
+++99,
+++131,
+++71,
+++68,
+++232,
+++32,
+++0,
+++0,
+++0,
+++0,
+++99,
+++2,
+++99,
+++23,
+++102,
+++7,
+++106,
+++127,
+++156,
+++178,
+++255,
+++0,
+++248,
+++64,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++30,
+++106,
+++134,
+++24,
+++128,
+++248,
+++0,
+++0,
+++112,
+++0,
+++192,
+++243,
+++211,
+++31,
+++128,
+++144,
+++72,
+++0,
+++188,
+++64,
+++67,
+++232,
+++0,
+++2,
+++0,
+++0,
+++0,
+++255,
+++64,
+++0,
+++0,
+++20,
+++200,
+++243,
+++0,
+++0,
+++128,
+++144,
+++61,
+++0,
+++195,
+++232,
+++0,
+++2,
+++0,
+++0,
+++12,
+++128,
+++7,
+++192,
+++130,
+++248,
+++0,
+++0,
+++112,
+++192,
+++224,
+++16,
+++195,
+++31,
+++132,
+++248,
+++1,
+++0,
+++112,
+++0,
+++224,
+++16,
+++203,
+++31,
+++25,
+++102,
+++9,
+++106,
+++2,
+++30,
+++41,
+++3,
+++26,
+++87,
+++162,
+++64,
+++64,
+++198,
+++1,
+++23,
+++127,
+++158,
+++95,
+++255,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++94,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++95,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++208,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++209,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++47,
+++0,
+++8,
+++255,
+++227,
+++23,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++52,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++239,
+++3,
+++0,
+++254,
+++128,
+++143,
+++14,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++143,
+++15,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++192,
+++142,
+++16,
+++0,
+++0,
+++240,
+++12,
+++0,
+++0,
+++254,
+++128,
+++142,
+++17,
+++0,
+++0,
+++240,
+++12,
+++0,
+++128,
+++144,
+++13,
+++0,
+++8,
+++255,
+++227,
+++3,
+++0,
+++244,
+++192,
+++51,
+++0,
+++0,
+++8,
+++255,
+++35,
+++4,
+++0,
+++180,
+++192,
+++51,
+++0,
+++0,
+++111,
+++3,
+++32,
+++246,
+++192,
+++11,
+++1,
+++16,
+++32,
+++246,
+++2,
+++140,
+++47,
+++240,
+++32,
+++247,
+++35,
+++141,
+++63,
+++178,
+++64,
+++254,
+++35,
+++141,
+++2,
+++68,
+++192,
+++243,
+++128,
+++11,
+++32,
+++255,
+++35,
+++240,
+++58,
+++226,
+++192,
+++243,
+++188,
+++10,
+++0,
+++254,
+++0,
+++141,
+++4,
+++4,
+++0,
+++240,
+++128,
+++10,
+++88,
+++246,
+++35,
+++141,
+++3,
+++68,
+++240,
+++246,
+++35,
+++141,
+++48,
+++66,
+++0,
+++247,
+++227,
+++143,
+++52,
+++242,
+++32,
+++247,
+++227,
+++142,
+++52,
+++178,
+++90,
+++0,
+++161,
+++3,
+++6,
+++64,
+++23,
+++64,
+++96,
+++8,
+++70,
+++98,
+++97,
+++8,
+++70,
+++98,
+++98,
+++8,
+++70,
+++98,
+++99,
+++8,
+++70,
+++98,
+++100,
+++8,
+++70,
+++98,
+++101,
+++8,
+++70,
+++98,
+++255,
+++159,
+++8,
+++250,
+++23,
+++102,
+++7,
+++106,
+++112,
+++30,
+++33,
+++3,
+++};
+ diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+ new file mode 100644
+-index 0000000..0255f5d
++index 0000000000..0255f5dd44
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.c
+ @@ -0,0 +1,149 @@
+@@ -14567,7 +22217,7 @@ index 0000000..0255f5d
+ +
+ diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+ new file mode 100644
+-index 0000000..b316878
++index 0000000000..b3168788d2
+ --- /dev/null
+ +++ b/libavcodec/rpi_mailbox.h
+ @@ -0,0 +1,58 @@
+@@ -14617,24 +22267,76 @@ index 0000000..b316878
+ +  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
+ +} VC_IMAGE_T;
+ +
+-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
+++
+++
+++extern int mbox_open(void);
+++extern void mbox_close(int file_desc);
+++
+++extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+++extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+++
+++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+++
+++#endif
++diff --git a/libavcodec/rpi_opts.h b/libavcodec/rpi_opts.h
++new file mode 100644
++index 0000000000..e6127749ea
++--- /dev/null
+++++ b/libavcodec/rpi_opts.h
++@@ -0,0 +1,46 @@
+++#ifndef AVCODEC_RPI_OPTS_H
+++#define AVCODEC_RPI_OPTS_H
+++
+++// define RPI to split the CABAC/prediction/transform into separate stages
+++#ifndef RPI
+++
+++  #define RPI_INTER          0
+++  #define RPI_TSTATS         0
+++  #define RPI_HEVC_SAND      0
+++
+++#else
+++  #include "config.h"
+++
+++  #define RPI_INTER          1          // 0 use ARM for UV inter-pred, 1 use QPU
+++
+++  // By passing jobs to a worker thread we hope to be able to catch up during slow frames
+++  // This has no effect unless RPI_WORKER is defined
+++  // N.B. The extra thread count is effectively RPI_MAX_JOBS - 1 as
+++  // RPI_MAX_JOBS defines the number of worker parameter sets and we must have one
+++  // free for the foreground to fill in.
+++  #define RPI_MAX_JOBS 2
+++
+++  // Define RPI_DEBLOCK_VPU to perform deblocking on the VPUs
+++  // As it stands there is something mildy broken in VPU deblock - looks mostly OK
+++  // but reliably fails some conformance tests (e.g. DBLK_A/B/C_)
+++  // With VPU luma & chroma pred it is much the same speed to deblock on the ARM
+++//  #define RPI_DEBLOCK_VPU
+++
+++  #define RPI_VPU_DEBLOCK_CACHED 1
+ +
+++  #if HAVE_NEON
+++  #define RPI_HEVC_SAND      1
+++  #else
+++  // Sand bust on Pi1 currently - reasons unknown
+++  #define RPI_HEVC_SAND      0
+++  #endif
+ +
+-+extern int mbox_open(void);
+-+extern void mbox_close(int file_desc);
+ +
+-+extern unsigned mbox_mem_lock(int file_desc, unsigned handle);
+-+extern unsigned mbox_mem_unlock(int file_desc, unsigned handle);
+++  #define RPI_QPU_EMU_Y      0
+++  #define RPI_QPU_EMU_C      0
+ +
+-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
+++  #define RPI_TSTATS 0
+++#endif
+ +
+ +#endif
+++
+ diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+ new file mode 100644
+-index 0000000..7c0eedd
++index 0000000000..e872b855b7
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.c
+-@@ -0,0 +1,902 @@
++@@ -0,0 +1,935 @@
+ +#ifdef RPI
+ +#include <stdio.h>
+ +#include <stdlib.h>
+@@ -14653,8 +22355,9 @@ index 0000000..7c0eedd
+ +#include "rpi_mailbox.h"
+ +#include "rpi_qpu.h"
+ +#include "rpi_shader.h"
+-+#include "rpi_hevc_transform.h"
+-+#include "rpi_zc.h"
+++#include "rpi_hevc_transform8.h"
+++#include "rpi_hevc_transform10.h"
+++#include "libavutil/rpi_sand_fns.h"
+ +
+ +#pragma GCC diagnostic push
+ +// Many many redundant decls in the header files
+@@ -14678,26 +22381,13 @@ index 0000000..7c0eedd
+ +#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
+ +#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
+ +
+-+// On Pi2 there is no way to access the VPU L2 cache
+-+// GPU_MEM_FLG should be 4 for uncached memory.  (Or C for alias to allocate in the VPU L2 cache)
+-+// However, if using VCSM allocated buffers, need to use C at the moment because VCSM does not allocate uncached memory correctly
+-+// The QPU crashes if we mix L2 cached and L2 uncached accesses due to a HW bug.
+-+#define GPU_MEM_FLG 0x4
+-+// GPU_MEM_MAP is meaningless on the Pi2 and should be left at 0  (On Pi1 it allows ARM to access VPU L2 cache)
+-+#define GPU_MEM_MAP 0x0
+-+
+ +#define vcos_verify_ge0(x) ((x)>=0)
+ +
+-+/*static const unsigned code[] =
+-+{
+-+  #include "rpi_shader.hex"
+-+};*/
+-+
+ +// Size in 32bit words
+-+#define QPU_CODE_SIZE 2048
+++#define QPU_CODE_SIZE 4098
+ +#define VPU_CODE_SIZE 2048
+ +
+-+const short rpi_transMatrix2even[32][16] = { // Even rows first
+++static const short rpi_transMatrix2even[32][16] = { // Even rows first
+ +{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
+ +{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
+ +{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
+@@ -14737,7 +22427,8 @@ index 0000000..7c0eedd
+ +struct GPU
+ +{
+ +  unsigned int qpu_code[QPU_CODE_SIZE];
+-+  unsigned int vpu_code[VPU_CODE_SIZE];
+++  unsigned int vpu_code8[VPU_CODE_SIZE];
+++  unsigned int vpu_code10[VPU_CODE_SIZE];
+ +  short transMatrix2even[16*16*2];
+ +};
+ +
+@@ -14749,8 +22440,9 @@ index 0000000..7c0eedd
+ +#define CFE_A_COUNT    (CFE_ENT_COUNT / CFE_ENTS_PER_A)
+ +
+ +struct rpi_cache_flush_env_s {
+-+    unsigned int n;
+-+    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+++//    unsigned int n;
+++//    struct vcsm_user_clean_invalid_s a[CFE_A_COUNT];
+++  struct vcsm_user_clean_invalid2_s v;
+ +};
+ +
+ +#define WAIT_COUNT_MAX 16
+@@ -14774,7 +22466,6 @@ index 0000000..7c0eedd
+ +typedef struct vq_wait_s
+ +{
+ +  sem_t sem;
+-+  unsigned int cost;
+ +  struct vq_wait_s * next;
+ +} vq_wait_t;
+ +
+@@ -14793,7 +22484,7 @@ index 0000000..7c0eedd
+ +  int open_count;
+ +  int init_count;
+ +  int mb;
+-+  unsigned int current_load;
+++  int vpu_i_cache_flushed;
+ +  GPU_MEM_PTR_T code_gm_ptr;
+ +  vq_wait_pool_t wait_pool;
+ +#if RPI_TRACE_TIME_VPU_QPU_WAIT
+@@ -14866,8 +22557,8 @@ index 0000000..7c0eedd
+ +
+ +// GPU_MEM_PTR_T alloc fns
+ +static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+-+  p->numbytes = numbytes;
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
+++  p->numbytes = (numbytes + 255) & ~255;  // Round up
+++  p->vcsm_handle = vcsm_malloc_cache(p->numbytes, VCSM_CACHE_TYPE_HOST | 0x80, (char *)"Video Frame" );
+ +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
+ +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+ +  //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
+@@ -14878,12 +22569,14 @@ index 0000000..7c0eedd
+ +  av_assert0(p->arm);
+ +  p->vc = mbox_mem_lock(mb, p->vc_handle);
+ +  av_assert0(p->vc);
+++//  printf("***** %s, %d\n", __func__, numbytes);
+++
+ +  return 0;
+ +}
+ +
+ +static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
+ +  p->numbytes = numbytes;
+-+  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
+++  p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE | 0x80, (char *)"Video Frame" );
+ +  av_assert0(p->vcsm_handle);
+ +  p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
+ +  av_assert0(p->vc_handle);
+@@ -14891,6 +22584,7 @@ index 0000000..7c0eedd
+ +  av_assert0(p->arm);
+ +  p->vc = mbox_mem_lock(mb, p->vc_handle);
+ +  av_assert0(p->vc);
+++//  printf("***** %s, %d\n", __func__, numbytes);
+ +  return 0;
+ +}
+ +
+@@ -14899,6 +22593,7 @@ index 0000000..7c0eedd
+ +  vcsm_unlock_ptr(p->arm);
+ +  vcsm_free(p->vcsm_handle);
+ +  memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
+++//  printf("***** %s\n", __func__);
+ +}
+ +
+ +
+@@ -14955,9 +22650,14 @@ index 0000000..7c0eedd
+ +  }
+ +  // And the VPU code
+ +  {
+-+    int num_bytes = sizeof(rpi_hevc_transform);
+++    int num_bytes = sizeof(rpi_hevc_transform8);
+++    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+++    memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
+++  }
+++  {
+++    int num_bytes = sizeof(rpi_hevc_transform10);
+ +    av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
+-+    memcpy((void*)ptr->vpu_code, rpi_hevc_transform, num_bytes);
+++    memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
+ +  }
+ +  // And the transform coefficients
+ +  memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+@@ -15048,10 +22748,18 @@ index 0000000..7c0eedd
+ +  gpu_unlock_unref(ge);
+ +}
+ +
+-+unsigned int vpu_get_fn(void) {
+++unsigned int vpu_get_fn(const unsigned int bit_depth) {
+ +  // Make sure that the gpu is initialized
+ +  av_assert0(gpu != NULL);
+-+  return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
+++  switch (bit_depth){
+++    case 8:
+++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
+++    case 10:
+++      return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
+++    default:
+++      av_assert0(0);
+++  }
+++  return 0;
+ +}
+ +
+ +unsigned int vpu_get_constants(void) {
+@@ -15081,95 +22789,75 @@ index 0000000..7c0eedd
+ +//
+ +// Cache flush functions
+ +
+++#define CACHE_EL_MAX 16
+ +
+ +rpi_cache_flush_env_t * rpi_cache_flush_init()
+ +{
+-+    rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t));
+-+    if (rfe == NULL)
+-+        return NULL;
+++  rpi_cache_flush_env_t * const rfe = malloc(sizeof(rpi_cache_flush_env_t) +
+++            sizeof(struct vcsm_user_clean_invalid2_block_s) * CACHE_EL_MAX);
+++  if (rfe == NULL)
+++    return NULL;
+ +
+-+    rfe->n = 0;
+-+    return rfe;
+++  rfe->v.op_count = 0;
+++  return rfe;
+ +}
+ +
+ +void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
+ +{
+-+    if (rfe != NULL)
+-+        free(rfe);
+++  if (rfe != NULL)
+++    free(rfe);
+ +}
+ +
+ +int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
+ +{
+-+    int rc = 0;
+-+    unsigned int na;
+-+    unsigned int nr;
+-+
+-+    // Clear any reamaining ents in the final block
+-+    if ((nr = rfe->n % CFE_ENTS_PER_A) != 0)
+-+        memset(rfe->a[rfe->n / CFE_ENTS_PER_A].s + nr, 0, (CFE_ENTS_PER_A - nr) * sizeof(rfe->a[0].s[0]));
+++  int rc = 0;
+ +
+-+    for (na = 0; na * CFE_ENTS_PER_A < rfe->n; ++na)
+-+    {
+-+        if (vcsm_clean_invalid(rfe->a + na) != 0)
+-+            rc = -1;
+-+    }
+++  if (vcsm_clean_invalid2(&rfe->v) != 0)
+++    rc = -1;
+ +
+-+    free(rfe);
+++  free(rfe);
+ +
+-+    if (rc == 0)
+-+        return 0;
+++  if (rc == 0)
+++    return 0;
+ +
+-+    av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+-+    return rc;
+++  av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
+++  return rc;
+ +}
+ +
+-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
+ +{
+-+    // Deal with empty pointer trivially
+-+    if (gm == NULL || gm->numbytes == 0)
+-+        return;
+-+
+-+    {
+-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+ +
+-+        av_assert0(rfe->n < CFE_ENT_COUNT);
+++  av_assert0(rfe->v.op_count <= CACHE_EL_MAX);
+ +
+-+        a->s[n].cmd = mode;
+-+        a->s[n].handle = gm->vcsm_handle;
+-+        a->s[n].addr = (unsigned int)gm->arm;
+-+        a->s[n].size = gm->numbytes;
+-+        ++rfe->n;
+-+    }
+++  b->invalidate_mode = mode;
+++  b->block_count = blocks;
+++  b->start_address = gm->arm + offset0;
+++  b->block_size = block_size;
+++  b->inter_block_stride = block_stride;
+ +}
+ +
+ +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+ +  const unsigned int offset, const unsigned int size)
+ +{
+-+    // Deal with empty pointer trivially
+-+    if (gm == NULL || size == 0)
+-+        return;
+-+
+-+//    printf("[%d] offset=%d, size=%d, numbytes=%d\n", rfe->n, offset, size, gm->numbytes);
+-+
+-+    av_assert0(offset <= gm->numbytes);
+-+    av_assert0(size <= gm->numbytes);
+-+    av_assert0(offset + size <= gm->numbytes);
+++  // Deal with empty pointer trivially
+++  if (gm == NULL || size == 0)
+++    return;
+ +
+-+    {
+-+        struct vcsm_user_clean_invalid_s * const a = rfe->a + (rfe->n / CFE_ENTS_PER_A);
+-+        const unsigned int n = rfe->n % CFE_ENTS_PER_A;
+++  av_assert0(offset <= gm->numbytes);
+++  av_assert0(size <= gm->numbytes);
+++  av_assert0(offset + size <= gm->numbytes);
+ +
+-+        av_assert0(rfe->n < CFE_ENT_COUNT);
+++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
+++}
+ +
+-+        a->s[n].cmd = mode;
+-+        a->s[n].handle = gm->vcsm_handle;
+-+        a->s[n].addr = (unsigned int)gm->arm + offset;
+-+        a->s[n].size = size;
+-+        ++rfe->n;
+-+    }
+++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
+++{
+++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
+ +}
+ +
+++
+ +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
+ +{
+ +#if !RPI_ONE_BUF
+@@ -15186,21 +22874,27 @@ index 0000000..7c0eedd
+ +  }
+ +}
+ +
+-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
+++// Flush an area of a frame
+++// Width, height, x0, y0 in luma pels
+++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
+++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
+ +{
+-+  const unsigned int y_offset = frame->linesize[0] * start_line;
+-+  const unsigned int y_size = frame->linesize[0] * n;
+++  const unsigned int y_offset = frame->linesize[0] * y0;
+++  const unsigned int y_size = frame->linesize[0] * height;
+ +  // Round UV up/down to get everything
+ +  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
+-+  const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
+-+  const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
+++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
+++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
+ +
+++#if 0
+++  // *** frame->height is cropped height so not good
+ +  // As all unsigned they will also reject -ve
+ +  // Test individually as well as added to reject overflow
+-+  av_assert0(start_line <= (unsigned int)frame->height);
+++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
+ +  av_assert0(n <= (unsigned int)frame->height);
+ +  av_assert0(start_line + n <= (unsigned int)frame->height);
+++#endif
+ +
+ +  if (!gpu_is_buf1(frame))
+ +  {
+@@ -15212,7 +22906,7 @@ index 0000000..7c0eedd
+ +      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
+ +    }
+ +  }
+-+  else if (!rpi_sliced_frame(frame))
+++  else if (!av_rpi_is_sand_frame(frame))
+ +  {
+ +    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+ +    if (do_luma) {
+@@ -15225,16 +22919,30 @@ index 0000000..7c0eedd
+ +  }
+ +  else
+ +  {
+-+    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
+-+//    printf("%s: start_line=%d, lines=%d, %c%c\n", __func__, start_line, n, do_luma ? 'l' : ' ', do_chroma ? 'c' : ' ');
+-+    for (int x = 0; x < frame->width; x += frame->linesize[0]) {
+-+      if (do_luma) {
+-+        rpi_cache_flush_add_gm_range(rfe, gm, mode, rpi_sliced_frame_off_y(frame, x, start_line), y_size);
+-+      }
+-+      if (do_chroma) {
+-+        rpi_cache_flush_add_gm_range(rfe, gm, mode,
+-+                                     (frame->data[1] - gm->arm) + rpi_sliced_frame_off_c(frame, x >> 1, start_line >> 1), uv_size);
+-+      }
+++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
+++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
+++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
+++    av_assert0(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
+++
+++    if (do_chroma)
+++    {
+++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+++      b->invalidate_mode = mode;
+++      b->block_count = block_count;
+++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
+++      b->block_size = uv_size;
+++      b->inter_block_stride = stride1 * stride2;
+++    }
+++    if (do_luma)
+++    {
+++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
+++      b->invalidate_mode = mode;
+++      b->block_count = block_count;
+++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
+++      b->block_size = y_size;
+++      b->inter_block_stride = stride1 * stride2;
+ +    }
+ +  }
+ +}
+@@ -15275,13 +22983,11 @@ index 0000000..7c0eedd
+ +
+ +
+ +// If sem_init actually takes time then maybe we want a pool...
+-+static vq_wait_t * vq_wait_new(const unsigned int cost)
+++static vq_wait_t * vq_wait_new(void)
+ +{
+ +  gpu_env_t * const ge = gpu_lock_ref();
+ +  vq_wait_t * const wait = ge->wait_pool.head;
+ +  ge->wait_pool.head = wait->next;
+-+  ge->current_load += cost;
+-+  wait->cost = cost;
+ +  wait->next = NULL;
+ +
+ +#if RPI_TRACE_TIME_VPU_QPU_WAIT
+@@ -15337,17 +23043,13 @@ index 0000000..7c0eedd
+ +
+ +static void vq_wait_post(vq_wait_t * const wait)
+ +{
+-+#if !RPI_TRACE_TIME_VPU_QPU_WAIT
+-+  if (wait->cost != 0)
+-+#endif
+++#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ +  {
+ +    gpu_env_t *const ge = gpu_lock();
+-+    ge->current_load -= wait->cost;
+-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
+ +    tto_end(&ge->ttw.active, ns_time());
+-+#endif
+ +    gpu_unlock();
+ +  }
+++#endif
+ +
+ +  sem_post(&wait->sem);
+ +}
+@@ -15363,7 +23065,6 @@ index 0000000..7c0eedd
+ +{
+ +  unsigned int n;
+ +  unsigned int mask;
+-+  unsigned int cost;
+ +  struct gpu_job_s j[VPU_QPU_JOB_MAX];
+ +};
+ +
+@@ -15396,23 +23097,26 @@ index 0000000..7c0eedd
+ +    vqj->mask |= VPU_QPU_MASK_VPU;
+ +
+ +    j->command = EXECUTE_VPU;
+-+    j->u.v.q[0] = vpu_code;
+++    // The bottom two bits of the execute address contain no-flush flags
+++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
+++    // as we never reload code
+++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
+ +    j->u.v.q[1] = r0;
+ +    j->u.v.q[2] = r1;
+ +    j->u.v.q[3] = r2;
+ +    j->u.v.q[4] = r3;
+ +    j->u.v.q[5] = r4;
+ +    j->u.v.q[6] = r5;
+++    gpu->vpu_i_cache_flushed = 1;
+ +  }
+ +}
+ +
+ +// flags are QPU_FLAGS_xxx
+-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
+++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
+ +{
+ +  if (n != 0) {
+ +    struct gpu_job_s *const j = new_job(vqj);
+ +    vqj->mask |= VPU_QPU_MASK_QPU;
+-+    vqj->cost += cost;
+ +
+ +    j->command = EXECUTE_QPU;
+ +    j->u.q.jobs = n;
+@@ -15442,7 +23146,7 @@ index 0000000..7c0eedd
+ +  }
+ +
+ +  // We are going to want a sync object
+-+  wait = vq_wait_new(vqj->cost);
+++  wait = vq_wait_new();
+ +
+ +  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
+ +  // If we only posted one thing or only QPU jobs
+@@ -15464,7 +23168,6 @@ index 0000000..7c0eedd
+ +    j->callback.cookie = wait;
+ +  }
+ +
+-+  vqj->cost = 0;
+ +  vqj->mask = 0;
+ +  *wait_h = wait;
+ +}
+@@ -15483,11 +23186,6 @@ index 0000000..7c0eedd
+ +  return rv;
+ +}
+ +
+-+unsigned int vpu_qpu_current_load(void)
+-+{
+-+  return gpu_ptr()->current_load;
+-+}
+-+
+ +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
+ +{
+ +  if (wait_h != NULL)
+@@ -15536,13 +23234,50 @@ index 0000000..7c0eedd
+ +  return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+ +}
+ +
+++
+++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
+++{
+++  // Dummy values we can catch with emulation
+++  qf->y_pxx = ~1U;
+++  qf->y_bxx = ~2U;
+++  qf->y_p00 = ~3U;
+++  qf->y_b00 = ~4U;
+++  qf->c_pxx = ~5U;
+++  qf->c_bxx = ~6U;
+++
+++  switch (bit_depth) {
+++    case 8:
+++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
+++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
+++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
+++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
+++      qf->c_pxx = qpu_fn(mc_filter_c_p);
+++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
+++      qf->c_bxx = qpu_fn(mc_filter_c_b);
+++      break;
+++    case 10:
+++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
+++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
+++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
+++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
+++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
+++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
+++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
+++      break;
+++    default:
+++      return -1;
+++  }
+++  return 0;
+++}
+++
+ +#endif // RPI
+ diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+ new file mode 100644
+-index 0000000..a95f7d9
++index 0000000000..485a08f8ba
+ --- /dev/null
+ +++ b/libavcodec/rpi_qpu.h
+-@@ -0,0 +1,200 @@
++@@ -0,0 +1,206 @@
+ +#ifndef RPI_QPU_H
+ +#define RPI_QPU_H
+ +
+@@ -15687,21 +23422,35 @@ index 0000000..a95f7d9
+ +void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
+ +void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
+ +  const unsigned int offset, const unsigned int size);
+++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
+++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
+ +void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
+-+void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+-+  const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
+++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
+++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
+++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
+ +
+ +// init, add, finish for one gm ptr
+ +void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+ +
+ +
+ +// QPU specific functions
+++
+++typedef struct HEVCRpiQpu {
+++    uint32_t c_pxx;
+++    uint32_t c_pxx_l1;
+++    uint32_t c_bxx;
+++    uint32_t y_pxx;
+++    uint32_t y_bxx;
+++    uint32_t y_p00;
+++    uint32_t y_b00;
+++} HEVCRpiQpu;
+++
+++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
+++
+ +uint32_t qpu_fn(const int * const mc_fn);
+ +
+-+#define QPU_N_GRP_UV 4
+-+#define QPU_N_UV     8
+-+#define QPU_N_GRP_Y  4  // 4 QPUs per TMU
+-+#define QPU_N_Y      12
+++#define QPU_N_GRP    4
+++#define QPU_N_MAX    12
+ +
+ +#define QPU_MAIL_EL_VALS  2
+ +
+@@ -15717,27 +23466,19 @@ index 0000000..a95f7d9
+ +void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
+ +void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
+ +  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
+-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
+++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
+ +void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
+ +int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
+ +int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
+ +
+-+
+-+extern unsigned int vpu_get_fn(void);
+++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
+ +extern unsigned int vpu_get_constants(void);
+ +
+ +// Waits for previous post_codee to complete and Will null out *wait_h after use
+ +void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
+-+unsigned int vpu_qpu_current_load(void);
+ +int vpu_qpu_init(void);
+ +void vpu_qpu_term(void);
+ +
+-+// Simple test of shader code
+-+extern int rpi_test_shader(void);
+-+
+-+extern void rpi_do_block(const unsigned char *in_buffer_vc, int src_pitch, unsigned char *dst_vc, int dst_pitch, unsigned char *dst);
+-+extern void rpi_do_block_arm(const unsigned char *in_buffer, int src_pitch, unsigned char *dst, int dst_pitch);
+-+
+ +extern int gpu_get_mailbox(void);
+ +void gpu_ref(void);
+ +void gpu_unref(void);
+@@ -15745,10 +23486,10 @@ index 0000000..a95f7d9
+ +#endif
+ diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+ new file mode 100644
+-index 0000000..0898ecd
++index 0000000000..2c6541a8fb
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.c
+-@@ -0,0 +1,670 @@
++@@ -0,0 +1,1570 @@
+ +#include "rpi_shader.h"
+ +
+ +#ifdef _MSC_VER
+@@ -15772,648 +23513,1548 @@ index 0000000..0898ecd
+ +__attribute__((aligned(8)))
+ +#endif
+ +unsigned int rpi_shader[] = {
+-+// ::mc_setup_c
+-+/* [0x00000000] */ 0x95801ff6, 0xd0020927, // mov tmurs, 1          ; mov -, unif
+-+/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
+-+/* [0x00000010] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+-+/* [0x00000018] */ 0x0d801dc0, 0xd0021667, // sub rb_max_x, unif, 1
+-+/* [0x00000020] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+-+/* [0x00000028] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000030] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000038] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000040] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+-+/* [0x00000048] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+-+/* [0x00000050] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+-+/* [0x00000058] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+-+/* [0x00000060] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+-+/* [0x00000068] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000070] */ 0x95800dbf, 0xd002550c, // mov rb_xpitch, unif   ; mov ra12, 0
+-+/* [0x00000078] */ 0x95800dbf, 0xd002540d, // mov rb_pitch, unif    ; mov ra13, 0
+-+/* [0x00000080] */ 0x95980dbf, 0xd002580e, // mov r0, elem_num      ; mov ra14, 0
+-+/* [0x00000088] */ 0x8c5d03f6, 0x1002560f, // add rb24, r1, rb_pitch ; mov ra15, ra_k0
+-+/* [0x00000090] */ 0x0c027180, 0x14020827, // add r0, r0, ra0.16b
+-+/* [0x00000098] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+-+/* [0x000000a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x000000a8] */ 0x149c11c0, 0xd0020867, // and r1, r0, 1
+-+/* [0x000000b0] */ 0x119c43c0, 0xd01204e7, // shl ra_xshift_next, r1, 4
+-+/* [0x000000b8] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+-+/* [0x000000c0] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+-+/* [0x000000c8] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+++// ::mc_setup_c_q0
+++// ::mc_start
+++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_c_qn
+++/* [0x00000008] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+++/* [0x00000010] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00000018] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x00000020] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+++/* [0x00000030] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+++/* [0x00000038] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+++/* [0x00000040] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+++/* [0x00000048] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00000050] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00000058] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00000060] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000078] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
+++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
+++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+++/* [0x000000b0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+ +/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+ +/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x000000e0] */ 0x8c467076, 0x14024821, // add r0, r0, r1        ; mov r1, ra_y
+++/* [0x000000e0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+ +/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+-+/* [0x000000f0] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+-+/* [0x000000f8] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+-+/* [0x00000100] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+/* [0x00000108] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+-+/* [0x00000110] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+-+/* [0x00000118] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+-+/* [0x00000120] */ 0x4c510387, 0x10224460, // add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+/* [0x00000128] */ 0x0c627c00, 0x10020e27, // add t0s, ra_base, r0
+-+/* [0x00000130] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
+-+/* [0x00000138] */ 0x15827d80, 0x100009e7, // mov -, unif
+-+/* [0x00000140] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000148] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+-+/* [0x00000150] */ 0x119c53c0, 0xd0020867, // shl r1, r1, 5
+-+/* [0x00000158] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+-+/* [0x00000160] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+-+/* [0x00000168] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+-+/* [0x00000170] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+-+/* [0x00000178] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+-+/* [0x00000180] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+-+/* [0x00000188] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+-+/* [0x00000190] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+-+/* [0x00000198] */ 0x15827d80, 0x10020027, // mov ra0, unif
+-+/* [0x000001a0] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+-+/* [0x000001a8] */ 0x15027d80, 0x12120567, // mov ra_y2, ra0.16a
+-+/* [0x000001b0] */ 0x15027d80, 0x14020827, // mov r0, ra0.16b
+-+/* [0x000001b8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+-+/* [0x000001c0] */ 0x938001f6, 0xd0020827, // max r0, r0, 0         ; mov -, unif
+-+/* [0x000001c8] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+-+/* [0x000001d0] */ 0x948011f6, 0xd0020867, // and r1, r0, 1         ; mov -, unif
+-+/* [0x000001d8] */ 0x119c43c0, 0xd0021067, // shl rb_xshift2_next, r1, 4
+-+/* [0x000001e0] */ 0x149de1c0, 0xd0020827, // and r0, r0, -2
+-+/* [0x000001e8] */ 0xec9e7009, 0x10024821, // add r0, r0, r0        ; v8subs r1, r1, r1
+-+/* [0x000001f0] */ 0x0d9d03c0, 0x10020867, // sub r1, r1, rb_pitch
+-+/* [0x000001f8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+-+/* [0x00000200] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000208] */ 0x8c567076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_y2
+-+/* [0x00000210] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+-+/* [0x00000218] */ 0x139c03c0, 0xd0020827, // max r0, r1, 0
+-+/* [0x00000220] */ 0x129de1c0, 0x10020827, // min r0, r0, rb_max_y
+-+/* [0x00000228] */ 0x4c510387, 0x10024860, // add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+/* [0x00000230] */ 0x8c660c3f, 0x10020f27, // add t1s, ra_base2, r0 ; mov -, unif
+-+/* [0x00000238] */ 0x938003f6, 0xd0020827, // max r0, r1, 0         ; mov -, unif
+-+/* [0x00000240] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000248] */ 0x9281e1f6, 0x10020827, // min r0, r0, rb_max_y  ; mov -, unif
+-+/* [0x00000250] */ 0x4c510387, 0x10124560, // add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+/* [0x00000258] */ 0x0c667c00, 0x10020f27, // add t1s, ra_base2, r0
+-+// ::mc_filter_uv
+-+/* [0x00000260] */ 0x9581cdbf, 0x100247b1, // mov ra_link, unif     ; mov vw_setup, rb28
+-+/* [0x00000268] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+-+/* [0x00000270] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000278] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+-+/* [0x00000280] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+-+/* [0x00000288] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+-+/* [0x00000290] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+-+/* [0x00000298] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+-+/* [0x000002a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+-+/* [0x000002a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+/* [0x000002b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+-+/* [0x000002b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x000002c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+-+/* [0x000002c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+-+/* [0x000002d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+-+/* [0x000002d8] */ 0x8c8013f6, 0xd0025441, // add rb17, r1, 1       ; mov ra1, unif
+-+/* [0x000002e0] */ 0x8c8033f6, 0xd002d481, // add rb18, r1, 3       ; mov.ifnz ra1, unif
+-+/* [0x000002e8] */ 0x8c0e70b6, 0x18024808, // add r0,   r0, r2      ; mov rb8,  ra3.8a
+-+/* [0x000002f0] */ 0x910cf1f6, 0xda024809, // shl r0,   r0, 15      ; mov rb9,  ra3.8b
+-+/* [0x000002f8] */ 0x8c05b1f6, 0x140256a1, // add rb26, r0, rb27    ; mov r1, ra1.16b
+-+/* [0x00000300] */ 0x910cd3f6, 0x1c02484a, // shl r1, r1, rb13      ; mov rb10, ra3.8c
+-+/* [0x00000308] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+-+/* [0x00000310] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+-+/* [0x00000318] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+-+// :uvloop
+-+/* [0x00000320] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+-+/* [0x00000328] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+/* [0x00000330] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+/* [0x00000338] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+/* [0x00000340] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000348] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+/* [0x00000350] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+-+/* [0x00000358] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000360] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000368] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+/* [0x00000370] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+-+/* [0x00000378] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+/* [0x00000380] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+/* [0x00000388] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+/* [0x00000390] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+/* [0x00000398] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+/* [0x000003a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+-+/* [0x000003a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+-+/* [0x000003b0] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x000003b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+-+/* [0x000003c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
+-+/* [0x000003c8] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000003d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra14, rb10
+-+/* [0x000003d8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra15, rb11
+-+/* [0x000003e0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+-+/* [0x000003e8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+-+/* [0x000003f0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x000003f8] */ 0x409ce00f, 0x100049e1, // nop                   ; mul24 r1, r1, rb14
+-+/* [0x00000400] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00000408] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00000410] */ 0x0f9cd3c0, 0x10c20067, // asr ra1.8as, r1, rb13
+-+/* [0x00000418] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+-+/* [0x00000420] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:uvloop
+-+/* [0x00000428] */ 0x0f9cd3c0, 0x10d20067, // asr ra1.8bs, r1, rb13
+-+/* [0x00000430] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+-+/* [0x00000438] */ 0x15067d80, 0x10020c27, // mov vpm, ra1
+-+/* [0x00000440] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000448] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000450] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00000458] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_filter_uv_b0
+-+/* [0x00000460] */ 0x9581cdbf, 0x100049f1, // mov -, unif           ; mov vw_setup, rb28
+-+/* [0x00000468] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+-+/* [0x00000470] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000478] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+-+/* [0x00000480] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+-+/* [0x00000488] */ 0x934c01f6, 0xd2024800, // max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next
+-+/* [0x00000490] */ 0x928191f6, 0x10025801, // min r0, r0, rb_max_x  ; mov ra1, unif
+-+/* [0x00000498] */ 0x119c41c0, 0xd01204e7, // shl ra_xshift_next, r0, 4
+-+/* [0x000004a0] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+-+/* [0x000004a8] */ 0x8c0a7036, 0x12225813, // add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+/* [0x000004b0] */ 0x54042077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra1.16b, 2
+-+/* [0x000004b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x000004c0] */ 0x8c067076, 0x12024821, // add r0, r0, r1        ; mov r1, ra1.16a
+-+/* [0x000004c8] */ 0x4c5a760e, 0x100246a0, // add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+-+/* [0x000004d0] */ 0x8d818eb6, 0x10025743, // sub rb29, rb24, r2    ; mov ra3, unif
+-+/* [0x000004d8] */ 0x0c9c13c0, 0xd0021467, // add rb17, r1, 1
+-+/* [0x000004e0] */ 0x8c0c33f6, 0xd80247c8, // add ra31, r1, 3       ; mov rb8,  ra3.8a
+-+/* [0x000004e8] */ 0x8c0e70b6, 0x1a024809, // add r0,   r0, r2      ; mov rb9,  ra3.8b
+-+/* [0x000004f0] */ 0x910cf1f6, 0xdc02480a, // shl r0,   r0, 15      ; mov rb10, ra3.8c
+-+/* [0x000004f8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
+-+/* [0x00000500] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0             ; mov rb11, ra3.8d
+-+/* [0x00000508] */ 0x15827d80, 0x100213a7, // mov rb14, unif
+-+/* [0x00000510] */ 0x15827d80, 0x100613a7, // mov.ifnz rb14, unif
+-+// :uvloop_b0
+-+/* [0x00000518] */ 0xcd5117de, 0xa00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0
+-+/* [0x00000520] */ 0x8e4c09f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+/* [0x00000528] */ 0x8e4481f6, 0xd402c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+/* [0x00000530] */ 0x936807f6, 0xd0029898, // max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000540] */ 0x4c510797, 0x10224462, // add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+/* [0x00000548] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_k255
+-+/* [0x00000550] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000558] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+/* [0x00000560] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+-+/* [0x00000570] */ 0x40037031, 0xda00c9e2, // nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+/* [0x00000578] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+/* [0x00000580] */ 0x40036031, 0xdc00c9e3, // nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+/* [0x00000588] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+/* [0x00000590] */ 0x40035031, 0xde00c9e3, // nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+/* [0x00000598] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3        ; mov r3, rb31
+-+/* [0x000005a0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4     ; mov ra12, ra13
+-+/* [0x000005a8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005b0] */ 0x55389db7, 0x10024361, // mov ra13, ra14        ; mul24 r1, ra14, rb9
+-+/* [0x000005b8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15        ; mul24 r2, ra15, rb10
+-+/* [0x000005c0] */ 0x55308037, 0x100243e0, // mov ra15, r0          ; mul24 r0, ra12, rb8
+-+/* [0x000005c8] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+-+/* [0x000005d0] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+-+/* [0x000005d8] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0        ; mov ra7, rb6
+-+/* [0x000005e0] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
+-+/* [0x000005e8] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6    ; mov rb6, ra5
+-+/* [0x000005f0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
+-+/* [0x000005f8] */ 0x95104ff6, 0x10024144, // mov ra5, rb4          ; mov rb4, ra4
+-+/* [0x00000600] */ 0x95185ff6, 0x10024105, // mov ra4, rb5          ; mov rb5, ra6
+-+/* [0x00000608] */ 0x95207ff6, 0x10024187, // mov ra6, rb7          ; mov rb7, ra8
+-+/* [0x00000610] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
+-+/* [0x00000618] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
+-+/* [0x00000620] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3        ; mov -, unif
+-+/* [0x00000628] */ 0x95810ff6, 0xd002581e, // mov r0, i_shift16     ; mov ra_link, unif
+-+/* [0x00000630] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
+-+/* [0x00000638] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
+-+/* [0x00000640] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+-+/* [0x00000648] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+-+/* [0x00000650] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+-+/* [0x00000658] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+-+/* [0x00000660] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
+-+/* [0x00000668] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+-+/* [0x00000670] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+-+/* [0x00000678] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
+-+/* [0x00000680] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+-+/* [0x00000688] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+-+/* [0x00000690] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+-+// :uv_b0_post12
+-+/* [0x00000698] */ 0x95187dbf, 0x100248a3, // mov r2, ra6           ; mov r3, rb7
+-+/* [0x000006a0] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+-+/* [0x000006a8] */ 0x959e749b, 0x10024144, // mov ra5, r2           ; mov rb4, r3
+-+/* [0x000006b0] */ 0x95105dbf, 0x100248a3, // mov r2,  ra4          ; mov r3,  rb5
+-+/* [0x000006b8] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+-+/* [0x000006c0] */ 0x959e749b, 0x100241c6, // mov ra7, r2           ; mov rb6, r3
+-+// :uv_b0_post_fin
+-+/* [0x000006c8] */ 0x959a0ff6, 0x100240a0, // mov ra2, unif         ; mov r0, elem_num
+-+/* [0x000006d0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x000006d8] */ 0xec0a7c09, 0x14024821, // add r0, ra2.16b, r0   ; v8subs r1, r1, r1
+-+/* [0x000006e0] */ 0x8d8103f6, 0x10024863, // sub r1, r1, rb_pitch  ; mov r3, unif
+-+/* [0x000006e8] */ 0x935c11bf, 0x10024800, // max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next
+-+/* [0x000006f0] */ 0x928191f6, 0x10020827, // min r0, r0, rb_max_x  ; mov -, unif
+-+/* [0x000006f8] */ 0x119c41c0, 0xd0021067, // shl rb_xshift2_next, r0, 4
+-+/* [0x00000700] */ 0x9481e1f6, 0xd0025800, // and r0, r0, -2        ; mov ra0, unif
+-+/* [0x00000708] */ 0x8c0a7036, 0x12225815, // add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+-+/* [0x00000710] */ 0x94827076, 0x10025843, // and r1, r0, r1        ; mov ra3, unif
+-+/* [0x00000718] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000720] */ 0x8c0e7076, 0x18024808, // add r0, r0, r1        ; mov rb8,  ra3.8a
+-+/* [0x00000728] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+-+/* [0x00000730] */ 0x950e0ff6, 0x1a024049, // mov ra1, unif         ; mov rb9,  ra3.8b
+-+/* [0x00000738] */ 0x950e0ff6, 0x1c06404a, // mov.ifnz ra1, unif    ; mov rb10, ra3.8c
+-+/* [0x00000740] */ 0x800e7036, 0x1e0049cb, // nop                   ; mov rb11, ra3.8d
+-+/* [0x00000748] */ 0xf104dddb, 0x14024863, // shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3
+-+/* [0x00000750] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+-+// :uvloop_b
+-+/* [0x00000758] */ 0xcd5117de, 0xb00269df, // sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1
+-+/* [0x00000760] */ 0x8e5409f6, 0x14028823, // shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+-+/* [0x00000768] */ 0x8e5481f6, 0xd202c863, // shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+-+/* [0x00000770] */ 0x935d37bf, 0x10029899, // max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+-+/* [0x00000778] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000780] */ 0x4c510797, 0x10124562, // add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+-+/* [0x00000788] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255
+-+/* [0x00000790] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000798] */ 0x540163f0, 0x18024863, // and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+-+/* [0x000007a0] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+-+/* [0x000007a8] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+-+/* [0x000007b0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+-+/* [0x000007b8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+-+/* [0x000007c0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+-+/* [0x000007c8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+-+/* [0x000007d0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+-+/* [0x000007d8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x000007e0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4    ; mov ra12, ra13
+-+/* [0x000007e8] */ 0xffffff50, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x000007f0] */ 0x55389db7, 0x10024361, // mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+/* [0x000007f8] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15          ; mul24 r2, ra15, rb10
+-+/* [0x00000800] */ 0x55308037, 0x100243e0, // mov ra15, r0            ; mul24 r0, ra12, rb8
+-+/* [0x00000808] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0        ; mov ra8.16b, ra7
+-+/* [0x00000810] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2        ; mul24 r0, ra15, rb11
+-+/* [0x00000818] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+-+/* [0x00000820] */ 0x55586fce, 0x100241e1, // mov ra7, rb6          ; mul24 r1, r1, ra_k256
+-+/* [0x00000828] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14        ; mov rb6, ra5
+-+/* [0x00000830] */ 0x55044fce, 0x12024161, // mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+-+/* [0x00000838] */ 0x8c127236, 0x10024844, // add r1, r1, r0        ; mov rb4, ra4
+-+/* [0x00000840] */ 0x55585fce, 0x10024121, // mov ra4, rb5          ; mul24 r1, r1, ra_k256
+-+/* [0x00000848] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12      ; mov rb5, ra6
+-+/* [0x00000850] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31  ; mov ra6, rb7
+-+/* [0x00000858] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
+-+/* [0x00000860] */ 0x809f8009, 0xd00049e1, // nop                   ; mov r1, r1 << 8
+-+/* [0x00000868] */ 0xfffffed0, 0xf06809e7, // brr.anyn -, r:uvloop_b
+-+/* [0x00000870] */ 0x0f9cd3c0, 0x10d200e7, // asr ra3.8bs, r1, rb13
+-+/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait        ; mov rb7, ra8
+-+/* [0x00000880] */ 0x150e7d80, 0x10020c27, // mov vpm, ra3
+-+/* [0x00000888] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000890] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00000898] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+-+// ::mc_interrupt_exit8c
+-+/* [0x000008a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000008b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008b8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x000008c0] */ 0x159f2fc0, 0xa00009e7, // mov  -, vw_wait ; nop ; ldtmu0
+-+/* [0x000008c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000900] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000908] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000910] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit
+-+// ::mc_exit_c
+-+/* [0x00000918] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000920] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000928] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000930] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+-+/* [0x00000938] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
+-+/* [0x00000940] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000948] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+/* [0x00000950] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_interrupt_exit12
+-+/* [0x00000958] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000960] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000968] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000970] */ 0x159f2fc0, 0xb00009e7, // mov  -, vw_wait ; nop ; ldtmu1
+-+/* [0x00000978] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000980] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000988] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000990] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x00000998] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
+-+/* [0x000009d0] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x000009d8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_exit1
+-+/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+-+/* [0x000009f0] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x000009f8] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a00] */ 0x009e7000, 0xa00009e7, // ldtmu0
+-+/* [0x00000a08] */ 0x009e7000, 0xb00009e7, // ldtmu1
+-+/* [0x00000a10] */ 0x009e7000, 0x300009e7, // nop        ; nop ; thrend
+-+/* [0x00000a18] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
+-+/* [0x00000a20] */ 0x009e7000, 0x100009e7, // nop        ; nop
+-+// ::mc_setup
+-+/* [0x00000a28] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1          ; mov ra8, unif
+-+/* [0x00000a30] */ 0x15827d80, 0x10020267, // mov ra9, unif
+-+/* [0x00000a38] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+-+/* [0x00000a40] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+-+/* [0x00000a48] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+-+/* [0x00000a50] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+-+/* [0x00000a58] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+-+/* [0x00000a60] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+-+/* [0x00000a68] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+-+/* [0x00000a70] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+-+/* [0x00000a78] */ 0x159d03c0, 0x10021627, // or  rb24, r1, rb_pitch
+-+/* [0x00000a80] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+-+/* [0x00000a88] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
+-+/* [0x00000a90] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000a98] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000aa0] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000aa8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+-+/* [0x00000ab0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+-+/* [0x00000ab8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000ac0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000ac8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000ad0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+-+/* [0x00000ad8] */ 0x15227d80, 0x14020867, // mov r1, ra8.16b
+-+/* [0x00000ae0] */ 0x0c9c13c0, 0xd0220467, // add ra_y, r1, 1
+-+/* [0x00000ae8] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000af0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000af8] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b00] */ 0x0c627c40, 0x10020e27, // add t0s, ra_base, r1
+-+/* [0x00000b08] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
+-+/* [0x00000b10] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000b18] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000b20] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+-+/* [0x00000b28] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+-+/* [0x00000b30] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000b38] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000b40] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000b48] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+-+/* [0x00000b50] */ 0x152a7d80, 0x14020867, // mov r1, ra10.16b
+-+/* [0x00000b58] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
+-+/* [0x00000b60] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
+-+/* [0x00000b68] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000b70] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+-+/* [0x00000b78] */ 0x0c667c40, 0x10020f27, // add t1s, ra_base2, r1
+-+/* [0x00000b80] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
+-+/* [0x00000b88] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
+-+/* [0x00000b90] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
+-+/* [0x00000b98] */ 0x00000000, 0xe00205e7, // mov ra_k0, 0
+-+/* [0x00000ba0] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+-+/* [0x00000ba8] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+-+/* [0x00000bb0] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+-+/* [0x00000bb8] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+-+/* [0x00000bc0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+-+/* [0x00000bc8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+-+/* [0x00000bd0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+-+/* [0x00000bd8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+-+/* [0x00000be0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+-+/* [0x00000be8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+-+/* [0x00000bf0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+-+/* [0x00000bf8] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+-+/* [0x00000c00] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+-+/* [0x00000c08] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+-+/* [0x00000c10] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
+-+/* [0x00000c18] */ 0x13440dc0, 0xd4020867, // max r1, ra_y, 0
+-+/* [0x00000c20] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000c28] */ 0x0c441dc0, 0xd4220467, // add ra_y, ra_y, 1
+-+/* [0x00000c30] */ 0x55810d8f, 0x100049e1, // mov -, unif           ; mul24 r1, r1, rb_pitch
+-+/* [0x00000c38] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_base
+-+/* [0x00000c40] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
+-+/* [0x00000c48] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+-+/* [0x00000c50] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
+-+/* [0x00000c58] */ 0x409d000f, 0x100049e1, // nop                   ; mul24 r1, r1, rb_pitch
+-+/* [0x00000c60] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_base2
+-+// :per_block_setup
+-+/* [0x00000c68] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000c70] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+-+/* [0x00000c78] */ 0x959a0ff6, 0x10024063, // mov ra1, unif         ; mov r3, elem_num
+-+/* [0x00000c80] */ 0x154e7d80, 0x12120467, // mov ra_xshift, ra_xshift_next
+-+/* [0x00000c88] */ 0x159c1fc0, 0x10021027, // mov rb_xshift2, rb_xshift2_next
+-+/* [0x00000c90] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+-+/* [0x00000c98] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000ca0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000ca8] */ 0x119c31c0, 0xd01204e7, // shl ra_xshift_next, r0, 3
+-+/* [0x00000cb0] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+-+/* [0x00000cb8] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+-+/* [0x00000cc0] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000cc8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000cd0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000cd8] */ 0x0c827c00, 0x100206a7, // add ra_base_next, unif, r0
+-+/* [0x00000ce0] */ 0x15067d80, 0x142204e7, // mov ra_y_next, ra1.16b
+-+/* [0x00000ce8] */ 0x15827d80, 0x10020067, // mov ra1, unif
+-+/* [0x00000cf0] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00000cf8] */ 0x0c067cc0, 0x12020827, // add r0, ra1.16a, r3
+-+/* [0x00000d00] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+-+/* [0x00000d08] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+-+/* [0x00000d10] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+-+/* [0x00000d18] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+-+/* [0x00000d20] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+-+/* [0x00000d28] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+/* [0x00000d30] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+-+/* [0x00000d38] */ 0x0c827c00, 0x100214e7, // add rb_base2_next, unif, r0
+-+/* [0x00000d40] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
+-+/* [0x00000d48] */ 0x15827d80, 0x10020427, // mov ra_width_height, unif
+-+/* [0x00000d50] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00000d58] */ 0x0d418f80, 0x14021767, // sub rb29, rb24, ra_width
+-+/* [0x00000d60] */ 0x8c405df6, 0xd2025460, // add rb17, ra_height, 5  ; mov r0, ra_height
+-+/* [0x00000d68] */ 0x00000010, 0xe0020867, // mov r1, 16
+-+/* [0x00000d70] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+-+/* [0x00000d78] */ 0x0c9c71c0, 0xd00214a7, // add rb18, r0, 7
+-+/* [0x00000d80] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, 7
+-+/* [0x00000d88] */ 0x0c427180, 0x14020827, // add r0,   r0, ra_width
+-+/* [0x00000d90] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, i_shift16
+-+/* [0x00000d98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27                 ; mov r0, unif
+-+/* [0x00000da0] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16          ; mov ra5, unif
+-+/* [0x00000da8] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
+-+/* [0x00000db0] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3                     ; mov rb14, ra5.16a
+-+/* [0x00000db8] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+-+/* [0x00000dc0] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+-+/* [0x00000dc8] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+-+/* [0x00000dd0] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
+-+/* [0x00000dd8] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
+-+/* [0x00000de0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+-+/* [0x00000de8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+-+/* [0x00000df0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+-+/* [0x00000df8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+-+/* [0x00000e00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+-+/* [0x00000e08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+-+/* [0x00000e10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+-+/* [0x00000e18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
+-+/* [0x00000e20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
+-+/* [0x00000e28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+-+/* [0x00000e30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
+-+/* [0x00000e38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
+-+/* [0x00000e40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+-+/* [0x00000e48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
+-+/* [0x00000e50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
+-+/* [0x00000e58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+-+/* [0x00000e60] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
+-+/* [0x00000e68] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
+-+/* [0x00000e70] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a            ; mov ra18, unif
+-+/* [0x00000e78] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
+-+/* [0x00000e80] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
+-+/* [0x00000e88] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
+-+/* [0x00000e90] */ 0x15827d80, 0x100215e7, // mov rb_dest, unif
+-+/* [0x00000e98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+-+/* [0x00000ea0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
+-+/* [0x00000ea8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
+-+/* [0x00000eb0] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0                  ; mov rb7, ra3.8d
+-+// ::mc_filter
+-+/* [0x00000eb8] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
+-+// :yloop
+-+/* [0x00000ec0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00000ec8] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+/* [0x00000ed0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+/* [0x00000ed8] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00000ee0] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00000ee8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+-+/* [0x00000ef0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000ef8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00000f00] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+-+/* [0x00000f08] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00000f10] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00000f18] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00000f20] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+-+/* [0x00000f28] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00000f30] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00000f38] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+/* [0x00000f40] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+/* [0x00000f48] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+/* [0x00000f50] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+/* [0x00000f58] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+/* [0x00000f60] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+/* [0x00000f68] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+/* [0x00000f70] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+/* [0x00000f78] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+/* [0x00000f80] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+/* [0x00000f88] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+/* [0x00000f90] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+/* [0x00000f98] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+/* [0x00000fa0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+/* [0x00000fa8] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+/* [0x00000fb0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00000fb8] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00000fc0] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00000fc8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00000fd0] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x00000fd8] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x00000fe0] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x00000fe8] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x00000ff0] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x00000ff8] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x00001000] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x00001008] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x00001010] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x00001018] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x00001020] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x00001028] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0          ; mov -, vw_wait
+-+/* [0x00001030] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00001038] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001040] */ 0x409ce00f, 0x100049e1, // nop                     ; mul24 r1, r1, rb14
+-+/* [0x00001048] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
+-+/* [0x00001050] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x00001058] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
+-+/* [0x00001060] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x00001068] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x00001070] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x00001078] */ 0x00000010, 0xe0020867, // mov r1, 16
+-+/* [0x00001080] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+-+/* [0x00001088] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+-+/* [0x00001090] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+-+/* [0x00001098] */ 0xfffffbb0, 0xf02809e7, // brr.anyz -, r:per_block_setup
+-+/* [0x000010a0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x000010a8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x000010b0] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+-+/* [0x000010b8] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+-+/* [0x000010c0] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+-+/* [0x000010c8] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+-+/* [0x000010d0] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+-+/* [0x000010d8] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+-+/* [0x000010e0] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+-+/* [0x000010e8] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+-+/* [0x000010f0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x000010f8] */ 0xfffffda8, 0xf0f809e7, // brr -, r:yloop
+-+/* [0x00001100] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001108] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001110] */ 0x009e7000, 0x100009e7, // nop
+-+// ::mc_filter_b
+-+// :yloopb
+-+/* [0x00001118] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+/* [0x00001120] */ 0x8e4539bf, 0xb2029819, // shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+/* [0x00001128] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+/* [0x00001130] */ 0x954d0dbf, 0x14244463, // mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+/* [0x00001138] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+-+/* [0x00001140] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+-+/* [0x00001148] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00001150] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+/* [0x00001158] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2   ; v8min r0, r0, rb_k255
+-+/* [0x00001160] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
+-+/* [0x00001168] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+-+/* [0x00001170] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+/* [0x00001178] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+-+/* [0x00001180] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+/* [0x00001188] */ 0x40027030, 0x180049e3, // nop                  ; mul24      r3, ra0.8a,      r0
+-+/* [0x00001190] */ 0x40038031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+/* [0x00001198] */ 0x4003f030, 0xda0049e2, // nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+/* [0x000011a0] */ 0x40037031, 0xda00c9e2, // nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+/* [0x000011a8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+/* [0x000011b0] */ 0x40036031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+/* [0x000011b8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+/* [0x000011c0] */ 0x40035031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+/* [0x000011c8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+/* [0x000011d0] */ 0x40074031, 0xd800c9e3, // nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+/* [0x000011d8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+/* [0x000011e0] */ 0x40073031, 0xda00c9e3, // nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+/* [0x000011e8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+/* [0x000011f0] */ 0x40072031, 0xdc00c9e3, // nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+/* [0x000011f8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+/* [0x00001200] */ 0x40071031, 0xde00c9e3, // nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+/* [0x00001208] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3       ; mov r3, rb31
+-+/* [0x00001210] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8       ; mov r1,   ra8
+-+/* [0x00001218] */ 0x95249dbf, 0x10024208, // mov ra8,  ra9           ; mov rb8,  rb9
+-+/* [0x00001220] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x00001228] */ 0x9528adbf, 0x10024249, // mov ra9,  ra10          ; mov rb9,  rb10
+-+/* [0x00001230] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11          ; mov rb10, rb11
+-+/* [0x00001238] */ 0x959e7009, 0x100242cb, // mov ra11, r0            ; mov rb11, r1
+-+/* [0x00001240] */ 0x4008803e, 0x180049e0, // nop                     ; mul24 r0, rb8,  ra2.8a
+-+/* [0x00001248] */ 0x4008903e, 0x1a0049e1, // nop                     ; mul24 r1, rb9,  ra2.8b
+-+/* [0x00001250] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+/* [0x00001258] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+/* [0x00001260] */ 0x4c204237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+/* [0x00001268] */ 0x4c245237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+/* [0x00001270] */ 0x4d286237, 0x10024860, // sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+/* [0x00001278] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+/* [0x00001280] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0          ; mov r2, rb12
+-+/* [0x00001288] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+-+/* [0x00001290] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+-+/* [0x00001298] */ 0x409ce00f, 0x100049e0, // nop                     ; mul24 r0, r1, rb14
+-+/* [0x000012a0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+-+/* [0x000012a8] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0          ; mov -, vw_wait
+-+/* [0x000012b0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
+-+/* [0x000012b8] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
+-+/* [0x000012c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
+-+/* [0x000012c8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
+-+/* [0x000012d0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
+-+/* [0x000012d8] */ 0x00000010, 0xe0020867, // mov r1, 16
+-+/* [0x000012e0] */ 0x0d427c40, 0x12020827, // sub r0, ra_height, r1
+-+/* [0x000012e8] */ 0x159e7000, 0x10120427, // mov ra_height, r0
+-+/* [0x000012f0] */ 0x139c01c0, 0xd0022827, // max.setf r0, r0, 0
+-+/* [0x000012f8] */ 0xfffff950, 0xf02809e7, // brr.anyz -, r:per_block_setup
+-+/* [0x00001300] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
+-+/* [0x00001308] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
+-+/* [0x00001310] */ 0x159d7fc0, 0x10021ca7, // mov vw_addr, rb_dest
+-+/* [0x00001318] */ 0x129e7040, 0x10020827, // min r0, r0, r1
+-+/* [0x00001320] */ 0x0c9d2e00, 0x100214a7, // add rb18, rb18, r0
+-+/* [0x00001328] */ 0x0d9e7040, 0x10020827, // sub r0, r0, r1
+-+/* [0x00001330] */ 0x119d71c0, 0xd0020827, // shl r0, r0, i_shift23
+-+/* [0x00001338] */ 0x0c9dae00, 0x100216a7, // add rb26, rb26, r0
+-+/* [0x00001340] */ 0x409d000f, 0x100049e0, // nop ; mul24 r0, r1, rb_pitch
+-+/* [0x00001348] */ 0x0c9d7e00, 0x100215e7, // add rb_dest, rb_dest, r0
+-+/* [0x00001350] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
+-+/* [0x00001358] */ 0xfffffda0, 0xf0f809e7, // brr -, r:yloopb
+-+/* [0x00001360] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001368] */ 0x009e7000, 0x100009e7, // nop
+-+/* [0x00001370] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x000000f0] */ 0x0c80ff80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+++/* [0x000000f8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000100] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+++/* [0x00000108] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000110] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+++/* [0x00000118] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000120] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000128] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000130] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000138] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000140] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000148] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00000150] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+++/* [0x00000158] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x00000160] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+++/* [0x00000168] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000170] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000178] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000180] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000188] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+++/* [0x00000190] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x00000198] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000001a0] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
+++/* [0x000001a8] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+++/* [0x000001b0] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
+++// :1
+++/* [0x000001b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x000001c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x000001c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x000001d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000001d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x000001e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x000001e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x000001f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x000001f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00000200] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x00000208] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000210] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+++/* [0x00000218] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000220] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+++/* [0x00000228] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+++/* [0x00000230] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+++// ::mc_filter_c_p
+++/* [0x00000238] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00000240] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00000248] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00000250] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00000258] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00000260] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00000268] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00000270] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
+++/* [0x00000278] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000280] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00000288] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000290] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x00000298] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x000002a0] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x000002a8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x000002b0] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x000002b8] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x000002c0] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x000002c8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x000002d0] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x000002d8] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x000002e0] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x000002e8] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x000002f0] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x000002f8] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x00000300] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+++/* [0x00000308] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x00000310] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+++/* [0x00000318] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x00000320] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x00000328] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x00000330] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x00000338] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00000340] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000348] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000350] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00000358] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00000360] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00000368] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00000370] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00000378] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00000380] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00000388] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+++/* [0x00000390] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x00000398] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x000003a0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x000003a8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x000003b0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000003b8] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x000003c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x000003c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x000003d0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000003d8] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x000003e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000003e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000003f0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000003f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00000400] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00000408] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00000410] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00000418] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00000420] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+++/* [0x00000428] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00000430] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00000438] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c_p_l1
+++/* [0x00000440] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00000448] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00000450] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00000458] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00000460] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00000468] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00000470] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00000478] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
+++/* [0x00000480] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000488] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00000490] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000498] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x000004a0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x000004a8] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x000004b0] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x000004b8] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x000004c0] */ 0x910c73f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x000004c8] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x000004d0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x000004d8] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x000004e0] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x000004e8] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x000004f0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x000004f8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x00000500] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+++/* [0x00000508] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+++/* [0x00000510] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x00000518] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+++/* [0x00000520] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x00000528] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x00000530] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x00000538] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x00000540] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00000548] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00000550] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000558] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00000560] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00000568] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00000570] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00000578] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00000580] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00000588] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00000590] */ 0x4d108437, 0x100241e0, // sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+++/* [0x00000598] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x000005a0] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x000005a8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x000005b0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x000005b8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000005c0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x000005c8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x000005d0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x000005d8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000005e0] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x000005e8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000005f0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000005f8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00000600] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00000608] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00000610] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00000618] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00000620] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00000628] */ 0xfffffeb8, 0xf0f809e7, // brr -, r:1b
+++/* [0x00000630] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00000638] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00000640] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c_b
+++/* [0x00000648] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00000650] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00000658] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+++/* [0x00000660] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+++/* [0x00000668] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
+++/* [0x00000670] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+++/* [0x00000678] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
+++/* [0x00000680] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00000688] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
+++/* [0x00000690] */ 0x54402077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00000698] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000006a0] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
+++/* [0x000006a8] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+++/* [0x000006b0] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+++/* [0x000006b8] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x000006c0] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
+++/* [0x000006c8] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+++/* [0x000006d0] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
+++/* [0x000006d8] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+++/* [0x000006e0] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+++/* [0x000006e8] */ 0x110c1dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+++/* [0x000006f0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+++/* [0x000006f8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00000700] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
+++/* [0x00000708] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+++/* [0x00000710] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000718] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
+++/* [0x00000720] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
+++/* [0x00000728] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000730] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
+++/* [0x00000738] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+++/* [0x00000740] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
+++/* [0x00000748] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+++/* [0x00000750] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
+++// :1
+++/* [0x00000758] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x00000760] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00000768] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+++/* [0x00000770] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+++/* [0x00000778] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
+++/* [0x00000780] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00000788] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00000790] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00000798] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
+++/* [0x000007a0] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
+++/* [0x000007a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x000007b0] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x000007b8] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x000007c0] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x000007c8] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x000007d0] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+++/* [0x000007d8] */ 0x8d9c64ff, 0xb00240c5, // sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
+++/* [0x000007e0] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+++/* [0x000007e8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+++/* [0x000007f0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
+++/* [0x000007f8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00000800] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00000808] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00000810] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+++/* [0x00000818] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+++/* [0x00000820] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00000828] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00000830] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00000838] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00000840] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+++/* [0x00000848] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+++/* [0x00000850] */ 0xfffffee8, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00000858] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
+++/* [0x00000860] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+++/* [0x00000868] */ 0x8f0c05f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+++/* [0x00000870] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
+++/* [0x00000878] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+++/* [0x00000880] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
+++/* [0x00000888] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
+++/* [0x00000890] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00000898] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+++/* [0x000008a0] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+++/* [0x000008a8] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+++/* [0x000008b0] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x000008b8] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
+++/* [0x000008c0] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x000008c8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000008d0] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+++/* [0x000008d8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000008e0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000008e8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000008f0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000008f8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00000900] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00000908] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00000910] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00000918] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+++/* [0x00000920] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00000928] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00000930] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_sync_q0
+++/* [0x00000938] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000940] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000948] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000950] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000958] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000960] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000968] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000970] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000978] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q1
+++/* [0x00000980] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000988] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000990] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000998] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000009a0] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000009a8] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q2
+++/* [0x000009b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000009b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000009c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000009c8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000009d0] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000009d8] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q3
+++/* [0x000009e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000009e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000009f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000009f8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000a00] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a08] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync_q4
+++/* [0x00000a10] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000a18] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000a20] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a28] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a30] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a38] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000a40] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a48] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000a50] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q5
+++/* [0x00000a58] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000a60] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000a68] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000a70] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000a78] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000a80] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q6
+++/* [0x00000a88] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000a90] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000a98] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000aa0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000aa8] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000ab0] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q7
+++/* [0x00000ab8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000ac0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000ac8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000ad0] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000ad8] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000ae0] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync_q8
+++/* [0x00000ae8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000af0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000af8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b00] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b08] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000b18] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b20] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000b28] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q9
+++/* [0x00000b30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000b38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000b40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000b48] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000b50] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b58] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q10
+++/* [0x00000b60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000b68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000b70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000b78] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000b80] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000b88] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync_q11
+++/* [0x00000b90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000b98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00000ba0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000ba8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00000bb0] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000bb8] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c_qn
+++// ::mc_exit_y_qn
+++/* [0x00000bc0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x00000bc8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00000bd0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x00000bd8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x00000be0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00000be8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00000bf0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x00000bf8] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00000c00] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c_q0
+++// ::mc_exit_y_q0
+++/* [0x00000c08] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x00000c10] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00000c18] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x00000c20] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x00000c28] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00000c30] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x00000c38] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00000c40] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x00000c48] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+++/* [0x00000c50] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_setup_y_q0
+++/* [0x00000c58] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_y_qn
+++/* [0x00000c60] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
+++/* [0x00000c68] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00000c70] */ 0x15827d80, 0x10020067, // mov ra1, unif
+++/* [0x00000c78] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00000c80] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x00000c88] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00000c90] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00000c98] */ 0x000000ff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00000ca0] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00000ca8] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+++/* [0x00000cb0] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd4021667, // sub rb_max_x, ra3.16b, 1
+++/* [0x00000cc0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+++/* [0x00000cc8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00000cd0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00000cd8] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
+++/* [0x00000ce0] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+++/* [0x00000ce8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x00000cf0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000cf8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000d00] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00000d08] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00000d10] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x00000d18] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000d20] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000d28] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000d30] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+++/* [0x00000d38] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00000d40] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00000d48] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000d50] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000d58] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000d60] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00000d68] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000d70] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00000d78] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+++/* [0x00000d80] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
+++/* [0x00000d88] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
+++// :1
+++/* [0x00000d90] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00000d98] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x00000da0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000da8] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00000db0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x00000db8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x00000dc0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00000dc8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00000dd0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00000dd8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x00000de0] */ 0x0c80fdc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+++/* [0x00000de8] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x00000df0] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
+++/* [0x00000df8] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
+++/* [0x00000e00] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
+++/* [0x00000e08] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x00000e10] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
+++/* [0x00000e18] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00000e20] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
+++/* [0x00000e28] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
+++/* [0x00000e30] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00000e38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00000e40] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+++/* [0x00000e48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00000e50] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+++/* [0x00000e58] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+++/* [0x00000e60] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+++// :per_block_setup_8
+++/* [0x00000e68] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+++/* [0x00000e70] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00000e78] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00000e80] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00000e88] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00000e90] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00000e98] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000ea0] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
+++/* [0x00000ea8] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+++/* [0x00000eb0] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00000eb8] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+++/* [0x00000ec0] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
+++/* [0x00000ec8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00000ed0] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
+++/* [0x00000ed8] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
+++/* [0x00000ee0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00000ee8] */ 0x4c401077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
+++/* [0x00000ef0] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+++/* [0x00000ef8] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x00000f00] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x00000f08] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+++/* [0x00000f10] */ 0x119c71c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
+++/* [0x00000f18] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
+++/* [0x00000f20] */ 0x119d01c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
+++/* [0x00000f28] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+++/* [0x00000f30] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00000f38] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
+++/* [0x00000f40] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00000f48] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00000f50] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00000f58] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+++/* [0x00000f60] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+++/* [0x00000f68] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+++/* [0x00000f70] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00000f78] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00000f80] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00000f88] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00000f90] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00000f98] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00000fa0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00000fa8] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00000fb0] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+++/* [0x00000fb8] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00000fc0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x00000fc8] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+++/* [0x00000fd0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x00000fd8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x00000fe0] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+++/* [0x00000fe8] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+++/* [0x00000ff0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x00000ff8] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x00001000] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00001008] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+++/* [0x00001010] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+++/* [0x00001018] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
+++// ::mc_filter_y_pxx
+++/* [0x00001020] */ 0xfffffe28, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+++/* [0x00001028] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00001030] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00001038] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00001040] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+++// :1
+++/* [0x00001048] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x00001050] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+++/* [0x00001058] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00001060] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001068] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00001070] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00001078] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001080] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001088] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00001090] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00001098] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x000010a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x000010a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x000010b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x000010b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x000010c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000010c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000010d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000010d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000010e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000010e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000010f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000010f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00001100] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00001108] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00001110] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00001118] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00001120] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00001128] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00001130] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00001138] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001140] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00001148] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00001150] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00001158] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00001160] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00001168] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00001170] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00001178] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00001180] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00001188] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00001190] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001198] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000011a0] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x000011a8] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+++/* [0x000011b0] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x000011b8] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000011c0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x000011c8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000011d0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000011d8] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000011e0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000011e8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000011f0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x000011f8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001200] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001208] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001210] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001218] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001220] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y_bxx
+++/* [0x00001228] */ 0xfffffc20, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+++/* [0x00001230] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00001238] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00001240] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++// :1
+++/* [0x00001248] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+++/* [0x00001250] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+++/* [0x00001258] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00001260] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001268] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00001270] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00001278] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001280] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001288] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00001290] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00001298] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x000012a0] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x000012a8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x000012b0] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x000012b8] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x000012c0] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000012c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000012d0] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000012d8] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000012e0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000012e8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000012f0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000012f8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00001300] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00001308] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00001310] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00001318] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00001320] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00001328] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00001330] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00001338] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001340] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00001348] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00001350] */ 0x8f1c05f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00001358] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00001360] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00001368] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00001370] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00001378] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00001380] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00001388] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
+++/* [0x00001390] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001398] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x000013a0] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
+++/* [0x000013a8] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+++/* [0x000013b0] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
+++/* [0x000013b8] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x000013c0] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x000013c8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x000013d0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000013d8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000013e0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000013e8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000013f0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000013f8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001400] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001408] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001410] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001418] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001420] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001428] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y_p00
+++/* [0x00001430] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00001438] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+++/* [0x00001440] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x00001448] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00001450] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00001458] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00001460] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00001468] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00001470] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00001478] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001480] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
+++/* [0x00001488] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+++/* [0x00001490] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+++/* [0x00001498] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x000014a0] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x000014a8] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+++/* [0x000014b0] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
+++/* [0x000014b8] */ 0x918101f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+++/* [0x000014c0] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+++/* [0x000014c8] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+++/* [0x000014d0] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
+++// :1
+++/* [0x000014d8] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+++/* [0x000014e0] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+++/* [0x000014e8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x000014f0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x000014f8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x00001500] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x00001508] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+++/* [0x00001510] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+++/* [0x00001518] */ 0x915cf3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x00001520] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001528] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001530] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00001538] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001540] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001548] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001550] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001558] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001560] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001568] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001570] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001578] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001580] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001588] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001590] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y_b00
+++/* [0x00001598] */ 0xfffff8b0, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
+++/* [0x000015a0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x000015a8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x000015b0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x000015b8] */ 0x00000007, 0xe0020827, // mov r0, 7
+++/* [0x000015c0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+++/* [0x000015c8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+++/* [0x000015d0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+++/* [0x000015d8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+++/* [0x000015e0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+++// :1
+++/* [0x000015e8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x000015f0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+++/* [0x000015f8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x00001600] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00001608] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x00001610] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x00001618] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001620] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00001628] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00001630] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+++/* [0x00001638] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00001640] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+++/* [0x00001648] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x00001650] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+++/* [0x00001658] */ 0x915ce3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x00001660] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001668] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001670] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00001678] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001680] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001688] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001690] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001698] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000016a0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x000016a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x000016b0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x000016b8] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+++/* [0x000016c0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x000016c8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x000016d0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_setup_c10_q0
+++/* [0x000016d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_c10_qn
+++/* [0x000016e0] */ 0x00000001, 0xe0020927, // mov tmurs, 1
+++/* [0x000016e8] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x000016f0] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x000016f8] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00001700] */ 0x15827d80, 0x10020627, // mov ra_base, unif
+++/* [0x00001708] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
+++/* [0x00001710] */ 0x119c21c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+++/* [0x00001718] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
+++/* [0x00001720] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00001728] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00001730] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00001738] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00001740] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00001748] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00001750] */ 0x0c9d03c0, 0x10021627, // add rb_dma1_base, r1, rb_pitch
+++/* [0x00001758] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
+++/* [0x00001760] */ 0x409c5007, 0xd00049e0, // nop                   ; mul24 r0, r0, 5
+++/* [0x00001768] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
+++/* [0x00001770] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
+++/* [0x00001778] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x00001780] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
+++/* [0x00001788] */ 0x930001f6, 0xd2225811, // max r0, r0, 0         ; mov ra_y, ra0.16a
+++/* [0x00001790] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00001798] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+++/* [0x000017a0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+++/* [0x000017a8] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x000017b0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000017b8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x000017c0] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
+++/* [0x000017c8] */ 0x0c80df80, 0xd0021367, // add rb_wt_den_p15, 23 - v_bit_depth, unif
+++/* [0x000017d0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x000017d8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+++/* [0x000017e0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+++/* [0x000017e8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+++/* [0x000017f0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x000017f8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x00001800] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x00001808] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+++/* [0x00001810] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+++/* [0x00001818] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00001820] */ 0x15827d80, 0x10020027, // mov ra0, unif
+++/* [0x00001828] */ 0x15827d80, 0x10020667, // mov ra_base2, unif
+++/* [0x00001830] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
+++/* [0x00001838] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
+++/* [0x00001840] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00001848] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00001850] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
+++/* [0x00001858] */ 0x149e7040, 0x10020867, // and r1, r0, r1
+++/* [0x00001860] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001868] */ 0x8c467076, 0x12024822, // add r0, r0, r1        ; mov r2, ra_y2
+++/* [0x00001870] */ 0x0c667c00, 0x10020667, // add ra_base2, ra_base2, r0
+++/* [0x00001878] */ 0x95444ff6, 0xd40248e0, // mov r3, PREREAD       ; mov r0, ra_y
+++// :1
+++/* [0x00001880] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00001888] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x00001890] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00001898] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000018a0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x000018a8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x000018b0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x000018b8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x000018c0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000018c8] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x000018d0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000018d8] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
+++/* [0x000018e0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000018e8] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
+++/* [0x000018f0] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
+++/* [0x000018f8] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
+++// ::mc_filter_c10_p
+++/* [0x00001900] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00001908] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00001910] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00001918] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00001920] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00001928] */ 0x93567176, 0x14024800, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00001930] */ 0x920991f6, 0x12225813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00001938] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00001940] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001948] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x00001950] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x00001958] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00001960] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x00001968] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x00001970] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x00001978] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x00001980] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x00001988] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x00001990] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x00001998] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x000019a0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x000019a8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x000019b0] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x000019b8] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+++/* [0x000019c0] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x000019c8] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+++/* [0x000019d0] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x000019d8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x000019e0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x000019e8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x000019f0] */ 0x8c616c87, 0x10024e20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x000019f8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00001a00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001a08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001a10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001a18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001a20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00001a28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00001a30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001a38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00001a40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
+++/* [0x00001a48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+++/* [0x00001a50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x00001a58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00001a60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00001a68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001a70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001a78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x00001a80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x00001a88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001a90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001a98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x00001aa0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001aa8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001ab0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001ab8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001ac0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001ac8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001ad0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001ad8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001ae0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001ae8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001af0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001af8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c10_p_l1
+++/* [0x00001b00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00001b08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00001b10] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
+++/* [0x00001b18] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
+++/* [0x00001b20] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch  ; mov ra0, unif
+++/* [0x00001b28] */ 0x939c117f, 0x10125815, // max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++/* [0x00001b30] */ 0x920991f6, 0x12125813, // min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+++/* [0x00001b38] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00001b40] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001b48] */ 0x8c827076, 0x10025803, // add r0, r0, r1        ; mov ra3, unif
+++/* [0x00001b50] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
+++/* [0x00001b58] */ 0x8d818eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00001b60] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x00001b68] */ 0x8c8033f6, 0xd0039496, // add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif
+++/* [0x00001b70] */ 0x910c83f6, 0xd8024808, // shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++/* [0x00001b78] */ 0x8c0e70b6, 0x1a024809, // add r0, r0, r2                ; mov rb9, ra3.8b
+++/* [0x00001b80] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
+++/* [0x00001b88] */ 0x8c59b1f6, 0x140256a1, // add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
+++/* [0x00001b90] */ 0x9581edbf, 0x100255c9, // mov rb_dest, unif             ; mov ra9, rb_max_y
+++/* [0x00001b98] */ 0x910cd3f6, 0x1e02484b, // shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+++/* [0x00001ba0] */ 0x8f8023f6, 0xd002531e, // asr rb_wt_off, r1, 2          ; mov ra_link, unif
+++/* [0x00001ba8] */ 0x0d50df80, 0x1a0200e7, // sub ra3, rb_wt_den_p15, ra_k1
+++// :1
+++/* [0x00001bb0] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1
+++/* [0x00001bb8] */ 0x8e5539bf, 0x12029899, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+++/* [0x00001bc0] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++/* [0x00001bc8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+++/* [0x00001bd0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++/* [0x00001bd8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++/* [0x00001be0] */ 0x92267792, 0x1003c8e0, // min r3, r3, ra9       ; mov.ifnc r0, r2
+++/* [0x00001be8] */ 0x55150d9f, 0x10024122, // mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++/* [0x00001bf0] */ 0x8c656c87, 0x10024f20, // add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00001bf8] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00001c00] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001c08] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001c10] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001c18] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001c20] */ 0x4d004bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++/* [0x00001c28] */ 0x8c1a74f6, 0x10025885, // add r2, r2, r3        ; mov ra5, ra6
+++/* [0x00001c30] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001c38] */ 0x551cadb7, 0x100241a1, // mov ra6, ra7          ; mul24 r1, ra7, rb10
+++/* [0x00001c40] */ 0x4d108437, 0x100248a0, // sub r2, r2, r0        ; mul24 r0, ra4, rb8
+++/* [0x00001c48] */ 0x0f9c25c0, 0xd00201e7, // asr ra7, r2, v_bit_depth - 8
+++/* [0x00001c50] */ 0x4d149237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra5, rb9
+++/* [0x00001c58] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00001c60] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00001c68] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001c70] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00001c78] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x00001c80] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8         ; mov r3, ra_blk_height
+++/* [0x00001c88] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00001c90] */ 0xffffff00, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001c98] */ 0x0f0e7380, 0x10020867, // asr r1, r1, ra3
+++/* [0x00001ca0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001ca8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001cb0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001cb8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001cc0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001cc8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001cd0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001cd8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001ce0] */ 0xfffffeb0, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001ce8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001cf0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001cf8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_c10_b
+++/* [0x00001d00] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
+++/* [0x00001d08] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
+++/* [0x00001d10] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
+++/* [0x00001d18] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+++/* [0x00001d20] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch  ; mov ra_width_height, unif
+++/* [0x00001d28] */ 0x93567176, 0x14125815, // max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+++/* [0x00001d30] */ 0x928191f6, 0x10025800, // min r0, r0, rb_max_x  ; mov ra0, unif
+++/* [0x00001d38] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4        ; mov ra2, unif
+++/* [0x00001d40] */ 0x54404077, 0xd4024862, // and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul
+++/* [0x00001d48] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001d50] */ 0x8c427076, 0x12024821, // add r0, r0, r1        ; mov r1, ra_height
+++/* [0x00001d58] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00001d60] */ 0x8d818eb6, 0x10125756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif
+++/* [0x00001d68] */ 0x8c5df3ce, 0xdc025461, // add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++/* [0x00001d70] */ 0x8c8033f6, 0xd0139496, // add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif
+++/* [0x00001d78] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
+++/* [0x00001d80] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2        ; mov r3, unif
+++/* [0x00001d88] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
+++/* [0x00001d90] */ 0x8c81b1f6, 0x10025681, // add rb_dma0, r0, rb_dma0_base ; mov ra1, unif
+++/* [0x00001d98] */ 0x110c2dc0, 0xd4020827, // shl r0, ra3.16b, v_x_shift
+++/* [0x00001da0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
+++/* [0x00001da8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00001db0] */ 0x930e7176, 0x18024808, // max r0, r0, r5        ; mov rb8, ra3.8a
+++/* [0x00001db8] */ 0x920d91f6, 0x1a024809, // min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+++/* [0x00001dc0] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif
+++/* [0x00001dc8] */ 0x940e7076, 0x1c02484a, // and r1, r0, r1        ; mov rb10, ra3.8c
+++/* [0x00001dd0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00001dd8] */ 0x8c827076, 0x10024817, // add r0, r0, r1        ; mov rb_dest, unif
+++/* [0x00001de0] */ 0x0c9e7600, 0x100214e7, // add rb_base2_next, r3, r0
+++/* [0x00001de8] */ 0x950deff6, 0x1e02424b, // mov ra9, rb_max_y     ; mov rb11, ra3.8d
+++/* [0x00001df0] */ 0x1148ddc0, 0x14020867, // shl r1, ra_wt_off_l1, rb_wt_den_p15
+++/* [0x00001df8] */ 0x8f8093f6, 0xd002531e, // asr rb_wt_off, r1, 9  ; mov ra_link, unif
+++// :1
+++/* [0x00001e00] */ 0xcd511bee, 0xaa0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0
+++/* [0x00001e08] */ 0x8e5539bf, 0x12029899, // shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00001e10] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+++/* [0x00001e18] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+++/* [0x00001e20] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y     ; mov r3, ra_y
+++/* [0x00001e28] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00001e30] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00001e38] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00001e40] */ 0x8c616cc7, 0x10024e20, // add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask
+++/* [0x00001e48] */ 0x95145ff6, 0x10025104, // mov rb4, rb5          ; mov ra4, ra5
+++/* [0x00001e50] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++/* [0x00001e58] */ 0x4003e030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001e60] */ 0x40034031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001e68] */ 0x4d03c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001e70] */ 0x40032031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001e78] */ 0x4c0274f1, 0x1e0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+++/* [0x00001e80] */ 0x8d9c64ff, 0xb0024885, // sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
+++/* [0x00001e88] */ 0x0f9c25c0, 0xd00200e7, // asr ra3, r2, (v_bit_depth - 8)
+++/* [0x00001e90] */ 0x8e1809f6, 0x10025885, // shr r2, r4, rb_xshift2 ; mov ra5, ra6
+++/* [0x00001e98] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
+++/* [0x00001ea0] */ 0x8c5077bf, 0x1a124446, // add ra_y2, r3, ra_k1  ; mov rb6, rb7
+++/* [0x00001ea8] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++/* [0x00001eb0] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++/* [0x00001eb8] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++/* [0x00001ec0] */ 0x8c656cc7, 0x10024f20, // add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask
+++/* [0x00001ec8] */ 0x540563f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+++/* [0x00001ed0] */ 0x4007e030, 0xda0049e2, // nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+++/* [0x00001ed8] */ 0x40074031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+++/* [0x00001ee0] */ 0x4d07c4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+++/* [0x00001ee8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+++/* [0x00001ef0] */ 0x4d044bf1, 0xde0269e0, // sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+++/* [0x00001ef8] */ 0x4c0854fe, 0x1a0248a1, // add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+++/* [0x00001f00] */ 0xfffffee0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001f08] */ 0x551cadb7, 0x100241a3, // mov ra6, ra7          ; mul24 r3, ra7, rb10
+++/* [0x00001f10] */ 0x4d08443e, 0x180248a0, // sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+++/* [0x00001f18] */ 0x8f0c25f6, 0xd00241c7, // asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+++/* [0x00001f20] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c
+++/* [0x00001f28] */ 0x4c08723e, 0x1e024860, // add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+++/* [0x00001f30] */ 0x4d108237, 0x100248a0, // sub r2, r1, r0        ; mul24 r0, ra4, rb8
+++/* [0x00001f38] */ 0x4d149637, 0x10024860, // sub r1, r3, r0        ; mul24 r0, ra5, rb9
+++/* [0x00001f40] */ 0x4c1cb237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra7, rb11
+++/* [0x00001f48] */ 0x4d527216, 0x12024862, // sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+++/* [0x00001f50] */ 0x4f50e5ce, 0xd20248a1, // asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+++/* [0x00001f58] */ 0x4f58e3d6, 0xd2024862, // asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+++/* [0x00001f60] */ 0x4c48c5ce, 0x120248a1, // add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x00001f68] */ 0x8c5e72b6, 0x1c024863, // add r1, r1, r2        ; mov r3, ra_blk_height
+++/* [0x00001f70] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00001f78] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00001f80] */ 0xef40d3f3, 0x12024860, // asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+++/* [0x00001f88] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00001f90] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00001f98] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00001fa0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00001fa8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00001fb0] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00001fb8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00001fc0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00001fc8] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+++/* [0x00001fd0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00001fd8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00001fe0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_sync10_q0
+++/* [0x00001fe8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00001ff0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00001ff8] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002000] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002008] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002010] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002018] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002020] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002028] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q1
+++/* [0x00002030] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002038] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002040] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002048] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002050] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002058] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q2
+++/* [0x00002060] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002068] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002070] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002078] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002080] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002088] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q3
+++/* [0x00002090] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002098] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000020a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000020a8] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000020b0] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020b8] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync10_q4
+++/* [0x000020c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000020c8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000020d0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020d8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020e0] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020e8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000020f0] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000020f8] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002100] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q5
+++/* [0x00002108] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002110] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002118] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002120] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002128] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002130] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q6
+++/* [0x00002138] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002140] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002148] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002150] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002158] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002160] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q7
+++/* [0x00002168] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002170] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002178] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002180] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002188] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002190] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_sync10_q8
+++/* [0x00002198] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000021a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000021a8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021b0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021b8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000021c8] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000021d0] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x000021d8] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q9
+++/* [0x000021e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x000021e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x000021f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000021f8] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002200] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002208] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q10
+++/* [0x00002210] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002218] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002220] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002228] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002230] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002238] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_sync10_q11
+++/* [0x00002240] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002248] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
+++/* [0x00002250] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002258] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
+++/* [0x00002260] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x00002268] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c10_q0
+++// ::mc_exit_y10_q0
+++/* [0x00002270] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x00002278] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00002280] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x00002288] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x00002290] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00002298] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000022a0] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
+++/* [0x000022a8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x000022b0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
+++/* [0x000022b8] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_exit_c10_qn
+++// ::mc_exit_y10_qn
+++/* [0x000022c0] */ 0x00000003, 0xe00228e7, // mov.setf r3, PREREAD - 1
+++// :1
+++/* [0x000022c8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x000022d0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
+++/* [0x000022d8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
+++/* [0x000022e0] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x000022e8] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
+++/* [0x000022f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
+++/* [0x000022f8] */ 0x009e7000, 0x100009e7, // nop
+++/* [0x00002300] */ 0x009e7000, 0x100009e7, // nop
+++// ::mc_setup_y10_q0
+++/* [0x00002308] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
+++// ::mc_setup_y10_qn
+++/* [0x00002310] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1          ; mov ra0, unif
+++/* [0x00002318] */ 0x15827d80, 0x10020267, // mov ra9, unif
+++/* [0x00002320] */ 0x15827d80, 0x10020067, // mov ra1, unif
+++/* [0x00002328] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+++/* [0x00002330] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++/* [0x00002338] */ 0x119de1c0, 0xd00210e7, // shl rb_ef, r0, i_shift30
+++/* [0x00002340] */ 0xff100100, 0xe0020527, // mov ra_kff100100, 0xff100100
+++/* [0x00002348] */ 0x0000ffff, 0xe00215a7, // mov rb_pmask, v_pmask
+++/* [0x00002350] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++/* [0x00002358] */ 0x15827d80, 0x100200e7, // mov ra3, unif
+++/* [0x00002360] */ 0x15827d80, 0x10021527, // mov rb_xpitch, unif
+++/* [0x00002368] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
+++/* [0x00002370] */ 0x119c11c0, 0xd0021667, // shl rb_max_x, r0, v_x_shift
+++/* [0x00002378] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
+++/* [0x00002380] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
+++/* [0x00002388] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
+++/* [0x00002390] */ 0x159d03c0, 0x10021627, // or  rb_dma1_base, r1, rb_pitch
+++/* [0x00002398] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
+++/* [0x000023a0] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x000023a8] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x000023b0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x000023b8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x000023c0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x000023c8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x000023d0] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
+++/* [0x000023d8] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x000023e0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000023e8] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x000023f0] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
+++/* [0x000023f8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00002400] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002408] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00002410] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00002418] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x00002420] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00002428] */ 0x149e7080, 0x10020867, // and r1, r0, r2
+++/* [0x00002430] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00002438] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
+++/* [0x00002440] */ 0x0c2e7c00, 0x10020667, // add ra_base2, ra11, r0
+++/* [0x00002448] */ 0x80027036, 0x120049e0, // nop                   ; mov r0, ra0.16a
+++/* [0x00002450] */ 0x95044ff6, 0xd20248e2, // mov r3, PREREAD       ; mov r2, ra1.16a
+++// :1
+++/* [0x00002458] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
+++/* [0x00002460] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
+++/* [0x00002468] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00002470] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x00002478] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1  ; mov ra_y, r0
+++/* [0x00002480] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
+++/* [0x00002488] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
+++/* [0x00002490] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
+++/* [0x00002498] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++/* [0x000024a0] */ 0x8c667c52, 0x10125f11, // add t1s, ra_base2, r1 ; mov ra_y2, r2
+++/* [0x000024a8] */ 0x0c80ddc0, 0xd0021367, // add rb_wt_den_p15, unif, 23 - v_bit_depth
+++/* [0x000024b0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
+++/* [0x000024b8] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
+++/* [0x000024c0] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
+++/* [0x000024c8] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
+++/* [0x000024d0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
+++/* [0x000024d8] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
+++/* [0x000024e0] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
+++/* [0x000024e8] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
+++/* [0x000024f0] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
+++/* [0x000024f8] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
+++/* [0x00002500] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
+++/* [0x00002508] */ 0x00000000, 0xe0024208, // mov ra8,  0           ; mov rb8,  0
+++/* [0x00002510] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x00002518] */ 0x00000000, 0xe0024249, // mov ra9,  0           ; mov rb9,  0
+++/* [0x00002520] */ 0x00000000, 0xe002428a, // mov ra10, 0           ; mov rb10, 0
+++/* [0x00002528] */ 0x00000000, 0xe00242cb, // mov ra11, 0           ; mov rb11, 0
+++// :per_block_setup_10
+++/* [0x00002530] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002538] */ 0x93567176, 0x14125815, // max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+++/* [0x00002540] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00002548] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00002550] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
+++/* [0x00002558] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00002560] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00002568] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00002570] */ 0x8c827076, 0x10025801, // add r0, r0, r1        ; mov ra1, unif
+++/* [0x00002578] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
+++/* [0x00002580] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
+++/* [0x00002588] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002590] */ 0x93067176, 0x12125813, // max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+++/* [0x00002598] */ 0x928191f6, 0x10024813, // min r0, r0, rb_max_x  ; mov rb_base2_next, unif
+++/* [0x000025a0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
+++/* [0x000025a8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4        ; mov ra_width_height, unif
+++/* [0x000025b0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2        ; mov vw_setup, rb_vpm_init
+++/* [0x000025b8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x000025c0] */ 0x4c402077, 0xd4024821, // add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul
+++/* [0x000025c8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
+++/* [0x000025d0] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x000025d8] */ 0x8c5c31c6, 0xdc025460, // add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x000025e0] */ 0x0c9c71c0, 0xd00214a7, // add rb_lcount, r0, 7
+++/* [0x000025e8] */ 0x119c81c0, 0xd0020827, // shl r0,   r0, v_dma_h_shift
+++/* [0x000025f0] */ 0x0c9e7040, 0x10020827, // add r0,   r0, r1
+++/* [0x000025f8] */ 0x119cf1c0, 0xd0020827, // shl r0,   r0, v_dma_wh_shift
+++/* [0x00002600] */ 0x8c81b1f6, 0x100256a0, // add rb_dma0, r0, rb_dma0_base ; mov r0, unif
+++/* [0x00002608] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
+++/* [0x00002610] */ 0x915031f6, 0xde024223, // shl ra8, r0, 3        ; mov r3, ra_k255
+++/* [0x00002618] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
+++/* [0x00002620] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
+++/* [0x00002628] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
+++/* [0x00002630] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
+++/* [0x00002638] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
+++/* [0x00002640] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
+++/* [0x00002648] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
+++/* [0x00002650] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
+++/* [0x00002658] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
+++/* [0x00002660] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
+++/* [0x00002668] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
+++/* [0x00002670] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
+++/* [0x00002678] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
+++/* [0x00002680] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+++/* [0x00002688] */ 0x90227383, 0x1c424044, // ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+++/* [0x00002690] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
+++/* [0x00002698] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x000026a0] */ 0x90227383, 0x1c524045, // ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+++/* [0x000026a8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
+++/* [0x000026b0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x000026b8] */ 0x90227383, 0x1c624046, // ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+++/* [0x000026c0] */ 0x954a0dbf, 0x10084597, // mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif
+++/* [0x000026c8] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
+++/* [0x000026d0] */ 0x10227380, 0x1e020827, // ror r0, r1, ra8.8d
+++/* [0x000026d8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
+++/* [0x000026e0] */ 0x90227383, 0x1c724047, // ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+++/* [0x000026e8] */ 0x1158ddc0, 0x14020827, // shl r0, ra_wt_off_l0, rb_wt_den_p15
+++/* [0x000026f0] */ 0x8f8091f6, 0xd002531e, // asr rb_wt_off, r0, 9  ; mov ra_link, unif
+++// ::mc_filter_y10_pxx
+++/* [0x000026f8] */ 0xfffffe18, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+++/* [0x00002700] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002708] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00002710] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00002718] */ 0x11581dc0, 0xd21205a7, // shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+++// :1
+++/* [0x00002720] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x00002728] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+++/* [0x00002730] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00002738] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00002740] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00002748] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00002750] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00002758] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00002760] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00002768] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00002770] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x00002778] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x00002780] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x00002788] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x00002790] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00002798] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x000027a0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x000027a8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x000027b0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x000027b8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x000027c0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x000027c8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x000027d0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x000027d8] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x000027e0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x000027e8] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x000027f0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x000027f8] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00002800] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00002808] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00002810] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002818] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00002820] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00002828] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00002830] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00002838] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00002840] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00002848] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00002850] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00002858] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00002860] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
+++/* [0x00002868] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00002870] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00002878] */ 0x405a700e, 0x120049e1, // nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++/* [0x00002880] */ 0x8c5cc3f6, 0x1c024863, // add r1, r1, rb_wt_off ; mov r3, ra_blk_height
+++/* [0x00002888] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x00002890] */ 0xfffffe70, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002898] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x000028a0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x000028a8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x000028b0] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x000028b8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x000028c0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x000028c8] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x000028d0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x000028d8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x000028e0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
+++/* [0x000028e8] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x000028f0] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x000028f8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y10_p00
+++/* [0x00002900] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002908] */ 0x15567d80, 0x14120567, // mov ra_xshift, ra_xshift_next
+++/* [0x00002910] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
+++/* [0x00002918] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
+++/* [0x00002920] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
+++/* [0x00002928] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_max_x
+++/* [0x00002930] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
+++/* [0x00002938] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4        ; v8subs r2, r2, r2
+++/* [0x00002940] */ 0x8d8105f6, 0x1002589a, // sub r2, r2, rb_pitch  ; mov ra_base_next, unif
+++/* [0x00002948] */ 0x940270b6, 0x12225853, // and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++/* [0x00002950] */ 0x569d404f, 0x10024821, // xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++/* [0x00002958] */ 0x8c827076, 0x10025810, // add r0, r0, r1        ; mov ra_width_height, unif
+++/* [0x00002960] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
+++/* [0x00002968] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
+++/* [0x00002970] */ 0x8d418e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++/* [0x00002978] */ 0x8d5c41c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+++/* [0x00002980] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+++/* [0x00002988] */ 0x8c827076, 0x10025816, // add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif
+++/* [0x00002990] */ 0x9180f1f6, 0xd0024817, // shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif
+++/* [0x00002998] */ 0x0c9db1c0, 0x100216a7, // add rb_dma0, r0, rb_dma0_base
+++/* [0x000029a0] */ 0xf158dddb, 0x14024825, // shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3
+++/* [0x000029a8] */ 0x8f8011f6, 0xd002531e, // asr rb_wt_off, r0, 1  ; mov ra_link, unif
+++// :1
+++/* [0x000029b0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+++/* [0x000029b8] */ 0x804e7036, 0xa42099d1, // nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+++/* [0x000029c0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x000029c8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x000029d0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x000029d8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x000029e0] */ 0x8c616c87, 0x10024e20, // add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+++/* [0x000029e8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+++/* [0x000029f0] */ 0x915cd3f6, 0xdc024863, // shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x000029f8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00002a00] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002a08] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00002a10] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00002a18] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00002a20] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00002a28] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00002a30] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00002a38] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00002a40] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00002a48] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00002a50] */ 0xffffff40, 0xf0f809e7, // brr -, r:1b
+++/* [0x00002a58] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00002a60] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00002a68] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y10_bxx
+++/* [0x00002a70] */ 0xfffffaa0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+++/* [0x00002a78] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002a80] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00002a88] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++// :1
+++/* [0x00002a90] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+++/* [0x00002a98] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+++/* [0x00002aa0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++/* [0x00002aa8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00002ab0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++/* [0x00002ab8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++/* [0x00002ac0] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00002ac8] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00002ad0] */ 0x9221e5f6, 0x10025887, // min r2, r2, rb_max_y          ; mov ra7, ra8
+++/* [0x00002ad8] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++/* [0x00002ae0] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask
+++/* [0x00002ae8] */ 0x8c243ff6, 0x100279c8, // add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++/* [0x00002af0] */ 0x540163f0, 0x18024863, // and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++/* [0x00002af8] */ 0x4003f030, 0xda0049e2, // nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++/* [0x00002b00] */ 0x40038031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++/* [0x00002b08] */ 0x40037031, 0xda0109e2, // nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++/* [0x00002b10] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++/* [0x00002b18] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++/* [0x00002b20] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++/* [0x00002b28] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++/* [0x00002b30] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++/* [0x00002b38] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++/* [0x00002b40] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++/* [0x00002b48] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++/* [0x00002b50] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++/* [0x00002b58] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++/* [0x00002b60] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++/* [0x00002b68] */ 0x40071031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++/* [0x00002b70] */ 0x8d288bf6, 0xd00279c9, // sub.setf -, r5, 8     ; mov ra9,  ra10
+++/* [0x00002b78] */ 0x4d0894fe, 0x180248a0, // sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++/* [0x00002b80] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002b88] */ 0x5508affe, 0x1a025261, // mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++/* [0x00002b90] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11        ; mov rb10, rb11
+++/* [0x00002b98] */ 0x8f1c25f6, 0xd00242cb, // asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++/* [0x00002ba0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++/* [0x00002ba8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++/* [0x00002bb0] */ 0x4c204237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++/* [0x00002bb8] */ 0x4c245237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++/* [0x00002bc0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++/* [0x00002bc8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0        ; mul24 r0, ra11, rb7
+++/* [0x00002bd0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0        ; mov r2, rb_wt_off
+++/* [0x00002bd8] */ 0x4d512bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++/* [0x00002be0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
+++/* [0x00002be8] */ 0x405a700e, 0x120049e0, // nop                   ; mul24 r0, r1, ra_wt_mul_l0
+++/* [0x00002bf0] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+++/* [0x00002bf8] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0        ; mov r3, ra_blk_height
+++/* [0x00002c00] */ 0xf14083f3, 0xd2024860, // shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++/* [0x00002c08] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002c10] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00002c18] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00002c20] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00002c28] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00002c30] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00002c38] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00002c40] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00002c48] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00002c50] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00002c58] */ 0xfffffe18, 0xf0f809e7, // brr -, r:1b
+++/* [0x00002c60] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00002c68] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00002c70] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+++// ::mc_filter_y10_b00
+++/* [0x00002c78] */ 0xfffff898, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
+++/* [0x00002c80] */ 0x959a0ff6, 0x10024023, // mov ra0, unif         ; mov r3, elem_num
+++/* [0x00002c88] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2
+++/* [0x00002c90] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++/* [0x00002c98] */ 0x00000007, 0xe0020827, // mov r0, 7
+++/* [0x00002ca0] */ 0x0d9d1e00, 0x10021467, // sub rb_i_tmu, rb_i_tmu, r0
+++/* [0x00002ca8] */ 0x0d9d2e00, 0x100214a7, // sub rb_lcount, rb_lcount, r0
+++/* [0x00002cb0] */ 0x95588ff6, 0xd0024821, // mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+++/* [0x00002cb8] */ 0x119cce00, 0x10021327, // shl rb_wt_off, rb_wt_off, r0
+++/* [0x00002cc0] */ 0x809f8009, 0xd000d9d6, // nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+++// :1
+++/* [0x00002cc8] */ 0xcd511bee, 0xba0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++/* [0x00002cd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+++/* [0x00002cd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++/* [0x00002ce0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
+++/* [0x00002ce8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++/* [0x00002cf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++/* [0x00002cf8] */ 0x8c613cbf, 0x10029e19, // add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+++/* [0x00002d00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
+++/* [0x00002d08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
+++/* [0x00002d10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+++/* [0x00002d18] */ 0x8c656c87, 0x10024f20, // add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask
+++/* [0x00002d20] */ 0x545963c6, 0x12024860, // and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+++/* [0x00002d28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+++/* [0x00002d30] */ 0x0c9e7040, 0x10020867, // add r1, r0, r1
+++/* [0x00002d38] */ 0x915cc3f6, 0xdc024863, // shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+++/* [0x00002d40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++/* [0x00002d48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
+++/* [0x00002d50] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb_wt_den_p15
+++/* [0x00002d58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax   ; mov -, vw_wait
+++/* [0x00002d60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++/* [0x00002d68] */ 0x959da03f, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, rb_dma0
+++/* [0x00002d70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
+++/* [0x00002d78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
+++/* [0x00002d80] */ 0x8d9d70ff, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, rb_dest
+++/* [0x00002d88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
+++/* [0x00002d90] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
+++/* [0x00002d98] */ 0xffffff10, 0xf0f809e7, // brr -, r:1b
+++/* [0x00002da0] */ 0x0c9dae40, 0x100216a7, // add rb_dma0, rb_dma0, r1
+++/* [0x00002da8] */ 0x0c9d7e80, 0x100215e7, // add rb_dest, rb_dest, r2
+++/* [0x00002db0] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb_vpm_init
+ +// ::mc_end
+ +};
+ +#ifdef __HIGHC__
+@@ -16421,35 +25062,79 @@ index 0000000..0898ecd
+ +#endif
+ diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+ new file mode 100644
+-index 0000000..d17b9fd
++index 0000000000..82bf380eb4
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.h
+-@@ -0,0 +1,19 @@
++@@ -0,0 +1,63 @@
+ +#ifndef rpi_shader_H
+ +#define rpi_shader_H
+ +
+ +extern unsigned int rpi_shader[];
+ +
+-+#define mc_setup_c (rpi_shader + 0)
+-+#define mc_filter_uv (rpi_shader + 152)
+-+#define mc_filter_uv_b0 (rpi_shader + 280)
+-+#define mc_interrupt_exit8c (rpi_shader + 554)
+-+#define mc_exit (rpi_shader + 582)
+-+#define mc_exit_c (rpi_shader + 582)
+-+#define mc_interrupt_exit12 (rpi_shader + 598)
+-+#define mc_exit1 (rpi_shader + 634)
+-+#define mc_setup (rpi_shader + 650)
+-+#define mc_filter (rpi_shader + 942)
+-+#define mc_filter_b (rpi_shader + 1094)
+-+#define mc_end (rpi_shader + 1246)
+++#define mc_setup_c_q0 (rpi_shader + 0)
+++#define mc_start (rpi_shader + 0)
+++#define mc_setup_c_qn (rpi_shader + 2)
+++#define mc_filter_c_p (rpi_shader + 142)
+++#define mc_filter_c_p_l1 (rpi_shader + 272)
+++#define mc_filter_c_b (rpi_shader + 402)
+++#define mc_sync_q0 (rpi_shader + 590)
+++#define mc_sync_q1 (rpi_shader + 608)
+++#define mc_sync_q2 (rpi_shader + 620)
+++#define mc_sync_q3 (rpi_shader + 632)
+++#define mc_sync_q4 (rpi_shader + 644)
+++#define mc_sync_q5 (rpi_shader + 662)
+++#define mc_sync_q6 (rpi_shader + 674)
+++#define mc_sync_q7 (rpi_shader + 686)
+++#define mc_sync_q8 (rpi_shader + 698)
+++#define mc_sync_q9 (rpi_shader + 716)
+++#define mc_sync_q10 (rpi_shader + 728)
+++#define mc_sync_q11 (rpi_shader + 740)
+++#define mc_exit_c_qn (rpi_shader + 752)
+++#define mc_exit_y_qn (rpi_shader + 752)
+++#define mc_exit_c_q0 (rpi_shader + 770)
+++#define mc_exit_y_q0 (rpi_shader + 770)
+++#define mc_setup_y_q0 (rpi_shader + 790)
+++#define mc_setup_y_qn (rpi_shader + 792)
+++#define mc_filter_y_pxx (rpi_shader + 1032)
+++#define mc_filter_y_bxx (rpi_shader + 1162)
+++#define mc_filter_y_p00 (rpi_shader + 1292)
+++#define mc_filter_y_b00 (rpi_shader + 1382)
+++#define mc_setup_c10_q0 (rpi_shader + 1462)
+++#define mc_setup_c10_qn (rpi_shader + 1464)
+++#define mc_filter_c10_p (rpi_shader + 1600)
+++#define mc_filter_c10_p_l1 (rpi_shader + 1728)
+++#define mc_filter_c10_b (rpi_shader + 1856)
+++#define mc_sync10_q0 (rpi_shader + 2042)
+++#define mc_sync10_q1 (rpi_shader + 2060)
+++#define mc_sync10_q2 (rpi_shader + 2072)
+++#define mc_sync10_q3 (rpi_shader + 2084)
+++#define mc_sync10_q4 (rpi_shader + 2096)
+++#define mc_sync10_q5 (rpi_shader + 2114)
+++#define mc_sync10_q6 (rpi_shader + 2126)
+++#define mc_sync10_q7 (rpi_shader + 2138)
+++#define mc_sync10_q8 (rpi_shader + 2150)
+++#define mc_sync10_q9 (rpi_shader + 2168)
+++#define mc_sync10_q10 (rpi_shader + 2180)
+++#define mc_sync10_q11 (rpi_shader + 2192)
+++#define mc_exit_c10_q0 (rpi_shader + 2204)
+++#define mc_exit_y10_q0 (rpi_shader + 2204)
+++#define mc_exit_c10_qn (rpi_shader + 2224)
+++#define mc_exit_y10_qn (rpi_shader + 2224)
+++#define mc_setup_y10_q0 (rpi_shader + 2242)
+++#define mc_setup_y10_qn (rpi_shader + 2244)
+++#define mc_filter_y10_pxx (rpi_shader + 2494)
+++#define mc_filter_y10_p00 (rpi_shader + 2624)
+++#define mc_filter_y10_bxx (rpi_shader + 2716)
+++#define mc_filter_y10_b00 (rpi_shader + 2846)
+++#define mc_end (rpi_shader + 2926)
+ +
+ +#endif
+ diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+ new file mode 100644
+-index 0000000..aa3fe47
++index 0000000000..ba6cc13a95
+ --- /dev/null
+ +++ b/libavcodec/rpi_shader.qasm
+-@@ -0,0 +1,1259 @@
++@@ -0,0 +1,1741 @@
+ +
+ +# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
+ +# the warning that we are using rotation & ra/rb registers. r0..3 can be
+@@ -16457,102 +25142,197 @@ index 0000000..aa3fe47
+ +# local 4.  As it happens this is what is wanted here as we do not want the
+ +# constants from the other half of the calc.
+ +
+-+# register allocation
+++# PREREAD is the number of requests that we have sitting in the TMU request
+++# queue.
+ +#
+-+# ra0...ra7                                     eight horizontal filter coefficients
+++# There are 8 slots availible in the TMU request Q for tm0s requests, but
+++# only 4 output FIFO entries and overflow is bad (corruption or crash)
+++# (If threaded then only 2 out FIFO entries, but we aren't.)
+++# In s/w we are effectively limited to the min vertical read which is >= 4
+++# so output FIFO is the limit.
+ +#
+-+# rb0 rx_shift2
+-+# rb1 rb_y2_next
+-+#
+-+# rb4...rb7
+-+#
+-+# rb8..rb11, ra8...ra11                         Y: eight filtered rows of context (ra11 == most recent)
+-+#
+-+#                                               (ra15 isn't clamped to zero - this happens during the
+-+#                                                copy to ra14, and during its use in the vertical filter)
+-+#
+-+# rb8...rb11                                    eight vertical filter coefficients
+++# However in the current world there seems to be no benefit (and a small
+++# overhead) in setting this bigger than 2.
+ +
+-+# ra4                                           y: Fiter, UV: part -of b0 -> b stash
+++.set PREREAD,                      4
+ +
+-+# rb12                                          offset to add before shift (round + weighting offsets)
+-+# rb13                                          shift: denom + 6 + 9
+-+# rb14                                          L0 weight (U on left, V on right)
+-+# rb15                                          -- free --
+-+#
+-+# ra16                                          width:height
+-+# ra17                                          ra_y:ra_xshift
+-+# ra18                                          L1 weight (Y)
+-+# ra19                                          ra_y_next:ra_xshift_next
+-+#
+-+# rb16                                          pitch
+-+# rb17                                          height + 1
+-+# rb18                                          max(height,16) + 3
+-+# rb19                                          frame_base2_next
+-+#
+-+# ra20                                          1
+-+# ra21                                          ra_y2_next:ra_y2 (luma); free (chroma)
+-+# ra22 ra_k256                                  256
+-+# ra23                                          0
+-+#
+-+# rb20                                          -- free --
+-+# rb21                                          -- free --
+-+# rb22 rb_k255                                  255
+-+# rb23                                          dest (Y)
+-+#
+-+# rb24                                          vdw_setup_1(dst_pitch)
+-+# rb25                                          frame width-1
+-+# rb26                                          height<<23 + width<<16 + vdw_setup_0
+-+# rb27                                          vdw_setup_0 (depends on QPU number)
+-+# rb28                                          vpm_setup (depends on QPU number) for writing 8bit results into VPM
+-+# rb29                                          vdw_setup_1(dst_pitch-width)
+-+# rb30                                          frame height-1
+-+# rb31                                          used as temp to count loop iterations
+-+#
+-+# ra24                                          src frame base
+-+# ra25                                          src frame base 2
+-+# ra26                                          next ra24
+-+# ra27                                          next ra25
+-+# ra28                                          -- free --
+-+# ra29                                          -- free --
+++# Block heights - 8 & 16 are the only numbers we currently support
+++
+++.set C_BLK_HEIGHT_8,               16
+++.set C_BLK_HEIGHT_16,              8
+++.set Y_BLK_HEIGHT_8,               16
+++.set Y_BLK_HEIGHT_16,              8
+++
+++# QPU counts - depend on block size
+++# If we have a 2-byte format & block_size > 8 then can only afford
+++# 8 QPUs
+++# These numbers must match the numbers in rpi_shader_cmd.h
+++
+++.set N_QPU_8,                      12
+++.set N_QPU_16,                     12
+++
+++# register allocation
+ +#
+-+# Use an even numbered register as a link register to avoid corrupting flags
+-+# ra30                                          next kernel address
+-+# ra31                                          chroma-B height+3; free otherwise
+ +
+-+.set rb_max_x,                     rb25
+-+.set rb_max_y,                     rb30
+-+.set rb_pitch,                     rb16
+++# ra0-3
+++# Used as temp and may be loop filter coeffs (split into .8s)
+++# or temp in loop. Check usage on an individual basis.
+++
+++# ra4-7
+++# C:   L0 H filter out FIFO
+++# otherwise -- free --
+++
+++# ra8-11
+++# temp in some places - check usage
+++# Y:   (with rb8-11) horiz out FIFO
+++
+++# ra12-15
+++# -- free --
+++
+++# uniform: width:height
+ +.set ra_width_height,              ra16
+ +.set ra_width,                     ra16.16b
+ +.set ra_height,                    ra16.16a
+-+.set ra_y2,                        ra21.16a
+-+.set ra_y2_next,                   ra21.16b
+ +
+-+.set rb_base2_next,                rb19
+++# y:y2 same layout as y_y2_next so we can update both together
+++.set ra_y_y2,                      ra17
+++.set ra_y2,                        ra17.16a
+++.set ra_y,                         ra17.16b
+ +
+-+.set rb_dest,                      rb23
+++# uniform: L1 weight (U on left, V on right)
+++# Only used in Y B
+++.set ra_wt_off_mul_l1,             ra18
+++.set ra_wt_off_l1,                 ra18.16b
+++.set ra_wt_mul_l1,                 ra18.16a
+++
+++# y_next:y2_next same layout as y_y2 so we can update both together
+++.set ra_y_y2_next,                 ra19
+++.set ra_y_next,                    ra19.16b
+++.set ra_y2_next,                   ra19.16a
+++
+++# Setup: consts - subdivide a single register
+++.set ra_kff100100,                 ra20
+++.set ra_k256,                      ra20.16a
+++.set ra_k0,                        ra20.8a
+++.set ra_k1,                        ra20.8b
+++.set ra_k16,                       ra20.8c
+++.set ra_k255,                      ra20.8d
+++
+++# Loop: xshifts
+++.set ra_xshift,                    ra21.16a
+++.set ra_xshift_next,               ra21.16b
+++
+++# Loop var: L0 weight (U on left, V on right)
+++# _off_ is not used in loop as we want to modify it before use
+++.set ra_wt_off_mul_l0,             ra22
+++.set ra_wt_mul_l0,                 ra22.16a
+++.set ra_wt_off_l0,                 ra22.16b
+++
+++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
+++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
+++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
+++.set ra_blk_height_pmax,           ra23
+++.set ra_pmax,                      ra23.16a
+++.set ra_blk_height,                ra23.8c
+++# -- free --                       ra23.8d
+++
+++# Loop:  src frame base (L0)
+ +.set ra_base,                      ra24
+-+.set ra_base_next,                 ra26
+-+.set ra_xshift,                    ra17.16a
+ +
+++# Loop: src frame base (L1)
+ +.set ra_base2,                     ra25
+ +
+-+# Note ra_xy & ra_xy_next should have same structure!
+-+.set ra_xshift_next,               ra19.16a
+++# Loop: next src frame base (L0)
+++.set ra_base_next,                 ra26
+++
+++# -- free --                       ra27
+++# -- free --                       ra28
+++# -- free --                       ra29
+++
+++# Use an even numbered register as a link register to avoid corrupting flags
+++.set ra_link,                      ra30
+++
+++# -- free --                       ra31
+++
+ +.set rb_xshift2,                   rb0
+ +.set rb_xshift2_next,              rb1
+ +
+-+.set ra_y_next,                    ra19.16b
+-+.set ra_y,                         ra17.16b
+++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
+++.set rb_elem_x,                    rb2
+++
+++# El Flags
+++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
+++.set rb_ef,                        rb3
+++
+++# rb4-7
+++# C-B: L1 H filter out FIFO
+++# Y:   (with ra2.8x) Y vertical filter coeffs
+ +
+-+.set ra_k1,                        ra20
+++# rb8-11
+++# C:   Vertical filter coeffs
+++# Y:   (with ra8-11) horiz out FIFO
+++
+++# Loop var: offset to add before shift (round + weighting offsets)
+++# Exact value varies by loop
+++.set rb_wt_off,                    rb12
+++
+++# Setup: denom + 6 + 9
+++.set rb_wt_den_p15,                rb13
+++
+++# -- free --                       rb14
+++# -- free --                       rb15
+++
+++# Line pitch (128 for sand128)
+++.set rb_pitch,                     rb16
+++
+++# Loop count - 2 (set up TMU for next xfer)
+++.set rb_i_tmu,                     rb17
+++
+++# Loop count for min(height, 16)
+++# Y will reset & loop again if height > 16
+++.set rb_lcount,                    rb18
+++
+++# frame_base2_next
+++.set rb_base2_next,                rb19
+++
+++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
+++# offset to the slice
+ +.set rb_xpitch,                    rb20
+-+.set rb_k255,                      rb22
+-+.set ra_k256,                      ra22
+-+.set ra_k0,                        ra23
+ +
+-+.set ra_link,                      ra30
+++# -- free --                       rb21
+++
+++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
+++.set rb_pmask,                     rb22
+++
+++# Loop: destination address
+++.set rb_dest,                      rb23
+++
+++# vdw_setup_1(dst_pitch)
+++.set rb_dma1_base,                 rb24
+++
+++# Setup: pic width - 1
+++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
+++.set rb_max_x,                     rb25
+++
+++# Loop: height<<23 + width<<16 + vdw_setup_0
+++.set rb_dma0,                      rb26
+++
+++# vdw_setup_0 (depends on QPU number)
+++.set rb_dma0_base,                 rb27
+++
+++# Setup: vw_setup value to reset VPM write pointer
+++.set rb_vpm_init,                  rb28
+++
+++# Loop: vdw_setup_1(dst_pitch-width) = stride
+++.set rb_dma1,                      rb29
+++
+++# Setup: pic_height - 1
+++.set rb_max_y,                     rb30
+++
+++# -- free --                       rb31
+++
+++
+++
+ +
+ +# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+ +.set i_shift16,                    -16
+@@ -16564,8 +25344,10 @@ index 0000000..aa3fe47
+ +# Macros that express this - obviously these can't be overlapped
+ +# so are probably unsuitable for loop code
+ +
+-+.macro m_calc_dma_regs, r_vpm, r_dma
+++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
+ +  mov r2, qpu_num
+++.if v_bit_depth <= 8
+++  # 8 bit version
+ +  asr r1, r2, 2
+ +  shl r1, r1, 6
+ +  and r0, r2, 3
+@@ -16576,811 +25358,983 @@ index 0000000..aa3fe47
+ +
+ +  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
+ +  shl r0, r0, 5
+-+  add r_dma, r0, r1  # DMA out
+-+.endm
+ +
+-+# For chroma use packed H = (qpu_num & 1), Y = (qpu_num >> 1) * 16
+-+.macro m_calc_dma_regs_c, r_vpm, r_dma
+-+  mov r2, qpu_num
+++.else
+++  # 16 bit version
+++  # Limited to 8 QPUs if blk height > 8
+ +  asr r1, r2, 1
+++.if v_blk_height <= 8
+++  shl r1, r1, 4
+++.else
+ +  shl r1, r1, 5
+++.endif
+ +  and r0, r2, 1
+ +  or  r0, r0, r1
+ +
+-+  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
+-+  add r_vpm, r0, r1  # VPM 8bit storage
+++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
+++  add r_vpm, r0, r1
+ +
+ +  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
+ +  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
+-+  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
+++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
+ +  shl r0, r0, 6
+++.endif
+ +  add r_dma, r0, r1  # DMA out
+ +.endm
+ +
+ +
+++.macro m_setup_q0
+++  srel -, 12
+++.endm
+++
+++# Code start label
+++::mc_start
+++
+ +################################################################################
+ +# mc_setup_uv(next_kernel, x, y, ref_c_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+-+::mc_setup_c
+-+  mov tmurs, 1          ; mov -, unif        # No swap TMUs ; Next fn (ignored)
+++
+++.macro m_setup_c, v_bit_depth
+++
+++# Cannot use mul24 on x as x might be -ve, so must use shift
+++.if v_bit_depth <= 8
+++.set v_x_shift,         1
+++.set v_pmask,           0xff
+++.set v_blk_height,      C_BLK_HEIGHT_8
+++.else
+++.set v_x_shift,         2
+++.set v_pmask,           0xffff
+++.set v_blk_height,      C_BLK_HEIGHT_16
+++.endif
+++
+++  mov tmurs, 1                                  # No swap TMUs
+ +
+ +# Load first request location
+-+  mov ra0, unif         # next_x_y
+++  mov ra0, unif                                 # next_x_y
+++
+++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++  shl rb_ef, r0, i_shift30
+ +
+ +  mov ra_base, unif                             # Store frame c base
+ +
+ +# Read image dimensions
+-+  sub rb_max_x, unif, 1     # pic c width
+-+  sub rb_max_y, unif, 1     # pic c height
+++  sub r0, unif, 1                               # pic c width
+++  shl rb_max_x, r0, v_x_shift                   # rb_max_x in bytes
+++  sub rb_max_y, unif, 1                         # pic c height
+ +
+ +# load constants
+-+  mov ra_k1, 1
+-+  mov ra_k256, 256
+-+  mov rb_k255, 255
+-+  mov ra_k0, 0
+-+
+-+# touch registers to keep simulator happy
+++  mov ra_kff100100, 0xff100100
+++  mov rb_pmask, v_pmask
+++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+ +
+-+  # ra/b4..7: B0 -> B stash registers
+-+  mov ra4, 0 ; mov rb4, 0
+-+  mov ra5, 0 ; mov rb5, 0
+-+  mov ra6, 0 ; mov rb6, 0
+-+  mov ra7, 0 ; mov rb7, 0
+-+
+-+  mov r1, vdw_setup_1(0)  # Merged with dst_stride shortly, delay slot for ra_base
+-+
+-+# ; ra12..15: vertical scroll registers
+ +# get source pitch
+-+  mov rb_xpitch, unif   ; mov ra12, 0           # stride2
+-+  mov rb_pitch, unif    ; mov ra13, 0           # stride1
+-+  mov r0, elem_num      ; mov ra14, 0
+-+# get destination vdw setup
+-+  add rb24, r1, rb_pitch ; mov ra15, ra_k0 # vdw_setup_1
+++  mov rb_xpitch, unif                           # stride2
+++  mov rb_pitch, unif                            # stride1
+++  mov r1, vdw_setup_1(0)                        # [rb_pitch delay] Merged with dst_stride shortly
+++  add rb_dma1_base, r1, rb_pitch                # vdw_setup_1
+++
+++  and r0, 1, elem_num
+++  nop                   ; mul24 r0, r0, 5
+++.if v_bit_depth <= 8
+++  add rb_elem_x, r0, elem_num
+++.else
+++  add r0, r0, elem_num
+++  add rb_elem_x, r0, r0
+++.endif
+ +
+ +# Compute base address for first and second access
+ +# ra_base ends up with t0s base
+ +# ra_base2 ends up with t1s base
+ +
+-+  add r0, r0, ra0.16b                           # Add elem no to x to get X for this slice
+++  shl r0, ra0.16b, v_x_shift                    # [rb_elem_x delay]
+++  add r0, r0, rb_elem_x                         # Add elem no to x to get X for this slice
+ +  max r0, r0, 0         ; mov ra_y, ra0.16a     # ; stash Y
+ +  min r0, r0, rb_max_x
+ +
+ +# Get shift
+-+  and r1, r0, 1
+-+  shl ra_xshift_next, r1, 4
+-+
+-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+-+
+-+  and r0, r0, -2
+-+  add r0, r0, r0        ; v8subs r1, r1, r1
+-+  sub r1, r1, rb_pitch
+++# Shift will always calculate as 0 for 9+ bit
+++# Ideally we can optimize the shift out of the code in these cases but for now
+++# it is tidier to leave it in
+++.if v_bit_depth <= 8
+++  shl ra_xshift_next, r0, 3
+++.else
+++  mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
+++.endif
+++
+++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
+++
+++.if v_bit_depth <= 8
+++  and r0, r0, -4
+++.endif
+++  sub r1, ra_k0, rb_pitch
+ +  and r1, r0, r1
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra_y
+++  add r0, r0, r1
+ +  add ra_base, ra_base, r0
+ +
+-+  max r0, r1, 0
+-+  min r0, r0, rb_max_y
+-+
+-+# submit texture requests for first line
+-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+  add t0s, ra_base, r0
+-+
+-+# submit texture requests for 2nd line
+-+
+-+  max r0, r1, 0
+-+  min r0, r0, rb_max_y
+-+
+-+  add ra_y, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+  add t0s, ra_base, r0
+-+
+-+  add rb13, 9, unif     # denominator
+-+  mov -, unif           # Unused
+++  add rb_wt_den_p15, 23 - v_bit_depth, unif     # denominator
+ +
+ +# Compute part of VPM to use for DMA output
+-+  m_calc_dma_regs_c rb28, rb27
+++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
+++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+ +
+-+# -----------------
+ +# And again for L1, but only worrying about frame2 stuff
+ +
+-+  mov ra_link, unif        # Next fn
+-+
+ +# Load first request location
+-+  mov ra0, unif            # next_x_y
+++  mov ra0, unif                                 # next_x_y
+ +
+-+  mov ra_base2, unif # Store frame c base
+++  mov ra_base2, unif                            # [ra0 delay] Store frame c base
+ +
+ +# Compute base address for first and second access
+ +# ra_base ends up with t0s base
+ +# ra_base2 ends up with t1s base
+ +
+-+  mov ra_y2, ra0.16a       # Store y
+-+  mov r0, ra0.16b          # Load x
+-+  add r0, r0, elem_num     # Add QPU slice
+-+  max r0, r0, 0         ; mov -, unif           # Unused 0
+-+  min r0, r0, rb_max_x  ; mov -, unif           # Unused 1
+++  shl r0, ra0.16b, v_x_shift
+++  add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a    # Add QPU slice offset
+++  max r0, r0, 0
+++  min r0, r0, rb_max_x
+ +
+-+# Get shift
+-+  and r1, r0, 1         ; mov -, unif           # Unused 2
+-+  shl rb_xshift2_next, r1, 4
+++# Get shift (already zero if 9+ bit so ignore)
+++.if v_bit_depth <= 8
+++  shl rb_xshift2_next, r0, 3
+++.endif
+ +
+ +# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
+ +
+-+  and r0, r0, -2
+-+  add r0, r0, r0        ; v8subs r1, r1, r1
+-+  sub r1, r1, rb_pitch
+++.if v_bit_depth <= 8
+++  and r0, r0, -4
+++.endif
+++  sub r1, ra_k0, rb_pitch
+ +  and r1, r0, r1
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra_y2
+++  add r0, r0, r1        ; mov r2, ra_y2
+ +  add ra_base2, ra_base2, r0
+ +
+-+  max r0, r1, 0
+-+  min r0, r0, rb_max_y
+-+
+-+# submit texture requests for first line
+-+  add r1, r1, ra_k1     ; mul24 r0, r0, rb_pitch
+-+  add t1s, ra_base2, r0 ; mov -, unif           # Unused 3
+++# Do preloads
+++# r0 = ra_y, r2 = ra_y2
+++  mov r3, PREREAD       ; mov r0, ra_y
+ +
+-+# submit texture requests for 2nd line
+++:1
+++  sub.setf r3, r3, 1
+++  max r1, r0, 0
+++  min r1, r1, rb_max_y
+++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t0s, ra_base, r1  ; mov ra_y, r0
+ +
+-+  max r0, r1, 0         ; mov -, unif           # Unused 4
+++  max r1, r2, 0
+++  brr.anynz -, r:1b
+++  min r1, r1, rb_max_y
+++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t1s, ra_base2, r1 ; mov ra_y2, r2
+++# >>> .anynz 1b
+ +
+++  mov ra_link, unif                             # link
+++# touch registers to keep simulator happy
+++  # ra/b4..7: B0 -> B stash registers
+++  mov ra4, 0 ; mov rb4, 0
+ +  bra -, ra_link
+-+
+-+  min r0, r0, rb_max_y  ; mov -, unif           # Unused 5
+-+  add ra_y2, r1, ra_k1   ; mul24 r0, r0, rb_pitch
+-+  add t1s, ra_base2, r0
+-+
+++  mov ra5, 0 ; mov rb5, 0
+++  mov ra6, 0 ; mov rb6, 0
+++  mov ra7, 0 ; mov rb7, 0
+ +# >>> ra_link
+-+
+-+
+-+.macro setf_nz_if_v
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +.endm
+ +
+++::mc_setup_c_q0
+++  m_setup_q0
+++::mc_setup_c_qn
+++  m_setup_c 8
+ +
+ +################################################################################
+ +
+-+# mc_filter_uv(next_kernel, x, y, frame_u_base, frame_v_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+++# mc_filter_uv(next_kernel, x, y, frame_c_base, width_height, hcoeffs, vcoeffs, offset_weight_u, offset_weight_v, this_u_dst, this_v_dst)
+ +
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv
+-+  mov ra_link, unif     ; mov vw_setup, rb28    # ; x_y
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+++.macro m_filter_c_p, v_tmu, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         1
+++.set v_x_mul,           2
+++.set v_v_shift,         8
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         2
+++.set v_x_mul,           4
+++.set v_v_shift,         i_shift16
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++
+++.if v_tmu == 0
+++.set vrx_xshift,        rb_xshift2              # b side more convienient
+++.set vrx_xshift_next,   ra_xshift_next
+++.set vra_y_next,        ra_y_next
+++.set vrx_base_next,     ra_base_next
+++.set vra_y,             ra_y
+++.set vra_base,          ra_base
+++.set vr_txs,            t0s
+++.else
+++.set vrx_xshift,        ra_xshift               # a side more convienient
+++.set vrx_xshift_next,   rb_xshift2_next
+++.set vra_y_next,        ra_y2_next
+++.set vrx_base_next,     rb_base2_next
+++.set vra_y,             ra_y2
+++.set vra_base,          ra_base2
+++.set vr_txs,            t1s
+++.endif
+ +
+++# per-channel shifts were calculated on the *previous* invocation
+ +# get base addresses and per-channel shifts for *next* invocation
+-+  mov ra2, unif         ; mov r0, elem_num
+++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
+ +
+-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; base
+ +
+-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
+++  add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
+++  sub r1, r5, rb_pitch  ; mov ra0, unif         # ; H filter coeffs
+++  max r0, r0, r5        ; mov vrx_xshift, vrx_xshift_next
+++  min r0, r0, rb_max_x  ; mov vra_y_next, ra2.16a
+ +
+-+  shl ra_xshift_next, r0, 4
+-+
+-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++.if v_bit_depth <= 8
+++  shl vrx_xshift_next, r0, 3
+++  and r0, r0, -4
+++.endif
+++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul        # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++  add r0, r0, r1        ; mov ra3, unif                      # ; V filter coeffs
+++  add vrx_base_next, r3, r0     ; mov r1, ra_height
+ +
+ +# set up VPM write
+-+
+-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+-+  add rb17, r1, 1       ; mov ra1, unif         # ; U offset/weight
+-+  add rb18, r1, 3       ; mov.ifnz ra1, unif    # ; V offset/weight
+++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
+++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++  add rb_lcount, r1, 3          ; mov.ifc ra_wt_off_mul_l0, unif    # ; V offset/weight
+ +
+ +# ; unpack filter coefficients
+ +
+-+  add r0,   r0, r2      ; mov rb8,  ra3.8a      # Combine width and height of destination area
+-+  shl r0,   r0, 15      ; mov rb9,  ra3.8b      # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27    ; mov r1, ra1.16b       # ; r1=weight
+++  shl r0, r1, v_dma_h_shift     ; mov rb8, ra3.8a
+++  add r0, r0, r2                ; mov rb9, ra3.8b            # Combine width and height of destination area (r0=h<<8, r2=w*2)
+++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c           # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0       # ; r1=weight
+ +
+-+  shl r1, r1, rb13      ; mov rb10, ra3.8c
+-+  mov r3, 0             ; mov rb11, ra3.8d   # Loop count
+++  mov rb_dest, unif             ; mov ra9, rb_max_y          # dst_addr ; alias rb_max_y
+ +
+-+  asr rb12, r1, 1
+-+  shl rb14, ra1.16a, 1  # b14 = weight*2
+++  shl r1, r1, rb_wt_den_p15     ; mov rb11, ra3.8d
+ +
+-+# rb14 - weight L0 * 2
+-+# rb13 = weight denom + 6 + 9
+-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+++  asr rb_wt_off, r1, 2          ; mov ra_link, unif    # ; Link
+++  sub ra3, rb_wt_den_p15, ra_k1
+ +
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++# r5           = 0 (loop counter)
+++# ra9          = alias for rb_max_y
+++# ra_wt_mul_l0 = weight L0
+++# ra3          = weight denom + 22 - bit_depth [= rb_wt_den_p15 - 1, max 19]
+++# rb_wt_off    = (offset * 2 + 1) << (ra3 - 1)
+++
+++# We want (r0r1)
+++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
+++# We fetch (after shift)
+++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
+ +
+-+# r3 = 0
+-+:uvloop
+++:1
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+
+-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+  min r2, r2, rb_max_y
+-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+  setf_nz_if_v
+++.if v_tmu == 0
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0   # loop counter increment
+++  shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
+++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++  add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
+++.else
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu1     # loop counter increment
+++  shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
+++  shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
+++  add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
+++.endif
+++
+++  add vra_y, r3, ra_k1   ; mov      r0, r1 << 15
+++  max r3, r3, ra_k0     ; mov.ifnc r1, r2 << 1
+++  min r3, r3, ra9       ; mov.ifnc r0, r2
+++
+++  mov ra4, ra5          ; mul24 r2, r3, rb_pitch
+++  add vr_txs, vra_base, r2 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+ +
+ +# apply horizontal filter
+ +# The filter coeffs for the two halves of this are the same (unlike in the
+ +# Y case) so it doesn't matter which ra0 we get them from
+-+
+-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+  sub r0, r2, r3        ; mov r3, rb31
+-+  sub.setf -, r3, 4     ; mov ra12, ra13
+-+  brr.anyn -, r:uvloop
+-+  mov ra13, ra14        ; mul24 r1, ra14, rb9
+-+  mov ra14, ra15
+-+  mov ra15, r0          ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+  sub r1, r1, r0        ; mul24 r0, ra14, rb10
+-+  add r1, r1, r0        ; mul24 r0, ra15, rb11
+++# Also as the two halves are locked together we don't need to separate the 1st
+++# r0 mul or the last r1 mul as they are vaild for all QPUs
+++
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++  sub.setf -, r5, 4     ; mul24      r0, ra0.8d,       r1
+++
+++# V filter =- ra4 * rb8-+ ra5 * rb9 + ra6 * rb10 - ra7 * rb11 (post FIFO shift)
+++# Have to dup block as we need to move the brr - code is more common than it
+++# looks at first glance
+++.if v_bit_depth <= 8
+++  brr.anyn -, r:1b
+++  add r2, r2, r3        ; mov ra5, ra6
+++  mov ra6, ra7          ; mul24 r1, ra7, rb10
+++  sub ra7, r2, r0       ; mul24 r0, ra4, rb8
+++.else
+++  add r2, r2, r3        ; mov ra5, ra6
+++  brr.anyn -, r:1b
+++  mov ra6, ra7          ; mul24 r1, ra7, rb10
+++  sub r2, r2, r0        ; mul24 r0, ra4, rb8
+++  asr ra7, r2, v_bit_depth - 8
+++.endif
+++# >>> .anyn 1b
+++
+++  sub r1, r1, r0        ; mul24 r0, ra5, rb9    # [ra7 delay]
+++  add r1, r1, r0        ; mul24 r0, ra7, rb11
+ +  sub r1, r1, r0
+-+  sub.setf -, r3, rb18  ; mul24 r1, r1, ra_k256
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+ +  asr r1, r1, 14
+-+  nop                   ; mul24 r1, r1, rb14
+-+  shl r1, r1, 8
+++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++  shl r1, r1, 8         ; mov r3, ra_blk_height
+++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++  brr.anyn -, r:1b
+++  asr r1, r1, ra3
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> .anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+ +
+-+  add r1, r1, rb12
+-+  asr ra1.8as, r1, rb13
+-+  nop                   ; mov r1, r1 << 8
+-+  brr.anyn -, r:uvloop
+-+  asr ra1.8bs, r1, rb13
+-+  mov -, vw_wait
+-+  mov vpm, ra1
+++# At 10 bits
+++# Worst case +ve after 1st filter = 74 * 0x3ff >> 2 = 18925 0x49ed (15 bits)
+++# Worst case -ve after 1st filter = -10 * 0x3ff >> 2 = -10230
+++# after 2nd (really we can't get this) = 74 * 18925 + 10 * 10230 >> 6 = 23480 = 0x5bb8 (15 bits)
+++# (P)
+++# * weight (255) = 5987400 = 0x5b5c48 (23 bits)
+++# + 0x3ff << (13 - bit_depth + 7) = 0x6b5848 (23 bits)
+++# ... should be OK
+++#
+++# (B)
+++# *2 (L0+L1) = 5963920 = 0x5b0090 (23 bits)
+++# + (offset * 2 + 1) << (15 - bit_depth + 7) = 5963920 + (0x3ff << 12) = 5963920 + 4190208 = 10154128 = 0x9af090 (24 bits)
+++# So signed overflow if we sign extend here :-(
+++#
+++# In practice this doesn't happen (we need a maximal offset and a very unlucky
+++# filter).
+++#
+++# This could be fixed by offsetting the filters s.t. they are unsigned until
+++# weight mul and then removing the offset with the weighting offset (I think
+++# this should work) or splitting the rounding & offsetting
+ +
+-+# >>>
+++::mc_filter_c_p
+++  m_filter_c_p 0, 8
+ +
+-+# DMA out for U & stash for V
+-+  bra -, ra_link
+-+  mov vw_setup, rb26
+-+  mov vw_setup, rb29
+-+  mov vw_addr, unif     # u_dst_addr
+-+# >>>
+++::mc_filter_c_p_l1
+++  m_filter_c_p 1, 8
+ +
+ +################################################################################
+ +
+-+# mc_filter_uv_b0(next_kernel, x, y, frame_c_base, height, hcoeffs[0], hcoeffs[1], vcoeffs[0], vcoeffs[1], this_u_dst, this_v_dst)
+++# mc_filter_c_b
+ +
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +# ra_x, ra_x16_base point to the current coordinates for this block
+-+::mc_filter_uv_b0
+-+  mov -, unif           ; mov vw_setup, rb28    # next_fn ignored - always uv_b
+++
+++.macro m_filter_c_b, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         1
+++.set v_v_shift,         8
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         2
+++.set v_v_shift,         i_shift16
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++.set v_x_mul,           (1 << v_x_shift)
+ +
+ +# per-channel shifts were calculated on the *previous* invocation
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+-+  mov ra2, unif         ; mov r0, elem_num
+++  mov vw_setup, rb_vpm_init ; mov ra2, unif     # ; x_y
+ +
+-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++  add.setf -, rb_ef, rb_ef ; mov r3, unif       # [ra2 delay] ; r3=base
+ +
+-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+-+  max r0, r0, 0         ; mov rb_xshift2, ra_xshift_next # ; xshift2 used because B
+-+  min r0, r0, rb_max_x  ; mov ra1, unif         # ; width_height
+++  shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1  # x ; r5=0
+++  add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
+++  sub r1, r5, rb_pitch  ; mov ra_width_height, unif  # r1=pitch2 mask ; width_height
+++  max r0, r0, r5        ; mov ra_xshift, ra_xshift_next
+++  min r0, r0, rb_max_x  ; mov ra0, unif         # L0 H filter coeffs
+ +
+-+  shl ra_xshift_next, r0, 4
+++.if v_bit_depth <= 8
+++  shl ra_xshift_next, r0, 3
+++.endif
+ +
+-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+-+  add r0, r0, r0        ; mov ra_y_next, ra2.16a
+-+  and r1, r0, r1        ; mul24 r2, ra1.16b, 2  # r2=x*2 (we are working in pel pairs)
+++  and r0, r0, -4        ; mov ra2, unif         # ; L0 V filter coeffs
+++  and r1, r0, r1        ; mul24 r2, ra_width, v_x_mul  # r2=x*2 (we are working in pel pairs)
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov r1, ra1.16a       # Add stripe offsets ; r1=height
+-+  add ra_base_next, r3, r0 ; mul24 r0, r1, ra_k256
+++  add r0, r0, r1        ; mov r1, ra_height     # Add stripe offsets ; r1=height
+++  add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+ +
+ +# set up VPM write
+ +
+-+  sub rb29, rb24, r2    ; mov ra3, unif         # Compute vdw_setup1(dst_pitch-width) ; V filter coeffs
+-+  add rb17, r1, 1
+-+  add ra31, r1, 3       ; mov rb8,  ra3.8a      # Combine width and height of destination area
+-+
+-+# ; unpack filter coefficients
+-+
+-+  add r0,   r0, r2      ; mov rb9,  ra3.8b
+-+  shl r0,   r0, 15      ; mov rb10, ra3.8c      # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27
+-+
+-+  mov r3, 0             ; mov rb11, ra3.8d      # Loop count
+-+
+-+  mov rb14, unif                                # U weight
+-+  mov.ifnz rb14, unif                           # V weight
+-+
+-+# rb14 unused in b0 but will hang around till the second pass
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# r3 = 0
+-+:uvloop_b0
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu0     # loop counter increment
+-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y_next
+-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y
+-+
+-+  max r2, r3, 0         ; mov.ifz ra_base, ra_base_next
+-+  min r2, r2, rb_max_y
+-+  add ra_y, r3, ra_k1   ; mul24 r2, r2, rb_pitch
+-+  add t0s, ra_base, r2  ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+  and r1, r1, rb_k255   ; mul24      r3, ra0.8a,       r0
+-+  nop                   ; mul24      r2, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8  @ "mul_used", 0  # Need to wait 1 cycle for rotated r1
+-+  nop                   ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
+-+  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
+-+  add r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
+-+  nop                   ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+-+  sub r0, r2, r3        ; mov r3, rb31
+-+  sub.setf -, r3, 4     ; mov ra12, ra13
+-+  brr.anyn -, r:uvloop_b0
+-+  mov ra13, ra14        ; mul24 r1, ra14, rb9   # ra14 is about to be ra13
+-+  mov ra14, ra15        ; mul24 r2, ra15, rb10  # ra15 is about to be ra14
+-+  mov ra15, r0          ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop_b0
+-+
+-+# apply vertical filter and write to B-FIFO
+-+
+-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # start of B FIFO writes
+-+  add r1, r1, r2        ; mul24 r0, ra15, rb11  # N.B. ra15 write gap
+-+  sub r1, r1, r0        ; mov ra7, rb6
+-+
+-+# FIFO goes:
+-+# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
+-+# This arrangement optimizes the inner loop FIFOs at the expense of making the
+-+# bulk shift between loops quite a bit nastier
+-+# a8 used as temp
+-+
+-+  sub.setf -, r3, ra31
+-+  asr ra8.16a, r1, 6    ; mov rb6, ra5          # This discards the high bits that might be bad
+-+  brr.anyn -, r:uvloop_b0
+-+  mov ra5, rb4          ; mov rb4, ra4
+-+  mov ra4, rb5          ; mov rb5, ra6
+-+  mov ra6, rb7          ; mov rb7, ra8
+-+# >>>
+-+
+-+# 1st half done all results now in the a/b4..7 fifo
+-+
+-+# Need to bulk rotate FIFO for heights other than 16
+-+# plausible heights are 16, 12, 8, 6, 4, 2 and that is all we deal with
+-+# we are allowed 3/4 cb_size w/h :-(
+-+
+-+# Destination uniforms discarded
+-+# At the end drop through to _b - we will always do b after b0
+-+
+-+  sub.setf -, 15, r3    # 12 + 3 of preroll
+-+  brr.anyn -, r:uv_b0_post_fin                  # h > 12 (n) => 16 (do nothing)
+-+  sub r3, 11, r3        ; mov -, unif           # r3 = shifts wanted ; Discard u_dst_addr
+-+  mov r0, i_shift16     ; mov ra_link, unif
+-+  mov r1, 0x10000
+-+# >>>
+-+  brr.anyz -, r:uv_b0_post12                    # h == 12 deal with specially
+-+# If h != 16 && h != 12 then h <= 8 so
+-+# shift 8 with discard (.16b = .16a on all regs)
+-+  shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
+-+  shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
+-+  shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
+-+# >>>
+-+  shl ra4, ra4, r0      ; mul24 rb4, rb4, r1
+-+
+-+  shl.setf -, r3, i_shift30  # b2 -> C, b1 -> N
+-+# Shift 4
+-+  mov.ifc ra7, ra4      ; mov.ifc rb6, rb5
+-+  mov.ifc ra5, ra6      ; mov.ifc rb4, rb7
+-+  # If we shifted by 4 here then the max length remaining is 4
+-+  # so that is it
+-+
+-+  brr -, r:uv_b0_post_fin
+-+# Shift 2
+-+  mov.ifn ra7, ra5      ; mov.ifn rb6, rb4
+-+  mov.ifn ra5, ra4      ; mov.ifn rb4, rb5
+-+  mov.ifn ra4, ra6      ; mov.ifn rb5, rb7
+-+  # 6 / 2 so need 6 outputs
+-+# >>>
+-+
+-+:uv_b0_post12
+-+# this one is annoying as we need to swap halves of things that don't
+-+# really want to be swapped
+-+
+-+# b7a, a6a, b5a, a4a
+-+# b4a, a5a, b6a, a7a
+-+# b7b, a6b, b5b, a4b
+-+# b4b, a5b, b6b, a7b
+-+
+-+  mov r2, ra6           ; mov r3, rb7
+-+  shl ra6, ra5, r0      ; mul24 rb7, rb4, r1
+-+  mov ra5, r2           ; mov rb4, r3
+-+
+-+  mov r2,  ra4          ; mov r3,  rb5
+-+  shl ra4, ra7, r0      ; mul24 rb5, rb6, r1
+-+  mov ra7, r2           ; mov rb6, r3
+-+
+-+:uv_b0_post_fin
+-+
+-+##### L1 B processing
+-+
+-+# per-channel shifts were calculated on the *previous* invocation
+++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
+++  add rb_i_tmu, r1, 3 - PREREAD ; v8min r1, r1, ra_blk_height
+++  add rb_lcount, r1, 3  ; mov.ifc ra_wt_mul_l0, unif # ; V weight
+ +
+-+# get base addresses and per-channel shifts for *next* invocation
+-+  mov ra2, unif         ; mov r0, elem_num
+++  shl r0, r1, v_dma_h_shift ; mov ra3, unif     # ; x2_y2
+++  add r0, r0, r2        ; mov r3, unif          # [ra3 delay] ; base
+++  shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a    # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb_dma0, r0, rb_dma0_base ; mov ra1, unif # ; H filter coeffs
+ +
+-+  setf_nz_if_v                                  # Also acts as delay slot for ra2
+++# L1 - uniform layout could possibly be optimized
+ +
+-+  add r0, ra2.16b, r0   ; v8subs r1, r1, r1     # x ; r1=0
+-+  sub r1, r1, rb_pitch  ; mov r3, unif          # r1=pitch2 mask ; r3=base
+-+  max r0, r0, ra_k0     ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
+-+  min r0, r0, rb_max_x  ; mov -, unif           # ; width_height
+++  shl r0, ra3.16b, v_x_shift                    # r0=x*2
+++  add r0, r0, rb_elem_x ; mov ra3, unif         # ; V filter coeffs
+++  sub r1, r5, rb_pitch  ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
+++  max r0, r0, r5        ; mov rb8, ra3.8a       # ; start unpacking filter coeffs
+++  min r0, r0, rb_max_x  ; mov rb9, ra3.8b
+ +
+-+  shl rb_xshift2_next, r0, 4
+++.if v_bit_depth <= 8
+++  shl rb_xshift2_next, r0, 3
+++.endif
+ +
+-+  and r0, r0, -2        ; mov ra0, unif         # H filter coeffs
+-+  add r0, r0, r0        ; mov ra_y2_next, ra2.16a
+-+  and r1, r0, r1        ; mov ra3, unif         # ; V filter coeffs
+++  and r0, r0, -4        ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
+++  and r1, r0, r1        ; mov rb10, ra3.8c
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        ; mov rb8,  ra3.8a      # Add stripe offsets ; start unpacking filter coeffs
+++  add r0, r0, r1        ; mov rb_dest, unif     #  Add stripe offsets ; dst_addr
+ +  add rb_base2_next, r3, r0
+ +
+-+  mov ra1, unif         ; mov rb9,  ra3.8b      # U offset/weight
+-+  mov.ifnz ra1, unif    ; mov rb10, ra3.8c      # V offset/weight
+-+
+-+  nop                   ; mov rb11, ra3.8d
+-+  shl r1, ra1.16b, rb13 ; v8subs r3, r3, r3     # ; r3 (loop counter)  = 0
+-+  asr rb12, r1, 1
+-+
+-+# ra1.16a used directly in the loop
+-+
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+-+
+-+# r3 = 0
+-+
+-+:uvloop_b
+++  mov ra9, rb_max_y     ; mov rb11, ra3.8d
+++  shl r1, ra_wt_off_l1, rb_wt_den_p15
+++  asr rb_wt_off, r1, 9  ; mov ra_link, unif     # link
+++
+++# r5        loop counter
+++# ra0       H coeffs L0
+++# ra1       H coeffs L1
+++# ra2       V coeffs L0
+++# ra3       temp
+++# ra4-7     L0 H FIFO
+++# rb4-7     L1 H FIFO
+++# rb8-rb11  V coeffs L1
+++# ra9       rb_max_y alias
+++
+++:1
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ; ldtmu0     # loop counter increment
+++  shr r2, r4, ra_xshift ; mov.ifz ra_base2, rb_base2_next
+++  shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
+++  add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
+++  add ra_y, 1, ra_y     ; mov r3, ra_y
+++
+++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++
+++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++  add t0s, ra_base, r3  ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+++
+++# L0 H-filter
+++# H FIFO scrolls are spread all over this loop
+++  mov rb4, rb5          ; mov ra4, ra5          # ? Just moves
+++
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,       r0
+++  nop                   ; mul24      r2, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 12, r1 << 12 @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra0.8d,       r1
+++.if v_bit_depth <= 8
+++  sub ra3, r2, r3       ; mov rb5, rb6          ; ldtmu1
+++.else
+++  sub r2, r2, r3        ; mov rb5, rb6          ; ldtmu1
+++  asr ra3, r2, (v_bit_depth - 8)
+++.endif
+++
+++  shr r2, r4, rb_xshift2 ; mov ra5, ra6
+++  shr r1, r2, v_v_shift ; mov r3, ra_y2
+++  add ra_y2, r3, ra_k1  ; mov rb6, rb7
+++
+++  max r3, r3, ra_k0     ; mov      r0, r1 << 15
+++  min r3, r3, ra9       ; mov.ifnc r1, r2 << 1
+++
+++  mov.ifnc r0, r2       ; mul24 r3, r3, rb_pitch
+++  add t1s, ra_base2, r3 ; v8min r0, r0, rb_pmask  # v8subs masks out all but bottom byte
+++
+++# L1 H-filter
+++
+++  and r1, r1, rb_pmask  ; mul24      r3, ra1.8a,       r0
+++  nop                   ; mul24      r2, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra1.8b << 12, r1 << 12 @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
+++  sub.setf -, r5, 4     ; mul24      r0, ra1.8d,       r1
+++# V filters - start in branch delay slots of H
+++# Final asr not needed for 8-bit but we can#t (currently) save a whole instruction
+++  add r2, r2, r3        ; mul24 r1, rb5, ra2.8b
+++  brr.anyn -, r:1b
+++  mov ra6, ra7          ; mul24 r3, ra7, rb10
+++  sub r2, r2, r0        ; mul24 r0, rb4, ra2.8a
+++  asr ra7, r2, (v_bit_depth - 8) ; mov rb7, ra3
+++# >>> .anyn 1b
+++
+++  sub r1, r1, r0        ; mul24 r0, rb6, ra2.8c # [rb7 delay]
+++  add r1, r1, r0        ; mul24 r0, rb7, ra2.8d
+++  sub r2, r1, r0        ; mul24 r0, ra4, rb8
+++  sub r1, r3, r0        ; mul24 r0, ra5, rb9
+++  add r1, r1, r0        ; mul24 r0, ra7, rb11
+++  sub r1, r1, r0        ; mul24 r2, r2, ra_k256
+++
+++  asr r2, r2, 14        ; mul24 r1, r1, ra_k256
+++  asr r1, r1, 14        ; mul24 r2, r2, ra_wt_mul_l0
+++
+++  add r2, r2, rb_wt_off ; mul24 r1, r1, ra_wt_mul_l1    # rb_wt_off = (offsetL0 + offsetL1 + 1) << (rb_wt_den_p15 - 9)
+++  add r1, r1, r2        ; mov r3, ra_blk_height
+++
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256     # Lose bad top 8 bits & sign extend
+++
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15 ; v8subs r0, ra_height, r3
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> .anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+ +
+-+  sub.setf -, r3, rb17  ; v8adds rb31, r3, ra_k1 ; ldtmu1     # loop counter increment
+-+  shr r0, r4, rb_xshift2 ; mov.ifz r3, ra_y2_next
+-+  shr r1, r0, 8         ; mov.ifnz r3, ra_y2
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+ +
+-+  max r2, r3, ra_k0     ; mov.ifz ra_base2, rb_base2_next
+-+  min r2, r2, rb_max_y
+-+  add ra_y2, r3, ra_k1  ; mul24 r2, r2, rb_pitch
+-+  add t1s, ra_base2, r2 ; v8min r0, r0, rb_k255  # v8subs masks out all but bottom byte
+-+
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+-+
+-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+
+-+  and r1, r1, rb_k255  ; mul24      r3, ra0.8a,       r0
+-+  nop                  ; mul24      r2, ra0.8b << 1,  r0 << 1     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8,  r1 << 8     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9,  r1 << 9     @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2,  r0 << 2     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10    @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra0.8d << 3,  r0 << 3     @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11    @ "mul_used", 0
+-+  sub r0, r2, r3       ; mov r3, rb31
+-+  sub.setf -, r3, 4    ; mov ra12, ra13
+-+  brr.anyn -, r:uvloop_b
+-+  mov ra13, ra14          ; mul24 r1, ra14, rb9
+-+  mov ra14, ra15          ; mul24 r2, ra15, rb10
+-+  mov ra15, r0            ; mul24 r0, ra12, rb8
+-+# >>> .anyn uvloop_b
+-+
+-+# apply vertical filter and write to VPM
+-+
+-+  sub r1, r1, r0        ; mov ra8.16b, ra7      # FIFO rotate (all ra/b4..7)
+-+  add r1, r1, r2        ; mul24 r0, ra15, rb11
+-+  sub r1, r1, r0        ; mul24 r0, ra7.16b, rb14
+-+  mov ra7, rb6          ; mul24 r1, r1, ra_k256
+-+  asr r1, r1, 14        ; mov rb6, ra5 # shift2=6
+-+
+-+  mov ra5, rb4          ; mul24 r1, r1, ra1.16a
+-+  add r1, r1, r0        ; mov rb4, ra4
+-+
+-+  mov ra4, rb5          ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
+-+  add r1, r1, rb12      ; mov rb5, ra6          # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+-+
+-+  sub.setf -, r3, ra31  ; mov ra6, rb7
+-+  asr ra3.8as, r1, rb13
+-+  nop                   ; mov r1, r1 << 8
+-+  brr.anyn -, r:uvloop_b
+-+  asr ra3.8bs, r1, rb13
+-+  mov -, vw_wait        ; mov rb7, ra8          #  vw_wait is B-reg (annoyingly) ; Final FIFO mov
+-+  mov vpm, ra3
+-+# >>>
+++::mc_filter_c_b
+++  m_filter_c_b 8
+ +
+-+# DMA out
+++################################################################################
+++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
+++# conflicts
+++
+++.macro m_exit_drain
+++.if PREREAD == 2
+++# Special case 2 as loop is wasteful
+++  nop                   ; nop           ; ldtmu0
+++  nop                   ; nop           ; ldtmu1
+++  nop                   ; nop           ; ldtmu0
+++  mov -, vw_wait        ; nop           ; ldtmu1
+++.else
+++  mov.setf r3, PREREAD - 1
+++:1
+++  brr.anynz -, r:1b
+++  nop                   ; nop           ; ldtmu0
+++  nop                   ; nop           ; ldtmu1
+++  sub.setf r3, r3, 1
+++ # >>>
+++  mov  -, vw_wait
+++.endif
+++.endm
+ +
+++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
+++# All qpus start at the beginning and after that (group - 1) must have finished
+++# before (group) can start
+++#
+++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
+++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
+++# lockup otherwise)
+++#
+++# There is some, currently ill defined, potential lockup if we have the VDM active
+++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
+++#
+++# The code stalled when I had many waiters on a single sem so we have a
+++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
+++# and we currently have both the memory & sems to support it.
+++.macro m_sync_q, n_qpu, n_quads
+++# Do not generate code for qpu >= quads * 4 -  fns should never be called
+++.if n_qpu < n_quads * 4
+++  mov ra_link, unif     # Can only branch to an a reg (not r0)
+++  mov -, vw_wait        # [ra_link delay]
+++
+++.set n_sem_sync, n_qpu - (n_qpu % 4)
+++.set n_sem_in, n_qpu
+++.set n_sem_out, n_qpu + 1
+++
+++.if n_qpu % 4 == 0
+++
+++.set n_sem_quad_in,  12 + n_qpu / 4
+++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
+++
+++  sacq -, n_sem_sync
+++  sacq -, n_sem_sync
+++  sacq -, n_sem_sync
+ +  bra -, ra_link
+-+  mov vw_setup, rb26
+-+  mov vw_setup, rb29
+-+  mov vw_addr, unif     # c_dst_addr
+++  sacq -, n_sem_quad_in
+++  srel -, n_sem_out
+++  srel -, n_sem_quad_out
+ +
+++.else
+++  bra -, ra_link
+++  srel -, n_sem_sync
+++  sacq -, n_sem_in
+++.if n_sem_out % 4 != 0
+++  srel -, n_sem_out
+++.else
+++  nop
+++.endif
+++.endif
+++.endif
+++.endm
+ +
+-+################################################################################
+++.set v_quads8, N_QPU_8 / 4
+++
+++::mc_sync_q0
+++  m_sync_q 0, v_quads8
+++::mc_sync_q1
+++  m_sync_q 1, v_quads8
+++::mc_sync_q2
+++  m_sync_q 2, v_quads8
+++::mc_sync_q3
+++  m_sync_q 3, v_quads8
+++::mc_sync_q4
+++  m_sync_q 4, v_quads8
+++::mc_sync_q5
+++  m_sync_q 5, v_quads8
+++::mc_sync_q6
+++  m_sync_q 6, v_quads8
+++::mc_sync_q7
+++  m_sync_q 7, v_quads8
+++::mc_sync_q8
+++  m_sync_q 8, v_quads8
+++::mc_sync_q9
+++  m_sync_q 9, v_quads8
+++::mc_sync_q10
+++  m_sync_q 10, v_quads8
+++::mc_sync_q11
+++  m_sync_q 11, v_quads8
+ +
+ +# mc_exit()
+-+
+-+::mc_interrupt_exit8c
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu1
+-+  mov  -, vw_wait ; nop ; ldtmu0  # wait on the VDW
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+#  mov -,sacq(0) # 8
+-+#  mov -,sacq(0) # 9
+-+#  mov -,sacq(0) # 10
+-+#  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+ +# Chroma & Luma the same now
+-+::mc_exit_c
+-+::mc_exit
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  mov  -, vw_wait ; nop ; ldtmu1 # wait on the VDW
+ +
+-+  mov -,srel(0)
+++.macro m_exit_qn
+++  m_exit_drain
+++  nop                   ; nop           ; thrend
+++  nop
+++  nop
+++# >>> thrend <<<
+++.endm
+++
+++::mc_exit_c_qn
+++::mc_exit_y_qn
+++  m_exit_qn
+ +
+-+  nop        ; nop ; thrend
+-+  nop        ; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+ +
+ +
+ +# mc_interrupt_exit12()
+-+::mc_interrupt_exit12
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  mov  -, vw_wait ; nop ; ldtmu1  # wait on the VDW
+-+
+-+  mov -,sacq(0) # 1
+-+  mov -,sacq(0) # 2
+-+  mov -,sacq(0) # 3
+-+  mov -,sacq(0) # 4
+-+  mov -,sacq(0) # 5
+-+  mov -,sacq(0) # 6
+-+  mov -,sacq(0) # 7
+-+  mov -,sacq(0) # 8
+-+  mov -,sacq(0) # 9
+-+  mov -,sacq(0) # 10
+-+  mov -,sacq(0) # 11
+-+
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+-+
+-+
+-+::mc_exit1
+-+  mov  -, vw_wait # wait on the VDW
+-+
+-+  ldtmu0
+-+  ldtmu1
+-+  ldtmu0
+-+  ldtmu1
+-+  nop        ; nop ; thrend
+-+  mov interrupt, 1; nop # delay slot 1
+-+  nop        ; nop # delay slot 2
+++
+++.macro m_exit_q0
+++  m_exit_drain
+++  sacq -, 12
+++  nop                   ; nop           ; thrend
+++  mov interrupt, 1
+++  nop
+++# >>> thrend <<<
+++.endm
+++
+++::mc_exit_c_q0
+++::mc_exit_y_q0
+++  m_exit_q0
+ +
+ +# LUMA CODE
+ +
+ +# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
+ +# For P frames we make the second x,y coordinates offset by +8
+ +
+++
+ +################################################################################
+-+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+-+::mc_setup
+++# mc_setup
+++#
+++# typedef struct qpu_mc_pred_y_s_s {
+++#    qpu_mc_src_t next_src1;
+++#    qpu_mc_src_t next_src2;
+++#    uint16_t pic_h;
+++#    uint16_t pic_w;
+++#    uint32_t stride2;
+++#    uint32_t stride1;
+++#    uint32_t wdenom;
+++#    uint32_t next_fn;
+++# } qpu_mc_pred_y_s_t;
+++
+++.macro m_setup_y, v_bit_depth
+++
+++# Cannot use mul24 on x as x might be -ve, so must use shift
+++.if v_bit_depth <= 8
+++.set v_x_shift,         0
+++.set v_pmask,           0xff
+++.set v_blk_height,      Y_BLK_HEIGHT_8
+++.else
+++.set v_x_shift,         1
+++.set v_pmask,           0xffff
+++.set v_blk_height,      Y_BLK_HEIGHT_16
+++.endif
+++
+++
+ +  # Need to save these because we need to know the frame dimensions before computing texture coordinates
+-+  mov tmurs, 1          ; mov ra8, unif         # No TMU swap ; y_x
+-+  mov ra9, unif         # ref_y_base
+-+  mov ra10, unif        # y2_x2
+-+  mov ra11, unif        # ref_y2_base
+++  mov tmurs, 1          ; mov ra0, unif         # No TMU swap ; x_y
+++  mov ra9, unif                                 # ref_y_base
+++  mov ra1, unif                                 # x2_y2
+++  mov ra11, unif                                # ref_y2_base
+++
+++# load constants
+++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
+++  shl rb_ef, r0, i_shift30
+++
+++
+++  mov ra_kff100100, 0xff100100
+++  mov rb_pmask, v_pmask
+++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
+++
+++# Compute part of VPM to use
+ +
+ +# Read image dimensions
+-+  mov ra3, unif         # width_height
+-+  mov rb_xpitch, unif   # stride2
+++  mov ra3, unif                                 # width_height
+++  mov rb_xpitch, unif                           # stride2
+++.if v_x_shift == 0
+ +  sub rb_max_x, ra3.16b, 1
+++.else
+++  sub r0, ra3.16b, 1
+++  shl rb_max_x, r0, v_x_shift
+++.endif
+ +  sub rb_max_y, ra3.16a, 1
+-+  mov rb_pitch, unif    # stride1
+++  mov rb_pitch, unif                            # stride1
+ +
+ +# get destination pitch
+ +  mov r1, vdw_setup_1(0)
+-+  or  rb24, r1, rb_pitch
+++  or  rb_dma1_base, r1, rb_pitch
+ +
+ +# Compute base address for first and second access
+ +  mov r3, elem_num
+-+  add r0, ra8.16a, r3   # Load x + elem_num
+++  add r0, ra0.16b, r3                           # Load x + elem_num
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+ +  max r0, r0, 0
+ +  min r0, r0, rb_max_x
+ +  shl ra_xshift_next, r0, 3 # Compute shifts
+ +
+-+
+-+# In a single 32 bit word we get 4 Y Pels so mask 2 bottom bits of xs
+++# X is byte offset - we can only load words - mask
+ +
+ +  and r0, r0, -4        ; v8subs r2, r2, r2
+ +  sub r2, r2, rb_pitch
+ +  and r1, r0, r2
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+++  add r0, r0, r1                                # Add stripe offsets
+ +  add ra_base, ra9, r0
+ +
+-+  mov r1, ra8.16b       # Load y
+-+  add ra_y, r1, 1       # Set for next
+-+  max r1, r1, 0
+-+  min r1, r1, rb_max_y
+-+
+-+# submit texture requests for first line
+-+  nop                   ; mul24 r1, r1, rb_pitch
+-+  add t0s, ra_base, r1
+-+
+-+
+ +  # r3 still contains elem_num
+-+  add r0, ra10.16a, r3  # Load x
+++  add r0, ra1.16b, r3                           # Load x
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+ +  max r0, r0, 0
+ +  min r0, r0, rb_max_x
+-+  shl rb_xshift2_next, r0, 3 # Compute shifts
+++  shl rb_xshift2_next, r0, 3                    # Compute shifts
+ +
+ +  # r2 still contains mask
+ +  and r0, r0, -4
+ +  and r1, r0, r2
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+++  add r0, r0, r1                                # Add stripe offsets
+ +  add ra_base2, ra11, r0
+ +
+-+  mov r1, ra10.16b       # Load y
+-+  add ra_y2, r1, 1       # Set for next
+-+  max r1, r1, 0
+++# Do preloads
+++  nop                   ; mov r0, ra0.16a       # ; r0 = y
+++  mov r3, PREREAD       ; mov r2, ra1.16a       # ; r2 = y2
+++
+++:1
+++  sub.setf r3, r3, 1
+++  max r1, r0, 0
+++  min r1, r1, rb_max_y
+++  add r0, r0, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t0s, ra_base, r1  ; mov ra_y, r0
+++
+++  max r1, r2, 0
+++  brr.anynz -, r:1b
+ +  min r1, r1, rb_max_y
+++  add r2, r2, ra_k1     ; mul24 r1, r1, rb_pitch
+++  add t1s, ra_base2, r1 ; mov ra_y2, r2
+++# >>> .anynz 1b
+ +
+-+# submit texture requests for first line
+-+  nop                   ; mul24 r1, r1, rb_pitch
+-+  add t1s, ra_base2, r1
+++  add rb_wt_den_p15, unif, 23 - v_bit_depth     # weight denom
+ +
+-+# load constants
+++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
+ +
+-+  mov ra_k1, 1
+-+  mov ra_k256, 256
+-+  mov rb_k255, 255
+-+  mov ra_k0, 0
+++  mov ra_link, unif                             # Next fn
+ +
+ +# touch vertical context to keep simulator happy
+-+
+ +  mov ra8,  0           ; mov rb8,  0
+++  bra -, ra_link
+ +  mov ra9,  0           ; mov rb9,  0
+ +  mov ra10, 0           ; mov rb10, 0
+ +  mov ra11, 0           ; mov rb11, 0
+++# >>> ra_link
+++.endm
+ +
+-+# Compute part of VPM to use
+-+  m_calc_dma_regs rb28, rb27
+-+
+-+# Weighted prediction denom
+-+  add rb13, unif, 9     # unif = weight denom + 6
+-+
+-+# submit texture requests for second line
+-+  max r1, ra_y, 0
+-+  min r1, r1, rb_max_y
+-+  add ra_y, ra_y, 1
+-+  mov -, unif           ; mul24 r1, r1, rb_pitch  # unused ;
+-+  add t0s, r1, ra_base
+-+
+-+  max r1, ra_y2, 0
+-+  min r1, r1, rb_max_y
+-+  add ra_y2, ra_y2, 1
+-+  nop                   ; mul24 r1, r1, rb_pitch
+-+  add t1s, r1, ra_base2
+-+
+-+# FALL THROUGHT TO PER-BLOCK SETUP
+++::mc_setup_y_q0
+++  m_setup_q0
+++::mc_setup_y_qn
+++  m_setup_y 8
+ +
+++################################################################################
+++#
+ +# Start of per-block setup code
+ +# P and B blocks share the same setup code to save on Icache space
+-+:per_block_setup
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+-+  mov ra_link, unif
+-+#### We do all the setup even if we are about to exit - reading junk from unif....
+-+
+-+  mov ra1, unif         ; mov r3, elem_num  # y_x ; elem_num has implicit unpack??
+ +
+-+# per-channel shifts were calculated on the *previous* invocation
+-+  mov ra_xshift, ra_xshift_next
+-+  mov rb_xshift2, rb_xshift2_next
+++# luma_setup_delay3 done in delay slots of branch that got us here
+ +
+ +# get base addresses and per-channel shifts for *next* invocation
+++# per-channel shifts were calculated on the *previous* invocation
+ +
+-+  add r0, ra1.16a, r3   # Load x
+-+  max r0, r0, 0
+-+  min r0, r0, rb_max_x
+++# 1st 3 instructions of per_block-setup in branch delay
+++#
+++# typedef struct qpu_mc_pred_y_p_s {
+++#    qpu_mc_src_t next_src1;
+++#    qpu_mc_src_t next_src2;
+++#    uint16_t h;
+++#    uint16_t w;
+++#    uint32_t mymx21;
+++#    uint32_t wo1;
+++#    uint32_t wo2;
+++#    uint32_t dst_addr;
+++#    uint32_t next_fn;
+++# } qpu_mc_pred_y_p_t;
+++#
+ +
+-+  shl ra_xshift_next, r0, 3         # Compute shifts
+-+  and r0, r0, -4        ; v8subs r2, r2, r2
+-+  sub r2, r2, rb_pitch
+-+  and r1, r0, r2
+-+  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+-+  add ra_base_next, unif, r0              # Base1
+-+  mov ra_y_next, ra1.16b                      # Load y
+-+  mov ra1, unif         # x2_y2
+-+  nop                   # ra1 delay
+++.macro m_luma_setup, v_bit_depth
+++# Hack - QASM may well have have label pasting but I have no idea how...
+++.if v_bit_depth == 8
+++  brr ra_link, r:per_block_setup_8
+++.elif v_bit_depth == 10
+++  brr ra_link, r:per_block_setup_10
+++.endif
+++  mov ra0, unif         ; mov r3, elem_num      # y_x ; elem_num has implicit unpack??
+++  add.setf -, rb_ef, rb_ef; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
+++  add r0, ra0.16b, r3   ; mov rb_xshift2, rb_xshift2_next
+++.endm
+ +
+-+  add r0, ra1.16a, r3   # Load x2
+-+  max r0, r0, 0
+++.macro m_per_block_setup, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         0
+++.set v_x_mul,           1
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         1
+++.set v_x_mul,           2
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+++  max r0, r0, r5         ; mov ra_xshift, ra_xshift_next
+ +  min r0, r0, rb_max_x
+ +
+-+  shl rb_xshift2_next, r0, 3         # Compute shifts
+++  shl ra_xshift_next, r0, 3         # Compute shifts
+ +  and r0, r0, -4
+-+  and r1, r0, r2
+++  sub r2, r5, rb_pitch  ; mov ra_base_next, unif # src1.base
+++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
+ +  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+-+  add r0, r0, r1        # Add stripe offsets
+-+  add rb_base2_next, unif, r0              # Base1
+-+  mov ra_y2_next, ra1.16b                      # Load y
+-+  mov ra_width_height, unif         # width_height
+-+
+-+# set up VPM write
+-+  mov vw_setup, rb28    # [ra1 delay]
+-+
+-+# get width,height of block (unif load above)
+-+  sub rb29, rb24, ra_width # Compute vdw_setup1(dst_pitch-width)
+-+  add rb17, ra_height, 5  ; mov r0, ra_height
+-+  mov r1, 16
+-+  min r0, r0, r1
+-+  add rb18, r0, 7
+-+  shl r0,   r0, 7
+-+  add r0,   r0, ra_width                        # Combine width and height of destination area
+-+  shl r0,   r0, i_shift16                       # Shift into bits 16 upwards of the vdw_setup0 register
+-+  add rb26, r0, rb27                 ; mov r0, unif   # Packed filter offsets
+++  add r0, r0, r1        ; mov ra1, unif         # Add stripe offsets ; src2.x_y
+++  add ra_base_next, ra_base_next, r0            # [ra1 delay]
+++
+++  add r0, ra1.16b, r3                           # Load x2
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+++  max r0, r0, r5        ; mov ra_y2_next, ra1.16a
+++  min r0, r0, rb_max_x  ; mov rb_base2_next, unif # ; src2.base
+++  shl rb_xshift2_next, r0, 3                    # Compute shifts
+++  and r0, r0, -4        ; mov ra_width_height, unif # ; width_height
+++  and r1, r0, r2        ; mov vw_setup, rb_vpm_init # ; set up VPM write
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
+++  add rb_base2_next, rb_base2_next, r0
+++
+++# get width,height of block (unif load above), r1 = width * pel_size
+++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
+++  add rb_i_tmu, r0, 7 - PREREAD ; v8min r0, r0, ra_blk_height
+++  add rb_lcount, r0, 7
+++  shl r0,   r0, v_dma_h_shift
+++  add r0,   r0, r1                              # Combine width and height of destination area
+++  shl r0,   r0, v_dma_wh_shift                  # Shift into bits 16 upwards of the vdw_setup0 register
+++  add rb_dma0, r0, rb_dma0_base ; mov r0, unif  # ; Packed filter offsets
+ +
+ +# get filter coefficients and discard unused B frame values
+-+  shl.ifz r0, r0, i_shift16          ; mov ra5, unif    #  Pick half to use ; L0 offset/weight
+-+  mov r2, 0x01040400                 # [ra5 delay]
+-+  shl ra8, r0, 3                     ; mov rb14, ra5.16a
+++  shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif     #  Pick half to use ; L0 offset/weight
+++  shl ra8, r0, 3        ; mov r3, ra_k255
+ +
+ +# Pack the 1st 4 filter coefs for H & V tightly
+++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
+ +
+-+  mov r1,0x00010100  # -ve
+++  mov r1,0x00010100  # -ve                      [ra8 delay]
+ +  ror ra2.8a, r1, ra8.8d
+ +  ror ra0.8a, r1, ra8.8c
+ +
+-+  ror ra2.8b, r2, ra8.8d
+-+  ror ra0.8b, r2, ra8.8c
+++  mov r1, 0x01040400
+++  ror ra2.8b, r1, ra8.8d
+++  ror ra0.8b, r1, ra8.8c
+ +
+ +  mov r1,0x050b0a00  # -ve
+ +  ror ra2.8c, r1, ra8.8d
+@@ -17390,49 +26344,44 @@ index 0000000..aa3fe47
+ +  ror ra2.8d, r1, ra8.8d
+ +  ror ra0.8d, r1, ra8.8c
+ +
+-+# In the 2nd vertical half we use b registers due to
+-+# using a-side fifo regs. The easiest way to achieve this to pack it
+-+# and then unpack!
+++# In the 2nd vertical half we use b registers due to using a-side fifo regs
+ +
+ +  mov r1,0x3a281100
+-+  ror ra3.8a, r1, ra8.8d
+-+  ror ra1.8a, r1, ra8.8c
+++  ror r0, r1, ra8.8d  ; mov ra_wt_off_mul_l1, unif
+++  ror ra1.8a, r1, ra8.8c ; v8min rb4, r0, r3
+ +
+ +  mov r1,0x0a0b0500  # -ve
+-+  ror ra3.8b, r1, ra8.8d
+-+  ror ra1.8b, r1, ra8.8c
+++  ror r0, r1, ra8.8d
+++  ror ra1.8b, r1, ra8.8c ; v8min rb5, r0, r3
+ +
+ +  mov r1,0x04040100
+-+  ror ra3.8c, r1, ra8.8d
+-+  ror ra1.8c, r1, ra8.8c
+++  ror r0, r1, ra8.8d
+++  ror ra1.8c, r1, ra8.8c ; v8min rb6, r0, r3
+++
+++  mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ; mov rb_dest, unif # ; Destination address
+ +
+ +  mov r1,0x01010000  # -ve
+-+  ror ra3.8d, r1, ra8.8d
+-+  ror ra1.8d, r1, ra8.8c
+++  ror r0, r1, ra8.8d
+ +
+-+# Extract weighted prediction information in parallel
+-+# We are annoyingly A src limited here
+++  bra -, ra_link
+++  ror ra1.8d, r1, ra8.8c ; v8min rb7, r0, r3
+ +
+-+  mov rb4, ra3.8a            ; mov ra18, unif
+-+  mov rb5, ra3.8b
+-+  mov rb6, ra3.8c
+-+  mov.ifnz ra5, ra18
+++  shl r0, ra_wt_off_l0, rb_wt_den_p15           # Offset calc
+++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+++  asr rb_wt_off, r0, 9  ; mov ra_link, unif    # ; link - load after we've used its previous val
+++# >>> branch ra_link
+ +
+-+  mov rb_dest, unif     # Destination address
+++# r5 = 0
+++# ra_wt_mul_l1  = weight L1
+++# ra5.16a       = weight L0/L1 depending on side (wanted for 2x mono-pred)
+++# rb_wt_off     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb_wt_den_p15 - 1)
+++# rb_wt_den_p15 = weight denom + 6 + 9
+++# rb_wt_mul_l0  = weight L0
+++.endm
+ +
+-+  bra -, ra_link
+++:per_block_setup_8
+++  m_per_block_setup 8
+ +
+-+  shl r0, ra5.16b, rb13      # Offset calc
+-+  asr rb12, r0, 9            # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+-+  mov r3, 0                  ; mov rb7, ra3.8d
+-+# >>> branch ra_link
+-+#
+-+# r3 = 0
+-+# ra18.16a = weight L1
+-+# ra5.16a  = weight L0/L1 depending on side (wanted for 2x mono-pred)
+-+# rb12     = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
+-+# rb13     = weight denom + 6 + 9
+-+# rb14     = weight L0
+ +
+ +
+ +################################################################################
+@@ -17440,381 +26389,1225 @@ index 0000000..aa3fe47
+ +# In a P block, y2_x2 should be y_x+8
+ +# At this point we have already issued two pairs of texture requests for the current block
+ +
+-+::mc_filter
+-+# ra5.16a = weight << 16; We want weight * 2 in rb14
+++.macro m_filter_y_pxx, v_bit_depth
+++  m_luma_setup v_bit_depth
+ +
+-+  shl rb14, ra5.16a, 1
+++  shl ra_wt_mul_l0, ra_wt_mul_l0, 1
+ +
+-+# r3 = 0
+++# r5 = 0 (loop count)
+ +
+-+:yloop
+++:1
+ +# retrieve texture results and pick out bytes
+ +# then submit two more texture requests
+ +
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+-+
+ +# N.B. Whilst y == y2 as far as this loop is concerned we will start
+ +# the grab for the next block before we finish with this block and that
+ +# might be B where y != y2 so we must do full processing on both y and y2
+ +
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next      ; ldtmu0
+++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+ +
+ +  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+-+
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++  max r2, ra_y2, 0
+++  min r2, r2, rb_max_y          ; mov ra7, ra8
+++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+ +
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+ +
+ +# apply horizontal filter
+-+  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+  sub r0, r2, r3       ; mov r3, rb31
+-+
+-+  sub.setf -, r3, 8       ; mov r1,   ra8
+-+  mov ra8,  ra9           ; mov rb8,  rb9
+-+  brr.anyn -, r:yloop
+-+  mov ra9,  ra10          ; mov rb9,  rb10
+-+  mov ra10, ra11          ; mov rb10, rb11
+-+  mov ra11, r0            ; mov rb11, r1
+-+  # >>> .anyn yloop
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++
+++  sub.setf -, r5, 8     ; mov ra9,  ra10
+++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++  brr.anyn -, r:1b
+++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++  mov ra10, ra11        ; mov rb10, rb11
+++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++  # >>> .anyn 1b
+ +
+ +  # apply vertical filter and write to VPM
+-+
+-+  nop                     ; mul24 r0, rb8,  ra2.8a
+-+  nop                     ; mul24 r1, rb9,  ra2.8b
+-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+  sub r1, r1, r0          ; mov -, vw_wait
+++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++  add r1, r1, r0        ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0
+ +# At this point r1 is a 22-bit signed quantity: 8 (original sample),
+ +#  +6, +6 (each pass), +1 (the passes can overflow slightly), +1 (sign)
+ +# The top 8 bits have rubbish in them as mul24 is unsigned
+ +# The low 6 bits need discard before weighting
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256  # x256 - sign extend & discard rubbish
+ +  asr r1, r1, 14
+-+  nop                     ; mul24 r1, r1, rb14
+-+  add r1, r1, rb12
+-+
+-+  shl r1, r1, 8
+-+  brr.anyn -, r:yloop
+-+  asr r1, r1, rb13
+-+# We have a saturating pack unit - I can't help feeling it should be useful here
+-+  min r1, r1, rb_k255       # Delay 2  rb_k255 = 255
+-+  max vpm, r1, 0         # Delay 3
+++  nop                   ; mul24 r1, r1, ra_wt_mul_l0
+++  add r1, r1, rb_wt_off ; mov r3, ra_blk_height      # ; r3 = block height for outside loop
+++
+++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++
+ +# >>> branch.anyn yloop
+ +
+-+# If looping again the we consumed 16 height last loop
+-+  # rb29 (stride) remains constant
+-+  # rb17 remains const (based on total height)
+-+  # recalc rb26, rb18 based on new segment height
+-+  # N.B. r3 is loop counter still
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_pxx
+++  m_filter_y_pxx 8
+++
+++
+++################################################################################
+++
+++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+++# In a P block, only the first half of coefficients contain used information.
+++# At this point we have already issued two pairs of texture requests for the current block
+++# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+++# Or possibly by taking advantage of symmetry?
+++
+++.macro m_filter_y_bxx, v_bit_depth
+++  m_luma_setup v_bit_depth
+++
+++:1
+++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1        ; ldtmu1
+++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next  ; ldtmu0
+++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2          ; mov.ifz ra_base2, rb_base2_next
+++
+++  max r2, ra_y2, 0
+++  min r2, r2, rb_max_y          ; mov ra7, ra8
+++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
+++  add t1s, ra_base2, r2         ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+++
+++  add.setf -, rb_ef, rb_ef      ; mov ra8, ra9
+++
+++# apply horizontal filter
+++  and r1, r1, rb_pmask  ; mul24      r3, ra0.8a,      r0
+++  nop                   ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+++  nop                   ; mul24.ifn  r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+++
+++  sub.setf -, r5, 8     ; mov ra9,  ra10
+++  sub r2, r2, r3        ; mul24 r0, rb9,  ra2.8a
+++  brr.anyn -, r:1b
+++  mov rb9,  rb10        ; mul24 r1, rb10, ra2.8b
+++  mov ra10, ra11        ; mov rb10, rb11
+++  asr ra11, r2, v_bit_depth - 8 ; mov rb11, ra7
+++  # >>> .anyn 1b
+++
+++  # apply vertical filter and write to VPM
+++  sub r1, r1, r0        ; mul24 r0, rb10, ra2.8c
+++  sub r1, r1, r0        ; mul24 r0, rb11, ra2.8d
+++  add r1, r1, r0        ; mul24 r0, ra8,  rb4
+++  add r1, r1, r0        ; mul24 r0, ra9,  rb5
+++  sub r1, r1, r0        ; mul24 r0, ra10, rb6
+++  add r1, r1, r0        ; mul24 r0, ra11, rb7
+++  sub r1, r1, r0        ; mov r2, rb_wt_off
+++# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+++# Top 8 bits are bad - low 6 bits should be discarded
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_k256
+++
+++  asr r1, r1, 14
+++  nop                   ; mul24 r0, r1, ra_wt_mul_l0
+++  add r0, r0, r2        ; mul24 r1, r1 << 8, ra_wt_mul_l1 << 8    @ "mul_used", 0
+++
+++  add r1, r1, r0        ; mov r3, ra_blk_height
+++  shl r1, r1, 8         ; v8subs r0, ra_height, r3
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> branch.anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_bxx
+++  m_filter_y_bxx 8
+++
+++################################################################################
+++#
+++# typedef struct qpu_mc_pred_y_p00_s {
+++#    qpu_mc_src_t next_src1;
+++#    uint16_t h;
+++#    uint16_t w;
+++#    uint32_t wo1;
+++#    uint32_t dst_addr;
+++#    uint32_t next_fn;
+++# } qpu_mc_pred_y_p00_t;
+++
+++.macro m_filter_y_p00, v_bit_depth
+++
+++.if v_bit_depth <= 8
+++.set v_x_shift,         0
+++.set v_x_mul,           1
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     7
+++.set v_dma_wh_shift,    i_shift16
+++.else
+++.set v_x_shift,         1
+++.set v_x_mul,           2
+++# Shifts to get width & height in the right place in rb_dma0
+++.set v_dma_h_shift,     8
+++.set v_dma_wh_shift,    15
+++.endif
+++
+++  mov ra0, unif         ; mov r3, elem_num      # y_x
+++  mov ra_xshift, ra_xshift_next                 # [ra0 delay]
+++  add r0, ra0.16b, r3
+++.if v_x_shift != 0
+++  shl r0, r0, v_x_shift
+++.endif
+++
+++  max r0, r0, 0
+++  min r0, r0, rb_max_x
+++
+++  shl ra_xshift_next, r0, 3                     # Compute shifts
+++  and r0, r0, -4        ; v8subs r2, r2, r2
+++  sub r2, r2, rb_pitch  ; mov ra_base_next, unif # src1.base
+++  and r1, r0, r2        ; mov ra_y_next, ra0.16a
+++  xor r0, r0, r1        ; mul24 r1, r1, rb_xpitch
+++  add r0, r0, r1        ; mov ra_width_height, unif # Add stripe offsets ; width_height
+++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
+++
+++# get width,height of block (unif load above)
+++# Compute vdw_setup1(dst_pitch-width)
+++  shl r1, ra_width, v_x_shift
+++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
+++  sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
+++  shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
+++  add r0, r0, r1        ; mov ra_wt_off_mul_l0, unif # Combine width and height of destination area ; weight_offset
+++  shl r0, r0, v_dma_wh_shift ; mov rb_dest, unif  # Shift into bits 16 upwards of the vdw_setup0 register ; dest addr
+++  add rb_dma0, r0, rb_dma0_base
+++
+++  shl r0, ra_wt_off_l0, rb_wt_den_p15 ; v8subs r5rep, r3, r3     # Offset calc ; r5 = 0
+++  # For B l1 & L0 offsets should be identical so it doesn't matter which we use
+++  asr rb_wt_off, r0, 1  ; mov ra_link, unif    # ; link
+++
+++:1
+++  sub.setf -, r5, rb_i_tmu  ; v8adds r5rep, r5, ra_k1
+++  nop                   ; mov.ifz ra_y, ra_y_next      ; ldtmu0
+++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2  ; v8min r0, r0, rb_pmask
+++
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
+++  shl r1, r1, 23 - v_bit_depth ; mov r3, ra_blk_height
+++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+++
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> branch.anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+++
+++# DMA out
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_p00
+++  m_filter_y_p00 8
+++
+++################################################################################
+++
+++.macro m_filter_y_b00, v_bit_depth
+++# luma setup does a fair bit more than we need calculating filter coeffs
+++# that we will never use but it saves I-cache to use it (also simple!)
+++  m_luma_setup v_bit_depth
+++
+++# Fix up vals that were expecting a filter (somewhat icky)
+++  mov r0, 7
+++  sub rb_i_tmu, rb_i_tmu, r0
+++  sub rb_lcount, rb_lcount, r0
+++  mov r0, 8             ; mov r1, ra_wt_off_mul_l0
+++  shl rb_wt_off, rb_wt_off, r0
+++  nop                   ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
+++
+++:1
+++  sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1            ; ldtmu1
+++  shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next        ; ldtmu0
+++  shr r0, r4, ra_xshift ; mov r3, rb_pitch
+++
+++  max r2, ra_y, 0  # y
+++  min r2, r2, rb_max_y  ; mov.ifz ra_base, ra_base_next
+++  add ra_y, ra_y, 1     ; mul24 r2, r2, r3
+++  add t0s, ra_base, r2  ; mov.ifz ra_base2, rb_base2_next
+++
+++  max r2, ra_y2, 0
+++  min r2, r2, rb_max_y
+++  add ra_y2, ra_y2, 1   ; mul24 r2, r2, r3
+++  add t1s, ra_base2, r2 ; v8min r0, r0, rb_pmask # v8subs masks out all but bottom byte
+++  and r1, r1, rb_pmask  ; mul24 r0, r0, ra_wt_mul_l0
+++
+++  sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
+++  add r1, r0, r1
+++  shl r1, r1, 22 - v_bit_depth ; mov r3, ra_blk_height
+++  add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
+ +
+-+  mov r1, 16
+-+  sub r0, ra_height, r1
+-+  mov ra_height, r0
+-+  max.setf r0, r0, 0    # Done if Z now
+++  brr.anyn -, r:1b
+++  asr r1, r1, rb_wt_den_p15
+++  min r1, r1, ra_pmax   ; mov -, vw_wait
+++  max vpm, r1, ra_k0    ; mul24 r2, r3, rb_pitch
+++# >>> branch.anyn 1b
+++
+++# r0 = remaining height (min 0)
+++# r2 = r3 * rb_pitch
+++# r3 = block_height (currently always 16)
+++
+++# If looping again then we consumed 16 height last loop
+++# rb_dma1 (stride) remains constant
+++# rb_i_tmu remains const (based on total height)
+++# recalc rb_dma0, rb_lcount based on new segment height
+++
+++  mov.setf ra_height, r0 ; mov vw_setup, rb_dma0 # VDW setup 0
+ +
+ +# DMA out
+-+  brr.anyz -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, rb_dest # start the VDW   Delay 3
+-+# >>> .anyz per_block_setup
+++  bra.anyz -, ra_link
+++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
+++  sub r1, r0, r3        ; mov vw_addr, rb_dest  # start the VDW
+++  shl r1, r1, i_shift23
+++# >>> .anyz ra_link
+++
+++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
+++# We add to dma0 to reduce the number of output lines in the final block
+++  add rb_lcount, rb_lcount, r0
+++  brr -, r:1b
+++  add rb_dma0, rb_dma0, r1
+++  add rb_dest, rb_dest, r2
+++  mov vw_setup, rb_vpm_init                     # Reset our VDM write pointer
+++# >>> 1b
+++.endm
+++
+++::mc_filter_y_b00
+++  m_filter_y_b00 8
+++
+++################################################################################
+++################################################################################
+++# 10 BIT
+++
+++::mc_setup_c10_q0
+++  m_setup_q0
+++::mc_setup_c10_qn
+++  m_setup_c 10
+++
+++::mc_filter_c10_p
+++  m_filter_c_p 0, 10
+++
+++::mc_filter_c10_p_l1
+++  m_filter_c_p 1, 10
+++
+++
+++::mc_filter_c10_b
+++  m_filter_c_b 10
+++
+++# Even if these fns are the same as for other bit depths we want our own copy
+++# to keep the code we are using in a single lump to avoid (direct map) cache
+++# thrashing
+++.set v_quads10, N_QPU_16 / 4
+++
+++::mc_sync10_q0
+++  m_sync_q 0, v_quads10
+++::mc_sync10_q1
+++  m_sync_q 1, v_quads10
+++::mc_sync10_q2
+++  m_sync_q 2, v_quads10
+++::mc_sync10_q3
+++  m_sync_q 3, v_quads10
+++::mc_sync10_q4
+++  m_sync_q 4, v_quads10
+++::mc_sync10_q5
+++  m_sync_q 5, v_quads10
+++::mc_sync10_q6
+++  m_sync_q 6, v_quads10
+++::mc_sync10_q7
+++  m_sync_q 7, v_quads10
+++::mc_sync10_q8
+++  m_sync_q 8, v_quads10
+++::mc_sync10_q9
+++  m_sync_q 9, v_quads10
+++::mc_sync10_q10
+++  m_sync_q 10, v_quads10
+++::mc_sync10_q11
+++  m_sync_q 11, v_quads10
+++
+++::mc_exit_y10_q0
+++::mc_exit_c10_q0
+++  m_exit_q0
+++
+++::mc_exit_y10_qn
+++::mc_exit_c10_qn
+++  m_exit_qn
+++
+++::mc_setup_y10_q0
+++  m_setup_q0
+++::mc_setup_y10_qn
+++  m_setup_y 10
+++
+++:per_block_setup_10
+++  m_per_block_setup 10
+++
+++::mc_filter_y10_pxx
+++  m_filter_y_pxx 10
+++
+++::mc_filter_y10_p00
+++  m_filter_y_p00 10
+++
+++::mc_filter_y10_bxx
+++  m_filter_y_bxx 10
+++
+++::mc_filter_y10_b00
+++  m_filter_y_b00 10
+++
+++
+++
+++::mc_end
+++# Do not add code here because mc_end must appear after all other code.
++diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
++new file mode 100644
++index 0000000000..9f8983da52
++--- /dev/null
+++++ b/libavcodec/rpi_shader_cmd.h
++@@ -0,0 +1,128 @@
+++#ifndef RPI_SHADER_CMD_H
+++#define RPI_SHADER_CMD_H
+++
+++#pragma pack(push, 4)
+++
+++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
+++// If mixed then we are just confused and get a lot of warnings....
+++typedef const uint8_t * qpu_mc_src_addr_t;
+++typedef uint8_t * qpu_mc_dst_addr_t;
+++#else
+++typedef uint32_t qpu_mc_src_addr_t;
+++typedef uint32_t qpu_mc_dst_addr_t;
+++#endif
+++
+++typedef struct qpu_mc_src_s
+++{
+++    int16_t y;
+++    int16_t x;
+++    qpu_mc_src_addr_t base;
+++} qpu_mc_src_t;
+++
+++
+++typedef struct qpu_mc_pred_c_p_s {
+++    qpu_mc_src_t next_src;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t coeffs_x;
+++    uint32_t coeffs_y;
+++    uint32_t wo_u;
+++    uint32_t wo_v;
+++    qpu_mc_dst_addr_t dst_addr_c;
+++    uint32_t next_fn;
+++} qpu_mc_pred_c_p_t;
+++
+++typedef struct qpu_mc_pred_c_b_s {
+++    qpu_mc_src_t next_src1;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t coeffs_x1;
+++    uint32_t coeffs_y1;
+++    uint32_t weight_u1;
+++    uint32_t weight_v1;
+++    qpu_mc_src_t next_src2;
+++    uint32_t coeffs_x2;
+++    uint32_t coeffs_y2;
+++    uint32_t wo_u2;
+++    uint32_t wo_v2;
+++    qpu_mc_dst_addr_t dst_addr_c;
+++    uint32_t next_fn;
+++} qpu_mc_pred_c_b_t;
+++
+++typedef struct qpu_mc_pred_c_s_s {
+++    qpu_mc_src_t next_src1;
+++    uint32_t pic_cw;            // C Width (== Y width / 2)
+++    uint32_t pic_ch;            // C Height (== Y Height / 2)
+++    uint32_t stride2;
+++    uint32_t stride1;
+++    uint32_t wdenom;
+++    qpu_mc_src_t next_src2;
+++    uint32_t next_fn;
+++} qpu_mc_pred_c_s_t;
+++
+++typedef struct qpu_mc_pred_c_s {
+++    union {
+++        qpu_mc_pred_c_p_t p;
+++        qpu_mc_pred_c_b_t b;
+++        qpu_mc_pred_c_s_t s;
+++    };
+++} qpu_mc_pred_c_t;
+++
+++
+++typedef struct qpu_mc_pred_y_p_s {
+++    qpu_mc_src_t next_src1;
+++    qpu_mc_src_t next_src2;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t mymx21;
+++    uint32_t wo1;
+++    uint32_t wo2;
+++    qpu_mc_dst_addr_t dst_addr;
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_p_t;
+++
+++typedef struct qpu_mc_pred_y_p00_s {
+++    qpu_mc_src_t next_src1;
+++    uint16_t h;
+++    uint16_t w;
+++    uint32_t wo1;
+++    qpu_mc_dst_addr_t dst_addr;
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_p00_t;
+++
+++typedef struct qpu_mc_pred_y_s_s {
+++    qpu_mc_src_t next_src1;
+++    qpu_mc_src_t next_src2;
+++    uint16_t pic_h;
+++    uint16_t pic_w;
+++    uint32_t stride2;
+++    uint32_t stride1;
+++    uint32_t wdenom;
+++    uint32_t next_fn;
+++} qpu_mc_pred_y_s_t;
+ +
+-+  min r0, r0, r1
+-+  add rb18, rb18, r0
+-+  sub r0, r0, r1
+-+  shl r0, r0, i_shift23
+-+  add rb26, rb26, r0
+++// Only a useful structure in that it allows us to return something other than a void *
+++typedef struct qpu_mc_pred_y_s {
+++    union {
+++        qpu_mc_pred_y_p_t p;
+++        qpu_mc_pred_y_p00_t p00;
+++        qpu_mc_pred_y_s_t s;
+++    };
+++} qpu_mc_pred_y_t;
+++
+++typedef union qpu_mc_pred_cmd_u {
+++    qpu_mc_pred_y_t y;
+++    qpu_mc_pred_c_t c;
+++    uint32_t data[1];
+++} qpu_mc_pred_cmd_t;
+++
+++#define QPU_MC_PRED_N_Y8        12
+++#define QPU_MC_PRED_N_C8        12
+++
+++#define QPU_MC_PRED_N_Y10       12
+++#define QPU_MC_PRED_N_C10       12
+++
+++#pragma pack(pop)
+++
+++#endif
+++
++diff --git a/libavcodec/rpi_shader_template.c b/libavcodec/rpi_shader_template.c
++new file mode 100644
++index 0000000000..1925ab7a79
++--- /dev/null
+++++ b/libavcodec/rpi_shader_template.c
++@@ -0,0 +1,65 @@
+++#ifdef RPI
+++
+++#include "hevc.h"
+++#include "libavutil/rpi_sand_fns.h"
+++#include "rpi_shader_cmd.h"
+++#include "rpi_shader_template.h"
+++
+++typedef struct shader_track_s
+++{
+++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
+++    const struct qpu_mc_src_s *last_l0;
+++    const struct qpu_mc_src_s *last_l1;
+++    uint32_t width;  // pic_width * PW
+++    uint32_t height;
+++    uint32_t stride2;
+++    uint32_t stride1;
+++    uint32_t wdenom;
+++} shader_track_t;
+++
+++static int wtoidx(const unsigned int w)
+++{
+++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
+++    return pel_weight[w];
+++}
+++
+++static const int fctom(uint32_t x)
+++{
+++    int rv;
+++    // As it happens we can take the 2nd filter term & divide it by 8
+++    // (dropping fractions) to get the fractional move
+++    rv = 8 - ((x >> 11) & 0xf);
+++    av_assert2(rv >= 0 && rv <= 7);
+++    return rv;
+++}
+++
+++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
+++{
+++    return (x << shl) >> shr;
+++}
+++
+++static inline int woff_p(HEVCContext *const s, int32_t x)
+++{
+++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
+++}
+++
+++static inline int woff_b(HEVCContext *const s, int32_t x)
+++{
+++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
+++}
+++
+++static inline int wweight(int32_t x)
+++{
+++    return ext(x, 16, 16);
+++}
+++
+++
+++#define PW 1
+++#include "rpi_shader_template_fn.h"
+++
+++#undef PW
+++#define PW 2
+++#include "rpi_shader_template_fn.h"
+++
+++#endif
+++
++diff --git a/libavcodec/rpi_shader_template.h b/libavcodec/rpi_shader_template.h
++new file mode 100644
++index 0000000000..ecf5b8185a
++--- /dev/null
+++++ b/libavcodec/rpi_shader_template.h
++@@ -0,0 +1,24 @@
+++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
+++
+++#ifdef RPI
+++struct HEVCContext;
+++struct HEVCRpiInterPredEnv;
+++
+++void rpi_shader_c8(struct HEVCContext *const s,
+++                  const struct HEVCRpiInterPredEnv *const ipe_y,
+++                  const struct HEVCRpiInterPredEnv *const ipe_c);
+ +
+-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+-+  add rb_dest, rb_dest, r0
+++void rpi_shader_c16(struct HEVCContext *const s,
+++                  const struct HEVCRpiInterPredEnv *const ipe_y,
+++                  const struct HEVCRpiInterPredEnv *const ipe_c);
+++
+++void rpi_sand_dump8(const char * const name,
+++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+++
+++void rpi_sand_dump16(const char * const name,
+++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
+++
+++#endif
+++#endif
+++
++diff --git a/libavcodec/rpi_shader_template_fn.h b/libavcodec/rpi_shader_template_fn.h
++new file mode 100644
++index 0000000000..b5ac2ceed6
++--- /dev/null
+++++ b/libavcodec/rpi_shader_template_fn.h
++@@ -0,0 +1,477 @@
+++#define STRCAT(x,y) x##y
+++
+++#if PW == 1
+++#define pixel uint8_t
+++#define FUNC(f) STRCAT(f, 8)
+++#elif PW == 2
+++#define pixel uint16_t
+++#define FUNC(f) STRCAT(f, 16)
+++#else
+++#error Unexpected PW
+++#endif
+++
+++#define PATCH_STRIDE (16 * PW)
+++
+++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+++{
+++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
+++        const pixel s = *(const pixel *)src;
+++        pixel * d = (pixel *)dst;
+++        for (unsigned int j = 0; j < w; j += PW) {
+++            *d++ = s;
+++        }
+++    }
+++}
+ +
+-+  mov vw_setup, rb28    # Reset our VDM write pointer
+++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
+++{
+++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
+++        memcpy(dst, src, w);
+++    }
+++}
+ +
+-+  brr -, r:yloop
+-+  nop
+-+  nop
+-+  nop
+-+# >>>
+++static void FUNC(get_patch_y)(const shader_track_t * const st,
+++                         uint8_t * dst, const unsigned int dst_stride,
+++                         const qpu_mc_src_t *src,
+++                         unsigned int _w, unsigned int _h)
+++{
+++    int x = src->x * PW;
+++    int y = src->y;
+++    int w = _w * PW;
+++    int h = _h;
+++    int dl = 0;
+++    int dr = 0;
+++    int dt = 0;
+++    int db = 0;
+++
+++    if (x < 0) {
+++        if (-x >= w)
+++            x = PW - w;
+++        dl = -x;
+++        w += x;
+++        x = 0;
+++    }
+++    if (x + w > st->width) {
+++        if (x >= st->width)
+++            x = st->width - PW;
+++        dr = (x + w) - st->width;
+++        w = st->width - x;
+++    }
+ +
+++    // Y
+++    if (y < 0) {
+++        if (-y >= h)
+++            y = 1 - h;
+++        dt = -y;
+++        h += y;
+++        y = 0;
+++    }
+++    if (y + h > st->height) {
+++        if (y >= st->height)
+++            y = st->height - 1;
+++        db = (y + h) - st->height;
+++        h = st->height - y;
+++    }
+ +
+++    dst += dl + dt * dst_stride;
+++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+++
+++    // Edge dup
+++    if (dl != 0)
+++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
+++    if (dr != 0)
+++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
+++    w += dl + dr;
+++    dst -= dl;
+++
+++    if (dt != 0)
+++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
+++    if (db != 0)
+++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
+++}
+ +
+ +
+ +
+-+################################################################################
+++static void FUNC(get_patch_c)(const shader_track_t * const st,
+++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
+++                         const qpu_mc_src_t *src,
+++                         unsigned int _w, unsigned int _h)
+++{
+++    int x = src->x * PW;
+++    int y = src->y;
+++    int w = _w * PW;
+++    int h = _h;
+++    int dl = 0;
+++    int dr = 0;
+++    int dt = 0;
+++    int db = 0;
+++    const int width = st->width;
+++    const int height = st->height;
+++
+++    if (x < 0) {
+++        if (-x >= w)
+++            x = PW - w;
+++        dl = -x;
+++        w += x;
+++        x = 0;
+++    }
+++    if (x + w > width) {
+++        if (x >= width)
+++            x = width - PW;
+++        dr = (x + w) - width;
+++        w = width - x;
+++    }
+ +
+-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
+-+# In a P block, only the first half of coefficients contain used information.
+-+# At this point we have already issued two pairs of texture requests for the current block
+-+# May be better to just send 16.16 motion vector and figure out the coefficients inside this block (only 4 cases so can compute hcoeffs in around 24 cycles?)
+-+# Can fill in the coefficients so only
+-+# Can also assume default weighted prediction for B frames.
+-+# Perhaps can unpack coefficients in a more efficient manner by doing H/V for a and b at the same time?
+-+# Or possibly by taking advantage of symmetry?
+-+# From 19->7 32bits per command.
+++    // Y
+++    if (y < 0) {
+++        if (-y >= h)
+++            y = 1 - h;
+++        dt = -y;
+++        h += y;
+++        y = 0;
+++    }
+++    if (y + h > height) {
+++        if (y >= height)
+++            y = height - 1;
+++        db = (y + h) - height;
+++        h = height - y;
+++    }
+ +
+-+::mc_filter_b
+-+  # r0 = weightL0 << 16, we want it in rb14
+-+#  asr rb14, r0, i_shift16
+++    dst_u += dl + dt * dst_stride;
+++    dst_v += dl + dt * dst_stride;
+++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
+ +
+-+:yloopb
+-+# retrieve texture results and pick out bytes
+-+# then submit two more texture requests
+++    // Edge dup
+++    if (dl != 0)
+++    {
+++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
+++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
+++    }
+++    if (dr != 0)
+++    {
+++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
+++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
+++    }
+++    w += dl + dr;
+++    dst_u -= dl;
+++    dst_v -= dl;
+ +
+-+# If we knew there was no clipping then this code would get simpler.
+-+# Perhaps we could add on the pitch and clip using larger values?
+++    if (dt != 0)
+++    {
+++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
+++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
+++    }
+++    if (db != 0)
+++    {
+++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
+++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
+++    }
+++}
+ +
+-+  sub.setf -, r3, rb17      ; v8adds r3, r3, ra_k1                           ; ldtmu0
+-+  shr r0, r4, ra_xshift     ; mov.ifz ra_base2, rb_base2_next    ; ldtmu1
+-+  mov.ifz ra_base, ra_base_next ; mov rb31, r3
+-+  mov.ifz ra_y, ra_y_next   ; mov r3, rb_pitch
+-+  shr r1, r4, rb_xshift2    ; mov.ifz ra_y2, ra_y2_next
+++// w, y, w, h in pixels
+++// stride1, stride2 in bytes
+++void FUNC(rpi_sand_dump)(const char * const name,
+++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
+++{
+++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
+ +
+-+  max r2, ra_y, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y, ra_y, 1            ; mul24 r2, r2, r3
+-+  add t0s, ra_base, r2   ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
+ +
+-+  max r2, ra_y2, 0  # y
+-+  min r2, r2, rb_max_y
+-+  add ra_y2, ra_y2, 1          ; mul24 r2, r2, r3
+-+  add t1s, ra_base2, r2  ; v8min r1, r1, rb_k255
+++    if (is_c) {
+++        x *= 2;
+++        w *= 2;
+++    }
+ +
+-+# generate seven shifted versions
+-+# interleave with scroll of vertical context
+++    for (int i = y; i != y + h; ++i) {
+++        for (int j = x; j != x + w; ++j) {
+++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
+++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
+++#if PW == 1
+++            if (j < 0 || i < 0)
+++                printf("..%c", sep);
+++            else
+++                printf("%02x%c", *(const pixel*)p, sep);
+++#else
+++            if (j < 0 || i < 0)
+++                printf("...%c", sep);
+++            else
+++                printf("%03x%c", *(const pixel*)p, sep);
+++#endif
+++        }
+++        printf("\n");
+++    }
+++}
+ +
+-+  mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+ +
+-+# apply horizontal filter
+-+  nop                  ; mul24      r3, ra0.8a,      r0
+-+  nop                  ; mul24.ifnz r3, ra0.8a << 8, r1 << 8    @ "mul_used", 0
+-+  nop                  ; mul24      r2, ra0.8b << 1, r0 << 1    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r2, ra0.8b << 9, r1 << 9    @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8c << 2, r0 << 2    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8c << 10, r1 << 10  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra0.8d << 3, r0 << 3    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra0.8d << 11, r1 << 11  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8a << 4, r0 << 4    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8a << 12, r1 << 12  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8b << 5, r0 << 5    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8b << 13, r1 << 13  @ "mul_used", 0
+-+  sub r2, r2, r3       ; mul24      r3, ra1.8c << 6, r0 << 6    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8c << 14, r1 << 14  @ "mul_used", 0
+-+  add r2, r2, r3       ; mul24      r3, ra1.8d << 7, r0 << 7    @ "mul_used", 0
+-+  nop                  ; mul24.ifnz r3, ra1.8d << 15, r1 << 15  @ "mul_used", 0
+-+  sub r0, r2, r3       ; mov r3, rb31
+-+
+-+  sub.setf -, r3, 8       ; mov r1,   ra8
+-+  mov ra8,  ra9           ; mov rb8,  rb9
+-+  brr.anyn -, r:yloopb
+-+  mov ra9,  ra10          ; mov rb9,  rb10
+-+  mov ra10, ra11          ; mov rb10, rb11
+-+  mov ra11, r0            ; mov rb11, r1
+-+  # >>> .anyn yloopb
+++void FUNC(rpi_shader_c)(HEVCContext *const s,
+++                  const HEVCRpiInterPredEnv *const ipe_y,
+++                  const HEVCRpiInterPredEnv *const ipe_c)
+++{
+++    for (int c_idx = 0; c_idx < 2; ++c_idx)
+++    {
+++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
+++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
+++        unsigned int exit_n = 0;
+ +
+-+  # apply vertical filter and write to VPM
+-+  nop                     ; mul24 r0, rb8,  ra2.8a
+-+  nop                     ; mul24 r1, rb9,  ra2.8b
+-+  sub r1, r1, r0          ; mul24 r0, rb10, ra2.8c
+-+  sub r1, r1, r0          ; mul24 r0, rb11, ra2.8d
+-+  add r1, r1, r0          ; mul24 r0, ra8,  rb4
+-+  add r1, r1, r0          ; mul24 r0, ra9,  rb5
+-+  sub r1, r1, r0          ; mul24 r0, ra10, rb6
+-+  add r1, r1, r0          ; mul24 r0, ra11, rb7
+-+  sub r1, r1, r0          ; mov r2, rb12
+-+# As with P-pred r1 is a 22-bit signed quantity in 32-bits
+-+# Top 8 bits are bad - low 6 bits should be discarded
+-+  sub.setf -, r3, rb18    ; mul24 r1, r1, ra_k256
+++        if (ipe == NULL || !ipe->used) {
+++            continue;
+++        }
+ +
+-+  asr r1, r1, 14
+-+  nop                     ; mul24 r0, r1, rb14
+-+  add r0, r0, r2          ; mul24 r1, r1 << 8, ra18.16a << 8    @ "mul_used", 0
+++        do {
+++            for (unsigned int i = 0; i != ipe->n; ++i) {
+++                const HEVCRpiInterPredQ * const q = ipe->q + i;
+++                shader_track_t * const st = tracka + i;
+++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
+++
+++                for (;;) {
+++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
+++
+++                    if (link == q->code_setup) {
+++                        if (c_idx == 0) {
+++                            // Luma
+++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
+++
+++                            st->height = c->pic_h;
+++                            st->width = c->pic_w * PW;
+++                            st->stride1 = c->stride1;
+++                            st->stride2 = c->stride2;
+++                            st->wdenom = c->wdenom;
+++                            st->last_l0 = &c->next_src1;
+++                            st->last_l1 = &c->next_src2;
+++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                        }
+++                        else {
+++                            // Chroma
+++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
+++
+++                            st->height = c->pic_ch;
+++                            st->width = c->pic_cw * PW;
+++                            st->stride1 = c->stride1;
+++                            st->stride2 = c->stride2;
+++                            st->wdenom = c->wdenom;
+++                            st->last_l0 = &c->next_src1;
+++                            st->last_l1 = &c->next_src2;
+++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                        }
+++                    }
+++                    else if (link == s->qpu.y_pxx) {
+++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+++                        const int w1 = FFMIN(c->w, 8);
+++                        const int w2 = c->w - w1;
+++
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h + 7);
+++                        if (w2 > 0) {
+++                            FUNC(get_patch_y)(st,
+++                                        patch_y2, PATCH_STRIDE,
+++                                        st->last_l1,
+++                                        16, c->h + 7);
+++                        }
+ +
+-+  add r1, r1, r0          ; mov -, vw_wait
+-+  shl r1, r1, 8
+++                        // wo[offset] = offset*2+1
+++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
+++                        if (w2 > 0) {
+++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+++                                c->h, st->wdenom, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
+++                        }
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.y_bxx) {
+++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+++
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+++
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h + 7);
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y2, PATCH_STRIDE,
+++                                    st->last_l1,
+++                                    16, c->h + 7);
+++
+++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
+++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
+++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
+++
+++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
+++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.y_p00) {
+++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
+ +
+-+  brr.anyn -, r:yloopb
+-+  asr r1, r1, rb13         # Delay 1
+-+  min r1, r1, rb_k255       # Delay 2
+-+  max vpm, r1, 0         # Delay 3
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+ +
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h + 7);
+ +
+-+# If looping again the we consumed 16 height last loop
+-+  # rb29 (stride) remains constant
+-+  # rb17 remains const (based on total height)
+-+  # recalc rb26, rb18 based on new segment height
+-+  # N.B. r3 is loop counter still
+++                        // wo[offset] = offset*2+1
+++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
+ +
+-+  mov r1, 16
+-+  sub r0, ra_height, r1
+-+  mov ra_height, r0
+-+  max.setf r0, r0, 0    # Done if Z now
+++                        st->last_l0 = &c->next_src1;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.y_b00) {
+++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
+++
+++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
+++
+++                        av_assert0(c->w <= 16 && c->h <= 64);
+++
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y1, PATCH_STRIDE,
+++                                    st->last_l0,
+++                                    16, c->h);
+++                        FUNC(get_patch_y)(st,
+++                                    patch_y2, PATCH_STRIDE,
+++                                    st->last_l1,
+++                                    16, c->h);
+++
+++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
+++                           patch_y3, patch_y1, PATCH_STRIDE,
+++                           c->h, 0, 0, c->w);
+++
+++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
+++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
+++                            c->h, st->wdenom, wweight(c->wo1), wweight(c->wo2),
+++                            0, woff_b(s, c->wo2), 0, 0, c->w);
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.c_pxx) {
+++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+++                        const int mx = fctom(c->coeffs_x);
+++                        const int my = fctom(c->coeffs_y);
+ +
+-+# DMA out
+-+  brr.anyz -, r:per_block_setup
+-+  mov vw_setup, rb26 # VDW setup 0    Delay 1
+-+  mov vw_setup, rb29 # Stride         Delay 2
+-+  mov vw_addr, rb_dest # start the VDW   Delay 3
+-+# >>> .anyz per_block_setup
+++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_u3[8 * 16 * PW];
+++                        uint8_t patch_v3[8 * 16 * PW];
+ +
+-+  min r0, r0, r1
+-+  add rb18, rb18, r0
+-+  sub r0, r0, r1
+-+  shl r0, r0, i_shift23
+-+  add rb26, rb26, r0
+++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+ +
+-+  nop ; mul24 r0, r1, rb_pitch  # r0 = pitch*16
+-+  add rb_dest, rb_dest, r0
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+ +
+-+  mov vw_setup, rb28    # Reset our VDM write pointer
+++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+ +
+-+  brr -, r:yloopb
+-+  nop
+-+  nop
+-+  nop
+++                        st->last_l0 = &c->next_src;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.c_pxx_l1) {
+++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
+++                        const int mx = fctom(c->coeffs_x);
+++                        const int my = fctom(c->coeffs_y);
+ +
+-+################################################################################
+++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
+++                        uint8_t patch_u3[8 * 16 * PW];
+++                        uint8_t patch_v3[8 * 16 * PW];
+ +
+-+::mc_end
+-+# Do not add code here because mc_end must appear after all other code.
+-diff --git a/libavcodec/rpi_shader_cmd.h b/libavcodec/rpi_shader_cmd.h
+-new file mode 100644
+-index 0000000..27cbb59
+---- /dev/null
+-+++ b/libavcodec/rpi_shader_cmd.h
+-@@ -0,0 +1,88 @@
+-+#ifndef RPI_SHADER_CMD_H
+-+#define RPI_SHADER_CMD_H
+++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+ +
+-+#pragma pack(push, 4)
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
+++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
+++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                            c->h, st->wdenom, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
+ +
+-+typedef struct qpu_mc_pred_c_s {
+-+    uint32_t next_fn;
+-+    int16_t next_src_y;
+-+    int16_t next_src_x;
+-+    uint32_t next_src_base_c;
+-+    union {
+-+        struct {
+-+            uint16_t h;
+-+            uint16_t w;
+-+            uint32_t coeffs_x;
+-+            uint32_t coeffs_y;
+-+            uint32_t wo_u;
+-+            uint32_t wo_v;
+-+            uint32_t dst_addr_c;
+-+        } p;
+-+        struct {
+-+            uint16_t h;
+-+            uint16_t w;
+-+            uint32_t coeffs_x;
+-+            uint32_t coeffs_y;
+-+            uint32_t weight_u;
+-+            uint32_t weight_v;
+-+            uint32_t dummy0;
+-+        } b0;
+-+        struct {
+-+            uint32_t dummy0;
+-+            uint32_t coeffs_x;
+-+            uint32_t coeffs_y;
+-+            uint32_t wo_u;
+-+            uint32_t wo_v;
+-+            uint32_t dst_addr_c;
+-+        } b1;
+-+        struct {
+-+            uint32_t pic_cw;            // C Width (== Y width / 2)
+-+            uint32_t pic_ch;            // C Height (== Y Height / 2)
+-+            uint32_t stride2;
+-+            uint32_t stride1;
+-+            uint32_t wdenom;
+-+            uint32_t dummy0;
+-+        } s0;
+-+        struct {
+-+            uint32_t dummy0;
+-+            uint32_t dummy1;
+-+            uint32_t dummy2;
+-+            uint32_t dummy3;
+-+            uint32_t dummy4;
+-+            uint32_t dummy5;
+-+        } s1;
+-+    };
+-+} qpu_mc_pred_c_t;
+++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+ +
+-+typedef struct qpu_mc_pred_y_s {
+-+    int16_t next_src1_x;
+-+    int16_t next_src1_y;
+-+    uint32_t next_src1_base;
+-+    int16_t next_src2_x;
+-+    int16_t next_src2_y;
+-+    uint32_t next_src2_base;
+-+    union {
+-+        struct {
+-+            uint16_t h;
+-+            uint16_t w;
+-+            uint32_t mymx21;
+-+            uint32_t wo1;
+-+            uint32_t wo2;
+-+            uint32_t dst_addr;
+-+        } p;
+-+        struct {
+-+            uint16_t pic_h;
+-+            uint16_t pic_w;
+-+            uint32_t stride2;
+-+            uint32_t stride1;
+-+            uint32_t wdenom;
+-+            uint32_t dummy0;
+-+        } s;
+-+    };
+-+    uint32_t next_fn;
+-+} qpu_mc_pred_y_t;
+++                        st->last_l1 = &c->next_src;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == s->qpu.c_bxx) {
+++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
+++                        const int mx1 = fctom(c->coeffs_x1);
+++                        const int my1 = fctom(c->coeffs_y1);
+++                        const int mx2 = fctom(c->coeffs_x2);
+++                        const int my2 = fctom(c->coeffs_y2);
+++
+++                        uint8_t patch_u1[PATCH_STRIDE * 72];
+++                        uint8_t patch_v1[PATCH_STRIDE * 72];
+++                        uint8_t patch_u2[PATCH_STRIDE * 72];
+++                        uint8_t patch_v2[PATCH_STRIDE * 72];
+++                        uint8_t patch_u3[8 * 16 * PW];
+++                        uint8_t patch_v3[8 * 16 * PW];
+++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
+++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
+++
+++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
+++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
+++
+++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                           c->h, mx1, my1, c->w);
+++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
+++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
+++                           c->h, mx1, my1, c->w);
+++
+++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
+++                            c->h, st->wdenom, c->weight_u1, wweight(c->wo_u2),
+++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
+++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
+++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
+++                            c->h, st->wdenom, c->weight_v1, wweight(c->wo_v2),
+++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
+++
+++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
+++
+++                        st->last_l0 = &c->next_src1;
+++                        st->last_l1 = &c->next_src2;
+++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
+++                    }
+++                    else if (link == q->code_sync) {
+++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
+++                        break;
+++                    }
+++                    else if (link == q->code_exit) {
+++                        // We expect exit to occur without other sync
+++                        av_assert0(i == exit_n);
+++                        ++exit_n;
+++                        break;
+++                    }
+++                    else {
+++                        av_assert0(0);
+++                    }
+++                }
+ +
+-+#pragma pack(pop)
+++                st->qpu_mc_curr = cmd;
+++            }
+++        } while (exit_n == 0);
+++    }
+++}
+ +
+-+#endif
+++#undef FUNC
+++#undef pixel
+ +
+ diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+ new file mode 100644
+-index 0000000..b061fe0
++index 0000000000..b502de0a2c
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.c
+-@@ -0,0 +1,581 @@
++@@ -0,0 +1,745 @@
+ +#include "config.h"
+ +#ifdef RPI
+++#include "libavcodec/avcodec.h"
+ +#include "rpi_qpu.h"
+ +#include "rpi_mailbox.h"
+ +#include "rpi_zc.h"
+ +#include "libavutil/avassert.h"
+++#include "libavutil/rpi_sand_fns.h"
+ +#include <pthread.h>
+ +
+ +#include "libavutil/buffer_internal.h"
+@@ -17841,21 +27634,11 @@ index 0000000..b061fe0
+ +    struct ZcPool * pool;
+ +} ZcPoolEnt;
+ +
+-+#if 1
+-+//#define ALLOC_PAD       0x1000
+-+#define ALLOC_PAD       0
+-+#define ALLOC_ROUND     0x1000
+-+//#define ALLOC_N_OFFSET  0x100
+-+#define ALLOC_N_OFFSET  0
+-+#define STRIDE_ROUND    0x80
+-+#define STRIDE_OR       0x80
+-+#else
+ +#define ALLOC_PAD       0
+ +#define ALLOC_ROUND     0x1000
+ +#define ALLOC_N_OFFSET  0
+-+#define STRIDE_ROUND    32
+++#define STRIDE_ROUND    64
+ +#define STRIDE_OR       0
+-+#endif
+ +
+ +#define DEBUG_ZAP0_BUFFERS 0
+ +
+@@ -18032,13 +27815,22 @@ index 0000000..b061fe0
+ +    {
+ +        case AV_PIX_FMT_YUV420P:
+ +            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+-+        //    geo.stride_y = ((video_width + 32 + 31) & ~31);
+ +            geo.stride_c = geo.stride_y / 2;
+-+        //    geo.height_y = (video_height + 15) & ~15;
+ +            geo.height_y = (video_height + 32 + 31) & ~31;
+ +            geo.height_c = geo.height_y / 2;
+ +            geo.planes_c = 2;
+ +            geo.stripes = 1;
+++            geo.bytes_per_pel = 1;
+++            break;
+++
+++        case AV_PIX_FMT_YUV420P10:
+++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
+++            geo.stride_c = geo.stride_y / 2;
+++            geo.height_y = (video_height + 32 + 31) & ~31;
+++            geo.height_c = geo.height_y / 2;
+++            geo.planes_c = 2;
+++            geo.stripes = 1;
+++            geo.bytes_per_pel = 2;
+ +            break;
+ +
+ +        case AV_PIX_FMT_SAND128:
+@@ -18073,6 +27865,7 @@ index 0000000..b061fe0
+ +            geo.height_c = img.pitch / stripe_w - geo.height_y;
+ +            geo.planes_c = 1;
+ +            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
+++            geo.bytes_per_pel = 1;
+ +
+ +            pthread_mutex_unlock(&sand_lock);
+ +
+@@ -18081,6 +27874,45 @@ index 0000000..b061fe0
+ +            break;
+ +        }
+ +
+++        case AV_PIX_FMT_SAND64_16:
+++        case AV_PIX_FMT_SAND64_10:
+++        {
+++            const unsigned int stripe_w = 128;  // bytes
+++
+++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
+++            static VC_IMAGE_T img = {0};
+++
+++            // Given the overhead of calling the mailbox keep a stashed
+++            // copy as we will almost certainly just want the same numbers again
+++            // but that means we need a lock
+++            pthread_mutex_lock(&sand_lock);
+++
+++            if (img.width != video_width || img.height != video_height)
+++            {
+++                VC_IMAGE_T new_img = {
+++                    .type = VC_IMAGE_YUV_UV_16,
+++                    .width = video_width,
+++                    .height = video_height
+++                };
+++
+++                gpu_ref();
+++                mbox_get_image_params(gpu_get_mailbox(), &new_img);
+++                gpu_unref();
+++                img = new_img;
+++            }
+++
+++            geo.stride_y = stripe_w;
+++            geo.stride_c = stripe_w;
+++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
+++            geo.height_c = img.pitch / stripe_w - geo.height_y;
+++            geo.planes_c = 1;
+++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
+++            geo.bytes_per_pel = 2;
+++
+++            pthread_mutex_unlock(&sand_lock);
+++            break;
+++        }
+++
+ +        default:
+ +            memset(&geo, 0, sizeof(geo));
+ +            break;
+@@ -18153,8 +27985,12 @@ index 0000000..b061fe0
+ +    frame->linesize[0] = geo.stride_y;
+ +    frame->linesize[1] = geo.stride_c;
+ +    frame->linesize[2] = geo.stride_c;
+++    // abuse: linesize[3] = "stripe stride"
+++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
+++    // In a general case this makes the calculation an xor and multiply rather
+++    // than a divide and multiply
+ +    if (geo.stripes > 1)
+-+        frame->linesize[3] = geo.height_y + geo.height_c;      // abuse: linesize[3] = stripe stride
+++        frame->linesize[3] = geo.height_y + geo.height_c;
+ +
+ +    frame->data[0] = buf->data;
+ +    frame->data[1] = frame->data[0] + size_y;
+@@ -18164,6 +28000,11 @@ index 0000000..b061fe0
+ +    frame->extended_data = frame->data;
+ +    // Leave extended buf alone
+ +
+++#if RPI_ZC_SAND_8_IN_10_BUF != 0
+++    // *** If we intend to use this for real we will want a 2nd buffer pool
+++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
+++#endif
+++
+ +    return 0;
+ +}
+ +
+@@ -18182,7 +28023,7 @@ index 0000000..b061fe0
+ +        rv = avcodec_default_get_buffer2(s, frame, flags);
+ +    }
+ +    else if (frame->format == AV_PIX_FMT_YUV420P ||
+-+             frame->format == AV_PIX_FMT_SAND128)
+++             av_rpi_is_sand_frame(frame))
+ +    {
+ +        rv = rpi_get_display_buffer(s->get_buffer_context, frame);
+ +    }
+@@ -18212,6 +28053,7 @@ index 0000000..b061fe0
+ +    unsigned int i;
+ +    uint8_t * psrc, * pdest;
+ +
+++    dest->format = src->format;
+ +    dest->width = src->width;
+ +    dest->height = src->height;
+ +
+@@ -18243,29 +28085,142 @@ index 0000000..b061fe0
+ +}
+ +
+ +
+++static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
+++    const AVFrame * const src)
+++{
+++    AVFrame dest_frame;
+++    AVFrame * const dest = &dest_frame;
+++    unsigned int i;
+++    uint8_t * psrc, * psrc2, * pdest;
+++
+++    memset(dest, 0, sizeof(*dest));
+++    dest->format = AV_PIX_FMT_SAND128;
+++    dest->width = src->width;
+++    dest->height = src->height;
+++
+++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+++    {
+++        return NULL;
+++    }
+++
+++    // Y
+++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
+++         i != dest->height;
+++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
+++    {
+++        uint16_t * s = (uint16_t*)psrc;
+++        uint8_t * d = pdest;
+++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
+++        {
+++            const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
+++            for (unsigned int j = 0; j != n; ++j)
+++                *d++ = (uint8_t)(*s++ >> 2);
+++            d += (dest->linesize[3] - 1) * dest->linesize[0];
+++        }
+++    }
+++
+++    // C
+++    for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
+++         i != dest->height / 2;
+++         ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
+++    {
+++        const uint16_t * su = (uint16_t*)psrc;
+++        const uint16_t * sv = (uint16_t*)psrc2;
+++        uint8_t * d = pdest;
+++        for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
+++        {
+++            const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
+++            for (unsigned int j = 0; j != n; ++j)
+++            {
+++                *d++ = (uint8_t)(*su++ >> 2);
+++                *d++ = (uint8_t)(*sv++ >> 2);
+++            }
+++            d += (dest->linesize[3] - 1) * dest->linesize[1];
+++        }
+++    }
+++
+++    return dest->buf[0];
+++}
+++
+++
+++static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
+++    const AVFrame * const src, const unsigned int src_bits)
+++{
+++    AVFrame dest_frame = {
+++        .format = AV_PIX_FMT_SAND128,
+++        .width = src->width,
+++        .height = src->height
+++    };
+++    AVFrame * const dest = &dest_frame;
+++    const unsigned int shr = src_bits - 8;
+++
+++    if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
+++    {
+++        return NULL;
+++    }
+++
+++    // Y
+++    av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
+++                        src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
+++                        src->width, src->height, shr);
+++    // C
+++    av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
+++                        src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
+++                        src->width, src->height / 2, shr);
+++
+++    return dest->buf[0];
+++}
+++
+++
+++
+ +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+-+    const AVFrame * const frame, const int maycopy)
+++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
+ +{
+ +    assert(s != NULL);
+ +
+ +    if (frame->format != AV_PIX_FMT_YUV420P &&
+-+        frame->format != AV_PIX_FMT_SAND128)
+++        frame->format != AV_PIX_FMT_YUV420P10 &&
+++        !av_rpi_is_sand_frame(frame))
+ +    {
+ +        av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
+ +        return NULL;
+ +    }
+ +
+-+    if (frame->buf[1] != NULL)
+++    if (frame->buf[1] != NULL || frame->format != expected_format)
+ +    {
+-+        av_assert0(frame->format == AV_PIX_FMT_YUV420P);
+++#if RPI_ZC_SAND_8_IN_10_BUF
+++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
+++        {
+++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
+++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
+++        }
+++#endif
+++
+ +        if (maycopy)
+ +        {
+-+            av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+-+            return zc_copy(s, frame);
+++            if (frame->buf[1] != NULL)
+++                av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
+++            else
+++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
+++
+++            switch (frame->format)
+++            {
+++                case AV_PIX_FMT_YUV420P10:
+++                    return zc_420p10_to_sand128(s, frame);
+++
+++                case AV_PIX_FMT_SAND64_10:
+++                    return zc_sand64_16_to_sand128(s, frame, 10);
+++
+++                default:
+++                    return zc_copy(s, frame);
+++            }
+ +        }
+ +        else
+ +        {
+-+            av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: NULL\n", __func__);
+++            if (frame->buf[1] != NULL)
+++                av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
+++            else
+++                av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
+ +            return NULL;
+ +        }
+ +    }
+@@ -18392,10 +28347,10 @@ index 0000000..b061fe0
+ +
+ diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+ new file mode 100644
+-index 0000000..f4aeb78
++index 0000000000..26fb3be999
+ --- /dev/null
+ +++ b/libavcodec/rpi_zc.h
+-@@ -0,0 +1,137 @@
++@@ -0,0 +1,105 @@
+ +#ifndef LIBAVCODEC_RPI_ZC_H
+ +#define LIBAVCODEC_RPI_ZC_H
+ +
+@@ -18406,23 +28361,33 @@ index 0000000..f4aeb78
+ +// bit of memory for the frame when can then be reference counted until
+ +// display has finished with it.
+ +
+-+#include "libavutil/frame.h"
+-+#include "libavcodec/avcodec.h"
+++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
+++// 0 disables
+++// *** This option still in development
+++//     Only works if SAO active
+++//     Allocates buffers that are twice the required size
+++#define RPI_ZC_SAND_8_IN_10_BUF  0
+++
+++struct AVBufferRef;
+++struct AVFrame;
+++struct AVCodecContext;
+++enum AVPixelFormat;
+ +
+ +// "Opaque" pointer to whatever we are using as a buffer reference
+-+typedef AVBufferRef * AVRpiZcRefPtr;
+++typedef struct AVBufferRef * AVRpiZcRefPtr;
+ +
+ +struct AVZcEnv;
+ +typedef struct AVZcEnv * AVZcEnvPtr;
+ +
+ +typedef struct AVRpiZcFrameGeometry
+ +{
+-+    unsigned int stride_y;
+-+    unsigned int height_y;
+-+    unsigned int stride_c;
+-+    unsigned int height_c;
+-+    unsigned int planes_c;
+-+    unsigned int stripes;
+++    unsigned int stride_y;  // Luma stride (bytes)
+++    unsigned int height_y;  // Luma height (lines)
+++    unsigned int stride_c;  // Chroma stride (bytes)
+++    unsigned int height_c;  // Chroma stride (lines)
+++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
+++    unsigned int stripes;   // Number of stripes (sand)
+++    unsigned int bytes_per_pel;
+ +} AVRpiZcFrameGeometry;
+ +
+ +
+@@ -18448,7 +28413,7 @@ index 0000000..f4aeb78
+ +//     the data, then allocate a new buffer and copy the data into it
+ +//   Otherwise return NULL
+ +AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
+-+    const AVFrame * const frame, const int maycopy);
+++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
+ +
+ +// Get the vc_handle from the frame ref
+ +// Returns -1 if ref doesn't look valid
+@@ -18469,72 +28434,30 @@ index 0000000..f4aeb78
+ +// Allocate an environment for the buffer pool used by the ZC code
+ +// This should be put in avctx->get_buffer_context so it can be found by
+ +// av_rpi_zc_get_buffer2 when it is called from ffmpeg
+-+AVZcEnvPtr av_rpi_zc_env_alloc(void);
+-+
+-+// Allocate the environment used by the ZC code
+-+void av_rpi_zc_env_free(AVZcEnvPtr);
+-+
+-+// Test to see if the context is using zc (checks get_buffer2)
+-+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+-+
+-+// Init ZC into a context
+-+// There is nothing magic in this fn - it just packages setting
+-+// get_buffer2 & get_buffer_context
+-+int av_rpi_zc_init(struct AVCodecContext * const s);
+-+
+-+// Free ZC from a context
+-+// There is nothing magic in this fn - it just packages unsetting
+-+// get_buffer2 & get_buffer_context
+-+void av_rpi_zc_uninit(struct AVCodecContext * const s);
+-+
+-+
+-+
+-+static inline unsigned int rpi_sliced_frame_stride2(const AVFrame * const frame)
+-+{
+-+    return frame->linesize[3];
+-+}
+-+
+-+static inline unsigned int rpi_sliced_frame_off_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+-+{
+-+    const unsigned int stride1 = frame->linesize[0];
+-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+-+    const unsigned int x1 = x & (stride1 - 1);
+-+    const unsigned int x2 = x ^ x1;
+-+
+-+    return x1 + stride1 * y + stride2 * x2;
+-+}
+-+
+-+static inline unsigned int rpi_sliced_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+-+{
+-+    const unsigned int stride1 = frame->linesize[0];
+-+    const unsigned int stride2 = rpi_sliced_frame_stride2(frame);
+-+    const unsigned int x = x_c * 2;
+-+    const unsigned int x1 = x & (stride1 - 1);
+-+    const unsigned int x2 = x ^ x1;
+++AVZcEnvPtr av_rpi_zc_env_alloc(void);
+ +
+-+    return x1 + stride1 * y_c + stride2 * x2;
+-+}
+++// Allocate the environment used by the ZC code
+++void av_rpi_zc_env_free(AVZcEnvPtr);
+ +
+-+static inline uint8_t * rpi_sliced_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+-+{
+-+    return frame->data[0] + rpi_sliced_frame_off_y(frame, x, y);
+-+}
+++// Test to see if the context is using zc (checks get_buffer2)
+++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
+ +
+-+static inline uint8_t * rpi_sliced_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+-+{
+-+    return frame->data[1] + rpi_sliced_frame_off_c(frame, x, y);
+-+}
+++// Init ZC into a context
+++// There is nothing magic in this fn - it just packages setting
+++// get_buffer2 & get_buffer_context
+++int av_rpi_zc_init(struct AVCodecContext * const s);
+++
+++// Free ZC from a context
+++// There is nothing magic in this fn - it just packages unsetting
+++// get_buffer2 & get_buffer_context
+++void av_rpi_zc_uninit(struct AVCodecContext * const s);
+ +
+-+static inline int rpi_sliced_frame(const AVFrame * const frame)
+-+{
+-+    return frame->format == AV_PIX_FMT_SAND128;
+-+}
+ +
+ +
+ +#endif
+ +
+ diff --git a/libavcodec/utils.c b/libavcodec/utils.c
+-index f7adb52..3b398a3 100644
++index c4af9cbb17..c1b806e51b 100644
+ --- a/libavcodec/utils.c
+ +++ b/libavcodec/utils.c
+ @@ -26,6 +26,12 @@
+@@ -18550,7 +28473,15 @@ index f7adb52..3b398a3 100644
+  #include "libavutil/atomic.h"
+  #include "libavutil/attributes.h"
+  #include "libavutil/avassert.h"
+-@@ -64,6 +70,10 @@
++@@ -39,6 +45,7 @@
++ #include "libavutil/mathematics.h"
++ #include "libavutil/mem_internal.h"
++ #include "libavutil/pixdesc.h"
+++#include "libavutil/rpi_sand_fns.h"
++ #include "libavutil/imgutils.h"
++ #include "libavutil/samplefmt.h"
++ #include "libavutil/dict.h"
++@@ -64,6 +71,10 @@
+  #include "libavutil/ffversion.h"
+  const char av_codec_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
+  
+@@ -18561,7 +28492,7 @@ index f7adb52..3b398a3 100644
+  #if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
+  static int default_lockmgr_cb(void **arg, enum AVLockOp op)
+  {
+-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
++@@ -508,6 +519,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+      return ret;
+  }
+  
+@@ -18609,7 +28540,7 @@ index f7adb52..3b398a3 100644
+  static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+  {
+      FramePool *pool = avctx->internal->pool;
+-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
++@@ -555,6 +607,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+              av_buffer_pool_uninit(&pool->pools[i]);
+              pool->linesize[i] = linesize[i];
+              if (size[i]) {
+@@ -18624,20 +28555,20 @@ index f7adb52..3b398a3 100644
+                  pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
+                                                       CONFIG_MEMORY_POISONING ?
+                                                          NULL :
+-@@ -724,6 +783,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
++@@ -729,6 +789,11 @@ int avcodec_default_get_buffer2(AVCodecContext *avctx, AVFrame *frame, int flags
+  {
+      int ret;
+  
+ +#ifdef RPI
+ +    // This is going to end badly if we let it continue
+-+    av_assert0(frame->format != AV_PIX_FMT_SAND128);
+++    av_assert0(!av_rpi_is_sand_frame(frame));
+ +#endif
+ +
+      if ((ret = update_frame_pool(avctx, frame)) < 0)
+          return ret;
+  
+ diff --git a/libavfilter/avfilter.c b/libavfilter/avfilter.c
+-index 21f8d9e..71ce7b9 100644
++index 21f8d9e00d..71ce7b9186 100644
+ --- a/libavfilter/avfilter.c
+ +++ b/libavfilter/avfilter.c
+ @@ -915,6 +915,7 @@ int avfilter_init_str(AVFilterContext *filter, const char *args)
+@@ -18649,7 +28580,7 @@ index 21f8d9e..71ce7b9 100644
+  #if FF_API_OLD_FILTER_OPTS || FF_API_OLD_FILTER_OPTS_ERROR
+              if (   !strcmp(filter->filter->name, "format")     ||
+ diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
+-index b31d233..2767306 100644
++index 6767b65ec8..f270190d57 100644
+ --- a/libavformat/mpegts.c
+ +++ b/libavformat/mpegts.c
+ @@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
+@@ -18662,10 +28593,10 @@ index b31d233..2767306 100644
+      { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC       },
+      { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS       },
+ diff --git a/libavformat/utils.c b/libavformat/utils.c
+-index 6f343f2..83f26d5 100644
++index 5a35953d24..d36fdc3199 100644
+ --- a/libavformat/utils.c
+ +++ b/libavformat/utils.c
+-@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
++@@ -694,7 +694,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+          int default_stream_index = av_find_default_stream_index(s);
+          if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
+              for (i = 0; i < s->nb_streams; i++) {
+@@ -18674,8 +28605,84 @@ index 6f343f2..83f26d5 100644
+                      continue;
+                  s->streams[i]->pts_wrap_reference = pts_wrap_reference;
+                  s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
++diff --git a/libavutil/Makefile b/libavutil/Makefile
++index 1e061763a2..cbc9bc145b 100644
++--- a/libavutil/Makefile
+++++ b/libavutil/Makefile
++@@ -59,6 +59,8 @@ HEADERS = adler32.h                                                     \
++           rational.h                                                    \
++           replaygain.h                                                  \
++           ripemd.h                                                      \
+++          rpi_sand_fns.h                                                \
+++          rpi_sand_fn_pw.h                                              \
++           samplefmt.h                                                   \
++           sha.h                                                         \
++           sha512.h                                                      \
++@@ -136,6 +138,7 @@ OBJS = adler32.o                                                        \
++        reverse.o                                                        \
++        rc4.o                                                            \
++        ripemd.o                                                         \
+++       rpi_sand_fns.o                                                   \
++        samplefmt.o                                                      \
++        sha.o                                                            \
++        sha512.o                                                         \
++diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
++index 5da44b0542..b74b7c4e2f 100644
++--- a/libavutil/arm/Makefile
+++++ b/libavutil/arm/Makefile
++@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
++ 
++ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
++              arm/float_dsp_neon.o                                       \
+++             arm/rpi_sand_neon.o                                        \
++diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
++new file mode 100644
++index 0000000000..dbffdaefa4
++--- /dev/null
+++++ b/libavutil/arm/rpi_sand_neon.S
++@@ -0,0 +1,40 @@
+++#include "libavutil/arm/asm.S"
+++
+++@ void rpi_sand128b_stripe_to_8_10(
+++@   uint8_t * dest,             [r0]
+++@   const uint8_t * src1,       [r1]
+++@   const uint8_t * src2,       [r2]
+++@   unsigned int lines);        [r3]
+++
+++.macro  stripe2_to_8, bit_depth
+++        vpush    {q4-q7}
+++1:
+++        vldm     r1!, {q0-q7}
+++        subs     r3, #1
+++        vldm     r2!, {q8-q15}
+++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
+++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
+++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
+++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
+++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
+++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
+++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
+++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
+++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
+++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
+++        vqrshrn.u16 d10, q10, #\bit_depth - 8
+++        vqrshrn.u16 d11, q11, #\bit_depth - 8
+++        vqrshrn.u16 d12, q12, #\bit_depth - 8
+++        vqrshrn.u16 d13, q13, #\bit_depth - 8
+++        vqrshrn.u16 d14, q14, #\bit_depth - 8
+++        vqrshrn.u16 d15, q15, #\bit_depth - 8
+++        vstm     r0!, {q0-q7}
+++        bne      1b
+++        vpop     {q4-q7}
+++        bx       lr
+++.endm
+++
+++function rpi_sand128b_stripe_to_8_10, export=1
+++        stripe2_to_8     10
+++endfunc
+++
+ diff --git a/libavutil/buffer.c b/libavutil/buffer.c
+-index 694e116..203ca7b 100644
++index 694e116a3c..203ca7b3a8 100644
+ --- a/libavutil/buffer.c
+ +++ b/libavutil/buffer.c
+ @@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
+@@ -18689,7 +28696,7 @@ index 694e116..203ca7b 100644
+ +  return buf->opaque;
+ +}
+ diff --git a/libavutil/buffer.h b/libavutil/buffer.h
+-index 0c0ce12..82e0bc3 100644
++index 0c0ce12cf2..82e0bc3058 100644
+ --- a/libavutil/buffer.h
+ +++ b/libavutil/buffer.h
+ @@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+@@ -18702,11 +28709,51 @@ index 0c0ce12..82e0bc3 100644
+  /**
+   * @}
+   */
++diff --git a/libavutil/frame.h b/libavutil/frame.h
++index 2b5c3320c3..990347e484 100644
++--- a/libavutil/frame.h
+++++ b/libavutil/frame.h
++@@ -120,7 +120,20 @@ enum AVFrameSideDataType {
++      * The GOP timecode in 25 bit timecode format. Data format is 64-bit integer.
++      * This is set on the first frame of a GOP that has a temporal reference of 0.
++      */
++-    AV_FRAME_DATA_GOP_TIMECODE
+++    AV_FRAME_DATA_GOP_TIMECODE,
+++
+++    /**
+++     * The data represents the AVSphericalMapping structure defined in
+++     * libavutil/spherical.h.
+++     */
+++    AV_FRAME_DATA_SPHERICAL,
+++
+++    /**
+++     * Extra data required to deal with a cropped Sand frame
+++     * AVFrame holds the cropped size, but we cannot simply offset the start
+++     * address to get the picture as we can for planar formats
+++     */
+++    AV_FRAME_DATA_SAND_INFO,
++ };
++ 
++ enum AVActiveFormatDescription {
++@@ -133,6 +146,13 @@ enum AVActiveFormatDescription {
++     AV_AFD_SP_4_3       = 15,
++ };
++ 
+++typedef struct AVFrameDataSandInfo
+++{
+++    unsigned int left_offset;
+++    unsigned int top_offset;
+++    unsigned int pic_width;
+++    unsigned int pic_height;
+++} AVFrameDataSandInfo;
++ 
++ /**
++  * Structure to hold side data for an AVFrame.
+ diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+-index 0dffa4d..5644176 100644
++index 0dffa4dbdb..17134b4f38 100644
+ --- a/libavutil/pixdesc.c
+ +++ b/libavutil/pixdesc.c
+-@@ -2088,6 +2088,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
++@@ -2088,6 +2088,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+          .flags = AV_PIX_FMT_FLAG_BE | AV_PIX_FMT_FLAG_PLANAR |
+                   AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_ALPHA,
+      },
+@@ -18721,35 +28768,486 @@ index 0dffa4d..5644176 100644
+ +            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
+ +        },
+ +        .flags = 0,
+-+    }
+++    },
+++    [AV_PIX_FMT_SAND64_10] = {
+++        .name = "sand64_10",
+++        .nb_components = 3,
+++        .log2_chroma_w = 1,
+++        .log2_chroma_h = 1,
+++        .comp = {
+++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
+++            { 1, 4, 0, 0, 10, 1, 9, 1 },        /* U */
+++            { 1, 4, 1, 0, 10, 1, 9, 2 },        /* V */
+++        },
+++        .flags = 0,
+++    },
+  };
+  #if FF_API_PLUS1_MINUS1
+  FF_ENABLE_DEPRECATION_WARNINGS
+ diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+-index 0ed01c4..4705e80 100644
++index 0ed01c4844..2155b78704 100644
+ --- a/libavutil/pixfmt.h
+ +++ b/libavutil/pixfmt.h
+-@@ -303,7 +303,10 @@ enum AVPixelFormat {
++@@ -303,7 +303,22 @@ enum AVPixelFormat {
+      AV_PIX_FMT_GBRAP10BE,  ///< planar GBR 4:4:4:4 40bpp, big-endian
+      AV_PIX_FMT_GBRAP10LE,  ///< planar GBR 4:4:4:4 40bpp, little-endian
+  
+ -    AV_PIX_FMT_NB,        ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+++    AV_PIX_FMT_MEDIACODEC, ///< hardware decoding through MediaCodec
+++
+++    AV_PIX_FMT_GRAY12BE,   ///<        Y        , 12bpp, big-endian
+++    AV_PIX_FMT_GRAY12LE,   ///<        Y        , 12bpp, little-endian
+++    AV_PIX_FMT_GRAY10BE,   ///<        Y        , 10bpp, big-endian
+++    AV_PIX_FMT_GRAY10LE,   ///<        Y        , 10bpp, little-endian
+++
+++    AV_PIX_FMT_P016LE, ///< like NV12, with 16bpp per component, little-endian
+++    AV_PIX_FMT_P016BE, ///< like NV12, with 16bpp per component, big-endian
+++
+ +// RPI - not on ifdef so can be got at by calling progs
+-+    AV_PIX_FMT_SAND128,   ///< 4:2:0 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
+++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
+ +
+ +    AV_PIX_FMT_NB         ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
+  };
+  
+  #define AV_PIX_FMT_Y400A AV_PIX_FMT_GRAY8A
++diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
++new file mode 100644
++index 0000000000..52d52a2a83
++--- /dev/null
+++++ b/libavutil/rpi_sand_fn_pw.h
++@@ -0,0 +1,182 @@
+++// * Included twice from rpi_sand_fn with different PW
+++
+++#define STRCAT(x,y) x##y
+++
+++#if PW == 1
+++#define pixel uint8_t
+++#define FUNC(f) STRCAT(f, 8)
+++#elif PW == 2
+++#define pixel uint16_t
+++#define FUNC(f) STRCAT(f, 16)
+++#else
+++#error Unexpected PW
+++#endif
+++
+++// Fetches a single patch - offscreen fixup not done here
+++// w <= stride1
+++// unclipped
+++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h)
+++{
+++    const unsigned int x = _x;
+++    const unsigned int w = _w;
+++    const unsigned int mask = stride1 - 1;
+++
+++    if ((x & ~mask) == ((x + w) & ~mask)) {
+++        // All in one sand stripe
+++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
+++            memcpy(dst, p, w);
+++        }
+++    }
+++    else
+++    {
+++        // Two+ stripe
+++        const unsigned int sstride = stride1 * stride2;
+++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        const uint8_t * p2 = p1 + sstride - (x & mask);
+++        const unsigned int w1 = stride1 - (x & mask);
+++        const unsigned int w3 = (x + w) & mask;
+++        const unsigned int w2 = w - (w1 + w3);
+++
+++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
+++            unsigned int j;
+++            const uint8_t * p = p2;
+++            uint8_t * d = dst;
+++            memcpy(d, p1, w1);
+++            d += w1;
+++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
+++                memcpy(d, p, stride1);
+++            }
+++            memcpy(d, p, w3);
+++        }
+++    }
+++}
+++
+++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
+++
+++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
+++                             uint8_t * dst_v, const unsigned int dst_stride_v,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h)
+++{
+++    const unsigned int x = _x * 2;
+++    const unsigned int w = _w * 2;
+++    const unsigned int mask = stride1 - 1;
+++
+++    if ((x & ~mask) == ((x + w) & ~mask)) {
+++        // All in one sand stripe
+++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
+++            pixel * du = (pixel *)dst_u;
+++            pixel * dv = (pixel *)dst_v;
+++            const pixel * p = (const pixel *)p1;
+++            for (unsigned int k = 0; k < w; k += 2 * PW) {
+++                *du++ = *p++;
+++                *dv++ = *p++;
+++            }
+++        }
+++    }
+++    else
+++    {
+++        // Two+ stripe
+++        const unsigned int sstride = stride1 * stride2;
+++        const unsigned int sstride_p = (sstride - stride1) / PW;
+++
+++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        const uint8_t * p2 = p1 + sstride - (x & mask);
+++        const unsigned int w1 = stride1 - (x & mask);
+++        const unsigned int w3 = (x + w) & mask;
+++        const unsigned int w2 = w - (w1 + w3);
+++
+++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
+++            unsigned int j;
+++            const pixel * p = (const pixel *)p1;
+++            pixel * du = (pixel *)dst_u;
+++            pixel * dv = (pixel *)dst_v;
+++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+++                *du++ = *p++;
+++                *dv++ = *p++;
+++            }
+++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+++                    *du++ = *p++;
+++                    *dv++ = *p++;
+++                }
+++            }
+++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+++                *du++ = *p++;
+++                *dv++ = *p++;
+++            }
+++        }
+++    }
+++}
+++
+++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
+++                             unsigned int stride1, unsigned int stride2,
+++                             const uint8_t * src_u, const unsigned int src_stride_u,
+++                             const uint8_t * src_v, const unsigned int src_stride_v,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h)
+++{
+++    const unsigned int x = _x * 2;
+++    const unsigned int w = _w * 2;
+++    const unsigned int mask = stride1 - 1;
+++    if ((x & ~mask) == ((x + w) & ~mask)) {
+++        // All in one sand stripe
+++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
+++            const pixel * su = (const pixel *)src_u;
+++            const pixel * sv = (const pixel *)src_v;
+++            pixel * p = (pixel *)p1;
+++            for (unsigned int k = 0; k < w; k += 2 * PW) {
+++                *p++ = *su++;
+++                *p++ = *sv++;
+++            }
+++        }
+++    }
+++    else
+++    {
+++        // Two+ stripe
+++        const unsigned int sstride = stride1 * stride2;
+++        const unsigned int sstride_p = (sstride - stride1) / PW;
+++
+++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
+++        const uint8_t * p2 = p1 + sstride - (x & mask);
+++        const unsigned int w1 = stride1 - (x & mask);
+++        const unsigned int w3 = (x + w) & mask;
+++        const unsigned int w2 = w - (w1 + w3);
+++
+++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
+++            unsigned int j;
+++            const pixel * su = (const pixel *)src_u;
+++            const pixel * sv = (const pixel *)src_v;
+++            pixel * p = (pixel *)p1;
+++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
+++                *p++ = *su++;
+++                *p++ = *sv++;
+++            }
+++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
+++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
+++                    *p++ = *su++;
+++                    *p++ = *sv++;
+++                }
+++            }
+++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
+++                *p++ = *su++;
+++                *p++ = *sv++;
+++            }
+++        }
+++    }
+++}
+++
+++
+++#undef pixel
+++#undef STRCAT
+++#undef FUNC
+++
++diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
++new file mode 100644
++index 0000000000..ec4cfadf8a
++--- /dev/null
+++++ b/libavutil/rpi_sand_fns.c
++@@ -0,0 +1,99 @@
+++#include "config.h"
+++#ifdef RPI
+++#include <stdint.h>
+++#include <string.h>
+++#include "rpi_sand_fns.h"
+++#include "avassert.h"
+++
+++#define PW 1
+++#include "rpi_sand_fn_pw.h"
+++#undef PW
+++
+++#define PW 2
+++#include "rpi_sand_fn_pw.h"
+++#undef PW
+++
+++#if HAVE_NEON
+++void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
+++#endif
+++
+++#if 1
+++// Simple round
+++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+++{
+++    const unsigned int rnd = (1 << shr) >> 1;
+++    const uint16_t * src = (const uint16_t *)_src;
+++
+++    for (; n != 0; --n) {
+++        *dst++ = (*src++ + rnd) >> shr;
+++    }
+++}
+++#else
+++// Dithered variation
+++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
+++{
+++    unsigned int rnd = (1 << shr) >> 1;
+++    const unsigned int mask = ((1 << shr) - 1);
+++    const uint16_t * src = (const uint16_t *)_src;
+++
+++    for (; n != 0; --n) {
+++        rnd = *src++ + (rnd & mask);
+++        *dst++ = rnd >> shr;
+++    }
+++}
+++#endif
+++
+++// w/h in pixels
+++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+++                         unsigned int w, unsigned int h, const unsigned int shr)
+++{
+++    const unsigned int n = dst_stride1 / 2;
+++    unsigned int j;
+++
+++    // This is true for our current layouts
+++    av_assert0(dst_stride1 == src_stride1);
+++
+++    // As we have the same stride1 for src & dest and src is wider than dest
+++    // then if we loop on src we can always write contiguously to dest
+++    // We make no effort to copy an exact width - round up to nearest src stripe
+++    // as we will always have storage in dest for that
+++
+++#if HAVE_NEON
+++    if (shr == 3 && src_stride1 == 128) {
+++        for (j = 0; j + n < w; j += dst_stride1) {
+++            uint8_t * d = dst + j * dst_stride2;
+++            const uint8_t * s1 = src + j * 2 * src_stride2;
+++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+++
+++            rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
+++        }
+++    }
+++    else
+++#endif
+++    {
+++        for (j = 0; j + n < w; j += dst_stride1) {
+++            uint8_t * d = dst + j * dst_stride2;
+++            const uint8_t * s1 = src + j * 2 * src_stride2;
+++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
+++
+++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
+++                cpy16_to_8(d, s1, n, shr);
+++                cpy16_to_8(d + n, s2, n, shr);
+++            }
+++        }
+++    }
+++
+++    // Fix up a trailing dest half stripe
+++    if (j < w) {
+++        uint8_t * d = dst + j * dst_stride2;
+++        const uint8_t * s1 = src + j * 2 * src_stride2;
+++
+++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
+++            cpy16_to_8(d, s1, n, shr);
+++        }
+++    }
+++}
+++
+++#endif  // RPI
+++
++diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
++new file mode 100644
++index 0000000000..aa880d0f63
++--- /dev/null
+++++ b/libavutil/rpi_sand_fns.h
++@@ -0,0 +1,129 @@
+++#ifndef AVUTIL_RPI_SAND_FNS
+++#define AVUTIL_RPI_SAND_FNS
+++#ifdef RPI
+++
+++#include "libavutil/frame.h"
+++
+++// For all these fns _x & _w are measured as coord * PW
+++// For the C fns coords are in chroma pels (so luma / 2)
+++// Strides are in bytes
+++
+++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++
+++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
+++                             uint8_t * dst_v, const unsigned int dst_stride_v,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
+++                             uint8_t * dst_v, const unsigned int dst_stride_v,
+++                             const uint8_t * src,
+++                             unsigned int stride1, unsigned int stride2,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++
+++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
+++                             unsigned int stride1, unsigned int stride2,
+++                             const uint8_t * src_u, const unsigned int src_stride_u,
+++                             const uint8_t * src_v, const unsigned int src_stride_v,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
+++                             unsigned int stride1, unsigned int stride2,
+++                             const uint8_t * src_u, const unsigned int src_stride_u,
+++                             const uint8_t * src_v, const unsigned int src_stride_v,
+++                             unsigned int _x, unsigned int y,
+++                             unsigned int _w, unsigned int h);
+++
+++// w/h in pixels
+++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
+++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
+++                         unsigned int w, unsigned int h, const unsigned int shr);
+++
+++
+++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
+++{
+++    // * We could repl;ace thios with a fixed 128 whic would allow the compiler
+++    //   to optimize a whole lot better
+++    return frame->linesize[0];
+++}
+++
+++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
+++{
+++    return frame->linesize[3];
+++}
+++
+++
+++static inline int av_rpi_is_sand_format(const int format)
+++{
+++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
+++}
+++
+++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
+++{
+++    return av_rpi_is_sand_format(frame->format);
+++}
+++
+++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
+++{
+++    return (frame->format == AV_PIX_FMT_SAND128);
+++}
+++
+++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
+++{
+++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
+++}
+++
+++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
+++{
+++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
+++}
+++
+++// If x is measured in bytes (not pixels) then this works for sand64_16 as
+++// well as sand128 - but in the general case we work that out
+++
+++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
+++{
+++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y + stride2 * x2;
+++}
+++
+++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
+++{
+++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
+++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
+++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
+++    const unsigned int x1 = x & (stride1 - 1);
+++    const unsigned int x2 = x ^ x1;
+++
+++    return x1 + stride1 * y_c + stride2 * x2;
+++}
+++
+++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
+++}
+++
+++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
+++{
+++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
+++}
+++
+++#endif
+++#endif
+++
+ diff --git a/libswscale/input.c b/libswscale/input.c
+-index 14ab5ab..e61b67a 100644
++index 14ab5abb3a..7a827c71e3 100644
+ --- a/libswscale/input.c
+ +++ b/libswscale/input.c
+-@@ -719,6 +719,14 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
++@@ -719,6 +719,13 @@ static void p010BEToUV_c(uint8_t *dstU, uint8_t *dstV,
+      }
+  }
+  
+-+
+ +static void sand128ToUV_c(uint8_t *dstU, uint8_t *dstV,
+ +                       const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
+ +                       int width, uint32_t *unused)
+@@ -18760,112 +29258,418 @@ index 14ab5ab..e61b67a 100644
+  #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
+  
+  static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
+-@@ -1085,6 +1093,9 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
++@@ -1085,6 +1092,10 @@ av_cold void ff_sws_init_input_funcs(SwsContext *c)
+      case AV_PIX_FMT_P010BE:
+          c->chrToYV12 = p010BEToUV_c;
+          break;
+ +    case AV_PIX_FMT_SAND128:
+-+        c->chrToYV12 = sand128ToUV_c;
+++    case AV_PIX_FMT_SAND64_10:
+++        c->chrToYV12 = sand128ToUV_c;  // NIF
+ +        break;
+      }
+      if (c->chrSrcHSubSample) {
+          switch (srcFormat) {
+ diff --git a/libswscale/utils.c b/libswscale/utils.c
+-index 576d8f0..d7206cc 100644
++index 576d8f0d5a..fd88a5e51e 100644
+ --- a/libswscale/utils.c
+ +++ b/libswscale/utils.c
+-@@ -248,6 +248,9 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
++@@ -248,6 +248,10 @@ static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
+      [AV_PIX_FMT_AYUV64LE]    = { 1, 1},
+      [AV_PIX_FMT_P010LE]      = { 1, 0 },
+      [AV_PIX_FMT_P010BE]      = { 1, 0 },
+ +#ifdef RPI
+ +    [AV_PIX_FMT_SAND128]     = { 1, 0 },
+++    [AV_PIX_FMT_SAND64_10]   = { 1, 0 },
+ +#endif
+  };
+  
+  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
+-diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+-new file mode 100755
+-index 0000000..8b596a2
++diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
++new file mode 100644
++index 0000000000..b1e99a6a89
+ --- /dev/null
+-+++ b/pi-util/conf.sh
+-@@ -0,0 +1,33 @@
+-+echo "Configure for Pi2/3"
+++++ b/pi-util/BUILD.txt
++@@ -0,0 +1,25 @@
+++Building Pi FFmpeg
+++==================
+ +
+-+RPI_BUILDROOT=`pwd`/build
+-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+-+#RPI_KEEPS="-save-temps=obj"
+-+RPI_KEEPS=""
+++Configuration:
+++=============
+ +
+-+./configure --enable-cross-compile\
+-+ --arch=armv6t2\
+-+ --cpu=cortex-a7\
+-+ --target-os=linux\
+-+ --disable-stripping\
+-+ --disable-thumb\
+-+ --enable-mmal\
+-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++pi-util/conf_pi2.sh
+ +
+-+# --enable-extra-warnings\
+-+# --arch=armv71\
+-+# --enable-shared\
+++contains suitable options to build the code for Pi2/3.  It expects to find
+++git clones of
+ +
+-+# gcc option for getting asm listing
+-+# -Wa,-ahls
+-diff --git a/pi-util/conf1.sh b/pi-util/conf1.sh
+-new file mode 100644
+-index 0000000..160e149
+---- /dev/null
+-+++ b/pi-util/conf1.sh
+-@@ -0,0 +1,34 @@
+-+echo "Configure for Pi1"
+++https://github.com/raspberrypi/tools
+++https://github.com/raspberrypi/firmware
+ +
+-+RPI_BUILDROOT=`pwd`/build
+-+RPI_ROOTFS=$RPI_BUILDROOT/linux/raspian_jessie_pi1-sysroot
+-+RPI_TOOLROOT=$RPI_BUILDROOT/tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+-+RPI_OPT_VC=$RPI_ROOTFS/opt/vc
+-+#RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_ROOTFS/usr/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+-+RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+-+#RPI_DEFS="-D__VCCOREVER__=0x04000000"
+-+RPI_LIBDIRS="-L$RPI_ROOTFS/lib -L$RPI_ROOTFS/usr/lib -L$RPI_OPT_VC/lib"
+-+#RPI_KEEPS="-save-temps=obj"
+-+RPI_KEEPS=""
+++in the parent of the FFmpeg directory.  I recommend using --depth 1 to avoid a
+++lot of history you don't want.
+ +
+-+./configure --enable-cross-compile\
+-+ --cpu=arm1176jzf-s\
+-+ --arch=armv\
+-+ --disable-neon\
+-+ --target-os=linux\
+-+ --disable-stripping\
+-+ --enable-mmal\
+-+ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_ROOTFS/lib,-rpath-link=$RPI_ROOTFS/usr/lib"\
+-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
+++rebuilt.  Otherwise the prebuilt .c & .h files will be used.
+++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
+ +
+++pi-util/conf_p1.sh should configure for Pi1.  Beware that as of this time
+++H265 QPU acceleration is broken on Pi1 and so it is disabled.
+ +
+-+# --enable-extra-warnings\
+-+# --arch=armv71\
+-+# --enable-shared\
+ +
+-+# gcc option for getting asm listing
+-+# -Wa,-ahls
++diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
++new file mode 100644
++index 0000000000..f05b7753f7
++--- /dev/null
+++++ b/pi-util/conf_h265.2016.csv
++@@ -0,0 +1,193 @@
+++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+++2,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+++2,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
+++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
+++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
+++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
+++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
+++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
+++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
+++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
+++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
+++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
+++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
+++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
+++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
+++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
+++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
+++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
+++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
+++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
+++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
+++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
+++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
+++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
+++2,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
+++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
+++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
+++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
+++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
+++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
+++1,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5
+++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
+++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
+++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
+++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
+++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
+++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
+++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
+++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
+++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
+++2,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
++diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
++new file mode 100644
++index 0000000000..6082641271
++--- /dev/null
+++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
++@@ -0,0 +1,147 @@
+++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
+++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
+++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
+++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
+++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
+++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
+++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
+++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
+++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
+++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
+++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
+++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
+++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
+++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
+++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
+++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
+++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
+++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
+++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
+++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
+++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
+++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
+++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
+++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
+++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
+++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
+++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
+++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
+++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
+++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
+++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
+++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
+++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
+++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
+++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
+++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
+++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
+++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
+++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
+++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
+++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
+++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
+++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
+++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
+++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
+++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
+++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
+++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
+++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
+++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
+++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
+++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
+++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
+++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
+++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
+++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
+++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
+++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
+++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
+++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
+++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
+++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
+++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
+++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
+++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
+++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
+++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
+++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
+++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
+++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
+++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
+++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
+++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
+++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
+++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
+++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
+++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
+++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
+++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
+++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
+++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
+++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
+++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
+++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
+++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
+++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
+++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
+++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
+++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
+++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
+++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
+++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
+++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
+++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
+++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
+++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
+++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
+++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
+++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
+++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
+++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
+++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
+++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
+++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
+++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
+++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
+++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
+++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
+++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
+++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
+++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
+++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
+++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
+++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
+++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
+++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
+++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
+++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
+++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
+++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
+++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
+++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
+++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
+++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
+++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
+++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
+++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
+++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
+++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
+++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
+++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
+++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
+++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
+++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
+++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
+++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
+++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
+++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
+++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
+++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
+++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+ diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+ new file mode 100644
+-index 0000000..fc14f2a
++index 0000000000..fc14f2a3c2
+ --- /dev/null
+ +++ b/pi-util/conf_h265.csv
+ @@ -0,0 +1,144 @@
+@@ -19013,14 +29817,88 @@ index 0000000..fc14f2a
+ +1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+ +1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+ +1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
++new file mode 100755
++index 0000000000..ec25b81c31
++--- /dev/null
+++++ b/pi-util/conf_pi1.sh
++@@ -0,0 +1,31 @@
+++echo "Configure for Pi1"
+++
+++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=`pwd`/../firmware/opt/vc
+++
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --cpu=arm1176jzf-s\
+++ --arch=arm\
+++ --disable-neon\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
++diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
++new file mode 100755
++index 0000000000..f8e5e75375
++--- /dev/null
+++++ b/pi-util/conf_pi2.sh
++@@ -0,0 +1,30 @@
+++echo "Configure for Pi2/3"
+++
+++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
+++RPI_OPT_VC=`pwd`/../firmware/opt/vc
+++
+++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
+++RPI_DEFS="-D__VCCOREVER__=0x04000000 -DRPI=1"
+++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
+++#RPI_KEEPS="-save-temps=obj"
+++RPI_KEEPS=""
+++
+++./configure --enable-cross-compile\
+++ --arch=armv6t2\
+++ --cpu=cortex-a7\
+++ --target-os=linux\
+++ --disable-stripping\
+++ --disable-thumb\
+++ --enable-mmal\
+++ --extra-cflags="-g $RPI_KEEPS $RPI_DEFS $RPI_INCLUDES"\
+++ --extra-cxxflags="$RPI_DEFS $RPI_INCLUDES"\
+++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
+++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
+++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
+++
+++# --enable-extra-warnings\
+++# --arch=armv71\
+++# --enable-shared\
+++
+++# gcc option for getting asm listing
+++# -Wa,-ahls
+ diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+-new file mode 100644
+-index 0000000..c896bc6
++new file mode 100755
++index 0000000000..70f7be22bb
+ --- /dev/null
+ +++ b/pi-util/ffconf.py
+-@@ -0,0 +1,154 @@
++@@ -0,0 +1,174 @@
+ +#!/usr/bin/env python
+ +
+++import string
+ +import os
+ +import subprocess
+ +import re
+@@ -19029,12 +29907,20 @@ index 0000000..c896bc6
+ +import csv
+ +from stat import *
+ +
+-+conf_root = "/opt/conform/h265"
+ +ffmpeg_exec = "./ffmpeg"
+ +
+-+def testone(fileroot, name, es_file, md5_file):
+++def testone(fileroot, srcname, es_file, md5_file):
+ +    tmp_root = "/tmp"
+ +
+++    names = srcname.split('/')
+++    while len(names) > 1:
+++        tmp_root = os.path.join(tmp_root, names[0])
+++        del names[0]
+++    name = names[0]
+++
+++    if not os.path.exists(tmp_root):
+++        os.makedirs(tmp_root)
+++
+ +    dec_file = os.path.join(tmp_root, name + ".dec.md5")
+ +    try:
+ +        os.remove(dec_file)
+@@ -19079,10 +29965,10 @@ index 0000000..c896bc6
+ +
+ +def scandir(root):
+ +    aconf = []
+-+    ents = os.listdir(conf_root)
+++    ents = os.listdir(root)
+ +    ents.sort(key=str.lower)
+ +    for name in ents:
+-+        test_path = os.path.join(conf_root, name)
+++        test_path = os.path.join(root, name)
+ +        if S_ISDIR(os.stat(test_path).st_mode):
+ +            files = os.listdir(test_path)
+ +            es_file = "?"
+@@ -19093,7 +29979,7 @@ index 0000000..c896bc6
+ +                    pass
+ +                elif ext == ".bit" or ext == ".bin":
+ +                    es_file = f
+-+                elif ext == ".md5":
+++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
+ +                    if md5_file == "?":
+ +                        md5_file = f
+ +                    elif base[-3:] == "yuv":
+@@ -19105,13 +29991,15 @@ index 0000000..c896bc6
+ +    if not tests:
+ +        return True
+ +    for t in tests:
+-+        if name[0:len(t)] == t:
+++        if name[0:len(t)] == t or name.find("/" + t) != -1:
+ +            return True
+-+        return False
+++    return False
+ +
+-+def doconf(csva, tests):
+-+    failures = []
+++def doconf(csva, tests, test_root):
+++    unx_failures = []
+ +    unx_success = []
+++    failures = 0
+++    successes = 0
+ +    for a in csva:
+ +        exp_test = int(a[0])
+ +        if (exp_test and runtest(a[1], tests)):
+@@ -19119,17 +30007,25 @@ index 0000000..c896bc6
+ +            print "==== ", name,
+ +            sys.stdout.flush()
+ +
+-+            rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
+++            rv = testone(os.path.join(test_root, name), name, a[2], a[3])
+++            if (rv == 0):
+++                successes += 1
+++            else:
+++                failures += 1
+++
+ +            if (rv == 0):
+ +                if exp_test == 2:
+ +                    print ": * OK *"
+ +                    unx_success.append(name)
+ +                else:
+ +                    print ": ok"
+-+            elif exp_test > 1 and rv == 1:
+++            elif exp_test == 2 and rv == 1:
+ +                print ": fail"
+++            elif exp_test == 3 and rv == 2:
+++                # Call an expected "crash" an abort
+++                print ": abort"
+ +            else:
+-+                failures.append(name)
+++                unx_failures.append(name)
+ +                if rv == 1:
+ +                    print ": * FAIL *"
+ +                elif (rv == 2) :
+@@ -19139,11 +30035,11 @@ index 0000000..c896bc6
+ +                else :
+ +                    print ": * BANG *"
+ +
+-+    if failures or unx_success:
+-+        print "Unexpected Failures:", failures
+++    if unx_failures or unx_success:
+++        print "Unexpected Failures:", unx_failures
+ +        print "Unexpected Success: ", unx_success
+ +    else:
+-+        print "All tests normal"
+++        print "All tests normal:", successes, "ok,", failures, "failed"
+ +
+ +
+ +class ConfCSVDialect(csv.Dialect):
+@@ -19159,2638 +30055,194 @@ index 0000000..c896bc6
+ +
+ +    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
+ +    argp.add_argument("tests", nargs='*')
+++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
+ +    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
+-+    argp.add_argument("--csv", default="pi-util/conf_h265.csv", help="CSV filename")
+++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
+ +    args = argp.parse_args()
+ +
+ +    if args.csvgen:
+-+        csv.writer(sys.stdout).writerows(scandir(conf_root))
+++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
+ +        exit(0)
+ +
+ +    with open(args.csv, 'rt') as csvfile:
+ +        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
+ +
+ +
+-+    doconf(csva, args.tests)
+++    doconf(csva, args.tests, args.test_root)
+ +
+-diff --git a/pi-util/qasm.py b/pi-util/qasm.py
+-new file mode 100644
+-index 0000000..1eacc04
++diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
++new file mode 100755
++index 0000000000..27cc453963
+ --- /dev/null
+-+++ b/pi-util/qasm.py
+-@@ -0,0 +1,2502 @@
+-+#!/usr/bin/env python
+++++ b/pi-util/ffperf.py
++@@ -0,0 +1,124 @@
+++#!/usr/bin/env python3
+ +
+-+#    add.ifz.setf  -, r0, ra0 ; fmul  rb1, rany2, 0 ; thrend # comment
+-+#    add  r0, r0, 1                    # implicit mul nop
+-+#    nop                               # explicit add nop, implicit mul nop
+-+#    bkpt                              # implicit add/mul nop
+-+#    mov  r0, 0x1234                   # hex immediate
+-+#    mov  r0, 20 * 40                  # expressions...
+-+#    mov  r0, f(sqrt(2.0) * 3.0)       # f() converts float to bits
+-+#    mov  r0, a:label                  # put address of label in r0
+-+# :label
+-+#    bra.allnn  ra2, a:1f              # branch to label 1 (searching forward), using absolute address
+-+# :1
+-+#    brr.anyz  -, r:1b                 # branch to label 1 (searching backward), using relative address
+-+# :1                                   # multiple definitions of numeric labels (differentiated using f/b)
+-+# .set my_val, 3                       # introduce alias for 3
+-+# .set my_reg, r0                      # and for r0
+-+#    mov  my_reg, my_val               # then use them
+-+# .set my_reg2, my_reg + my_val        # r0 plus 3 is r3
+-+# .macro my_add, a, b, c               # a, b, c act as if .set on entry
+-+# .set my_val, 10
+-+#    add  a, b, c
+-+#    mov  r0, my_val                   # 10
+-+# .endm                                # forget all .sets since .macro (including arg .sets)
+-+#    mov  r0, my_val                   # 3
+-+#    my_add  my_reg2, my_reg, ra0 << 4 # << rotates left (>> rotates right)
+-+
+-+import math
+-+import optparse
+++import time
+++import string
+ +import os
+-+import random
+++import tempfile
+++import subprocess
+ +import re
+-+import struct
+++import argparse
+ +import sys
+-+import time
+++import csv
+++from stat import *
+ +
+-+###############################################################################
+-+# constants
+-+###############################################################################
+-+
+-+# ops
+-+######
+-+
+-+# negatives are internal qasm ops
+-+
+-+AOP_MOV     = -3   # two operands
+-+AOP_BRA     = -2   # two operands
+-+AOP_BRR     = -1   # two operands
+-+AOP_NOP     = 0x00 # no operands
+-+AOP_FADD    = 0x01
+-+AOP_FSUB    = 0x02
+-+AOP_FMIN    = 0x03
+-+AOP_FMAX    = 0x04
+-+AOP_FMINABS = 0x05
+-+AOP_FMAXABS = 0x06
+-+AOP_FTOI    = 0x07 # two operands
+-+AOP_ITOF    = 0x08 # two operands
+-+AOP_ADD     = 0x0c
+-+AOP_SUB     = 0x0d
+-+AOP_SHR     = 0x0e
+-+AOP_ASR     = 0x0f
+-+AOP_ROR     = 0x10
+-+AOP_SHL     = 0x11
+-+AOP_MIN     = 0x12
+-+AOP_MAX     = 0x13
+-+AOP_AND     = 0x14
+-+AOP_OR      = 0x15
+-+AOP_XOR     = 0x16
+-+AOP_NOT     = 0x17 # two operands
+-+AOP_CLZ     = 0x18 # two operands
+-+AOP_V8ADDS  = 0x1e
+-+AOP_V8SUBS  = 0x1f
+-+
+-+MOP_MOV    = -1  # two operands
+-+MOP_NOP    = 0x0 # no operands
+-+MOP_FMUL   = 0x1
+-+MOP_MUL24  = 0x2
+-+MOP_V8MULD = 0x3
+-+MOP_V8MIN  = 0x4
+-+MOP_V8MAX  = 0x5
+-+MOP_V8ADDS = 0x6
+-+MOP_V8SUBS = 0x7
+-+
+-+# ldi modes
+-+############
+-+
+-+LDI_32          = 0
+-+LDI_EL_SIGNED   = 1
+-+LDI_EL_UNSIGNED = 3
+-+LDI_SEMA        = 4
+-+
+-+# conds
+-+########
+-+
+-+COND_NEVER  = 0
+-+COND_ALWAYS = 1
+-+COND_IFZ    = 2
+-+COND_IFNZ   = 3
+-+COND_IFN    = 4
+-+COND_IFNN   = 5
+-+COND_IFC    = 6
+-+COND_IFNC   = 7
+-+
+-+BCOND_ALLZ   = 0
+-+BCOND_ALLNZ  = 1
+-+BCOND_ANYZ   = 2
+-+BCOND_ANYNZ  = 3
+-+BCOND_ALLN   = 4
+-+BCOND_ALLNN  = 5
+-+BCOND_ANYN   = 6
+-+BCOND_ANYNN  = 7
+-+BCOND_ALLC   = 8
+-+BCOND_ALLNC  = 9
+-+BCOND_ANYC   = 10
+-+BCOND_ANYNC  = 11
+-+BCOND_ALWAYS = 15
+-+
+-+# packing/unpacking
+-+####################
+-+
+-+# regfile a pack modes
+-+PACK_A_NOP   = 0
+-+PACK_A_16A   = 1
+-+PACK_A_16B   = 2
+-+PACK_A_8888  = 3
+-+PACK_A_8A    = 4
+-+PACK_A_8B    = 5
+-+PACK_A_8C    = 6
+-+PACK_A_8D    = 7
+-+PACK_A_32S   = 8
+-+PACK_A_16AS  = 9
+-+PACK_A_16BS  = 10
+-+PACK_A_8888S = 11
+-+PACK_A_8AS   = 12
+-+PACK_A_8BS   = 13
+-+PACK_A_8CS   = 14
+-+PACK_A_8DS   = 15
+-+
+-+# mul unit pack modes
+-+PACK_MUL_NOP  = 0
+-+PACK_MUL_8888 = 3
+-+PACK_MUL_8A   = 4
+-+PACK_MUL_8B   = 5
+-+PACK_MUL_8C   = 6
+-+PACK_MUL_8D   = 7
+-+
+-+# regfile a unpack modes
+-+UNPACK_A_NOP = 0
+-+UNPACK_A_16A = 1
+-+UNPACK_A_16B = 2
+-+UNPACK_A_8R  = 3
+-+UNPACK_A_8A  = 4
+-+UNPACK_A_8B  = 5
+-+UNPACK_A_8C  = 6
+-+UNPACK_A_8D  = 7
+-+
+-+# r4 unpack modes
+-+UNPACK_R4_NOP = 0
+-+UNPACK_R4_16A = 1
+-+UNPACK_R4_16B = 2
+-+UNPACK_R4_8R  = 3
+-+UNPACK_R4_8A  = 4
+-+UNPACK_R4_8B  = 5
+-+UNPACK_R4_8C  = 6
+-+UNPACK_R4_8D  = 7
+-+
+-+PACK_TYPE_INT    = 0
+-+PACK_TYPE_FLOAT  = 1
+-+PACK_TYPE_EITHER = -1
+-+
+-+PACK_MODE_A      = 0 # regfile a
+-+PACK_MODE_M      = 1 # mul unit
+-+PACK_MODE_EITHER = -1
+-+
+-+UNPACK_LOC_A     = 0 # regfile a
+-+UNPACK_LOC_R4    = 1 # r4
+-+UNPACK_LOC_AB    = 2 # either regfile a or regfile b
+-+UNPACK_LOC_OTHER = 3 # somewhere else
+-+
+-+# args
+-+#######
+-+
+-+# loc_t, ie internal
+-+MUX_AC  = 0
+-+MUX_ANY = 1
+-+MUX_A   = 2
+-+MUX_B   = 3
+-+RW_EITHER = 0
+-+RW_READ   = 1
+-+RW_WRITE  = 2
+-+
+-+RADDR_NOP = 39
+-+
+-+# negatives are for internal use
+-+RMUX_SEMA  = -6
+-+RMUX_LABEL = -5
+-+RMUX_IMMV  = -4
+-+RMUX_IMM   = -3
+-+RMUX_AC    = -2
+-+RMUX_ANY   = -1
+-+RMUX_A0    = 0 # followed by A1, A2, A3, A4, A5
+-+RMUX_A     = 6
+-+RMUX_B     = 7
+-+
+-+WADDR_R0  = 32 # followed by R1, R2, R3
+-+WADDR_NOP = 39
+-+
+-+WMUX_ANY = 0
+-+WMUX_A   = 1
+-+WMUX_B   = 2
+-+
+-+# signals
+-+##########
+-+
+-+SIG_BKPT       = 0
+-+SIG_NORMAL     = 1
+-+SIG_THRSW      = 2
+-+SIG_THREND     = 3
+-+SIG_SBWAIT     = 4
+-+SIG_SBDONE     = 5
+-+SIG_INT        = 6 # on a0
+-+SIG_LTHRSW     = 6 # on b0
+-+SIG_LOADCV     = 7
+-+SIG_LOADC      = 8
+-+SIG_LDCEND     = 9
+-+SIG_LDTMU0     = 10
+-+SIG_LDTMU1     = 11
+-+SIG_ROTATE     = 12 # on a0
+-+SIG_LOADAM     = 12 # on b0
+-+SIG_SMALLIMMED = 13
+-+SIG_IMMED      = 14
+-+SIG_BRANCH     = 15
+-+
+-+# multi-line assembler constructs
+-+##################################
+-+
+-+CONSTRUCT_MACRO = 0x1
+-+CONSTRUCT_IF    = 0x2
+-+CONSTRUCT_ELSE  = 0x4
+-+CONSTRUCT_REP   = 0x8
+-+
+-+###############################################################################
+-+# helpers
+-+###############################################################################
+-+
+-+def asm_error(message, location = None):
+-+   if location is None:
+-+      location = current_location
+-+   if location == '':
+-+      sys.stderr.write('qasm ERROR: %s\n' % message)
+-+   else:
+-+      sys.stderr.write('qasm ERROR: %s: %s\n' % (location, message))
+-+   sys.exit(-1)
+-+
+-+def asm_warning(message, location = None):
+-+   if disable_warnings or (nwarn_level != 0):
+-+      return
+-+   if location is None:
+-+      location = current_location
+-+   if location == '':
+-+      sys.stderr.write('qasm WARNING: %s\n' % message)
+-+   else:
+-+      sys.stderr.write('qasm WARNING: %s: %s\n' % (location, message))
+-+   if warnings_are_errors:
+-+      asm_error('warnings are errors!', location)
+-+
+-+# smart_split('') = []
+-+# smart_split('a') = ['a']
+-+# smart_split('a(1, 2),[3, 4, 5],6') = ['a(1, 2)', '[3, 4, 5]', '6']
+-+def smart_split(s, delim = ',', count = 0):
+-+   if len(s) == 0:
+-+      return []
+-+   parts = []
+-+   depth = 0
+-+   i = 0
+-+   for j in xrange(len(s)):
+-+      if s[j] in '([{':
+-+         depth += 1
+-+      elif s[j] in ')]}':
+-+         depth -= 1
+-+      elif (s[j] == delim) and (depth == 0):
+-+         parts.append(s[i:j])
+-+         i = j + 1
+-+         if len(parts) == count:
+-+            break
+-+   if depth != 0:
+-+      asm_error('bracket nesting fail')
+-+   parts.append(s[i:])
+-+   return parts
+-+
+-+def is_int(x):
+-+   return isinstance(x, int) or isinstance(x, long)
+-+
+-+###############################################################################
+-+# "parsing" stuff
+-+###############################################################################
+-+
+-+re_macro = re.compile('\\.macro\\s+(?P<name>\\w+)(?P<params>(\\s*,\\s*\\w+)*)$')
+-+re_if = re.compile('\\.if((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+-+re_elif = re.compile('\\.elif((?P<set>n?set)\\s+(?P<name>\\w+)|\\s(?P<condition>.+))$')
+-+re_rep = re.compile('\\.rep\\s+(?P<name>\\w+)\\s*,(?P<count>.+)$')
+-+re_include = re.compile('\\.include\\s(?P<filename>.+)$')
+-+re_set = re.compile('\\.set\\s+(?P<name>\\w+)\\s*,(?P<val>.+)$')
+-+re_unset = re.compile('\\.unset\\s+(?P<name>\\w+)$')
+-+re_eval = re.compile('\\.eval\\s(?P<expr>.+)$')
+-+re_print_info_warn_error = re.compile('\\.(?P<print_info_warn_error>print|info|warn|error)\\s(?P<message>.+)$')
+-+re_assert = re.compile('\\.assert\\s(?P<condition>.+)$')
+-+re_data = re.compile('\\.d(?P<size>[124])\\s(?P<data>.+)$')
+-+re_macro_inst = re.compile('(?P<name>\\w+)(?P<args>\\s.+|)$')
+-+re_label = re.compile(':(?P<name>:?[a-zA-Z_]\\w*|\\d+)$')
+-+re_op = re.compile('(?P<op>\\w+)(\\.(?P<cond>\\w+))??(\\.(?P<sf>setf))?(?P<args>\\s.+|)$')
+-+re_label_ref_left = re.compile('\\b([ar]):')
+-+re_label_ref_right = re.compile('[a-zA-Z_]\\w*|\\d+[bf]$')
+-+re_pack = re.compile('\\.([0-9]\\w*[a-df-zA-DF-Z_])') # a bit weird because we don't want to pick up float literals...
+-+
+-+# ops
+-+######
+-+
+-+aops = {
+-+   'mov': (AOP_MOV, 2),
+-+   'bra': (AOP_BRA, 2),
+-+   'brr': (AOP_BRR, 2),
+-+   'nop': (AOP_NOP, 0),
+-+   'fadd': (AOP_FADD, 3),
+-+   'fsub': (AOP_FSUB, 3),
+-+   'fmin': (AOP_FMIN, 3),
+-+   'fmax': (AOP_FMAX, 3),
+-+   'fminabs': (AOP_FMINABS, 3),
+-+   'fmaxabs': (AOP_FMAXABS, 3),
+-+   'ftoi': (AOP_FTOI, 2),
+-+   'itof': (AOP_ITOF, 2),
+-+   'add': (AOP_ADD, 3),
+-+   'sub': (AOP_SUB, 3),
+-+   'shr': (AOP_SHR, 3),
+-+   'asr': (AOP_ASR, 3),
+-+   'ror': (AOP_ROR, 3),
+-+   'shl': (AOP_SHL, 3),
+-+   'min': (AOP_MIN, 3),
+-+   'max': (AOP_MAX, 3),
+-+   'and': (AOP_AND, 3),
+-+   'or': (AOP_OR, 3),
+-+   'xor': (AOP_XOR, 3),
+-+   'not': (AOP_NOT, 2),
+-+   'clz': (AOP_CLZ, 2),
+-+   'v8adds': (AOP_V8ADDS, 3),
+-+   'v8subs': (AOP_V8SUBS, 3)}
+-+
+-+def get_aop(aop):
+-+   if aop not in aops:
+-+      asm_error('invalid aop')
+-+   return aops[aop]
+-+
+-+mops = {
+-+   'mov': (MOP_MOV, 2),
+-+   'nop': (MOP_NOP, 0),
+-+   'fmul': (MOP_FMUL, 3),
+-+   'mul24': (MOP_MUL24, 3),
+-+   'v8muld': (MOP_V8MULD, 3),
+-+   'v8min': (MOP_V8MIN, 3),
+-+   'v8max': (MOP_V8MAX, 3),
+-+   'v8adds': (MOP_V8ADDS, 3),
+-+   'v8subs': (MOP_V8SUBS, 3)}
+-+
+-+def get_mop(mop):
+-+   if mop not in mops:
+-+      asm_error('invalid mop')
+-+   return mops[mop]
+-+
+-+# conds
+-+########
+-+
+-+conds = {
+-+   'ifz': COND_IFZ,
+-+   'ifnz': COND_IFNZ,
+-+   'ifn': COND_IFN,
+-+   'ifnn': COND_IFNN,
+-+   'ifc': COND_IFC,
+-+   'ifnc': COND_IFNC}
+-+
+-+def get_cond(cond):
+-+   if not cond:
+-+      return COND_ALWAYS
+-+   if cond not in conds:
+-+      asm_error('invalid cond')
+-+   return conds[cond]
+-+
+-+bconds = {
+-+   'allz': BCOND_ALLZ,
+-+   'allnz': BCOND_ALLNZ,
+-+   'anyz': BCOND_ANYZ,
+-+   'anynz': BCOND_ANYNZ,
+-+   'alln': BCOND_ALLN,
+-+   'allnn': BCOND_ALLNN,
+-+   'anyn': BCOND_ANYN,
+-+   'anynn': BCOND_ANYNN,
+-+   'allc': BCOND_ALLC,
+-+   'allnc': BCOND_ALLNC,
+-+   'anyc': BCOND_ANYC,
+-+   'anync': BCOND_ANYNC}
+-+
+-+def get_bcond(bcond):
+-+   if not bcond:
+-+      return BCOND_ALWAYS
+-+   if bcond not in bconds:
+-+      asm_error('invalid bcond')
+-+   return bconds[bcond]
+-+
+-+def get_setf(setf):
+-+   if not setf:
+-+      return False
+-+   return True
+-+
+-+# packing/unpacking
+-+####################
+-+
+-+packs = {
+-+   '16a':    (PACK_A_16A,    PACK_TYPE_INT,    PACK_MODE_A),
+-+   '16b':    (PACK_A_16B,    PACK_TYPE_INT,    PACK_MODE_A),
+-+   '16af':   (PACK_A_16A,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+-+   '16bf':   (PACK_A_16B,    PACK_TYPE_FLOAT,  PACK_MODE_A),
+-+   '8abcd':  (PACK_A_8888,   PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8a':     (PACK_A_8A,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8b':     (PACK_A_8B,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8c':     (PACK_A_8C,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8d':     (PACK_A_8D,     PACK_TYPE_EITHER, PACK_MODE_A),
+-+   's':      (PACK_A_32S,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '16as':   (PACK_A_16AS,   PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '16bs':   (PACK_A_16BS,   PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8abcds': (PACK_A_8888S,  PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8as':    (PACK_A_8AS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8bs':    (PACK_A_8BS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8cs':    (PACK_A_8CS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8ds':    (PACK_A_8DS,    PACK_TYPE_EITHER, PACK_MODE_A),
+-+   '8abcdc': (PACK_MUL_8888, PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8ac':    (PACK_MUL_8A,   PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8bc':    (PACK_MUL_8B,   PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8cc':    (PACK_MUL_8C,   PACK_TYPE_EITHER, PACK_MODE_M),
+-+   '8dc':    (PACK_MUL_8D,   PACK_TYPE_EITHER, PACK_MODE_M)}
+-+
+-+def get_pack(pack):
+-+   if not pack:
+-+      return (0, PACK_TYPE_EITHER, PACK_MODE_EITHER)
+-+   if pack not in packs:
+-+      asm_error('invalid pack')
+-+   return packs[pack]
+-+
+-+a_unpacks = {
+-+   '16a':  (UNPACK_A_16A, PACK_TYPE_INT),
+-+   '16b':  (UNPACK_A_16B, PACK_TYPE_INT),
+-+   '16af': (UNPACK_A_16A, PACK_TYPE_FLOAT),
+-+   '16bf': (UNPACK_A_16B, PACK_TYPE_FLOAT),
+-+   '8dr':  (UNPACK_A_8R,  PACK_TYPE_EITHER),
+-+   '8a':   (UNPACK_A_8A,  PACK_TYPE_INT),
+-+   '8b':   (UNPACK_A_8B,  PACK_TYPE_INT),
+-+   '8c':   (UNPACK_A_8C,  PACK_TYPE_INT),
+-+   '8d':   (UNPACK_A_8D,  PACK_TYPE_INT),
+-+   '8ac':  (UNPACK_A_8A,  PACK_TYPE_FLOAT),
+-+   '8bc':  (UNPACK_A_8B,  PACK_TYPE_FLOAT),
+-+   '8cc':  (UNPACK_A_8C,  PACK_TYPE_FLOAT),
+-+   '8dc':  (UNPACK_A_8D,  PACK_TYPE_FLOAT)}
+-+
+-+def get_a_unpack(unpack):
+-+   if not unpack:
+-+      return (UNPACK_A_NOP, PACK_TYPE_EITHER, UNPACK_LOC_A)
+-+   if unpack not in a_unpacks:
+-+      asm_error('invalid ra unpack')
+-+   return a_unpacks[unpack] + (UNPACK_LOC_A,)
+-+
+-+r4_unpacks = {
+-+   '16af': UNPACK_R4_16A,
+-+   '16bf': UNPACK_R4_16B,
+-+   '8dr':  UNPACK_R4_8R,
+-+   '8ac':  UNPACK_R4_8A,
+-+   '8bc':  UNPACK_R4_8B,
+-+   '8cc':  UNPACK_R4_8C,
+-+   '8dc':  UNPACK_R4_8D}
+-+
+-+def get_r4_unpack(unpack):
+-+   if not unpack:
+-+      return (UNPACK_R4_NOP, PACK_TYPE_EITHER, UNPACK_LOC_R4)
+-+   if unpack not in r4_unpacks:
+-+      asm_error('invalid r4 unpack')
+-+   return (r4_unpacks[unpack], PACK_TYPE_EITHER, UNPACK_LOC_R4)
+-+
+-+# args
+-+#######
+-+
+-+class loc_t:
+-+   def __init__(self, mux, i, rot, r5_rot, pack, rw):
+-+      self.mux = mux
+-+      self.i = i
+-+      self.rot = rot % 16
+-+      self.r5_rot = r5_rot % 16
+-+      self.pack = pack
+-+      self.rw = rw
+-+
+-+   def copy(self):
+-+      return loc_t(self.mux, self.i, self.rot, self.r5_rot, self.pack, self.rw)
+-+
+-+   def __add__(self, i):
+-+      if not is_int(i):
+-+         raise Exception('can only add integer to loc')
+-+      return loc_t(self.mux, self.i + i, self.rot, self.r5_rot, self.pack, self.rw)
+-+
+-+   def __sub__(self, i):
+-+      if not is_int(i):
+-+         raise Exception('can only subtract integer from loc')
+-+      return loc_t(self.mux, self.i - i, self.rot, self.r5_rot, self.pack, self.rw)
+-+
+-+   def __cmp__(self, other):
+-+      if is_int(other):
+-+         return cmp(self.i, other)
+-+      if not isinstance(other, loc_t):
+-+         raise Exception('can only compare loc to integer or other loc')
+-+      if self.mux != other.mux:
+-+         return cmp(self.mux, other.mux)
+-+      if self.i != other.i:
+-+         return cmp(self.i, other.i)
+-+      if self.rot != other.rot:
+-+         return cmp(self.rot, other.rot)
+-+      if self.r5_rot != other.r5_rot:
+-+         return cmp(self.r5_rot, other.r5_rot)
+-+      return cmp(self.pack, other.pack)
+-+
+-+   def is_r5(self):
+-+      return (self.mux == MUX_AC) and (self.i == 5)
+-+
+-+   def shift(self, rot, left):
+-+      if isinstance(rot, loc_t) and rot.is_r5():
+-+         if (rot.rot != 0) or (rot.r5_rot != 0) or rot.pack:
+-+            raise Exception('can\'t rotate by rotated/unpacked r5')
+-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot + (-1 if left else 1), self.pack, self.rw)
+-+      if not is_int(rot):
+-+         raise Exception('can only rotate by integer or r5')
+-+      return loc_t(self.mux, self.i, self.rot + (-rot if left else rot), self.r5_rot, self.pack, self.rw)
+-+
+-+   def __lshift__(self, rot):
+-+      return self.shift(rot, True)
+-+
+-+   def __rshift__(self, rot):
+-+      return self.shift(rot, False)
+-+
+-+   def __getattr__(self, name):
+-+      # discard the first character if it is an underscore. this is a total hack
+-+      # to allow packs starting with a digit to work
+-+      if name[0] == '_':
+-+         name = name[1:]
+-+      if (name in packs) or (name in a_unpacks) or (name in r4_unpacks):
+-+         if self.pack:
+-+            raise Exception('can\'t specify two packs')
+-+         return loc_t(self.mux, self.i, self.rot, self.r5_rot, name, self.rw)
+-+      raise AttributeError()
+-+
+-+   def __str__(self):
+-+      if self.mux == MUX_AC:
+-+         return 'r%d' % self.i
+-+      if self.mux == MUX_ANY:
+-+         return 'rany%d' % self.i
+-+      if self.mux == MUX_A:
+-+         return 'ra%d' % self.i
+-+      if self.mux == MUX_B:
+-+         return 'rb%d' % self.i
+-+      assert 0
+-+
+-+class sema_t:
+-+   def __init__(self, acq, i):
+-+      if not is_int(i):
+-+         raise Exception('semaphore index must be integer')
+-+      self.acq = acq
+-+      self.i = i
+-+
+-+class label_t:
+-+   def __init__(self, rel, name, offset):
+-+      self.rel = rel
+-+      self.name = name
+-+      self.offset = offset
+-+
+-+   def __add__(self, offset):
+-+      return label_t(self.rel, self.name, self.offset + offset)
+-+
+-+   def __sub__(self, offset):
+-+      return label_t(self.rel, self.name, self.offset - offset)
+-+
+-+class label_maker_t:
+-+   def __init__(self, rel):
+-+      self.rel = rel
+-+
+-+   def __getattr__(self, name):
+-+      # we discard the first character. this is a total hack to allow numeric labels to work
+-+      if not re_label_ref_right.match(name[1:]):
+-+         raise Exception('invalid label reference')
+-+      return label_t(self.rel, name[1:], 0)
+-+
+-+def bits(x, n):
+-+   if (x >> n) != 0:
+-+      raise Exception('%d doesn\'t fit in %d bits' % (x, n))
+-+   return x
+-+
+-+def bitsw(x, n):
+-+   if x == (1 << n):
+-+      x = 0
+-+   return bits(x, n)
+-+
+-+def bitsws(x, n):
+-+   if x == (1 << (n - 1)):
+-+      x = 0
+-+   if -(1 << (n - 1)) <= x < 0:
+-+      x += 1 << n
+-+   return bits(x, n)
+-+
+-+def vpm_setup(n, stride, addr, v2 = False):
+-+   horiz, laned, size, y, x, p = addr
+-+   if size not in (0, 1, 2):
+-+      raise Exception('addr size should be 0, 1, or 2')
+-+   if horiz:
+-+      if x != 0:
+-+         raise Exception('horizontal accesses must have x of 0')
+-+   else:
+-+      if (y & 0xf) != 0:
+-+         raise Exception('vertical accesses must be 16 row aligned')
+-+   hls = (bits(horiz, 1) << 3) | (bits(laned, 1) << 2) | (2 - size)
+-+   if v2:
+-+      return ((1 << 29) | (bitsw(n, 5) << 24) | (bitsws(stride, 7) << 16) |
+-+         (hls << 12) | ((bits(y, 8) | bits(x, 4)) << size) | bits(p, size))
+-+   return ((bitsw(n, 4) << 20) | (bitsw(stride, 6) << 12) |
+-+      (hls << 8) | ((bits(y, 6) | bits(x, 4)) << size) | bits(p, size))
+-+
+-+def vdw_setup_0(n, m, addr):
+-+   horiz, size, y, x, p = addr
+-+   if size not in (0, 1, 2):
+-+      raise Exception('addr size should be 0, 1, or 2')
+-+   return ((2 << 30) | (bitsw(n, 7) << 23) | (bitsw(m, 7) << 16) |
+-+      (bits(horiz, 1) << 14) | (bits(y, 7) << 7) | (bits(x, 4) << 3) | (size << 1) | bits(p, size))
+-+
+-+def vdr_setup_0(n, m, addr, vpm_stride, stride):
+-+   horiz, size, y, x, p = addr
+-+   if size not in (0, 1, 2):
+-+      raise Exception('addr size should be 0, 1, or 2')
+-+   if (stride < 8) or (stride & (stride - 1)):
+-+      raise Exception('stride must be power of 2 >= 8, 8 meaning use extended stride')
+-+   log2_stride = 3
+-+   while (1 << log2_stride) != stride:
+-+      log2_stride += 1
+-+   return ((1 << 31) | (size << 29) | (bits(p, size) << 28) | (bits(log2_stride - 3, 4) << 24) |
+-+      (bitsw(m, 4) << 20) | (bitsw(n, 4) << 16) | (bitsw(vpm_stride, 4) << 12) |
+-+      (bits(1 - horiz, 1) << 11) | (bits(y, 7) << 4) | bits(x, 4))
+-+
+-+class allocator_t:
+-+   def __init__(self, *available):
+-+      self.available = list(available)
+-+      self.allocated = {}
+-+      self.reserved = []
+-+
+-+   def copy(self):
+-+      a = allocator_t()
+-+      a.available = self.available[:]
+-+      a.allocated = self.allocated.copy()
+-+      a.reserved = self.reserved[:]
+-+      return a
+-+
+-+   def forget(self):
+-+      self.__init__(self.available + self.allocated.values() + self.reserved)
+-+
+-+   def reserve(self, *rs):
+-+      for r in rs:
+-+         self.available.remove(r)
+-+         self.reserved.append(r)
+-+
+-+   def retire(self, name):
+-+      r = self.allocated.pop(name)
+-+      del r.__invert__
+-+      del r.retire
+-+      self.available.append(r)
+-+      return r
+-+
+-+   def __getattr__(self, name):
+-+      if name not in self.allocated:
+-+         r = self.available.pop()
+-+         r.retire = lambda: self.retire(name) # this is an ugly hack to get nicer retire syntax
+-+         r.__invert__ = r.retire
+-+         self.allocated[name] = r
+-+      return self.allocated[name]
+-+
+-+def pragma_allow_xor_0(x):
+-+   global allow_xor_0
+-+
+-+   if not isinstance(x, bool):
+-+      raise Exception('allow_xor_0 must be bool')
+-+   x, allow_xor_0 = allow_xor_0, x
+-+   return x
+-+
+-+def pragma_dont_warn_when_mul_rot_inp_r5(x):
+-+   global dont_warn_when_mul_rot_inp_r5
+-+
+-+   if not isinstance(x, bool):
+-+      raise Exception('dont_warn_when_mul_rot_inp_r5 must be bool')
+-+   x, dont_warn_when_mul_rot_inp_r5 = dont_warn_when_mul_rot_inp_r5, x
+-+   return x
+-+
+-+arg_defs = {
+-+   # special reg names (these alias the regular names, but also have appropriate read/write restrictions)
+-+   'w':             loc_t(MUX_A,   15, 0, 0, None, RW_EITHER),
+-+   'z':             loc_t(MUX_B,   15, 0, 0, None, RW_EITHER),
+-+   'unif':          loc_t(MUX_ANY, 32, 0, 0, None, RW_READ),
+-+   'vary':          loc_t(MUX_ANY, 35, 0, 0, None, RW_READ),
+-+   'tmurs':         loc_t(MUX_ANY, 36, 0, 0, None, RW_WRITE),
+-+   'r5quad':        loc_t(MUX_A,   37, 0, 0, None, RW_WRITE),
+-+   'r5rep':         loc_t(MUX_B,   37, 0, 0, None, RW_WRITE),
+-+   'elem_num':      loc_t(MUX_A,   38, 0, 0, None, RW_READ),
+-+   'qpu_num':       loc_t(MUX_B,   38, 0, 0, None, RW_READ),
+-+   'unif_addr':     loc_t(MUX_A,   40, 0, 0, None, RW_WRITE),
+-+   'unif_addr_rel': loc_t(MUX_B,   40, 0, 0, None, RW_WRITE),
+-+   'x_coord':       loc_t(MUX_A,   41, 0, 0, None, RW_EITHER),
+-+   'y_coord':       loc_t(MUX_B,   41, 0, 0, None, RW_EITHER),
+-+   'ms_mask':       loc_t(MUX_A,   42, 0, 0, None, RW_EITHER),
+-+   'rev_flag':      loc_t(MUX_B,   42, 0, 0, None, RW_EITHER),
+-+   'stencil':       loc_t(MUX_ANY, 43, 0, 0, None, RW_WRITE),
+-+   'tlbz':          loc_t(MUX_ANY, 44, 0, 0, None, RW_WRITE),
+-+   'tlbm':          loc_t(MUX_ANY, 45, 0, 0, None, RW_WRITE),
+-+   'tlbc':          loc_t(MUX_ANY, 46, 0, 0, None, RW_WRITE),
+-+   'vpm':           loc_t(MUX_ANY, 48, 0, 0, None, RW_EITHER),
+-+   'vr_busy':       loc_t(MUX_A,   49, 0, 0, None, RW_READ),
+-+   'vw_busy':       loc_t(MUX_B,   49, 0, 0, None, RW_READ),
+-+   'vr_setup':      loc_t(MUX_A,   49, 0, 0, None, RW_WRITE),
+-+   'vw_setup':      loc_t(MUX_B,   49, 0, 0, None, RW_WRITE),
+-+   'vr_wait':       loc_t(MUX_A,   50, 0, 0, None, RW_READ),
+-+   'vw_wait':       loc_t(MUX_B,   50, 0, 0, None, RW_READ),
+-+   'vr_addr':       loc_t(MUX_A,   50, 0, 0, None, RW_WRITE),
+-+   'vw_addr':       loc_t(MUX_B,   50, 0, 0, None, RW_WRITE),
+-+   'mutex':         loc_t(MUX_ANY, 51, 0, 0, None, RW_EITHER),
+-+   'recip':         loc_t(MUX_ANY, 52, 0, 0, None, RW_WRITE),
+-+   'recipsqrt':     loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+-+   'rsqrt':         loc_t(MUX_ANY, 53, 0, 0, None, RW_WRITE),
+-+   'exp':           loc_t(MUX_ANY, 54, 0, 0, None, RW_WRITE),
+-+   'log':           loc_t(MUX_ANY, 55, 0, 0, None, RW_WRITE),
+-+   't0s':           loc_t(MUX_ANY, 56, 0, 0, None, RW_WRITE),
+-+   't0t':           loc_t(MUX_ANY, 57, 0, 0, None, RW_WRITE),
+-+   't0r':           loc_t(MUX_ANY, 58, 0, 0, None, RW_WRITE),
+-+   't0b':           loc_t(MUX_ANY, 59, 0, 0, None, RW_WRITE),
+-+   't1s':           loc_t(MUX_ANY, 60, 0, 0, None, RW_WRITE),
+-+   't1t':           loc_t(MUX_ANY, 61, 0, 0, None, RW_WRITE),
+-+   't1r':           loc_t(MUX_ANY, 62, 0, 0, None, RW_WRITE),
+-+   't1b':           loc_t(MUX_ANY, 63, 0, 0, None, RW_WRITE),
+-+
+-+   # semaphore acq/rel
+-+   'sacq': lambda i: sema_t(True, i),
+-+   'srel': lambda i: sema_t(False, i),
+-+
+-+   # label makers (before evaluating, the syntax x:label gets transformed to x_label_maker._label)
+-+   'r_label_maker': label_maker_t(True),
+-+   'a_label_maker': label_maker_t(False),
+-+
+-+   # handy functions
+-+   'f':     lambda x: struct.unpack('I', struct.pack('f', x))[0],
+-+   'sqrt':  math.sqrt,
+-+   'sin':   math.sin,
+-+   'cos':   math.cos,
+-+   'atan2': math.atan2,
+-+   'pi':    math.pi,
+-+   'rseed': random.seed,
+-+   'rand':  lambda: int(random.getrandbits(32)),
+-+   'bits':  bits,
+-+   'bitsw': bitsw,
+-+   'bitsws': bitsws,
+-+
+-+   # handy vpm/vdw/vdr stuff
+-+   'h32':  lambda y:       (1, 0, 0, y, 0, 0),
+-+   'h16l': lambda y, p:    (1, 1, 1, y, 0, p),
+-+   'h16p': lambda y, p:    (1, 0, 1, y, 0, p),
+-+   'h8l':  lambda y, p:    (1, 1, 2, y, 0, p),
+-+   'h8p':  lambda y, p:    (1, 0, 2, y, 0, p),
+-+   'v32':  lambda y, x:    (0, 0, 0, y, x, 0),
+-+   'v16l': lambda y, x, p: (0, 1, 1, y, x, p),
+-+   'v16p': lambda y, x, p: (0, 0, 1, y, x, p),
+-+   'v8l':  lambda y, x, p: (0, 1, 2, y, x, p),
+-+   'v8p':  lambda y, x, p: (0, 0, 2, y, x, p),
+-+   'dma_h32':  lambda y, x:    (1, 0, y, x, 0),
+-+   'dma_h16p': lambda y, x, p: (1, 1, y, x, p),
+-+   'dma_h8p':  lambda y, x, p: (1, 2, y, x, p),
+-+   'dma_v32':  lambda y, x:    (0, 0, y, x, 0),
+-+   'dma_v16p': lambda y, x, p: (0, 1, y, x, p),
+-+   'dma_v8p':  lambda y, x, p: (0, 2, y, x, p),
+-+   'vpm_setup': vpm_setup,
+-+   'vpm_setup_v2': lambda n, stride, addr: vpm_setup(n, stride, addr, True),
+-+   'vdw_setup_0': vdw_setup_0,
+-+   'vdw_setup_1': lambda stride: (3 << 30) | bits(stride, 13),
+-+   'vdr_setup_0': vdr_setup_0,
+-+   'vdr_setup_ext_stride': 8, # stride of 8 means use extended stride
+-+   'vdr_setup_1': lambda stride: (9 << 28) | bits(stride, 13),
+-+
+-+   # annotations
+-+   'mul_used': lambda *is_: ('mul_used', sum(1 << i for i in is_)),
+-+   'mul_unused': lambda *is_: ('mul_used', sum(1 << i for i in is_) ^ 0xffff),
+-+   'preserve_cond': ('preserve_cond', 1),
+-+
+-+   # somewhat experimental register allocator
+-+   'allocator_t': allocator_t,
+-+
+-+   # pragmas
+-+   'pragma_allow_xor_0': pragma_allow_xor_0,
+-+   'pragma_dont_warn_when_mul_rot_inp_r5': pragma_dont_warn_when_mul_rot_inp_r5}
+-+
+-+# accumulators and regs (regular names -- r0, ra0, etc)
+-+arg_defs.update(('r%d' % i, loc_t(MUX_AC, i, 0, 0, None, RW_EITHER)) for i in xrange(6))
+-+arg_defs.update(('rany%d' % i, loc_t(MUX_ANY, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+-+arg_defs.update(('ra%d' % i, loc_t(MUX_A, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+-+arg_defs.update(('rb%d' % i, loc_t(MUX_B, i, 0, 0, None, RW_EITHER)) for i in xrange(64))
+-+
+-+def arg_eval(arg, sets):
+-+   s = (arg.strip().split('.', 1) + [None])[:2]
+-+   if s[0] == '-':
+-+      return loc_t(MUX_ANY, WADDR_NOP, 0, 0, s[1], RW_WRITE)
+-+   arg = re_label_ref_left.sub('\\1_label_maker._', arg) # todo: we probably don't want to replace in strings...
+-+   arg = re_pack.sub('._\\1', arg)
+-+   try:
+-+      # todo: i would like to be able to pass both arg_defs and sets in here
+-+      # (with sets hiding arg_defs in the case of conflicts), but the obvious
+-+      # dict(arg_defs, **sets) won't permit things such as:
+-+      # .set f, lambda x: y
+-+      # .set y, 4
+-+      # (the y in the lambda will be looked up in the temporary dict we created
+-+      # when evaluating the f .set, which doesn't contain y)
+-+      #
+-+      # instead, sets is initially set to (a copy of) arg_defs. to simulate the
+-+      # hiding behaviour, on an unset, we restore any hidden arg_defs value.
+-+      # also, before dumping sets at the end, we strip out the arg_defs stuff
+-+      # (this isn't entirely correct as we want to dump sets that are hiding
+-+      # arg_defs)
+-+      return eval(arg, sets)
+-+   except Exception, e:
+-+      asm_error(e)
+-+   except:
+-+      asm_error('unknown error while evaluating argument')
+-+
+-+# doesn't check/fixup pack
+-+def check_and_fixup_loc(loc, read):
+-+   if (not read) and (loc.rw == RW_READ):
+-+      asm_error('writing to read-only hardware register')
+-+   if read and (loc.rw == RW_WRITE):
+-+      asm_error('reading from write-only hardware register')
+-+   if not read:
+-+      # conceptually, we are writing to a location rotated right by
+-+      # loc.rot/loc.r5_rot. but we are actually rotating the output right by
+-+      # -loc.rot/-loc.r5_rot then writing it to the unrotated location
+-+      loc.rot = -loc.rot % 16
+-+      loc.r5_rot = -loc.r5_rot % 16
+-+   if (loc.rot != 0) and (loc.r5_rot != 0):
+-+      asm_error('can\'t rotate by both r5 and immediate')
+-+   if (loc.r5_rot != 0) and (loc.r5_rot != 1):
+-+      asm_error('only supported rotation by r5 is once to the %s' % ('left', 'right')[read])
+-+   if (not mulw_rotate) and ((loc.rot != 0) or loc.r5_rot): # mulw_rotate source checking is done later
+-+      if not read:
+-+         asm_error('target doesn\'t support write rotation')
+-+      if loc.mux == MUX_ANY:
+-+         loc.mux = MUX_A # can't do rotated read from regfile b
+-+      if loc.mux != MUX_A:
+-+         asm_error('rotation on read only allowed from regfile a')
+-+      if loc.i >= 32:
+-+         asm_warning('rotation only works from physical regfile')
+-+   if loc.mux == MUX_AC:
+-+      if (loc.i < 0) or (loc.i >= 6):
+-+         asm_error('reg out of range')
+-+      if not read:
+-+         if loc.i == 4:
+-+            asm_error('not allowed to write to r4')
+-+         if loc.i == 5:
+-+
+-+            asm_error('not allowed to write to r5 -- please specify r5quad or r5rep')
+-+   elif (loc.mux == MUX_ANY) or (loc.mux == MUX_A) or (loc.mux == MUX_B):
+-+      if (loc.i < 0) or (loc.i >= 64):
+-+         asm_error('reg out of range')
+-+   else:
+-+      assert 0
+-+
+-+def get_dst(dst, sets):
+-+   if not dst:
+-+      return None, None, (0, PACK_TYPE_EITHER, PACK_MODE_EITHER), 0, 0
+-+   dst = arg_eval(dst, sets)
+-+   if not isinstance(dst, loc_t):
+-+      asm_error('invalid dst')
+-+   dst = dst.copy()
+-+   check_and_fixup_loc(dst, False)
+-+   pack = get_pack(dst.pack)
+-+   if dst.mux == MUX_AC:
+-+      if pack[2] == PACK_MODE_A:
+-+         asm_warning('ra packing only works when writing to physical regfile')
+-+         return WADDR_R0 + dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+-+      return WADDR_R0 + dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+-+   if (dst.mux == MUX_A) or ((dst.mux == MUX_ANY) and (pack[2] == PACK_MODE_A)): # can't pack to regfile b with this operation
+-+      if (pack[2] == PACK_MODE_A) and (dst.i >= 32):
+-+         asm_warning('ra packing only works when writing to physical regfile')
+-+      return dst.i, WMUX_A, pack, dst.rot, dst.r5_rot
+-+   if dst.mux == MUX_ANY:
+-+      return dst.i, WMUX_ANY, pack, dst.rot, dst.r5_rot
+-+   if dst.mux == MUX_B:
+-+      if pack[2] == PACK_MODE_A:
+-+         asm_error('this packing operation can only be used for regfile a')
+-+      return dst.i, WMUX_B, pack, dst.rot, dst.r5_rot
+-+   assert 0
+-+
+-+def get_src(src, sets):
+-+   if not src:
+-+      return None, None, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), None, None
+-+   src = arg_eval(src, sets)
+-+   if isinstance(src, sema_t):
+-+      if not have_sema:
+-+         asm_error('target does not support semaphores')
+-+      if (src.i < 0) or (src.i >= 16):
+-+         asm_error('semaphore number must be in [0, 16)')
+-+      return src.i | (src.acq << 4), RMUX_SEMA, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if isinstance(src, label_t):
+-+      return (src.name, src.rel, src.offset), RMUX_LABEL, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if isinstance(src, list):
+-+      if len(src) != 16:
+-+         asm_error('vector immediate must have length 16')
+-+      src = src[:]
+-+      for i in xrange(16):
+-+         if not is_int(src[i]):
+-+            asm_error('all elements of vector immediate must be integers')
+-+         src[i] &= (1 << 32) - 1
+-+      return src, RMUX_IMMV, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if is_int(src):
+-+      return src & ((1 << 32) - 1), RMUX_IMM, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), 0, 0
+-+   if not isinstance(src, loc_t):
+-+      asm_error('invalid src')
+-+   src = src.copy()
+-+   check_and_fixup_loc(src, True)
+-+   if mulw_rotate:
+-+      srot, sr5rot = 0, 0
+-+      drot, dr5rot = src.rot, src.r5_rot
+-+   else:
+-+      srot, sr5rot = src.rot, src.r5_rot
+-+      drot, dr5rot = 0, 0
+-+   if src.mux == MUX_AC:
+-+      if src.i == 4:
+-+         return 4, RMUX_AC, get_r4_unpack(src.pack), drot, dr5rot
+-+      if src.pack:
+-+         asm_error('unpack only allowed for regfile a or r4')
+-+      return src.i, RMUX_AC, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+-+   if (src.mux == MUX_A) or ((src.mux == MUX_ANY) and src.pack): # can't unpack from regfile b
+-+      return (src.i, srot, sr5rot), RMUX_A, get_a_unpack(src.pack), drot, dr5rot
+-+   if src.mux == MUX_ANY:
+-+      return src.i, RMUX_ANY, (0, PACK_TYPE_EITHER, UNPACK_LOC_AB), drot, dr5rot
+-+   if src.mux == MUX_B:
+-+      if src.pack:
+-+         asm_error('unpack only allowed for regfile a or r4')
+-+      return src.i, RMUX_B, (0, PACK_TYPE_EITHER, UNPACK_LOC_OTHER), drot, dr5rot
+-+   assert 0
+-+
+-+# signals
+-+##########
+-+
+-+sigs = {
+-+   'bkpt': SIG_BKPT,
+-+   'thrsw': SIG_THRSW,
+-+   'thrend': SIG_THREND,
+-+   'sbwait': SIG_SBWAIT,
+-+   'sbdone': SIG_SBDONE,
+-+   'int': SIG_INT,
+-+   'loadcv': SIG_LOADCV,
+-+   'loadc': SIG_LOADC,
+-+   'ldcend': SIG_LDCEND,
+-+   'ldtmu0': SIG_LDTMU0,
+-+   'ldtmu1': SIG_LDTMU1}
+-+
+-+def get_sig(sig):
+-+   if sig not in sigs:
+-+      return SIG_NORMAL
+-+   return sigs[sig]
+-+
+-+# annotations
+-+##############
+-+
+-+def get_annots(annot, sets):
+-+   annots = arg_eval(annot, sets)
+-+   if isinstance(annots, list):
+-+      annots = annots[:]
+-+   else:
+-+      annots = [annots]
+-+   for i, annot in enumerate(annots):
+-+      if ((not isinstance(annot, tuple)) or (len(annot) != 2) or (not isinstance(annot[0], str)) or
+-+         (not is_int(annot[1]))):
+-+         asm_error('annotation must be (string, integer) pair, or a list of such pairs')
+-+      annots[i] = (annot[0], annot[1] & ((1 << 32) - 1))
+-+   return annots
+-+
+-+###############################################################################
+-+# core
+-+###############################################################################
+-+
+-+def calculate_pack_modes(rpacks, rfloats, couldrfloat, wpacks, wfloats):
+-+   needfloat = PACK_TYPE_EITHER
+-+   havefloata = False
+-+   havefloatr4 = False
+-+   unpacka = None
+-+   unpackr4 = None
+-+   forcebs = [False, False, False, False]
+-+   forcerafloat = False
+-+
+-+   pm = PACK_MODE_EITHER
+-+   for i in (0, 1, 2, 3):
+-+      if (rpacks[i][2] == UNPACK_LOC_OTHER) or (rpacks[i][2] == UNPACK_LOC_AB):
+-+         assert rpacks[i][0] == 0
+-+      else:
+-+         if rpacks[i][2] == UNPACK_LOC_A:
+-+            if unpacka is None:
+-+               unpacka = rpacks[i][0]
+-+            elif unpacka != rpacks[i][0]:
+-+               asm_error('conflicting unpack operations on regfile a')
+-+            havefloata = havefloata or rfloats[i]
+-+         elif rpacks[i][2] == UNPACK_LOC_R4:
+-+            if unpackr4 is None:
+-+               unpackr4 = rpacks[i][0]
+-+            elif unpackr4 != rpacks[i][0]:
+-+               asm_error('conflicting unpack operations on r4')
+-+            havefloatr4 = havefloatr4 or rfloats[i]
+-+         else:
+-+            assert 0
+-+
+-+         if rpacks[i][1] != PACK_TYPE_EITHER:
+-+            if (needfloat != PACK_TYPE_EITHER) and (needfloat != rpacks[i][1]):
+-+               asm_error('conflicting unpack float requirements')
+-+            needfloat = rpacks[i][1]
+-+   for i in (0, 1, 2, 3):
+-+      if rpacks[i][2] == UNPACK_LOC_AB:
+-+         if (unpacka is not None) and (unpacka != UNPACK_A_NOP):
+-+            forcebs[i] = True # non-nop unpack from regfile a. must use b
+-+
+-+   if unpacka:
+-+      if (needfloat == PACK_TYPE_FLOAT) and (not havefloata) and couldrfloat:
+-+         havefloata = True
+-+         forcerafloat = True
+-+      havefloat = havefloata
+-+   else:
+-+      havefloat = havefloatr4
+-+
+-+   if (needfloat == PACK_TYPE_FLOAT) and (not havefloat):
+-+      asm_error('float unpack operation used in integer alu operations')
+-+   if (needfloat == PACK_TYPE_INT) and havefloat:
+-+      asm_error('integer unpack operation used in float alu operation')
+-+
+-+   unpack = 0
+-+   if unpacka and unpackr4:
+-+      asm_error('cannot specify pack operation for both regfile a and r4')
+-+   if unpacka:
+-+      pm = PACK_MODE_A
+-+      unpack = unpacka
+-+   elif unpackr4:
+-+      pm = PACK_MODE_M
+-+      unpack = unpackr4
+-+
+-+   pack = 0
+-+   if wpacks[0][2] == PACK_MODE_M:
+-+      asm_error('mul-unit pack operation used on add result')
+-+   for i in (0, 1):
+-+      if wpacks[i][2] == PACK_MODE_A:
+-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_A):
+-+            asm_error('conflicting pack modes')
+-+         pm = PACK_MODE_A
+-+         pack = wpacks[i][0]
+-+      elif wpacks[i][2] == PACK_MODE_M:
+-+         if (pm != PACK_MODE_EITHER) and (pm != PACK_MODE_M):
+-+            asm_error('conflicting pack modes')
+-+         pm = PACK_MODE_M
+-+         pack = wpacks[i][0]
+-+
+-+      if (wpacks[i][1] == PACK_TYPE_FLOAT) and (not wfloats[i]):
+-+         asm_error('float pack operation used with integer alu result')
+-+      if (wpacks[i][1] == PACK_TYPE_INT) and wfloats[i]:
+-+         asm_error('integer pack operation used with float alu result')
+-+
+-+   if pm == PACK_MODE_EITHER:
+-+      pm = PACK_MODE_A
+-+   return pm, pack, unpack, forcebs, forcerafloat
+-+
+-+# immediates that can be encoded with SIG_SMALLIMMED
+-+bimms = {}
+-+bimms.update((i, i) for i in xrange(16))
+-+bimms.update(((i - 32) + (1 << 32), i) for i in xrange(16, 32))
+-+bimms.update(((127 + (i - 32)) << 23, i) for i in xrange(32, 40))
+-+bimms.update(((127 + (i - 48)) << 23, i) for i in xrange(40, 48))
+-+
+-+def merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux):
+-+   if rmux == RMUX_SEMA:
+-+      asm_error('semaphore op can only be used with mov')
+-+   if rmux == RMUX_LABEL:
+-+      asm_error('label not allowed here')
+-+   if rmux == RMUX_IMMV:
+-+      asm_error('vector immediate can only be used with mov')
+-+   if rmux == RMUX_IMM:
+-+      if raddr not in bimms:
+-+         asm_error('can\'t encode immediate 0x%08x' % raddr)
+-+      raddr = bimms[raddr]
+-+      if not immb:
+-+         if raddr_b is not None:
+-+            asm_error('regfile b and immediates don\'t mix')
+-+         raddr_b = raddr
+-+         immb = True
+-+      elif raddr_b != raddr:
+-+         asm_error('can only encode one rotation/immediate')
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+   if rmux == RMUX_AC:
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A0 + raddr
+-+   if rmux == RMUX_ANY:
+-+      if (mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))) and (raddr_a == raddr):
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+      if (not immb) and (raddr_b == raddr):
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+      if raddr_a is None:
+-+         assert mulw_rotate or (((not immb) or (raddr_b < 48)) and (not arot_r5))
+-+         raddr_a = raddr
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+      if raddr_b is None:
+-+         assert not immb
+-+         raddr_b = raddr
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+      asm_error('no free read slots')
+-+   if rmux == RMUX_A:
+-+      if (not mulw_rotate) and (raddr_a is not None) and (
+-+         ((raddr[1] != 0) | ((raddr[2] != 0) << 1)) != ((immb and (raddr_b >= 48)) | (arot_r5 << 1))):
+-+         asm_error('conflicting rotations from regfile a')
+-+      if raddr_a is None:
+-+         raddr_a = raddr[0]
+-+      elif raddr_a != raddr[0]:
+-+         asm_error('can only read from one location in each regfile')
+-+      arot_r5 = raddr[2]
+-+      if raddr[1] == 0:
+-+         return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+      raddr = 48 + raddr[1]
+-+      if not immb:
+-+         if raddr_b is not None:
+-+            asm_error('regfile b and rotation don\'t mix')
+-+         raddr_b = raddr
+-+         immb = True
+-+      elif raddr_b != raddr:
+-+         asm_error('can only encode one rotation/immediate')
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_A
+-+   if rmux == RMUX_B:
+-+      if immb:
+-+         asm_error('regfile b and rotation/immediates don\'t mix')
+-+      if raddr_b is None:
+-+         raddr_b = raddr
+-+      elif raddr_b != raddr:
+-+         asm_error('can only read from one location in each regfile')
+-+      return raddr_a, raddr_b, immb, arot_r5, RMUX_B
+-+   assert 0
+-+
+-+# ok if:
+-+# - accumulator (r0-r3)
+-+# - uniform (ie all elements identical). this is true of unif, qpu_num, vr_busy,
+-+#   and vw_busy. it's also true of r5 if it was written by r5rep, but not if it
+-+#   was written by r5quad. so, by default, r5 isn't considered uniform. todo:
+-+#   what about vr_wait/vw_wait/mutex?
+-+def read_rot_ok(rmux, raddr_a, raddr_b):
+-+   return ((rmux < 4) or ((rmux == 5) and dont_warn_when_mul_rot_inp_r5) or
+-+      ((rmux == 6) and (raddr_a in (32, 49))) or # unif/vr_busy
+-+      ((rmux == 7) and (raddr_b in (32, 38, 49)))) # unif/qpu_num/vw_busy
+-+
+-+def asm_flush_prog_data():
+-+   global prog_data
+-+
+-+   while len(prog_data) & 7:
+-+      prog_data.append(0)
+-+   for i in xrange(0, len(prog_data), 8):
+-+      prog.append(((prog_data[i + 3] << 24) | (prog_data[i + 2] << 16) | (prog_data[i + 1] << 8) | (prog_data[i + 0] << 0),
+-+         (prog_data[i + 7] << 24) | (prog_data[i + 6] << 16) | (prog_data[i + 5] << 8) | (prog_data[i + 4] << 0), 'data', {}))
+-+   prog_data = []
+-+
+-+def asm_line(sets, location, line):
+-+   global current_location, construct, nwarn_level
+-+
+-+   prev_location = current_location
+-+   current_location = location
+-+
+-+   try:
+-+      if construct != None:
+-+         if re_macro.match(line):
+-+            construct_stack.append(CONSTRUCT_MACRO)
+-+         elif re_if.match(line):
+-+            construct_stack.append(CONSTRUCT_IF)
+-+         elif re_rep.match(line):
+-+            construct_stack.append(CONSTRUCT_REP)
+-+         else:
+-+            else_m = line == '.else'
+-+            elif_m = re_elif.match(line)
+-+            if elif_m:
+-+               end_construct = CONSTRUCT_IF
+-+            else:
+-+               end_construct = {
+-+                  '.endm':  CONSTRUCT_MACRO,
+-+                  '.else':  CONSTRUCT_IF,
+-+                  '.endif': CONSTRUCT_IF | CONSTRUCT_ELSE,
+-+                  '.endr':  CONSTRUCT_REP}.get(line)
+-+            if end_construct is not None:
+-+               end_construct &= construct_stack.pop()
+-+               if end_construct == 0:
+-+                  if elif_m:
+-+                     asm_error('unexpected .elif')
+-+                  asm_error('unexpected %s' % line)
+-+               if len(construct_stack) == 0:
+-+                  lines = construct
+-+                  construct = None
+-+                  if end_construct == CONSTRUCT_MACRO:
+-+                     return
+-+                  if (end_construct == CONSTRUCT_IF) or (end_construct == CONSTRUCT_ELSE):
+-+                     condition_if, condition_else = lines[0]
+-+                     lines = lines[1:]
+-+                     if condition_if:
+-+                        for location, line in lines:
+-+                           asm_line(sets, location, line)
+-+                     if else_m:
+-+                        construct = [(condition_else, False)]
+-+                        construct_stack.append(CONSTRUCT_ELSE)
+-+                     elif elif_m:
+-+                        if elif_m.group('set'):
+-+                           condition_if = condition_else and ((elif_m.group('set') == 'nset') ^ (elif_m.group('name') in sets))
+-+                        else:
+-+                           condition_if = condition_else and arg_eval(elif_m.group('condition'), sets)
+-+                        condition_else = condition_else and (not condition_if)
+-+                        construct = [(condition_if, condition_else)]
+-+                        construct_stack.append(CONSTRUCT_IF)
+-+                     return
+-+                  if end_construct == CONSTRUCT_REP:
+-+                     name, count = lines[0]
+-+                     lines = lines[1:]
+-+                     for i in xrange(count):
+-+                        sets[name] = i
+-+                        for location, line in lines:
+-+                           asm_line(sets, location, line)
+-+                     return
+-+                  assert 0
+-+               if else_m:
+-+                  construct_stack.append(CONSTRUCT_ELSE)
+-+               elif elif_m:
+-+                  construct_stack.append(CONSTRUCT_IF)
+-+         construct.append((current_location, line))
+-+         return
+-+
+-+      if line in ('.endm', '.else', '.endif', '.endr'):
+-+         asm_error('unexpected %s' % line)
+-+      if re_elif.match(line):
+-+         asm_error('unexpected .elif')
+-+
+-+      m = re_macro.match(line)
+-+      if m:
+-+         construct = []
+-+         construct_stack.append(CONSTRUCT_MACRO)
+-+         macros[m.group('name')] = ([param.strip() for param in m.group('params').split(',')[1:]], construct)
+-+         return
+-+
+-+      m = re_if.match(line)
+-+      if m:
+-+         if m.group('set'):
+-+            condition = (m.group('set') == 'nset') ^ (m.group('name') in sets)
+-+         else:
+-+            # not not forces condition to a bool (this matters if condition is
+-+            # something mutable like a list)
+-+            condition = not not arg_eval(m.group('condition'), sets)
+-+         construct = [(condition, not condition)]
+-+         construct_stack.append(CONSTRUCT_IF)
+-+         return
+-+
+-+      m = re_rep.match(line)
+-+      if m:
+-+         count = arg_eval(m.group('count'), sets)
+-+         if not is_int(count):
+-+            asm_error('.rep count must be integer')
+-+         construct = [(m.group('name'), count)]
+-+         construct_stack.append(CONSTRUCT_REP)
+-+         return
+-+
+-+      m = re_include.match(line)
+-+      if m:
+-+         filename = arg_eval(m.group('filename'), sets)
+-+         if not isinstance(filename, str):
+-+            asm_error('expected string')
+-+         asm_file(sets, '%s: %s' % (current_location, filename), filename)
+-+         return
+-+
+-+      m = re_set.match(line)
+-+      if m:
+-+         sets[m.group('name')] = arg_eval(m.group('val'), sets)
+-+         return
+-+
+-+      m = re_unset.match(line)
+-+      if m:
+-+         name = m.group('name')
+-+         if name not in sets:
+-+            asm_error('%s not set' % name)
+-+         if name in arg_defs: # todo: see arg_eval
+-+            sets[name] = arg_defs[name]
+-+         else:
+-+            del sets[name]
+-+         return
+-+
+-+      m = re_eval.match(line)
+-+      if m:
+-+         arg_eval(m.group('expr'), sets)
+-+         return
+-+
+-+      m = re_print_info_warn_error.match(line)
+-+      if m:
+-+         def print_fn(message):
+-+            print message
+-+         def info_fn(message):
+-+            sys.stderr.write('%s\n' % message)
+-+         {'print': print_fn, 'info': info_fn, 'warn': asm_warning, 'error': asm_error}[
+-+            m.group('print_info_warn_error')](arg_eval(m.group('message'), sets))
+-+         return
+-+
+-+      m = re_assert.match(line)
+-+      if m:
+-+         if not arg_eval(m.group('condition'), sets):
+-+            asm_error('assertion failure: \'%s\'' % m.group('condition'))
+-+         return
+-+
+-+      m = re_data.match(line)
+-+      if m:
+-+         size = int(m.group('size'))
+-+         for datum in smart_split(m.group('data')):
+-+            datum = arg_eval(datum, sets)
+-+            if not is_int(datum):
+-+               asm_error('datum must be integer')
+-+            prog_data.extend(((datum >> (i * 8)) & 0xff) for i in xrange(size))
+-+         return
+-+
+-+      m = re_macro_inst.match(line)
+-+      if m:
+-+         name = m.group('name')
+-+         if name in macros:
+-+            params, lines = macros[name]
+-+            args = smart_split(m.group('args'))
+-+            if len(args) > len(params):
+-+               asm_error('too many arguments to macro')
+-+            sets = sets.copy()
+-+            sets.update(zip(params, (arg_eval(arg, sets) for arg in args)))
+-+            for param in params[len(args):]:
+-+               if param in sets:
+-+                  if param in arg_defs: # todo: see arg_eval
+-+                     sets[param] = arg_defs[param]
+-+                  else:
+-+                     del sets[param]
+-+            for location, line in lines:
+-+               asm_line(sets, '%s: %s' % (current_location, location), line)
+-+            return
+-+
+-+      if line == '.pushnwarn':
+-+         nwarn_level += 1
+-+         return
+-+      if line == '.popnwarn':
+-+         if nwarn_level == 0:
+-+            asm_error('.popnwarn without .pushnwarn')
+-+         nwarn_level -= 1
+-+         return
+-+
+-+      # everything below assumes prog is up to date
+-+      asm_flush_prog_data()
+-+
+-+      m = re_label.match(line)
+-+      if m:
+-+         name = m.group('name')
+-+         if name[0].isdigit():
+-+            labels.setdefault(name, []).append(len(prog))
+-+         else:
+-+            if name[0] == ':':
+-+               undecorated_name = name[1:]
+-+            else:
+-+               undecorated_name = name
+-+            if (undecorated_name in labels) or ((':' + undecorated_name) in labels):
+-+               asm_error('named label defined twice')
+-+            labels[name] = len(prog)
+-+         return
+-+
+-+      annots = line.split('@')
+-+      ops = [op.strip() for op in annots[0].split(';')]
+-+      annots = sum((get_annots(annot, sets) for annot in annots[1:]), [])
+-+      sig = get_sig(ops[-1])
+-+      if sig != SIG_NORMAL:
+-+         ops = ops[:-1]
+-+      if len(ops) > 2:
+-+         asm_error('too many ops')
+-+      elif (len(ops) == 1) and (ops[0] == ''):
+-+         ops = []
+-+      ops = (ops + ['nop', 'nop'])[:2]
+-+      m = re_op.match(ops[0])
+-+      if not m:
+-+         asm_error('invalid syntax')
+-+      aop, aargs_n = get_aop(m.group('op'))
+-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
+-+         acond = get_bcond(m.group('cond'))
+-+      else:
+-+         acond = get_cond(m.group('cond'))
+-+      asf = get_setf(m.group('sf'))
+-+      aargs = smart_split(m.group('args'))
+-+      if len(aargs) != aargs_n:
+-+         asm_error('wrong operand count')
+-+      ard, ara, arb = (aargs + [None, None, None])[:3]
+-+      m = re_op.match(ops[1])
+-+      if not m:
+-+         asm_error('invalid syntax')
+-+      mop, margs_n = get_mop(m.group('op'))
+-+      mcond = get_cond(m.group('cond'))
+-+      msf = get_setf(m.group('sf'))
+-+      margs = smart_split(m.group('args'))
+-+      if len(margs) != margs_n:
+-+         asm_error('wrong operand count')
+-+      mrd, mra, mrb = (margs + [None, None, None])[:3]
+-+      # eval srcs first so allocator can retire and reuse registers for dst
+-+      aaraddr, aarmux, aarpack, aadrot, aadrot_r5 = get_src(ara, sets)
+-+      abraddr, abrmux, abrpack, abdrot, abdrot_r5 = get_src(arb, sets)
+-+      maraddr, marmux, marpack, madrot, madrot_r5 = get_src(mra, sets)
+-+      mbraddr, mbrmux, mbrpack, mbdrot, mbdrot_r5 = get_src(mrb, sets)
+-+      awaddr, awmux, awpack, awrot, awrot_r5 = get_dst(ard, sets)
+-+      mwaddr, mwmux, mwpack, mwrot, mwrot_r5 = get_dst(mrd, sets)
+-+      if (((abrmux is not None) and ((aadrot != abdrot) or (aadrot_r5 != abdrot_r5))) or
+-+         ((mbrmux is not None) and ((madrot != mbdrot) or (madrot_r5 != mbdrot_r5)))):
+-+         asm_error('cannot have 2 arguments with different rotations')
+-+      if aarmux is not None:
+-+         awrot = (awrot + aadrot) % 16
+-+         awrot_r5 = (awrot_r5 + aadrot_r5) % 16
+-+      if (awrot != 0) or awrot_r5:
+-+         asm_error('rotate not allowed on add write')
+-+      if marmux is not None:
+-+         mwrot = (mwrot + madrot) % 16
+-+         mwrot_r5 = (mwrot_r5 + madrot_r5) % 16
+-+
+-+      afloatr = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_FTOI)
+-+      afloatw = aop in (AOP_FADD, AOP_FSUB, AOP_FMIN, AOP_FMAX, AOP_FMINABS, AOP_FMAXABS, AOP_ITOF)
+-+      pm, pack, unpack, forcebs, forcerafloat = calculate_pack_modes(
+-+         [aarpack, abrpack, marpack, mbrpack],
+-+         [afloatr, afloatr, mop == MOP_FMUL, mop == MOP_FMUL],
+-+         aop == AOP_FTOI,
+-+         [awpack, mwpack],
+-+         [afloatw, mop == MOP_FMUL])
+-+      if forcebs[0]:
+-+         aarmux = RMUX_B
+-+      if forcebs[1]:
+-+         abrmux = RMUX_B
+-+      if forcebs[2]:
+-+         marmux = RMUX_B
+-+      if forcebs[3]:
+-+         mbrmux = RMUX_B
+-+
+-+      # extend nops to 3 operands
+-+      if aop == AOP_NOP:
+-+         awaddr, awmux, aaraddr, aarmux, abraddr, abrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+-+      if mop == MOP_NOP:
+-+         mwaddr, mwmux, maraddr, marmux, mbraddr, mbrmux = WADDR_NOP, WMUX_ANY, 0, RMUX_AC, 0, RMUX_AC
+-+
+-+      # extend 2 operand alu ops to 3 operands (by duplicating the 2nd operand)
+-+      if (aop == AOP_FTOI) or (aop == AOP_ITOF) or (aop == AOP_NOT) or (aop == AOP_CLZ):
+-+         if forcerafloat:
+-+            assert aop == AOP_FTOI # can only forcerafloat if we have an unused float operand
+-+            # instead of duplicating the 2nd operand, take the ra operand from
+-+            # the mul op thus forcing the ra value to be considered a float for
+-+            # the purposes of unpacking
+-+            if marmux == RMUX_A:
+-+               abraddr, abrmux = maraddr, marmux
+-+            else:
+-+               assert mbrmux == RMUX_A
+-+               abraddr, abrmux = mbraddr, mbrmux
+-+         else:
+-+            abraddr, abrmux = aaraddr, aarmux
+-+      else:
+-+         assert not forcerafloat # can only forcerafloat if we have an unused operand
+-+
+-+      # handle write addrs
+-+      if (awmux == mwmux) and (awmux != WMUX_ANY):
+-+         asm_error('add/mul ops not allowed to write to same regfile')
+-+      ws = (awmux == WMUX_B) or (mwmux == WMUX_A)
+-+
+-+      # handle branch
+-+      if (aop == AOP_BRA) or (aop == AOP_BRR):
+-+         # check setf
+-+         if asf:
+-+            asm_error('setf not allowed on bra/brr')
+-+
+-+         # check pack/unpack
+-+         if (pack != 0) or (unpack != 0):
+-+            asm_error('pack/unpack not allowed with bra/brr')
+-+
+-+         # handle read address
+-+         if aarmux == RMUX_LABEL:
+-+            if (aop == AOP_BRA) and aaraddr[1]:
+-+               asm_warning('bra with rel label')
+-+            if (aop == AOP_BRR) and (not aaraddr[1]):
+-+               asm_warning('brr with abs label')
+-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+-+         if aarmux == RMUX_ANY:
+-+            aaraddr, aarmux = (aaraddr, 0, 0), RMUX_A
+-+         if (aarmux != RMUX_IMM) and (aarmux != RMUX_A):
+-+            asm_error('branch destination must be either label, immediate, or from regfile a')
+-+         if aarmux == RMUX_IMM:
+-+            imm = aaraddr
+-+            raddr = 0 # can't use RADDR_NOP
+-+         elif aarmux == RMUX_A:
+-+            if (aaraddr[1] != 0) or (aaraddr[2] != 0):
+-+               asm_error('rotation of read from regfile a not allowed with branch')
+-+            if aop == AOP_BRR:
+-+               asm_warning('brr with ra')
+-+            imm = 0
+-+            raddr = aaraddr[0]
+-+         else:
+-+            assert 0
+-+
+-+         # check mul op is nop
+-+         if mop != MOP_NOP:
+-+            asm_error('mul op not allowed with branch')
+-+
+-+         # check sig
+-+         if sig != SIG_NORMAL:
+-+            asm_error('no signal allowed with branch')
+-+
+-+         if raddr >= 32:
+-+            asm_error('can only branch to register locations in physical regfile')
+-+         if raddr & 1:
+-+            asm_warning('branch instruction will destroy flags (see hw-2780)')
+-+
+-+         # construct branch instruction
+-+         prog.append((imm,
+-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (raddr << 13) | ((aarmux == RMUX_A) << 18) | ((aop == AOP_BRR) << 19) | (acond << 20) | (SIG_BRANCH << 28),
+-+            line, annots))
+-+
+-+         return
+-+
+-+      # use COND_NEVER when possible (might save power / allow mul setf)
+-+      if not dict(annots).get('preserve_cond', 0):
+-+          if (awaddr == WADDR_NOP) and (not asf):
+-+             acond = COND_NEVER
+-+          if (mwaddr == WADDR_NOP) and (not msf):
+-+             mcond = COND_NEVER
+-+
+-+      # attempt to convert movs to ldi
+-+      if (# no mul setf
+-+         (not msf) and
+-+         # ops must either be nop or mov of sema/label/imm/immv
+-+         ((aop == AOP_NOP) or ((aop == AOP_MOV) and (aarmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+-+         ((mop == MOP_NOP) or ((mop == MOP_MOV) and (marmux in (RMUX_SEMA, RMUX_LABEL, RMUX_IMMV, RMUX_IMM)))) and
+-+         # but we don't want 2 nops
+-+         ((aop != AOP_NOP) or (mop != MOP_NOP)) and
+-+         # if both ops are movs, srcs must be identical
+-+         ((aop != AOP_MOV) or (mop != MOP_MOV) or ((aarmux == marmux) and (aaraddr == maraddr))) and
+-+         # no signal
+-+         (sig == SIG_NORMAL)):
+-+         # make sure aarmux/aaraddr contains the value
+-+         if aop != AOP_MOV:
+-+            aarmux = marmux
+-+            aaraddr = maraddr
+-+
+-+         # convert immediate
+-+         if aarmux == RMUX_SEMA:
+-+            ldi_mode = LDI_SEMA
+-+         elif aarmux == RMUX_LABEL:
+-+            ldi_mode = LDI_32
+-+            aaraddr, aarmux = (current_location,) + aaraddr, RMUX_IMM
+-+         elif aarmux == RMUX_IMMV:
+-+            signed, unsigned = True, True
+-+            imm = 0
+-+            for i, elem in enumerate(aaraddr):
+-+               if elem not in (-2 + (1 << 32), -1 + (1 << 32), 0, 1):
+-+                  signed = False
+-+               if elem not in (0, 1, 2, 3):
+-+                  unsigned = False
+-+               imm |= ((elem & 0x1) << i) | ((elem & 0x2) << (15 + i))
+-+            if not (signed or unsigned):
+-+               asm_error('can\'t encode vector immediate')
+-+            if signed:
+-+               ldi_mode = LDI_EL_SIGNED
+-+            else:
+-+               ldi_mode = LDI_EL_UNSIGNED
+-+            aaraddr, aarmux = imm, RMUX_IMM
+-+         elif aarmux == RMUX_IMM:
+-+            ldi_mode = LDI_32
+-+         else:
+-+            assert 0
+-+
+-+         # construct ldi instruction
+-+         prog.append((aaraddr,
+-+            (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (asf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (ldi_mode << 25) | (SIG_IMMED << 28),
+-+            line, annots))
+-+
+-+         return
+-+
+-+      # convert movs to alu ops
+-+      if aop == AOP_MOV:
+-+         if allow_xor_0 and (aarmux == RMUX_IMM) and (aaraddr == 0):
+-+            aop = AOP_XOR
+-+            aaraddr, aarmux = 0, RMUX_AC
+-+            abraddr, abrmux = 0, RMUX_AC
+-+         else:
+-+            aop = AOP_OR
+-+            abraddr, abrmux = aaraddr, aarmux
+-+      if mop == MOP_MOV:
+-+         if allow_xor_0 and (marmux == RMUX_IMM) and (maraddr == 0):
+-+            mop = MOP_V8SUBS
+-+            maraddr, marmux = 0, RMUX_AC
+-+            mbraddr, mbrmux = 0, RMUX_AC
+-+         else:
+-+            mop = MOP_V8MIN
+-+            mbraddr, mbrmux = maraddr, marmux
+-+
+-+      # normal alu instruction...
+-+
+-+      # handle setf
+-+      if asf and (aop == AOP_NOP):
+-+         asm_error('nop.setf is not allowed in add pipe')
+-+      if msf and (mop == MOP_NOP):
+-+         asm_warning('nop.setf, really?')
+-+      if (aop == AOP_NOP) or (acond == COND_NEVER):
+-+         sf = msf
+-+      else:
+-+         if msf:
+-+            asm_error('setf only allowed on mul op if add op is nop or add condition is never')
+-+         sf = asf
+-+
+-+      # handle read addrs
+-+      raddr_a = None
+-+      raddr_b = None
+-+      immb = False
+-+      arot_r5 = False
+-+      muxes = [0, 0, 0, 0]
+-+      if mwrot != 0:
+-+         raddr_b = 48 + mwrot
+-+         immb = True
+-+      if mwrot_r5 and have_am:
+-+         raddr_b = 48
+-+         immb = True
+-+      for f in lambda rmux: rmux != RMUX_ANY, lambda rmux: rmux == RMUX_ANY: # do RMUX_ANY last
+-+         for i, raddr, rmux in (0, aaraddr, aarmux), (1, abraddr, abrmux), (2, maraddr, marmux), (3, mbraddr, mbrmux):
+-+            if f(rmux):
+-+               raddr_a, raddr_b, immb, arot_r5, muxes[i] = merge_rmux(raddr_a, raddr_b, immb, arot_r5, raddr, rmux)
+-+      add_a, add_b, mul_a, mul_b = muxes
+-+      if (not read_rot_ok(mul_a, raddr_a, raddr_b)) or (not read_rot_ok(mul_b, raddr_a, raddr_b)):
+-+         # some output elements might not be as expected
+-+         if mwrot_r5 or ((mwrot >= 4) and (mwrot <= 12)):
+-+            bad_elems = 0xffff
+-+         else:
+-+            bad_elems = ((1 << (mwrot & 0x3)) - 1) * 0x1111
+-+            if mwrot > 12:
+-+               bad_elems ^= 0xffff
+-+         bad_elems &= dict(annots).get('mul_used', 0xffff)
+-+         if not msf:
+-+            if mwaddr == WADDR_NOP:
+-+               # not writing anywhere and not setting flags. no elements used
+-+               bad_elems = 0
+-+            elif ((mwaddr in (36, 40, 43, 49, 50, 51)) or
+-+               ((not ws) and (mwaddr == 37))):
+-+               # writing to tmurs/r5rep/unif_addr/unif_addr_rel/stencil/
+-+               # vr_setup/vw_setup/vr_addr/vw_addr/mutex and not setting flags.
+-+               # only use element 0
+-+               bad_elems &= 0x0001
+-+            elif ((mwaddr == 41) or (ws and (mwaddr == 37)) or
+-+               ((not ws) and (mwaddr == 42))):
+-+               # writing to r5quad/x_coord/y_coord/rev_flag and not setting
+-+               # flags. only use elements 0, 4, 8, and 12
+-+               bad_elems &= 0x1111
+-+         if bad_elems:
+-+            asm_warning('mul inputs don\'t come from accumulators (r0-r3). output may not be as expected')
+-+      if raddr_a is None:
+-+         raddr_a = RADDR_NOP
+-+      if raddr_b is None:
+-+         raddr_b = RADDR_NOP
+-+      if immb:
+-+         if sig != SIG_NORMAL:
+-+            asm_error('rotation/immediates and signal don\'t mix')
+-+         sig = SIG_SMALLIMMED
+-+      if arot_r5 or (mwrot_r5 and (not have_am)):
+-+         if sig != SIG_NORMAL:
+-+            asm_error('rotation/immediates/signal don\'t mix')
+-+         sig = SIG_ROTATE
+-+
+-+      # construct instruction
+-+      prog.append(((mul_b << 0) | (mul_a << 3) | (add_b << 6) | (add_a << 9) | (raddr_b << 12) | (raddr_a << 18) | (aop << 24) | (mop << 29),
+-+         (mwaddr << 0) | (awaddr << 6) | (ws << 12) | (sf << 13) | (mcond << 14) | (acond << 17) | (pack << 20) | (pm << 24) | (unpack << 25) | (sig << 28),
+-+         line, annots))
+-+   finally:
+-+      current_location = prev_location
+-+
+-+def preprocess_passthrough(file):
+-+   line_number = 0
+-+   for line in file:
+-+      line_number += 1
+-+      yield line_number, line
+-+
+-+def asm_file(sets, location, filename, preprocess = None):
+-+   global current_dir, current_location
+-+
+-+   if filename is None:
+-+      location = '<stdin>'
+-+      file = sys.stdin
+-+
+-+      prev_dir = current_dir
+-+   else:
+-+      filename = os.path.normpath(os.path.join(current_dir, filename))
+-+
+-+      try:
+-+         file = open(filename)
+-+      except Exception, e:
+-+         asm_error(e)
+-+      except:
+-+         asm_error('unknown error while opening file %s' % filename)
+-+
+-+      prev_dir = current_dir
+-+      current_dir = os.path.dirname(filename)
+-+
+-+   prev_location = current_location
+-+   current_location = location
+-+
+-+   if preprocess is None:
+-+      preprocess = preprocess_passthrough
+-+
+-+   try:
+-+      for line_number, line in preprocess(file):
+-+         # strip off comments and whitespace
+-+         line = line.split('#')[0].strip()
+-+         if line == '':
+-+            continue
+-+
+-+         asm_line(sets, '%s: %d' % (current_location, line_number), line)
+-+   finally:
+-+      current_dir = prev_dir
+-+      current_location = prev_location
+-+
+-+def asm_end_prog():
+-+   # check we aren't in a multi-line construct (eg .macro or .rep)
+-+   if construct != None:
+-+      asm_error({
+-+         CONSTRUCT_MACRO: '.macro without .endm',
+-+         CONSTRUCT_IF:    '.if/.elif without .endif',
+-+         CONSTRUCT_ELSE:  '.else without .endif',
+-+         CONSTRUCT_REP:   '.rep without .endr'}[construct_stack[-1]])
+-+
+-+   # check no warnings level back to 0
+-+   if nwarn_level != 0:
+-+      asm_error('.pushnwarn without .popnwarn')
+-+
+-+   # flush queued up data
+-+   asm_flush_prog_data()
+-+
+-+   # fixup all the label references we can
+-+   for pc in xrange(len(prog)):
+-+      if isinstance(prog[pc][0], tuple):
+-+         location, label, rel, offset = prog[pc][0]
+-+         if label[0].isdigit():
+-+            label_pcs = labels.get(label[:-1], [])
+-+            if label[-1] == 'b':
+-+               label_pcs = filter(lambda label_pc: label_pc <= pc, label_pcs)[-1:]
+-+            else:
+-+               label_pcs = filter(lambda label_pc: label_pc > pc, label_pcs)[:1]
+-+            if label_pcs == []:
+-+               asm_error('search for label reached begin/end of file', location = location)
+-+            imm = label_pcs[0]
+-+         elif label in labels:
+-+            imm = labels[label]
+-+         elif (':' + label) in labels:
+-+            imm = labels[':' + label]
+-+         elif external_link:
+-+            continue # let the external linker deal with it
+-+         else:
+-+            asm_error('undefined label', location = location)
+-+         imm = (imm * 8) + offset
+-+         if rel:
+-+            imm -= (pc + 4) * 8 # relative to instruction after delay slots
+-+            imm &= (1 << 32) - 1
+-+         else:
+-+            if not external_link:
+-+               asm_error('can\'t get absolute address without using an external linker. this mode doesn\'t have an external linker', location = location)
+-+            imm = (location, label, rel, offset, imm)
+-+         prog[pc] = (imm,) + prog[pc][1:]
+-+
+-+def asm_init():
+-+   global current_dir, current_location, prog, prog_data, macros, labels, construct, construct_stack, nwarn_level
+-+
+-+   current_dir = os.getcwd()
+-+   current_location = ''
+-+   prog = []
+-+   prog_data = []
+-+   macros = {
+-+      'sacq': (['dst', 'i'], [('candyland', 'mov  dst, sacq(i)')]),
+-+      'srel': (['dst', 'i'], [('candyland', 'mov  dst, srel(i)')])}
+-+   labels = {}
+-+   construct = None
+-+   construct_stack = []
+-+   nwarn_level = 0
+-+
+-+def asm_reset_prog():
+-+   global prog, labels
+-+
+-+   prog = []
+-+   labels = {}
+-+
+-+###############################################################################
+-+# dumping
+-+###############################################################################
+-+
+-+def print_lines(lines):
+-+   for line in lines:
+-+      print line
+-+
+-+class dumper_t:
+-+   def external_link(self): return False
+-+   def begin(self): pass
+-+   def label(self, pc, name): pass
+-+   def line(self, pc, ls, ms, line, annots, first): pass
+-+   def end(self): pass
+-+   def sets(self, sets): pass
+-+   def direct(self, line): pass
+-+
+-+class clif_dumper_t(dumper_t):
+-+   def __init__(self):
+-+      self.annot_mode = 0
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def parse_annot_mode(self, line):
+-+      l = line.split(',')
+-+      self.annot_mode = int(l[0])
+-+      if self.annot_mode not in (0, 1, 2):
+-+         asm_error('bad annot mode')
+-+      if self.annot_mode == 2:
+-+         if len(l) != 2:
+-+            asm_error('expected buffer name')
+-+         self.annot_name = l[1].strip()
+-+         self.annot_offset = 0
+-+      elif len(l) != 1:
+-+         asm_error('unexpected comma')
+-+
+-+   def label(self, pc, name):
+-+      if (self.annot_mode != 1) and (name[0] == ':'):
+-+         if self.annot_mode == 2:
+-+            name = name + '_annotations'
+-+         print '@label %s' % name[1:]
+-+      else:
+-+         print '// :%s' % name
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if self.annot_mode == 0:
+-+         if isinstance(ls, tuple):
+-+            if len(ls) == 5:
+-+               location, label, rel, offset, offset_from_prog = ls
+-+               assert not rel
+-+               ls = '[. - %d + %d]' % (pc * 8, offset_from_prog)
+-+            else:
+-+               location, label, rel, offset = ls
+-+               if rel:
+-+                  asm_error('relative external label references not allowed in this mode', location = location)
+-+               ls = '[%s + %d]' % (label, offset)
+-+         else:
+-+            ls = '0x%08x' % ls
+-+         print '%s 0x%08x // %s' % (ls, ms, line)
+-+      elif self.annot_mode == 1:
+-+         print '// %s' % line
+-+         for annot in annots:
+-+            print '0x%08x 0x%08x // %s' % ({
+-+               # todo: would rather not have these hard coded
+-+               'mul_used':              1,
+-+               'preserve_cond':         2,
+-+               'geomd_open':            3,
+-+               'geomd_i':               4,
+-+               'geomd_tris_clear':      5,
+-+               'geomd_verts':           6,
+-+               'geomd_tris_add':        7,
+-+               'geomd_tris_set_center': 8,
+-+               'geomd_region_clear':    9,
+-+               'geomd_region_set':      10,
+-+               'geomd_images_clear':    11,
+-+               'geomd_images_l':        12,
+-+               'geomd_images_b':        13,
+-+               'geomd_images_r':        14,
+-+               'geomd_images_t':        15,
+-+               'geomd_images_add_vpm':  16,
+-+               'trace_4c':              17,
+-+               'geomd_images_add_tex':  18,}[annot[0]], annot[1], annot[0])
+-+         if len(annots) != 0:
+-+            print '0x00000000 // end'
+-+      else:
+-+         assert self.annot_mode == 2
+-+         if len(annots) == 0:
+-+            print '0x00000000 // %s' % line
+-+         else:
+-+            print '[%s + %d] // %s' % (self.annot_name, self.annot_offset, line)
+-+            self.annot_offset += (len(annots) * 8) + 4
+-+
+-+   def direct(self, line):
+-+      print line
+-+
+-+class plain_dumper_t(dumper_t):
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      print '0x%08x, 0x%08x, // %s' % (ls, ms, line)
+-+
+-+class c_c_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, array_name):
+-+      self.header_name = header_name
+-+      self.array_name = array_name
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      self.external_labels = set()
+-+      self.lines = []
+-+
+-+      print '#include "%s.h"' % self.header_name
+-+      print ''
+-+      print '#ifdef _MSC_VER'
+-+      print '   #include <stdint.h>'
+-+      print '   /* cast through uintptr_t to avoid warnings */'
+-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))'
+-+      print '#else'
+-+      print '   #define POINTER_TO_UINT(X) ((unsigned int)(X))'
+-+      print '#endif'
+-+      print ''
+-+      print '#ifdef __cplusplus'
+-+      print 'extern "C" { /* the types are probably wrong... */'
+-+      print '#endif'
+-+
+-+   def label(self, pc, name):
+-+      self.lines.append('// :%s' % name)
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if isinstance(ls, tuple):
+-+         if len(ls) == 5:
+-+            location, label, rel, offset, offset_from_prog = ls
+-+            assert not rel
+-+            ls = 'POINTER_TO_UINT(%s) + %d' % (self.array_name, offset_from_prog)
+-+         else:
+-+            location, label, rel, offset = ls
+-+            if rel:
+-+               asm_error('relative external label references not allowed in this mode', location = location)
+-+            if label not in self.external_labels:
+-+               self.external_labels.add(label)
+-+               print 'extern uint8_t %s[];' % label
+-+            ls = 'POINTER_TO_UINT(%s) + %d' % (label, offset)
+-+      else:
+-+         ls = '0x%08x' % ls
+-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+-+
+-+   def end(self):
+-+      print '#ifdef __cplusplus'
+-+      print '}'
+-+      print '#endif'
+-+      print ''
+-+      print '#ifdef _MSC_VER'
+-+      print '__declspec(align(8))'
+-+      print '#elif defined(__GNUC__)'
+-+      print '__attribute__((aligned(8)))'
+-+      print '#endif'
+-+      print 'unsigned int %s[] = {' % self.array_name
+-+      print_lines(self.lines)
+-+      print '};'
+-+      print '#ifdef __HIGHC__'
+-+      print '#pragma Align_to(8, %s)' % self.array_name
+-+      print '#endif'
+-+
+-+class c_h_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, array_name):
+-+      self.full_header_name = full_header_name
+-+      self.array_name = array_name
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      print '#ifndef %s_H' % self.full_header_name
+-+      print '#define %s_H' % self.full_header_name
+-+      print ''
+-+      print 'extern unsigned int %s[];' % self.array_name
+-+      print ''
+-+
+-+   def label(self, pc, name):
+-+      if name[0] == ':':
+-+         print '#define %s (%s + %d)' % (name[1:], self.array_name, pc * 2)
+-+
+-+   def end(self):
+-+      print ''
+-+      print '#endif'
+-+
+-+class ml_c_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, name, annots):
+-+      self.header_name = header_name
+-+      self.name = name
+-+      self.annots = annots
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      if self.annots:
+-+         self.annot_lines = []
+-+      self.lines = []
+-+      self.external_labels = set()
+-+      self.link_lines = []
+-+
+-+      print '#include "%s.h"' % self.header_name
+-+      print '#include <assert.h>'
+-+      if self.annots:
+-+         print '#ifdef SIMPENROSE'
+-+         print '#include <stddef.h>'
+-+         print '#include "v3d/verification/tools/2760sim/simpenrose.h"'
+-+      print ''
+-+
+-+   def label(self, pc, name):
+-+      self.lines.append('// :%s' % name)
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if self.annots:
+-+         if len(annots) == 0:
+-+            self.annot_lines.append('NULL,')
+-+         else:
+-+            print 'static unsigned int const annotations_%d[] = {' % pc
+-+            for annot in annots:
+-+               print '   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1])
+-+            print '   SIMPENROSE_SHADER_ANNOTATION_END};'
+-+            print ''
+-+            self.annot_lines.append('annotations_%d,' % pc)
+-+      if isinstance(ls, tuple):
+-+         self.link_lines.append('   assert(p[%d] == 0xdeadbeef);' % (pc * 2))
+-+         if len(ls) == 5:
+-+            location, label, rel, offset, offset_from_prog = ls
+-+            assert not rel
+-+            self.link_lines.append('   p[%d] = base + %d;' % (pc * 2, offset_from_prog))
+-+         else:
+-+            location, label, rel, offset = ls
+-+            self.external_labels.add(label)
+-+            if rel:
+-+               self.link_lines.append('   p[%d] = (%s + %d) - (base + %d);' % (pc * 2, label, offset, (pc + 4) * 8))
+-+            else:
+-+               self.link_lines.append('   p[%d] = %s + %d;' % (pc * 2, label, offset))
+-+         ls = '0xdeadbeef'
+-+      else:
+-+         ls = '0x%08x' % ls
+-+      self.lines.append('/* [0x%08x] */ %s, 0x%08x, // %s' % (pc * 8, ls, ms, line))
+-+
+-+   def end(self):
+-+      if self.annots:
+-+         print 'unsigned int const *const %s_annotations_array[] = {' % self.name
+-+         print_lines(self.annot_lines)
+-+         print '};'
+-+         print '#endif'
+-+         print ''
+-+      print 'static unsigned int const array[] = {'
+-+      print_lines(self.lines)
+-+      print '};'
+-+      print ''
+-+      print 'void %s_link(void *p_in, unsigned int base' % self.name
+-+      for label in sorted(self.external_labels):
+-+         print '   , unsigned int %s' % label
+-+      print '   )'
+-+      print '{'
+-+      print '   unsigned int *p = (unsigned int *)p_in;'
+-+      print '   unsigned int i;'
+-+      print '   for (i = 0; i != (%s_SIZE / 4); ++i) {' % self.name.upper()
+-+      print '      p[i] = array[i];'
+-+      print '   }'
+-+      print_lines(self.link_lines)
+-+      print '}'
+-+
+-+class ml_h_dumper_t(dumper_t):
+-+   def __init__(self, header_name, full_header_name, name, annots):
+-+      self.full_header_name = full_header_name
+-+      self.name = name
+-+      self.annots = annots
+-+
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      self.external_labels = set()
+-+      self.lines_n = 0
+-+
+-+      print '#ifndef %s_H' % self.full_header_name
+-+      print '#define %s_H' % self.full_header_name
+-+      print ''
+-+      if self.annots:
+-+         print '#ifdef SIMPENROSE'
+-+         print '   extern unsigned int const *const %s_annotations_array[];' % self.name
+-+         print '#endif'
+-+         print ''
+-+
+-+   def label(self, pc, name):
+-+      if name[0] == ':':
+-+         print '#define %s_OFFSET %d' % (name[1:].upper(), pc * 8)
+-+         if self.annots:
+-+            print '#ifdef SIMPENROSE'
+-+            print '   #define %s_annotations (%s_annotations_array + %d)' % (name[1:], self.name, pc)
+-+            print '#endif'
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if isinstance(ls, tuple) and (len(ls) != 5):
+-+         self.external_labels.add(ls[1])
+-+      self.lines_n += 1
+-+
+-+   def end(self):
+-+      print ''
+-+      print 'extern void %s_link(void *p, unsigned int base' % self.name
+-+      for label in sorted(self.external_labels):
+-+         print '   , unsigned int %s' % label
+-+      print '   );'
+-+      print ''
+-+      print '#define %s_SIZE %d' % (self.name.upper(), (self.lines_n * 8))
+-+      print ''
+-+      print '#endif'
+-+
+-+def print_lines_lc(lines):
+-+   for line in lines:
+-+      print '%s \\' % line
+-+
+-+def print_groups_lc(groups):
+-+   first = True
+-+   for group in groups:
+-+      if first:
+-+         print '{ \\'
+-+      else:
+-+         print ', { \\'
+-+      print_lines_lc(group)
+-+      print '} \\'
+-+      first = False
+-+
+-+class inline_c_dumper_t(dumper_t):
+-+   def __init__(self, annots):
+-+      self.annots = annots
+-+      self.iteration = False
+-+
+-+   def begin_iteration(self):
+-+      assert not self.iteration
+-+      self.iteration = True
+-+      self.iteration_lines = []
+-+      if self.annots:
+-+         self.iteration_annot_lines = []
+-+         self.annot_arrs = []
+-+
+-+   def end_iteration(self):
+-+      assert self.iteration
+-+      self.iteration = False
+-+      print '%d, \\' % self.iteration_n
+-+      if self.annots:
+-+         print '( \\'
+-+      print_groups_lc(self.iteration_lines)
+-+      if self.annots:
+-+         print '), ( \\'
+-+         print_groups_lc(self.iteration_annot_lines)
+-+         print '), ( \\'
+-+         for annot_arr in self.annot_arrs:
+-+            print_lines_lc(annot_arr)
+-+         print ') \\'
+-+
+-+   def begin(self):
+-+      self.n = 0
+-+      self.lines = []
+-+      if self.annots:
+-+         self.annot_lines = []
+-+         if not self.iteration:
+-+            self.annot_arrs = []
+-+
+-+   def label(self, pc, name):
+-+      self.lines.append('/* :%s */' % name)
+-+      if self.annots:
+-+         self.annot_lines.append('/* :%s */' % name)
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      self.n += 1
+-+      if first:
+-+         prefix = ''
+-+      else:
+-+         prefix = ', '
+-+      self.lines.append('%s0x%08x, 0x%08x /* %s */' % (prefix, ls, ms, line))
+-+      if self.annots:
+-+         if len(annots) == 0:
+-+            a = 'NULL'
+-+         else:
+-+            a = 'annotations_%d' % len(self.annot_arrs)
+-+            annot_arr = ['static unsigned int const annotations_%d[] = {' % len(self.annot_arrs)]
+-+            for annot in annots:
+-+               annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_%s, 0x%08x,' % (annot[0].upper(), annot[1]))
+-+            annot_arr.append('   SIMPENROSE_SHADER_ANNOTATION_END};')
+-+            self.annot_arrs.append(annot_arr)
+-+         self.annot_lines.append('%s%s /* %s */' % (prefix, a, line))
+-+
+-+   def end(self):
+-+      if self.iteration:
+-+         if len(self.iteration_lines) == 0:
+-+            self.iteration_n = self.n
+-+         elif self.iteration_n != self.n:
+-+            asm_error('number of instructions differs between iterations')
+-+         self.iteration_lines.append(self.lines)
+-+         if self.annots:
+-+            self.iteration_annot_lines.append(self.annot_lines)
+-+      else:
+-+         if self.annots:
+-+            print '( \\'
+-+         print_lines_lc(self.lines)
+-+         if self.annots:
+-+            print '), ( \\'
+-+            print_lines_lc(self.annot_lines)
+-+            print '), ( \\'
+-+            for annot_arr in self.annot_arrs:
+-+               print_lines_lc(annot_arr)
+-+            print ') \\'
+-+
+-+   def direct(self, line):
+-+      print line
+-+
+-+class asvc_dumper_t(dumper_t):
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      print '.align 8'
+-+
+-+   def label(self, pc, name):
+-+      if name[0] == ':':
+-+         print '%s::' % name[1:]
+-+      else:
+-+         print '%s:' % name
+-+
+-+   def line(self, pc, ls, ms, line, annots, first):
+-+      if isinstance(ls, tuple):
+-+         location, label, rel, offset = ls[:4]
+-+         if rel:
+-+            ls = '%s + %d - (. + 32)' % (label, offset)
+-+         else:
+-+            ls = '%s + %d' % (label, offset)
+-+      else:
+-+         ls = '0x%08x' % ls
+-+      print '.word %s, 0x%08x ; %s' % (ls, ms, line)
+-+
+-+def is_ra_or_rb(val):
+-+   return isinstance(val, loc_t) and ((val.mux == MUX_A) or (val.mux == MUX_B))
+-+
+-+class aliases_dumper_t(dumper_t):
+-+   def external_link(self):
+-+      return True
+-+
+-+   def begin(self):
+-+      print '#ifndef JUST_DQASM_ARGS'
+-+
+-+   def label(self, pc, name):
+-+      if not name[0].isdigit():
+-+         if name[0] == ':':
+-+            name = name[1:]
+-+         print '"bs%s", "bs%x",' % (name, pc * 8)
+-+         print '"bu%s", "bu%x",' % (name, pc * 8)
+-+
+-+   def end(self):
+-+      print '#endif'
+-+
+-+   # todo: handle things other than ra and rb? dqasm only allows ra and rb atm
+-+   def sets(self, sets):
+-+      dqasm_args = []
+-+      print '#ifndef JUST_DQASM_ARGS'
+-+      for name in sets:
+-+         if is_ra_or_rb(sets[name]):
+-+            dqasm_args.append('-r%s=%s' % (sets[name], name))
+-+            print '"%s", "%s",' % (name, sets[name])
+-+         elif isinstance(sets[name], list):
+-+            for i, val in enumerate(sets[name]):
+-+               if is_ra_or_rb(val):
+-+                  dqasm_args.append('-r%s=%s[%d]' % (val, name, i))
+-+                  print '"%s[%d]", "%s",' % (name, i, val)
+-+      print '#endif'
+-+      print '#define DQASM_ARGS "%s"' % ' '.join(dqasm_args)
+-+
+-+def dump(dumper):
+-+   if (len(prog) != 0) or (len(labels) != 0):
+-+      dumper.begin()
+-+
+-+      sorted_labels = []
+-+      for name in labels:
+-+         if name[0].isdigit():
+-+            for pc in labels[name]:
+-+               sorted_labels.append((pc, name))
+-+         else:
+-+            sorted_labels.append((labels[name], name))
+-+      sorted_labels.sort(reverse = True)
+-+
+-+      first = True
+-+      for pc in xrange(len(prog)):
+-+         ls, ms, line, annots = prog[pc]
+-+         while (len(sorted_labels) != 0) and (sorted_labels[-1][0] == pc):
+-+            dumper.label(*sorted_labels.pop())
+-+         dumper.line(pc, ls, ms, line, annots, first)
+-+         first = False
+-+      for sorted_label in sorted_labels:
+-+         assert sorted_label[0] == len(prog)
+-+         dumper.label(*sorted_label)
+-+
+-+      dumper.end()
+-+
+-+###############################################################################
+-+# preprocessing
+-+###############################################################################
+-+
+-+def preprocess_inline_c(dumper):
+-+   def preprocess(file):
+-+      ls = None
+-+      line_number = 0
+-+      for line in file:
+-+         line_number += 1
+-+         while True:
+-+            if ls is None:
+-+               l = line.split('%[', 1)
+-+               if len(l) == 1:
+-+                  dumper.direct(l[0].rstrip())
+-+                  break
+-+               dumper.direct('%s \\' % l[0].rstrip())
+-+               line = l[1]
+-+               ls = []
+-+            else:
+-+               l = line.split('%]', 1)
+-+               ls.append((line_number, l[0]))
+-+               if len(l) == 1:
+-+                  break
+-+               line = l[1]
+-+               l = ls[-1][1].split('%|', 1)
+-+               if len(l) == 1:
+-+                  for l_number, l in ls:
+-+                     yield l_number, l
+-+                  asm_end_prog()
+-+                  dump(dumper)
+-+                  asm_reset_prog()
+-+               else:
+-+                  ls[-1] = (ls[-1][0], l[0])
+-+                  if hasattr(dumper, 'begin_iteration'):
+-+                     dumper.begin_iteration()
+-+                  for repls in l[1].split('%,'):
+-+                     repls = [repl.strip() for repl in repls.split('%/')]
+-+                     for l_number, l in ls:
+-+                        for i, repl in enumerate(repls):
+-+                           l = l.replace('%' + str(i), repl)
+-+                        yield l_number, l
+-+                     asm_end_prog()
+-+                     dump(dumper)
+-+                     asm_reset_prog()
+-+                  if hasattr(dumper, 'end_iteration'):
+-+                     dumper.end_iteration()
+-+               ls = None
+-+   return preprocess
+-+
+-+def preprocess_clif(dumper):
+-+   def preprocess(file):
+-+      in_asm = False
+-+      line_number = 0
+-+      for line in file:
+-+         line_number += 1
+-+         if in_asm:
+-+            if line.strip() == '%]':
+-+               asm_end_prog()
+-+               dump(dumper)
+-+               asm_reset_prog()
+-+               in_asm = False
+-+            else:
+-+               yield line_number, line
+-+         else:
+-+            if line.strip() == '%[':
+-+               in_asm = True
+-+            elif (line[:1] == '%') and (line[:2] != '%@'):
+-+               yield line_number, line[1:]
+-+            else:
+-+               asm_end_prog()
+-+               dump(dumper)
+-+               asm_reset_prog()
+-+               if line[:2] == '%@':
+-+                  if hasattr(dumper, 'parse_annot_mode'):
+-+                     dumper.parse_annot_mode(line[2:])
+-+               else:
+-+                  dumper.direct(line.rstrip())
+-+   return preprocess
+-+
+-+###############################################################################
+-+# main
+-+###############################################################################
+++class tstats:
+++    close_threshold = 0.01
+++
+++    def __init__(self, stats_dict=None):
+++        if stats_dict != None:
+++            self.name = stats_dict["name"]
+++            self.elapsed = float(stats_dict["elapsed"])
+++            self.user = float(stats_dict["user"])
+++            self.sys = float(stats_dict["sys"])
+++
+++    def times_str(self):
+++        ctime = self.sys + self.user
+++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
+++
+++    def dict(self):
+++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
+++
+++    def is_close(self, other):
+++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
+++
+++    def __lt__(self, other):
+++        return self.elapsed < other.elapsed
+++    def __gt__(self, other):
+++        return self.elapsed > other.elapsed
+++
+++    def time_file(name, prefix):
+++        stats = tstats()
+++        stats.name = name
+++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
+++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
+++        pinfo = os.wait4(cproc.pid, 0)
+++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
+++        stats.elapsed = end_time - start_time
+++        stats.user = pinfo[2].ru_utime
+++        stats.sys = pinfo[2].ru_stime
+++        return stats
+++
+++
+++def common_prefix(s1, s2):
+++    for i in range(min(len(s1),len(s2))):
+++        if s1[i] != s2[i]:
+++            return s1[:i]
+++    return s1[:i+1]
+ +
+ +def main():
+-+   global external_link, allow_xor_0, dont_warn_when_mul_rot_inp_r5
+-+   global warnings_are_errors, disable_warnings, have_sema, have_am, mulw_rotate
+-+
+-+   asm_init() # do this first so we can use asm_error without having to pass a location and so asm_warning will work
+-+
+-+   # parse command line
+-+   parser = optparse.OptionParser(usage = 'usage: %prog [options] <filename>')
+-+   parser.add_option('-m', '--mode', dest = 'mode',
+-+      help = '<mode> should be clif, plain, ' +
+-+      'c_c:<header_name>,<full_header_name>,<array_name>, ' +
+-+      'c_h:<header_name>,<full_header_name>,<array_name>, ' +
+-+      'ml_c:<header_name>,<full_header_name>,<name>[,annots], ' +
+-+      'ml_h:<header_name>,<full_header_name>,<name>[,annots], ' +
+-+      'inline_c[:annots], asvc, or aliases[:<preprocess_mode>]', metavar = '<mode>')
+-+   parser.add_option('-t', '--target', dest = 'target',
+-+      help = '<target> should be a0, b0, or hera', metavar = '<target>')
+-+   parser.add_option('-x', '--allow_xor_0', dest = 'allow_xor_0', action = 'store_true', default = False)
+-+   parser.add_option('-r', '--dont_warn_when_mul_rot_inp_r5', dest = 'dont_warn_when_mul_rot_inp_r5', action = 'store_true', default = False)
+-+   parser.add_option('-w', '--warnings_are_errors', dest = 'warnings_are_errors', action = 'store_true', default = False)
+-+   parser.add_option('-d', '--disable_warnings', dest = 'disable_warnings', action = 'store_true', default = False)
+-+   parser.add_option('-s', '--set', dest = 'sets', action = 'append', default = [], metavar = '<name>=<val>')
+-+   options, args = parser.parse_args()
+-+   if len(args) == 0:
+-+      filename = None
+-+   elif len(args) == 1:
+-+      filename = args[0]
+-+   else:
+-+      parser.print_help()
+-+      sys.exit(-1)
+-+
+-+   # handle mode
+-+   mode = options.mode or 'clif' # assume clif if no mode specified
+-+   if mode == 'clif':
+-+      dumper = clif_dumper_t()
+-+      preprocess = preprocess_clif(dumper)
+-+   elif mode == 'plain':
+-+      dumper = plain_dumper_t()
+-+      preprocess = None
+-+   elif (mode[:4] == 'c_c:') or (mode[:4] == 'c_h:'):
+-+      mode_options = mode[4:].split(',')
+-+      if len(mode_options) != 3:
+-+         asm_error('badly formatted mode on command line')
+-+      dumper = {'c_c': c_c_dumper_t, 'c_h': c_h_dumper_t}[mode[:3]](*mode_options)
+-+      preprocess = None
+-+   elif (mode[:5] == 'ml_c:') or (mode[:5] == 'ml_h:'):
+-+      mode_options = mode[5:].split(',')
+-+      if (len(mode_options) != 3) and ((len(mode_options) != 4) or (mode_options[3] != 'annots')):
+-+         asm_error('badly formatted mode on command line')
+-+      dumper = {'ml_c': ml_c_dumper_t, 'ml_h': ml_h_dumper_t
+-+         }[mode[:4]](*(mode_options[:3] + [len(mode_options) == 4]))
+-+      preprocess = None
+-+   elif mode == 'inline_c':
+-+      dumper = inline_c_dumper_t(False)
+-+      preprocess = preprocess_inline_c(dumper)
+-+   elif mode == 'inline_c:annots':
+-+      dumper = inline_c_dumper_t(True)
+-+      preprocess = preprocess_inline_c(dumper)
+-+   elif mode == 'asvc':
+-+      dumper = asvc_dumper_t()
+-+      preprocess = None
+-+   elif mode == 'aliases':
+-+      dumper = aliases_dumper_t()
+-+      preprocess = None
+-+   elif mode == 'aliases:inline_c':
+-+      dumper = aliases_dumper_t()
+-+      preprocess = preprocess_inline_c(dumper)
+-+   else:
+-+      asm_error('invalid mode')
+-+   external_link = dumper.external_link()
+-+
+-+   # handle target
+-+   target = options.target or 'b0' # assume b0 if no target specified
+-+   if target == 'a0':
+-+      have_sema = False
+-+      have_am = False
+-+      mulw_rotate = False
+-+      have_lthrsw = False
+-+   elif target == 'b0':
+-+      have_sema = True
+-+      have_am = True
+-+      mulw_rotate = True
+-+      have_lthrsw = True
+-+   elif target == 'hera':
+-+      have_sema = True
+-+      have_am = False
+-+      mulw_rotate = True
+-+      have_lthrsw = True
+-+   else:
+-+      asm_error('invalid target')
+-+   if have_am:
+-+      sigs['loadam'] = SIG_LOADAM
+-+      arg_defs['tlbam'] = loc_t(MUX_ANY, 47, 0, 0, None, RW_WRITE)
+-+   if have_lthrsw:
+-+      sigs['lthrsw'] = SIG_LTHRSW
+-+      del sigs['int']
+-+      arg_defs['interrupt'] = loc_t(MUX_ANY, 38, 0, 0, None, RW_WRITE)
+-+
+-+   # handle misc options
+-+   allow_xor_0 = options.allow_xor_0
+-+   dont_warn_when_mul_rot_inp_r5 = options.dont_warn_when_mul_rot_inp_r5
+-+   warnings_are_errors = options.warnings_are_errors
+-+   disable_warnings = options.disable_warnings
+-+
+-+   # make options visible to asm
+-+   arg_defs['mode'] = mode
+-+   arg_defs['target'] = target
+-+
+-+   # arg_defs all setup at this point
+-+   sets = arg_defs.copy() # todo: see arg_eval
+-+
+-+   # handle command line sets
+-+   re_options_set = re.compile('(?P<name>\\w+)=(?P<val>.+)$')
+-+   for options_set in options.sets:
+-+      m = re_options_set.match(options_set)
+-+      if not m:
+-+         asm_error('badly formatted set on command line')
+-+      sets[m.group('name')] = arg_eval(m.group('val'), sets)
+-+
+-+   # assemble input file and dump
+-+   asm_file(sets, filename, filename, preprocess)
+-+   asm_end_prog()
+-+   dump(dumper)
+-+   for name in arg_defs: # todo: see arg_eval
+-+      del sets[name]
+-+   dumper.sets(sets)
+++    global flog
+ +
+-+if __name__ == '__main__':
+-+   main()
+-diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+-new file mode 100644
+-index 0000000..47dd071
+---- /dev/null
+-+++ b/pi-util/qem.sh
+-@@ -0,0 +1,9 @@
+-+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+-+QASM=python\ pi-util/qasm.py
+-+SRC_FILE=libavcodec/rpi_shader.qasm
+-+DST_BASE=shader
+++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
+++To blank the screen before starting use "xdg-screensaver activate"
+++(For some reason this doesn't seem to work from within python).
+++""")
+ +
+-+cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+-+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+-+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+++    argp.add_argument("streams", nargs='*')
+++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
+++    argp.add_argument("--csv_in", help="CSV input filename")
+++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
+ +
+-diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+-new file mode 100755
+-index 0000000..6a9a33f
+---- /dev/null
+-+++ b/pi-util/rebase_liblinks.py
+-@@ -0,0 +1,37 @@
+-+#!/usr/bin/env python
+++    args = argp.parse_args()
+ +
+-+import os, sys
+-+from stat import *
+++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
+++    csv_out.writeheader()
+++
+++    stats_in = {}
+++    if args.csv_in != None:
+++        with open(args.csv_in, 'r', newline='') as f_in:
+++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
+++
+++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
+++
+++    streams = args.streams
+++    if not streams:
+++        if not stats_in:
+++            print ("No source streams specified")
+++            return 1
+++        prefix = "" if args.prefix == None else args.prefix
+++        streams = [k for k in stats_in]
+++    elif args.prefix != None:
+++        prefix = args.prefix
+++    else:
+++        prefix = streams[0]
+++        for f in streams[1:]:
+++            prefix = common_prefix(prefix, f)
+++        pp = prefix.rpartition(os.sep)
+++        prefix = pp[0] + pp[1]
+++        streams = [s[len(prefix):] for s in streams]
+++
+++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
+++        print ("====", f)
+++
+++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
+++        for i in range(3):
+++            t = tstats.time_file(f, prefix)
+++            print ("...", t.times_str())
+++            if t0 > t:
+++                t0 = t
+++
+++        if t0.name in stats_in:
+++            pstat = stats_in[t0.name]
+++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
+++
+++        csv_out.writerow(t0.dict())
+++
+++        print ()
+++
+++    return 0
+ +
+-+def walktree(top, callback, n, prefix):
+-+    '''recursively descend the directory tree rooted at top,
+-+       calling the callback function for each regular file'''
+-+
+-+    for f in os.listdir(top):
+-+        pathname = os.path.join(top, f)
+-+        mode = os.lstat(pathname).st_mode
+-+        if S_ISDIR(mode):
+-+            # It's a directory, recurse into it
+-+            walktree(pathname, callback, n+1, prefix)
+-+        elif S_ISLNK(mode):
+-+            # It's a file, call the callback function
+-+            callback(pathname, os.readlink(pathname), n, prefix)
+-+
+-+def visitfile(file, linkname, n, prefix):
+-+    if (linkname.startswith(prefix + 'lib/')):
+-+        newlink = "../" * n + linkname[len(prefix):]
+-+        print 'relinking', file, "->", newlink
+-+        os.remove(file)
+-+        os.symlink(newlink, file)
+ +
+ +if __name__ == '__main__':
+-+    argc = len(sys.argv)
+-+    if argc == 2:
+-+        walktree(sys.argv[1], visitfile, 0, "/")
+-+    elif argc == 3:
+-+        walktree(sys.argv[1], visitfile, 0, sys.argv[2])
+-+    else:
+-+        print "rebase_liblinks.py <local root> [<old sysroot>]"
+++    exit(main())
+ +
++diff --git a/pi-util/make_array.py b/pi-util/make_array.py
++new file mode 100755
++index 0000000000..864fa5e704
++--- /dev/null
+++++ b/pi-util/make_array.py
++@@ -0,0 +1,19 @@
+++#!/usr/bin/env python
+ +
+++# Usage
+++#   make_array file.bin
+++#   Produces file.h with array of bytes.
+++#
+++import sys
+++for file in sys.argv[1:]:
+++  prefix,suffix = file.split('.')
+++  assert suffix=='bin'
+++  name=prefix.split('/')[-1]
+++  print 'Converting',file
+++  with open(prefix+'.h','wb') as out:
+++    print >>out, 'static const unsigned char',name,'[] = {'
+++    with open(file,'rb') as fd:  
+++      for byte in fd.read():
+++        print >>out, '%d,' % ord(byte)
+++    print >>out,'};'
+ +
+-diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
++diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+ new file mode 100755
+-index 0000000..d8bdd91
++index 0000000000..5ce2eeaf72
+ --- /dev/null
+-+++ b/pi-util/syncroot.sh
+-@@ -0,0 +1,43 @@
+-+set -e
+-+
+-+if [ "$1" == "" ]; then
+-+  echo Usage: $0 \<src_dir\> [\<rootname\>]
+-+  echo src_dir is a source for rsync so may contain m/c name.
+-+  echo rootname will be set to \"raspian_jessie_pi1\" if missing
+-+  echo e.g.: pi-util/syncroot.sh my-pi: raspian_jessie_pi1
+-+  exit 1
+-+fi
+-+
+-+SYSROOT_NAME=$2
+-+if [ "$SYSROOT_NAME" == "" ]; then
+-+  SYSROOT_NAME=raspian_jessie_pi1
+-+fi
+-+
+-+DST_ROOT=`pwd`
+-+DST=$DST_ROOT/build/linux/$SYSROOT_NAME-sysroot
+-+SRC=$1
+-+
+-+echo Sync src:  $SRC
+-+echo Sync dest: $DST
+-+
+-+mkdir -p $DST/lib
+-+mkdir -p $DST/opt/vc/include
+-+mkdir -p $DST/usr/lib/pkgconfig
+-+mkdir -p $DST/usr/bin
+-+mkdir -p $DST/usr/share
+-+
+-+#### MUST NOT include /opt/vc/include/*GL*
+-+# Creates conflicts with GL includes inside Chrome
+-+
+-+rsync -rl $SRC/lib/arm-linux-gnueabihf $DST/lib
+-+rsync -rl $SRC/opt/vc/lib $DST/opt/vc
+-+rsync -l  $SRC/opt/vc/include/bcm_host.h $DST/opt/vc/include
+-+rsync -rl $SRC/opt/vc/include/interface $DST/opt/vc/include
+-+rsync -rl $SRC/opt/vc/include/vcinclude $DST/opt/vc/include
+-+rsync -rl $SRC/usr/lib/arm-linux-gnueabihf $DST/usr/lib
+-+rsync -rl $SRC/usr/lib/gcc $DST/usr/lib
+-+rsync -rl $SRC/usr/include $DST/usr
+-+
+-+pi-util/rebase_liblinks.py $DST
+++++ b/pi-util/qem.sh
++@@ -0,0 +1,9 @@
+++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
+++QASM=python\ ../local/bin/qasm.py
+++SRC_FILE=libavcodec/rpi_shader.qasm
+++DST_BASE=shader
+ +
+++cp libavcodec/rpi_shader_cmd.h $TARGET_DIR
+++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
+++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
+ +
+ diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
+-new file mode 100644
+-index 0000000..5935a11
++new file mode 100755
++index 0000000000..5935a11ca5
+ --- /dev/null
+ +++ b/pi-util/v3dusage.py
+ @@ -0,0 +1,128 @@
+
+From 78b1b2b50f01ae8a61aec3b8efb839aa3b120827 Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 8 Sep 2017 17:06:43 +0100
+Subject: [PATCH 77/78] RBP: Request allocation are pre-pinned
+
+---
+ xbmc/linux/RBP.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xbmc/linux/RBP.cpp b/xbmc/linux/RBP.cpp
+index 79f932378cf37747be79e65fd0c2e2476f95474f..ee4a1d71fd8cc4517907952b14db86f310cb7ab0 100644
+--- a/xbmc/linux/RBP.cpp
++++ b/xbmc/linux/RBP.cpp
+@@ -406,7 +406,7 @@ static int get_image_params(int file_desc, VC_IMAGE_T * img)
+ CGPUMEM::CGPUMEM(unsigned int numbytes, bool cached)
+ {
+   m_numbytes = numbytes;
+-  m_vcsm_handle = vcsm_malloc_cache(numbytes, cached ? VCSM_CACHE_TYPE_HOST : VCSM_CACHE_TYPE_NONE, (char *)"CGPUMEM");
++  m_vcsm_handle = vcsm_malloc_cache(numbytes, (VCSM_CACHE_TYPE_T)(0x80 | (unsigned)(cached ? VCSM_CACHE_TYPE_HOST : VCSM_CACHE_TYPE_NONE)), (char *)"CGPUMEM");
+   assert(m_vcsm_handle);
+   m_vc_handle = vcsm_vc_hdl_from_hdl(m_vcsm_handle);
+   assert(m_vc_handle);
+
+From d8396450f95c8119a99c27cb3b60730ff4f170af Mon Sep 17 00:00:00 2001
+From: popcornmix <popcornmix@gmail.com>
+Date: Fri, 13 Oct 2017 20:29:23 +0100
+Subject: [PATCH 78/78] MMALFFMpeg: Avoid crash with 10bit HEVC by accepting
+ format but failing
+
+---
+ xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+index 8444d0df598caef958e4ac3254419f3b4f95c513..a5a28ab25a97417d8524e68b46d3e44fa8b35bad 100644
+--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
++++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecFFmpeg.cpp
+@@ -241,7 +241,7 @@ enum AVPixelFormat CDVDVideoCodecFFmpeg::GetFormat(struct AVCodecContext * avctx
+ #endif
+ 
+ #ifdef HAS_MMAL
+-    if (*cur == AV_PIX_FMT_YUV420P || *cur == AV_PIX_FMT_SAND128)
++    if (*cur == AV_PIX_FMT_YUV420P || *cur == AV_PIX_FMT_SAND128 || *cur == AV_PIX_FMT_SAND64_10)
+     {
+       MMAL::CDecoder* dec = new MMAL::CDecoder(ctx->m_processInfo, ctx->m_hints);
+       if(dec->Open(avctx, ctx->m_pCodecContext, *cur, ctx->m_uSurfacesCount))