@@ -11592,22 +11789,23 @@ index 0000000..365f4a6
+#include "rpi_shader.h"
+#include "rpi_hevc_transform.h"
+
-+#include "rpi_user_vcsm.h"
-+#ifdef GPUSERVICE
+#pragma GCC diagnostic push
+// Many many redundant decls in the header files
+#pragma GCC diagnostic ignored "-Wredundant-decls"
+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
+#pragma GCC diagnostic pop
-+#endif
+
-+// QPU profile flags
-+#define NO_FLUSH 1
-+#define CLEAR_PROFILE 2
-+#define OUTPUT_COUNTS 4
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
+
-+#define FLAGS_FOR_PROFILING (NO_FLUSH)
++// QPU "noflush" flags
++// a mixture of flushing & profiling
+
++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
+
+// On Pi2 there is no way to access the VPU L2 cache
+// GPU_MEM_FLG should be 4 for uncached memory. (Or C for alias to allocate in the VPU L2 cache)
@@ -11664,65 +11862,212 @@ index 0000000..365f4a6
+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
+};
+
++// Code/constants on GPU
+struct GPU
+{
+ unsigned int qpu_code[QPU_CODE_SIZE];
+ unsigned int vpu_code[VPU_CODE_SIZE];
+ short transMatrix2even[16*16*2];
-+ int open_count; // Number of allocated video buffers
-+ int mb; // Mailbox handle
-+ int vc; // Address in GPU memory
-+ int mail[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the first QPU task
-+ int mail2[12*2]; // These are used to pass pairs of code/unifs to the QPUs for the second QPU task
+};
+
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++ int count;
++ int64_t start[WAIT_COUNT_MAX];
++ int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++ unsigned int jcount;
++ int64_t start0;
++ int64_t last_update;
++ trace_time_one_t active;
++ trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++ sem_t sem;
++ unsigned int cost;
++ struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++ vq_wait_t * head;
++ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++ int open_count;
++ int init_count;
++ int mb;
++ unsigned int current_load;
++ GPU_MEM_PTR_T code_gm_ptr;
++ vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
+// Stop more than one thread trying to allocate memory or use the processing resources at once
+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static volatile struct GPU* gpu = NULL;
-+static GPU_MEM_PTR_T gpu_mem_ptr;
++static gpu_env_t * gpu = NULL;
+
-+#if defined(RPI_TIME_TOTAL_QPU) || defined(RPI_TIME_TOTAL_VPU) || defined(RPI_TIME_TOTAL_POSTED)
-+static unsigned int Microseconds(void) {
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
+ struct timespec ts;
-+ unsigned int x;
-+ static unsigned int base = 0;
-+ clock_gettime(CLOCK_REALTIME, &ts);
-+ x = ts.tv_sec*1000000 + ts.tv_nsec/1000;
-+ if (base==0) base=x;
-+ return x-base;
++ clock_gettime(CLOCK_MONOTONIC, &ts);
++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
+}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++ // Update totals for levels that are still pending
++ for (int i = 0; i < tto->count; ++i) {
++ tto->total[i] += now - tto->start[i];
++ tto->start[i] = now;
++ }
++
++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++ prefix,
++ T_ARG(now - start0 - tto->total[0]),
++ T_ARG(tto->total[0]),
++ T_ARG(tto->total[1]),
++ T_ARG(tto->total[2]),
++ T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++ av_assert0(tto->count < WAIT_COUNT_MAX);
++ tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++ const int n = --tto->count;
++ av_assert0(n >= 0);
++ tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++ tto_print(&ttw->active, now, ttw->start0, "Active");
++ tto_print(&ttw->wait, now, ttw->start0, " Wait");
++}
++
+#endif
+
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb);
-+static void gpu_free_internal(GPU_MEM_PTR_T *p);
++// GPU memory alloc fns (internal)
++
++// GPU_MEM_PTR_T alloc fns
++static int gpu_malloc_cached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++ p->numbytes = numbytes;
++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
++ av_assert0(p->vcsm_handle);
++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++ av_assert0(p->vc_handle);
++ p->arm = vcsm_lock(p->vcsm_handle);
++ av_assert0(p->arm);
++ p->vc = mbox_mem_lock(mb, p->vc_handle);
++ av_assert0(p->vc);
++ return 0;
++}
++
++static int gpu_malloc_uncached_internal(const int mb, const int numbytes, GPU_MEM_PTR_T * const p) {
++ p->numbytes = numbytes;
++ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
++ av_assert0(p->vcsm_handle);
++ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
++ av_assert0(p->vc_handle);
++ p->arm = vcsm_lock(p->vcsm_handle);
++ av_assert0(p->arm);
++ p->vc = mbox_mem_lock(mb, p->vc_handle);
++ av_assert0(p->vc);
++ return 0;
++}
++
++static void gpu_free_internal(const int mb, GPU_MEM_PTR_T * const p) {
++ mbox_mem_unlock(mb, p->vc_handle);
++ vcsm_unlock_ptr(p->arm);
++ vcsm_free(p->vcsm_handle);
++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++ gpu_env_t * const ge = gpu;
++
++ // We have to hope that eveything has terminated...
++ gpu = NULL;
++
++ vc_gpuserv_deinit();
++
++ gpu_free_internal(ge->mb, &ge->code_gm_ptr);
++
++ vcsm_exit();
++
++ mbox_close(ge->mb);
++
++ vq_wait_pool_deinit(&ge->wait_pool);
++
++ free(ge);
++}
++
+
+// Connect to QPU, returns 0 on success.
-+static int gpu_init(volatile struct GPU **gpu) {
-+ int mb = mbox_open();
-+ int vc;
++static int gpu_init(gpu_env_t ** const gpu) {
+ volatile struct GPU* ptr;
-+ if (mb < 0)
-+ return -1;
-+#ifndef RPI_ASYNC
-+ if (qpu_enable(mb, 1)) return -2;
-+#endif
++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++ *gpu = NULL;
++
++ if (ge == NULL)
++ return -1;
++
++ if ((ge->mb = mbox_open()) < 0)
++ return -1;
++
++ vq_wait_pool_init(&ge->wait_pool);
++
+ vcsm_init();
-+ vc_gpuserv_init();
-+ gpu_malloc_uncached_internal(sizeof(struct GPU), &gpu_mem_ptr, mb);
-+ ptr = (volatile struct GPU*)gpu_mem_ptr.arm;
-+ memset((void*)ptr, 0, sizeof *ptr);
-+ vc = gpu_mem_ptr.vc;
+
-+ ptr->mb = mb;
-+ ptr->vc = vc;
++ gpu_malloc_uncached_internal(ge->mb, sizeof(struct GPU), &ge->code_gm_ptr);
++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
+
-+ printf("GPU allocated at 0x%x\n",vc);
-+
-+ *gpu = ptr;
++ // Zero everything so we have zeros between the code bits
++ memset((void *)ptr, 0, sizeof(*ptr));
+
+ // Now copy over the QPU code into GPU memory
+ {
-+ int num_bytes = qpu_get_fn(QPU_MC_END) - qpu_get_fn(QPU_MC_SETUP_UV);
++ int num_bytes = (char *)mc_end - (char *)rpi_shader;
+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
+ memcpy((void*)ptr->qpu_code, rpi_shader, num_bytes);
+ }
@@ -11735,106 +12080,56 @@ index 0000000..365f4a6
+ // And the transform coefficients
+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
+
-+#ifdef RPI_ASYNC
-+ {
-+ int err;
-+ vpu_async_tail = 0;
-+ vpu_async_head = 0;
-+ err = pthread_create(&vpu_thread, NULL, vpu_start, NULL);
-+ //printf("Created thread\n");
-+ if (err) {
-+ av_log(NULL, AV_LOG_FATAL, "Failed to create vpu thread\n");
-+ return -4;
-+ }
-+
-+ {
-+ struct sched_param param = {0};
-+ int policy = 0;
-+
-+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+ }
-+ else
-+ {
-+ av_log(NULL, AV_LOG_INFO, "VPU thread: policy=%d (%s), pri=%d\n",
-+ policy,
-+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+ param.sched_priority);
-+
-+ policy = SCHED_FIFO;
-+ param.sched_priority = sched_get_priority_max(SCHED_FIFO);
-+
-+ av_log(NULL, AV_LOG_INFO, "Attempt to set: policy=%d (%s), pri=%d\n",
-+ policy,
-+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+ param.sched_priority);
-+
-+ if (pthread_setschedparam(vpu_thread, policy, ¶m) != 0)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "Unable to set VPU thread scheduling parameters\n");
-+ }
-+ else
-+ {
-+ if (pthread_getschedparam(vpu_thread, &policy, ¶m) != 0)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "Unable to get VPU thread scheduling parameters\n");
-+ }
-+ else
-+ {
-+ av_log(NULL, AV_LOG_INFO, "VPU thread (after): policy=%d (%s), pri=%d\n",
-+ policy,
-+ policy == SCHED_RR ? "RR" : policy == SCHED_FIFO ? "FIFO" : "???" ,
-+ param.sched_priority);
-+ }
-+ }
-+ }
-+
-+ }
-+
-+ }
-+#endif
-+
++ *gpu = ge;
+ return 0;
+}
+
-+// Returns 1 if the gpu is currently idle
-+static int gpu_idle(void)
-+{
-+ int ret = pthread_mutex_trylock(&gpu_mutex);
-+ if (ret==0) {
-+ pthread_mutex_unlock(&gpu_mutex);
-+ return 1;
-+ }
-+ return 0;
-+}
+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static void gpu_lock(void) {
-+ pthread_mutex_lock(&gpu_mutex);
-+
-+ if (gpu==NULL) {
-+ gpu_init(&gpu);
-+ }
-+}
+
+static void gpu_unlock(void) {
+ pthread_mutex_unlock(&gpu_mutex);
+}
+
-+static int gpu_malloc_uncached_internal(int numbytes, GPU_MEM_PTR_T *p, int mb) {
-+ p->numbytes = numbytes;
-+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+ av_assert0(p->vcsm_handle);
-+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+ av_assert0(p->vc_handle);
-+ p->arm = vcsm_lock(p->vcsm_handle);
-+ av_assert0(p->arm);
-+ p->vc = mem_lock(mb, p->vc_handle);
-+ av_assert0(p->vc);
-+ return 0;
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++ pthread_mutex_lock(&gpu_mutex);
++
++ av_assert0(gpu != NULL);
++ return gpu;
+}
+
++static gpu_env_t * gpu_lock_ref(void)
++{
++ pthread_mutex_lock(&gpu_mutex);
++
++ if (gpu == NULL) {
++ int rv = gpu_init(&gpu);
++ if (rv != 0) {
++ gpu_unlock();
++ return NULL;
++ }
++ }
++
++ ++gpu->open_count;
++ return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++ if (--ge->open_count == 0)
++ gpu_term();
++
++ gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++ av_assert0(gpu != NULL);
++ return gpu;
++}
++
++// Public gpu fns
++
+// Allocate memory on GPU
+// Fills in structure containing ARM pointer, videocore handle, videocore memory address, numbytes
+// Returns 0 on success.
@@ -11843,731 +12138,476 @@ index 0000000..365f4a6
+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
+{
+ int r;
-+ gpu_lock();
-+ r = gpu_malloc_uncached_internal(numbytes, p, gpu->mb);
-+ gpu->open_count++;
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
++ r = gpu_malloc_uncached_internal(ge->mb, numbytes, p);
+ gpu_unlock();
+ return r;
+}
+
-+int gpu_get_mailbox(void)
-+{
-+ av_assert0(gpu);
-+ return gpu->mb;
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void gpu_cache_flush(const GPU_MEM_PTR_T * const p)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+ struct vcsm_user_clean_invalid_s iocache = {};
-+ iocache.s[0].handle = p->vcsm_handle;
-+ iocache.s[0].cmd = 3; // clean+invalidate
-+ iocache.s[0].addr = (int) p->arm;
-+ iocache.s[0].size = p->numbytes;
-+ vcsm_clean_invalid( &iocache );
-+#else
-+ void *tmp = vcsm_lock(p->vcsm_handle);
-+ vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2)
-+{
-+#ifdef RPI_FAST_CACHEFLUSH
-+ struct vcsm_user_clean_invalid_s iocache = {};
-+ iocache.s[0].handle = p0->vcsm_handle;
-+ iocache.s[0].cmd = 3; // clean+invalidate
-+ iocache.s[0].addr = (int) p0->arm;
-+ iocache.s[0].size = p0->numbytes;
-+ iocache.s[1].handle = p1->vcsm_handle;
-+ iocache.s[1].cmd = 3; // clean+invalidate
-+ iocache.s[1].addr = (int) p1->arm;
-+ iocache.s[1].size = p1->numbytes;
-+ iocache.s[2].handle = p2->vcsm_handle;
-+ iocache.s[2].cmd = 3; // clean+invalidate
-+ iocache.s[2].addr = (int) p2->arm;
-+ iocache.s[2].size = p2->numbytes;
-+ vcsm_clean_invalid( &iocache );
-+#else
-+ void *tmp;
-+ tmp = vcsm_lock(p0->vcsm_handle);
-+ vcsm_unlock_ptr(tmp);
-+ tmp = vcsm_lock(p1->vcsm_handle);
-+ vcsm_unlock_ptr(tmp);
-+ tmp = vcsm_lock(p2->vcsm_handle);
-+ vcsm_unlock_ptr(tmp);
-+#endif
-+}
-+
-+static int gpu_malloc_cached_internal(int numbytes, GPU_MEM_PTR_T *p) {
-+ p->numbytes = numbytes;
-+ p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST, (char *)"Video Frame" );
-+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_VC, (char *)"Video Frame" );
-+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_NONE, (char *)"Video Frame" );
-+ //p->vcsm_handle = vcsm_malloc_cache(numbytes, VCSM_CACHE_TYPE_HOST_AND_VC, (char *)"Video Frame" );
-+ av_assert0(p->vcsm_handle);
-+ p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle);
-+ av_assert0(p->vc_handle);
-+ p->arm = vcsm_lock(p->vcsm_handle);
-+ av_assert0(p->arm);
-+ p->vc = mem_lock(gpu->mb, p->vc_handle);
-+ av_assert0(p->vc);
-+ return 0;
-+}
-+
+// This allocates data that will be
+// Cached in ARM L2
+// Uncached in VPU L2
+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
+{
+ int r;
-+ gpu_lock();
-+ r = gpu_malloc_cached_internal(numbytes, p);
-+ gpu->open_count++;
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
++ r = gpu_malloc_cached_internal(ge->mb, numbytes, p);
+ gpu_unlock();
+ return r;
+}
+
-+static void gpu_term(void)
-+{
-+ int mb;
-+
-+ if (gpu==NULL)
-+ return;
-+ mb = gpu->mb;
-+
-+ // ??? Tear down anything needed for gpuexecute
-+
-+ qpu_enable(mb, 0);
-+ gpu_free_internal(&gpu_mem_ptr);
-+
-+ vc_gpuserv_deinit();
-+ vcsm_exit();
-+
-+ mbox_close(mb);
-+ gpu = NULL;
-+}
-+
-+void gpu_free_internal(GPU_MEM_PTR_T *p) {
-+ int mb = gpu->mb;
-+ mem_unlock(mb,p->vc_handle);
-+ vcsm_unlock_ptr(p->arm);
-+ vcsm_free(p->vcsm_handle);
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T *p) {
-+ gpu_lock();
-+
-+ gpu_free_internal(p);
-+
-+ gpu->open_count--;
-+ if (gpu->open_count==0) {
-+ printf("Closing GPU\n");
-+ gpu_term();
-+ gpu = NULL;
-+ }
-+ gpu_unlock();
++void gpu_free(GPU_MEM_PTR_T * const p) {
++ gpu_env_t * const ge = gpu_lock();
++ gpu_free_internal(ge->mb, p);
++ gpu_unlock_unref(ge);
+}
+
+unsigned int vpu_get_fn(void) {
+ // Make sure that the gpu is initialized
-+ if (gpu==NULL) {
-+ printf("Preparing gpu\n");
-+ gpu_lock();
-+ gpu_unlock();
-+ }
-+ return gpu->vc + offsetof(struct GPU,vpu_code);
++ av_assert0(gpu != NULL);
++ return gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code);
+}
+
+unsigned int vpu_get_constants(void) {
-+ if (gpu==NULL) {
-+ gpu_lock();
++ av_assert0(gpu != NULL);
++ return gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even);
++}
++
++int gpu_get_mailbox(void)
++{
++ av_assert0(gpu);
++ return gpu->mb;
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++
++rpi_cache_flush_env_t * rpi_cache_flush_init()
++{
++ rpi_cache_flush_env_t * const rfe = calloc(1, sizeof(rpi_cache_flush_env_t));
++ if (rfe == NULL)
++ return NULL;
++
++ return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++ if (rfe != NULL)
++ free(rfe);
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++ int rc = (rfe->n == 0) ? 0 : vcsm_clean_invalid(&rfe->a);
++
++ free(rfe);
++
++ if (rc == 0)
++ return 0;
++
++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid failed: errno=%d\n", errno);
++ return rc;
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++ av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0]));
++
++ // Deal with empty pointer trivially
++ if (gm == NULL || gm->numbytes == 0)
++ return;
++
++ rfe->a.s[rfe->n].cmd = mode;
++ rfe->a.s[rfe->n].handle = gm->vcsm_handle;
++ rfe->a.s[rfe->n].addr = (unsigned int)gm->arm;
++ rfe->a.s[rfe->n].size = gm->numbytes;
++ ++rfe->n;
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++ const unsigned int offset, const unsigned int size)
++{
++ // Deal with empty pointer trivially
++ if (gm == NULL || size == 0)
++ return;
++
++ av_assert0(rfe->n < sizeof(rfe->a.s) / sizeof(rfe->a.s[0]));
++ av_assert0(offset <= gm->numbytes);
++ av_assert0(size <= gm->numbytes);
++ av_assert0(offset + size <= gm->numbytes);
++
++ rfe->a.s[rfe->n].cmd = mode;
++ rfe->a.s[rfe->n].handle = gm->vcsm_handle;
++ rfe->a.s[rfe->n].addr = (unsigned int)gm->arm + offset;
++ rfe->a.s[rfe->n].size = size;
++ ++rfe->n;
++}
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++ if (gpu_is_buf1(frame)) {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++ }
++ else
++ {
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++ }
++}
++
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++ const unsigned int y_offset = frame->linesize[0] * start_line;
++ const unsigned int y_size = frame->linesize[0] * n;
++ // Round UV up/down to get everything
++ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++ const unsigned int uv_offset = frame->linesize[1] * (start_line >> uv_shift);
++ const unsigned int uv_size = frame->linesize[1] * ((start_line + n + uv_rnd) >> uv_shift) - uv_offset;
++
++ // As all unsigned they will also reject -ve
++ // Test individually as well as added to reject overflow
++ av_assert0(start_line <= (unsigned int)frame->height);
++ av_assert0(n <= (unsigned int)frame->height);
++ av_assert0(start_line + n <= (unsigned int)frame->height);
++
++ if (gpu_is_buf1(frame)) {
++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++ }
++ }
++ else
++ {
++ if (do_luma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++ }
++ if (do_chroma) {
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++ }
++ }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init();
++ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++ rpi_cache_flush_finish(rfe);
++}
++
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++ unsigned int i;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_init(&wp->pool[i].sem, 0, 0);
++ wp->pool[i].next = wp->pool + i + 1;
++ }
++ wp->head = wp->pool + 0;
++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++ unsigned int i;
++ wp->head = NULL;
++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++ sem_destroy(&wp->pool[i].sem);
++ wp->pool[i].next = NULL;
++ }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(const unsigned int cost)
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ vq_wait_t * const wait = ge->wait_pool.head;
++ ge->wait_pool.head = wait->next;
++ ge->current_load += cost;
++ wait->cost = cost;
++ wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ tto_start(&ge->ttw.active, ns_time());
++#endif
++
++ gpu_unlock();
++ return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++ gpu_env_t * const ge = gpu_lock();
++ wait->next = ge->wait_pool.head;
++ ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ trace_time_wait_t * const ttw = &ge->ttw;
++ const int64_t now = ns_time();
++ ++ttw->jcount;
++ tto_end(&ttw->wait, now);
++
++ if (ttw->start0 == 0)
++ {
++ ttw->start0 = ttw->active.start[0];
++ ttw->last_update = ttw->start0;
++ }
++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++ {
++ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++ ttw_print(ttw, now);
++ }
++ }
++#endif
++ gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ {
++ const int64_t now = ns_time();
++ gpu_env_t * const ge = gpu_lock();
++ tto_start(&ge->ttw.wait, now);
++ gpu_unlock();
++ }
++#endif
++
++ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++ /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if !RPI_TRACE_TIME_VPU_QPU_WAIT
++ if (wait->cost != 0)
++#endif
++ {
++ gpu_env_t *const ge = gpu_lock();
++ ge->current_load -= wait->cost;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ tto_end(&ge->ttw.active, ns_time());
++#endif
+ gpu_unlock();
+ }
-+ return gpu->vc + offsetof(struct GPU,transMatrix2even);
++
++ sem_post(&wait->sem);
+}
+
-+#ifdef GPUSERVICE
-+static void callback(void *cookie)
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU 1
++#define VPU_QPU_MASK_VPU 2
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
+{
-+ sem_post((sem_t *)cookie);
++ unsigned int n;
++ unsigned int mask;
++ unsigned int cost;
++ struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_new(void)
++{
++ vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++ return vqj;
+}
-+#endif
+
-+
-+static volatile uint32_t post_done = 0;
-+static volatile uint32_t post_qed = 0;
-+
-+static void post_code2_cb(void * v)
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
+{
-+ uint32_t n = (uint32_t)v;
-+ if ((int32_t)(n - post_done) > 0) {
-+ post_done = n;
++ memset(vqj, 0, sizeof(*vqj));
++ free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++ struct gpu_job_s * const j = vqj->j + vqj->n++;
++ av_assert0(vqj->n <= VPU_QPU_JOB_MAX);
++ return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++ if (vpu_code != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_VPU;
++
++ j->command = EXECUTE_VPU;
++ j->u.v.q[0] = vpu_code;
++ j->u.v.q[1] = r0;
++ j->u.v.q[2] = r1;
++ j->u.v.q[3] = r2;
++ j->u.v.q[4] = r3;
++ j->u.v.q[5] = r4;
++ j->u.v.q[6] = r5;
+ }
+}
+
-+
-+// Post a command to the queue
-+// Returns an id which we can use to wait for completion
-+int vpu_post_code2(unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf)
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail)
+{
-+ struct gpu_job_s j[1] = {
-+ {
-+ .command = EXECUTE_VPU,
-+ .u.v.q = {code, r0, r1, r2, r3, r4, r5},
-+ .callback.func = post_code2_cb
-+ }
-+ };
-+ uint32_t id;
++ if (n != 0) {
++ struct gpu_job_s *const j = new_job(vqj);
++ vqj->mask |= VPU_QPU_MASK_QPU;
++ vqj->cost += cost;
+
-+ j[0].callback.cookie = (void *)(id = ++post_qed);
-+
-+ av_assert0(vc_gpuserv_execute_code(1, j) == 0);
-+
-+ return id;
++ j->command = EXECUTE_QPU;
++ j->u.q.jobs = n;
++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++ j->u.q.timeout = 5000;
++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++ }
+}
+
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+ int qpu0_n, const uint32_t * qpu0_mail,
-+ int qpu1_n, const uint32_t * qpu1_mail)
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
+{
-+#if 1
-+ sem_t sync0;
-+ struct gpu_job_s j[4];
++ vq_wait_post(v);
++}
+
-+ sem_init(&sync0, 0, 0);
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++ vq_wait_t * wait;
+
-+ j[0].command = EXECUTE_VPU;
-+ j[0].u.v.q[0] = vpu_code;
-+ j[0].u.v.q[1] = r0;
-+ j[0].u.v.q[2] = r1;
-+ j[0].u.v.q[3] = r2;
-+ j[0].u.v.q[4] = r3;
-+ j[0].u.v.q[5] = r4;
-+ j[0].u.v.q[6] = r5;
-+ j[0].callback.func = 0;
-+ j[0].callback.cookie = NULL;
++ if (vqj->mask == 0) {
++ *wait_h = NULL;
++ return;
++ }
+
-+ j[1].command = EXECUTE_QPU;
-+ j[1].u.q.jobs = qpu1_n;
-+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+ j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+ j[1].u.q.timeout = 5000;
-+ j[1].callback.func = 0;
-+ j[1].callback.cookie = NULL;
++ // We are going to want a sync object
++ wait = vq_wait_new(vqj->cost);
+
-+ j[2].command = EXECUTE_QPU;
-+ j[2].u.q.jobs = qpu0_n;
-+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+ j[2].u.q.noflush = 1;
-+ j[2].u.q.timeout = 5000;
-+ j[2].callback.func = 0;
-+ j[2].callback.cookie = NULL;
++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++ // If we only posted one thing or only QPU jobs
++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++ {
++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++ av_assert0(j->callback.func == 0);
+
-+ j[3].command = EXECUTE_SYNC;
-+ j[3].u.s.mask = 3;
-+ j[3].callback.func = callback;
-+ j[3].callback.cookie = (void *)&sync0;
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
++ else
++ {
++ struct gpu_job_s *const j = new_job(vqj);
+
-+ av_assert0(vc_gpuserv_execute_code(4, j) == 0);
++ j->command = EXECUTE_SYNC;
++ j->u.s.mask = vqj->mask;
++ j->callback.func = vpu_qpu_job_callback_wait;
++ j->callback.cookie = wait;
++ }
+
-+ sem_wait(&sync0);
-+#else
++ vqj->cost = 0;
++ vqj->mask = 0;
++ *wait_h = wait;
++}
+
-+ sem_t sync0, sync2;
-+ struct gpu_job_s j[3];
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++ return vqj->n == 0 ? 0 : vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
+
-+ sem_init(&sync0, 0, 0);
-+ sem_init(&sync2, 0, 0);
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++ int rv;
++ rv = vpu_qpu_job_start(vqj);
++ vpu_qpu_job_delete(vqj);
++ return rv;
++}
+
-+ j[0].command = EXECUTE_VPU;
-+ j[0].u.v.q[0] = vpu_code;
-+ j[0].u.v.q[1] = r0;
-+ j[0].u.v.q[2] = r1;
-+ j[0].u.v.q[3] = r2;
-+ j[0].u.v.q[4] = r3;
-+ j[0].u.v.q[5] = r4;
-+ j[0].u.v.q[6] = r5;
-+ j[0].callback.func = callback;
-+ j[0].callback.cookie = (void *)&sync0;
++unsigned int vpu_qpu_current_load(void)
++{
++ return gpu_ptr()->current_load;
++}
+
-+ j[1].command = EXECUTE_QPU;
-+ j[1].u.q.jobs = qpu1_n;
-+ memcpy(j[1].u.q.control, qpu1_mail, qpu1_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+ j[1].u.q.noflush = FLAGS_FOR_PROFILING;
-+ j[1].u.q.timeout = 5000;
-+ j[1].callback.func = 0;
-+ j[1].callback.cookie = NULL;
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++ if (wait_h != NULL)
++ {
++ vq_wait_t * const wait = *wait_h;
++ if (wait != NULL) {
++ *wait_h = NULL;
++ vq_wait_wait(wait);
++ vq_wait_delete(wait);
++ }
++ }
++}
+
-+ j[2].command = EXECUTE_QPU;
-+ j[2].u.q.jobs = qpu0_n;
-+ memcpy(j[2].u.q.control, qpu0_mail, qpu0_n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+ j[2].u.q.noflush = 1;
-+ j[2].u.q.timeout = 5000;
-+ j[2].callback.func = callback;
-+ j[2].callback.cookie = (void *)&sync2;
++int vpu_qpu_init()
++{
++ gpu_env_t * const ge = gpu_lock_ref();
++ if (ge == NULL)
++ return -1;
+
-+ av_assert0(vc_gpuserv_execute_code(3, j) == 0);
-+
-+ sem_wait(&sync0);
-+ sem_wait(&sync2);
-+#endif
++ if (ge->init_count++ == 0)
++ {
++ vc_gpuserv_init();
++ }
+
++ gpu_unlock();
+ return 0;
+}
+
-+
-+// Wait for completion of the given command
-+void vpu_wait(int id)
++void vpu_qpu_term()
+{
-+ if (id == 0) {
-+#if 0
-+ sem_t sync0;
-+ struct gpu_job_s j[1] =
-+ {
-+ {
-+ .command = EXECUTE_SYNC,
-+ .u.s.mask = 3,
-+ .callback.func = callback,
-+ .callback.cookie = (void *)&sync0
-+ }
-+ };
++ gpu_env_t * const ge = gpu_lock();
+
-+ sem_init(&sync0, 0, 0);
++ if (--ge->init_count == 0) {
++ vc_gpuserv_deinit();
+
-+ av_assert0(vc_gpuserv_execute_code(1, j) == 0);
-+
-+ sem_wait(&sync0);
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++ ttw_print(&ge->ttw, ns_time());
+#endif
+ }
-+ else {
-+ while ((int32_t)(post_done - (uint32_t)id) < 0) {
-+ usleep(1000);
-+ }
-+ }
++
++ gpu_unlock_unref(ge);
+}
+
-+
-+unsigned int qpu_get_fn(int num) {
-+ // Make sure that the gpu is initialized
-+ unsigned int *fn;
-+ if (gpu==NULL) {
-+ printf("Preparing gpu\n");
-+ gpu_lock();
-+ gpu_unlock();
-+ }
-+ switch(num) {
-+ case QPU_MC_SETUP:
-+ fn = mc_setup;
-+ break;
-+ case QPU_MC_FILTER:
-+ fn = mc_filter;
-+ break;
-+ case QPU_MC_EXIT:
-+ fn = mc_exit;
-+ break;
-+ case QPU_MC_INTERRUPT_EXIT12:
-+ fn = mc_interrupt_exit12;
-+ break;
-+ case QPU_MC_FILTER_B:
-+ fn = mc_filter_b;
-+ break;
-+ //case QPU_MC_FILTER_HONLY:
-+ // fn = mc_filter_honly;
-+ // break;
-+ case QPU_MC_SETUP_UV:
-+ fn = mc_setup_uv;
-+ break;
-+ case QPU_MC_FILTER_UV:
-+ fn = mc_filter_uv;
-+ break;
-+ case QPU_MC_FILTER_UV_B0:
-+ fn = mc_filter_uv_b0;
-+ break;
-+ case QPU_MC_FILTER_UV_B:
-+ fn = mc_filter_uv_b;
-+ break;
-+ case QPU_MC_INTERRUPT_EXIT8:
-+ fn = mc_interrupt_exit8;
-+ break;
-+ case QPU_MC_END:
-+ fn = mc_end;
-+ break;
-+ default:
-+ printf("Unknown function\n");
-+ exit(-1);
-+ }
-+ return gpu->vc + 4*(int)(fn-rpi_shader);
-+ //return code[num] + gpu->vc;
-+}
-+
-+#if 0
-+typedef unsigned int uint32_t;
-+
-+typedef struct mvs_s {
-+ GPU_MEM_PTR_T unif_mvs_ptr;
-+ uint32_t *unif_mvs; // Base of memory for motion vector commands
-+
-+ // _base pointers are to the start of the row
-+ uint32_t *mvs_base[8];
-+ // these pointers are to the next free space
-+ uint32_t *u_mvs[8];
-+
-+} HEVCContext;
-+
-+#define RPI_CHROMA_COMMAND_WORDS 12
-+
-+static void rpi_inter_clear(HEVCContext *s)
++uint32_t qpu_fn(const int * const mc_fn)
+{
-+ int i;
-+ for(i=0;i<8;i++) {
-+ s->u_mvs[i] = s->mvs_base[i];
-+ *s->u_mvs[i]++ = 0;
-+ *s->u_mvs[i]++ = 0;
-+ *s->u_mvs[i]++ = 0;
-+ *s->u_mvs[i]++ = 0;
-+ *s->u_mvs[i]++ = 0;
-+ *s->u_mvs[i]++ = 128; // w
-+ *s->u_mvs[i]++ = 128; // h
-+ *s->u_mvs[i]++ = 128; // stride u
-+ *s->u_mvs[i]++ = 128; // stride v
-+ s->u_mvs[i] += 3; // Padding words
-+ }
++ return gpu->code_gm_ptr.vc + ((const char *)mc_fn - (const char *)rpi_shader) + offsetof(struct GPU, qpu_code);
+}
+
-+static void rpi_execute_inter_qpu(HEVCContext *s)
-+{
-+ int k;
-+ uint32_t *unif_vc = (uint32_t *)s->unif_mvs_ptr.vc;
-+
-+ for(k=0;k<8;k++) {
-+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_EXIT); // Add exit command
-+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+3] = qpu_get_fn(QPU_MC_SETUP); // A dummy texture location (maps to our code) - this is needed as the texture requests are pipelined
-+ s->u_mvs[k][-RPI_CHROMA_COMMAND_WORDS+4] = qpu_get_fn(QPU_MC_SETUP); // dummy location for V
-+ }
-+
-+ s->u_mvs[8-1][-RPI_CHROMA_COMMAND_WORDS] = qpu_get_fn(QPU_MC_INTERRUPT_EXIT8); // This QPU will signal interrupt when all others are done and have acquired a semaphore
-+
-+ qpu_run_shader8(qpu_get_fn(QPU_MC_SETUP_UV),
-+ (uint32_t)(unif_vc+(s->mvs_base[0 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[1 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[2 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[3 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[4 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[5 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[6 ] - (uint32_t*)s->unif_mvs_ptr.arm)),
-+ (uint32_t)(unif_vc+(s->mvs_base[7 ] - (uint32_t*)s->unif_mvs_ptr.arm))
-+ );
-+}
-+
-+void rpi_test_qpu(void)
-+{
-+ HEVCContext mvs;
-+ HEVCContext *s = &mvs;
-+ int i;
-+ int uv_commands_per_qpu = (1 + (256*64*2)/(4*4)) * RPI_CHROMA_COMMAND_WORDS;
-+ uint32_t *p;
-+ printf("Allocate memory\n");
-+ gpu_malloc_uncached( 8 * uv_commands_per_qpu * sizeof(uint32_t), &s->unif_mvs_ptr );
-+ s->unif_mvs = (uint32_t *) s->unif_mvs_ptr.arm;
-+
-+ // Set up initial locations for uniform streams
-+ p = s->unif_mvs;
-+ for(i = 0; i < 8; i++) {
-+ s->mvs_base[i] = p;
-+ p += uv_commands_per_qpu;
-+ }
-+ // Now run a simple program that should just quit immediately after a single texture fetch
-+ rpi_inter_clear(s);
-+ for(i=0;i<4;i++) {
-+ printf("Launch QPUs\n");
-+ rpi_execute_inter_qpu(s);
-+ printf("Done\n");
-+ }
-+ printf("Free memory\n");
-+ gpu_free(&s->unif_mvs_ptr);
-+ return;
-+}
-+#endif
-+
-+#if 0
-+
-+int32_t hcoeffs[] = {-4, 10, -21, 70, 90, -24, 11, -4};
-+//int32_t hcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+int32_t vcoeffs[] = {-2, 6, -13, 37, 115, -20, 9, -4};
-+//int32_t vcoeffs[] = {1, 1, 1, 1, 1, 1, 1, 1};
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0-1) & 0xff) | ((c1-1) & 0xff) << 8 | ((c2-1) & 0xff) << 16 | ((c3-1) & 0xff) << 24);
-+
-+static uint8_t av_clip_uint8(int32_t a)
-+{
-+ if (a&(~255)) return (-a)>>31;
-+ else return a;
-+}
-+
-+static int32_t filter8(const uint8_t *data, int pitch)
-+{
-+ int32_t vsum = 0;
-+ int x, y;
-+
-+ for (y = 0; y < 8; y++) {
-+ int32_t hsum = 0;
-+
-+ for (x = 0; x < 8; x++)
-+ hsum += hcoeffs[x]*data[x + y * pitch];
-+
-+ vsum += vcoeffs[y]*av_clip_uint8( (hsum + 64) >> 7); // Added brackets to stop compiler warning
-+ }
-+
-+ return av_clip_uint8( (vsum + 64) >> 7);
-+}
-+
-+// Note regression changes coefficients so is not thread safe
-+//#define REGRESSION
-+#ifdef REGRESSION
-+#define CMAX 100
-+#else
-+#define CMAX 2
-+#endif
-+#define YMAX 16
-+
-+int rpi_test_shader(void)
-+{
-+ int i, c;
-+
-+ uint32_t *unifs;
-+
-+ uint8_t *in_buffer;
-+ uint8_t *out_buffer[2];
-+
-+ GPU_MEM_PTR_T unifs_ptr;
-+ GPU_MEM_PTR_T in_buffer_ptr;
-+ GPU_MEM_PTR_T out_buffer_ptr[2];
-+
-+ // Addresses in GPU memory of filter programs
-+ uint32_t mc_setup = 0;
-+ uint32_t mc_filter = 0;
-+ uint32_t mc_exit = 0;
-+
-+ int pitch = 0x500;
-+
-+ if (gpu==NULL) {
-+ gpu_lock();
-+ gpu_unlock();
-+ }
-+
-+ printf("This needs to change to reflect new assembler\n");
-+ // Use table to compute locations of program start points
-+ mc_setup = code[0] + gpu->vc;
-+ mc_filter = code[1] + gpu->vc;
-+ mc_exit = code[2] + gpu->vc;
-+
-+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+ return -2;
-+ }
-+ unifs = (uint32_t*)unifs_ptr.arm;
-+
-+ if (!vcos_verify_ge0(gpu_malloc_uncached(64*23,&in_buffer_ptr))) {
-+ return -3;
-+ }
-+ in_buffer = (uint8_t*)in_buffer_ptr.arm;
-+
-+ if (!vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[0])) || !vcos_verify_ge0(gpu_malloc_uncached(16*pitch,&out_buffer_ptr[1]))) {
-+ return -4;
-+ }
-+ out_buffer[0] = (uint8_t*)out_buffer_ptr[0].arm;
-+ out_buffer[1] = (uint8_t*)out_buffer_ptr[1].arm;
-+
-+ for (c = 0; c < CMAX; c++) {
-+ int xo[] = {rand()&31, rand()&31};
-+
-+#ifdef REGRESSION
-+ for (i = 0; i < 8; i++) {
-+ hcoeffs[i] = (int8_t)rand();
-+ vcoeffs[i] = (int8_t)rand();
-+ if (hcoeffs[i]==-128)
-+ hcoeffs[i]++;
-+ if (vcoeffs[i]==-128)
-+ vcoeffs[i]++;
-+ }
-+#endif
-+
-+ for (i = 0; i < 64*23; i++) {
-+ //printf("%d %d %p\n",i,gpu->mb,&in_buffer[i]);
-+ in_buffer[i] = rand();
-+ }
-+
-+ // Clear output array
-+ {
-+ int b;
-+ for(b=0;b<2;b++) {
-+ for(i=0;i<16*16;i++) {
-+ out_buffer[b][i] = 3;
-+ }
-+ }
-+ }
-+
-+ unifs[0] = mc_filter;
-+ unifs[1] = in_buffer_ptr.vc+xo[0]+16;
-+ unifs[2] = 64; // src pitch
-+ unifs[3] = pitch; // dst pitch
-+ unifs[4] = 0; // Padding
-+ unifs[5] = 0;
-+ unifs[6] = 0;
-+ unifs[7 ] = mc_filter;
-+ unifs[8 ] = in_buffer_ptr.vc+xo[1]+16;
-+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+ unifs[13] = out_buffer_ptr[0].vc;
-+ unifs[14] = mc_exit;
-+ unifs[15] = in_buffer_ptr.vc+xo[1]+16; // dummy
-+ unifs[16] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+ unifs[17] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+ unifs[18] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+ unifs[19] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+ unifs[20] = out_buffer_ptr[1].vc;
-+
-+ printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+ // flush_dcache(); TODO is this needed on ARM side? - tried to use the direct alias to avoid this problem
-+
-+ //qpu_run_shader(mc_setup, unifs_ptr.vc);
-+ //qpu_run_shader(gpu, gpu->vc, unifs_ptr.vc);
-+ rpi_do_block(in_buffer_ptr.vc+xo[0]+16, 64, out_buffer_ptr[0].vc, pitch,out_buffer[0]);
-+ rpi_do_block(in_buffer_ptr.vc+xo[1]+16, 64, out_buffer_ptr[1].vc, pitch,out_buffer[1]);
-+
-+ if (1)
-+ {
-+ int x, y, b;
-+ int bad = 0;
-+
-+ for (b=0; b<2; ++b)
-+ for (y=0; yvc;
-+ mc_filter = code[1] + gpu->vc;
-+ mc_exit = code[2] + gpu->vc;
-+
-+ if (!vcos_verify_ge0(gpu_malloc_uncached(4*64,&unifs_ptr))) {
-+ return;
-+ }
-+ //gpu_malloc_uncached(16*dst_pitch,&out_buffer_ptr);
-+ //out_buffer = (uint8_t*)out_buffer_ptr.arm;
-+
-+ /*for (y=0; y<16; ++y) {
-+ for (x=0; x<16; ++x) {
-+ out_buffer[x+y*dst_pitch] = 7;
-+ }
-+ }*/
-+
-+ unifs = (uint32_t*)unifs_ptr.arm;
-+
-+ unifs[0] = mc_filter;
-+ unifs[1] = (int)in_buffer_vc;
-+ unifs[2] = src_pitch; // src pitch
-+ unifs[3] = dst_pitch; // dst pitch
-+ unifs[4] = 0; // Padding
-+ unifs[5] = 0;
-+ unifs[6] = 0;
-+ unifs[7 ] = mc_exit;
-+ unifs[8 ] = (int)in_buffer_vc;
-+ unifs[9 ] = ENCODE_COEFFS(hcoeffs[0], hcoeffs[1], hcoeffs[2], hcoeffs[3]);
-+ unifs[10] = ENCODE_COEFFS(hcoeffs[4], hcoeffs[5], hcoeffs[6], hcoeffs[7]);
-+ unifs[11] = ENCODE_COEFFS(vcoeffs[0], vcoeffs[1], vcoeffs[2], vcoeffs[3]);
-+ unifs[12] = ENCODE_COEFFS(vcoeffs[4], vcoeffs[5], vcoeffs[6], vcoeffs[7]);
-+ unifs[13] = (int)dst_vc;
-+ //unifs[13] = (int)out_buffer_ptr.vc;
-+
-+ //printf("Gpu->vc=%x Code=%x dst=%x\n",gpu->vc, mc_filter,out_buffer_ptr[1].vc);
-+
-+ qpu_run_shader(mc_setup, unifs_ptr.vc);
-+
-+ /*for (y=0; y<16; ++y) {
-+ for (x=0; x<16; ++x) {
-+ dst[x+y*dst_pitch] = out_buffer[x+y*dst_pitch];
-+ }
-+ }*/
-+
-+ gpu_free(&unifs_ptr);
-+ //gpu_free(&out_buffer_ptr);
-+}
-+
-+
-+
-+#endif
-+
+#endif // RPI
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+diff --git b/libavcodec/rpi_qpu.h a/libavcodec/rpi_qpu.h
new file mode 100644
-index 0000000..c6cdb2b
+index 0000000..bcde316
--- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,176 @@
++++ a/libavcodec/rpi_qpu.h
+@@ -0,0 +1,204 @@
+#ifndef RPI_QPU_H
+#define RPI_QPU_H
+
-+// Define RPI_FAST_CACHEFLUSH to use the VCSM cache flush code
-+// *** N.B. Code has rotted & crashes if this is unset (before this set of changes)
-+#define RPI_FAST_CACHEFLUSH
++#include
+
+#define RPI_ONE_BUF 1
+
@@ -12582,9 +12622,7 @@ index 0000000..c6cdb2b
+// General GPU functions
+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T *p);
-+extern void gpu_cache_flush(const GPU_MEM_PTR_T * const p);
-+extern void gpu_cache_flush3(GPU_MEM_PTR_T *p0,GPU_MEM_PTR_T *p1,GPU_MEM_PTR_T *p2);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
+
+#include "libavutil/frame.h"
+#if !RPI_ONE_BUF
@@ -12627,29 +12665,31 @@ index 0000000..c6cdb2b
+ return av_buffer_get_opaque(frame->buf[0]);
+}
+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const int n)
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
+{
+ return av_buffer_pool_opaque(frame->buf[n]);
+}
+
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++ return gm->vc + (frame->data[n] - gm->arm);
++}
++
+
+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+ return gpu_is_buf1(frame) ? gpu_buf1_gmem(frame)->vc : gpu_buf3_gmem(frame, 0)->vc;
++ return get_vc_address3(frame, 0);
+}
+
+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+ return gpu_is_buf1(frame) ?
-+ gpu_buf1_gmem(frame)->vc + frame->data[1] - frame->data[0] :
-+ gpu_buf3_gmem(frame, 1)->vc;
++ return get_vc_address3(frame, 1);
+}
+
+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+ return gpu_is_buf1(frame) ?
-+ gpu_buf1_gmem(frame)->vc + frame->data[2] - frame->data[0] :
-+ gpu_buf3_gmem(frame, 2)->vc;
++ return get_vc_address3(frame, 2);
+}
+
-+
++#if 0
+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
+ if (gpu_is_buf1(frame))
+ {
@@ -12686,30 +12726,44 @@ index 0000000..c6cdb2b
+ else
+ return *gpu_buf3_gmem(frame, 2);
+}
-+
+#endif
++#endif
++
++// Cache flush stuff
++
++typedef struct rpi_flush_envss {
++ unsigned int n;
++ struct vcsm_user_clean_invalid_s a;
++} rpi_cache_flush_env_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(void);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
++} rpi_cache_flush_mode_t;
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++ const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_lines(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++ const unsigned int start_line, const unsigned int n, const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
+
+
+// QPU specific functions
-+extern void rpi_test_qpu(void);
++uint32_t qpu_fn(const int * const mc_fn);
+
-+enum {
-+ QPU_MC_SETUP,
-+ QPU_MC_FILTER,
-+ QPU_MC_EXIT,
-+ QPU_MC_INTERRUPT_EXIT12,
-+ QPU_MC_FILTER_B,
-+ QPU_MC_FILTER_HONLY,
-+ QPU_MC_SETUP_UV,
-+ QPU_MC_FILTER_UV,
-+ QPU_MC_FILTER_UV_B0,
-+ QPU_MC_FILTER_UV_B,
-+ QPU_MC_INTERRUPT_EXIT8,
-+ QPU_MC_END
-+ };
-+extern unsigned int qpu_get_fn(int num);
-+
-+#define QPU_N_UV 8
++#define QPU_N_UV 12
+#define QPU_N_Y 12
+#define QPU_N_MAX 16
+
@@ -12718,16 +12772,32 @@ index 0000000..c6cdb2b
+#define QPU_MAIL_VALS_MAX (QPU_N_MAX * QPU_MAIL_EL_VALS)
+#define QPU_MAIL_SIZE (QPU_MAIL_VALS_MAX * sizeof(uint32_t))
+
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
++
+// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++vpu_qpu_job_h vpu_qpu_job_new(void);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const unsigned int cost, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++
+extern unsigned int vpu_get_fn(void);
+extern unsigned int vpu_get_constants(void);
-+//extern unsigned vpu_execute_code( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5);
-+extern int vpu_post_code2( unsigned code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5, GPU_MEM_PTR_T *buf);
-+int vpu_qpu_post_code2(unsigned vpu_code, unsigned r0, unsigned r1, unsigned r2, unsigned r3, unsigned r4, unsigned r5,
-+ int qpu0_n, const uint32_t * qpu0_mail,
-+ int qpu1_n, const uint32_t * qpu1_mail);
+
-+extern void vpu_wait( int id);
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++unsigned int vpu_qpu_current_load(void);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
+
+// Simple test of shader code
+extern int rpi_test_shader(void);
@@ -12738,12 +12808,12 @@ index 0000000..c6cdb2b
+extern int gpu_get_mailbox(void);
+
+#endif
-diff --git a/libavcodec/rpi_shader.c b/libavcodec/rpi_shader.c
+diff --git b/libavcodec/rpi_shader.c a/libavcodec/rpi_shader.c
new file mode 100644
-index 0000000..06fb166
+index 0000000..627cda9
--- /dev/null
-+++ b/libavcodec/rpi_shader.c
-@@ -0,0 +1,629 @@
++++ a/libavcodec/rpi_shader.c
+@@ -0,0 +1,624 @@
+#include "rpi_shader.h"
+
+#ifdef _MSC_VER
@@ -12768,642 +12838,645 @@ index 0000000..06fb166
+#endif
+unsigned int rpi_shader[] = {
+// ::mc_setup_uv
-+/* [0x00000000] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000008] */ 0x0c9a0f80, 0x10020427, // add ra_x, unif, elem_num
-+/* [0x00000010] */ 0x15827d80, 0x10020767, // mov ra_y, unif
-+/* [0x00000018] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
-+/* [0x00000020] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000028] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
-+/* [0x00000030] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
-+/* [0x00000038] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
-+/* [0x00000040] */ 0x15827d80, 0x10021427, // mov rb16, unif
-+/* [0x00000048] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000050] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000058] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x00000060] */ 0x00010000, 0xe0020127, // mov ra4, 0x10000
-+/* [0x00000068] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000070] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000078] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000080] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000088] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000090] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000098] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x000000a0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x000000a8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x000000b0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x000000b8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x000000c0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x000000c8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x000000d0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x000000d8] */ 0x15427d80, 0x10020827, // mov r0, ra_x
-+/* [0x000000e0] */ 0x937401f6, 0xd0024821, // max r0, r0, 0; mov r1, ra_y
-+/* [0x000000e8] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
-+/* [0x000000f0] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
-+/* [0x000000f8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x00000100] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
-+/* [0x00000108] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000110] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
-+/* [0x00000118] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000120] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
-+/* [0x00000128] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
-+/* [0x00000130] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
-+/* [0x00000138] */ 0x00000009, 0xe00208a7, // mov r2, 9
-+/* [0x00000140] */ 0x0c827580, 0x10021367, // add rb13, r2, unif
-+/* [0x00000148] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000150] */ 0x15827d80, 0x100208a7, // mov r2, unif
-+/* [0x00000158] */ 0x119c15c0, 0xd00208a7, // shl r2, r2, 1
-+/* [0x00000160] */ 0x149cf5c0, 0xd00208a7, // and r2, r2, 15
-+/* [0x00000168] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000170] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000178] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000180] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000188] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000190] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000198] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x000001a0] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x000001a8] */ 0x0f9c11c0, 0xd00208a7, // asr r2, r0, 1
-+/* [0x000001b0] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x000001b8] */ 0x0c9e7440, 0x10021567, // add rb21, r2, r1
-+/* [0x000001c0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x000001c8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x000001d0] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x000001d8] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x000001e0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x000001e8] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x000001f0] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x000001f8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000200] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
-+/* [0x00000208] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
++/* [0x00000000] */ 0x95801ff6, 0xd002591e, // mov tmurs, 1 ; mov ra_link, unif
++/* [0x00000008] */ 0x15827d80, 0x10020027, // mov ra0, unif
++/* [0x00000010] */ 0x159a7d80, 0x10020827, // mov r0, elem_num
++/* [0x00000018] */ 0x0c027c00, 0x14020427, // add ra_x, ra0.16b, r0
++/* [0x00000020] */ 0x15027d80, 0x12020767, // mov ra_y, ra0.16a
++/* [0x00000028] */ 0x15827d80, 0x10020627, // mov ra_frame_base, unif
++/* [0x00000030] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000038] */ 0x0d620f80, 0x10020667, // sub ra_u2v_ref_offset, unif, ra_frame_base
++/* [0x00000040] */ 0x0d801dc0, 0xd0021667, // sub rb25,unif,1
++/* [0x00000048] */ 0x0d801dc0, 0xd00217a7, // sub rb30,unif,1
++/* [0x00000050] */ 0x15827d80, 0x10021427, // mov rb16, unif
++/* [0x00000058] */ 0x0c827380, 0x10021627, // add rb24, r1, unif
++/* [0x00000060] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000068] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000070] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000078] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
++/* [0x00000080] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
++/* [0x00000088] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
++/* [0x00000090] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
++/* [0x00000098] */ 0x00000000, 0xe0020327, // mov ra12, 0
++/* [0x000000a0] */ 0x00000000, 0xe0020367, // mov ra13, 0
++/* [0x000000a8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
++/* [0x000000b0] */ 0x00000000, 0xe00203e7, // mov ra15, 0
++/* [0x000000b8] */ 0x00000000, 0xe0020267, // mov ra9, 0
++/* [0x000000c0] */ 0x15427d80, 0x10020827, // mov r0, ra_x
++/* [0x000000c8] */ 0x937401f6, 0xd0024821, // max r0, r0, 0 ; mov r1, ra_y
++/* [0x000000d0] */ 0x926191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base
++/* [0x000000d8] */ 0x916431f6, 0xd00244e2, // shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
++/* [0x000000e0] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
++/* [0x000000e8] */ 0x0c9e70c0, 0x10020827, // add r0, r0, r3
++/* [0x000000f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x000000f8] */ 0x939c03c0, 0xd0025850, // max r1, r1, 0 ; mov ra_x, r0
++/* [0x00000100] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000108] */ 0x4c9d040f, 0x100248a1, // add r2, r2, r0 ; mul24 r1, r1, rb_pitch
++/* [0x00000110] */ 0x8c9e7052, 0x10025e18, // add t0s, r0, r1 ; mov ra_frame_base, r2
++/* [0x00000118] */ 0x0c9e7440, 0x10020f27, // add t1s, r2, r1
++/* [0x00000120] */ 0x0c809f80, 0xd0021367, // add rb13, 9, unif
++/* [0x00000128] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000130] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000138] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000140] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000148] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000150] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000158] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00000160] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000168] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000170] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000178] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000180] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000188] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
++/* [0x00000190] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000198] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
++/* [0x000001a0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000001a8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x000001b0] */ 0x0c427380, 0x10020e27, // add t0s, r1, ra_x
++/* [0x000001b8] */ 0x0c627380, 0x10020f27, // add t1s, r1, ra_frame_base
+// ::mc_filter_uv
-+/* [0x00000210] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000218] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000220] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif
-+/* [0x00000228] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000230] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
-+/* [0x00000238] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000240] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
-+/* [0x00000248] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000250] */ 0x959dc27f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb28
-+/* [0x00000258] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000260] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000268] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000270] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000278] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
-+/* [0x00000280] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x00000288] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
-+/* [0x00000290] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif
-+/* [0x00000298] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000002a0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
-+/* [0x000002a8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
-+/* [0x000002b0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
-+/* [0x000002b8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
-+/* [0x000002c0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000002c8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
-+/* [0x000002d0] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
++/* [0x000001c0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000001c8] */ 0x15827d80, 0x100200a7, // mov ra2, unif
++/* [0x000001d0] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num ; mov r3, unif
++/* [0x000001d8] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0
++/* [0x000001e0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000001e8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x000001f0] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
++/* [0x000001f8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000200] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
++/* [0x00000208] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
++/* [0x00000210] */ 0x9509cdbf, 0x12024731, // mov ra_y_next, ra2.16a ; mov vw_setup, rb28
++/* [0x00000218] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000220] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000228] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x00000230] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
++/* [0x00000238] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
++/* [0x00000240] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait
++/* [0x00000248] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:filter_uv_1
++/* [0x00000250] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
++/* [0x00000258] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
++/* [0x00000260] */ 0x8c81b1f6, 0x10025683, // add rb26, r0, rb27 ; mov ra3, unif
++/* [0x00000268] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x00000270] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x00000278] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++// :filter_uv_1
++/* [0x00000280] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000288] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
++/* [0x00000290] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
++/* [0x00000298] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
++/* [0x000002a0] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
++/* [0x000002a8] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
++/* [0x000002b0] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++/* [0x000002b8] */ 0x11041dc0, 0xd20213a7, // shl rb14, ra1.16a, 1
+// :uvloop
-+/* [0x000002d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
-+/* [0x000002e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
-+/* [0x000002e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x000002f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+/* [0x000002f8] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
-+/* [0x00000300] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000308] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000310] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x00000318] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
-+/* [0x00000320] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000328] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000330] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
-+/* [0x00000338] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000340] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+/* [0x00000348] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000350] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+/* [0x00000358] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000360] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+/* [0x00000368] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000370] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
-+/* [0x00000378] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
-+/* [0x00000380] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x00000388] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
-+/* [0x00000390] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x00000398] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
-+/* [0x000003a0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
-+/* [0x000003a8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
-+/* [0x000003b0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
-+/* [0x000003b8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
-+/* [0x000003c0] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000003c8] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
-+/* [0x000003d0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x000003d8] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x000003e0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
-+/* [0x000003e8] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x000003f0] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x000003f8] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000400] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000408] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000410] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000418] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000420] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000428] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000430] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000438] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000440] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x000002c0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
++/* [0x000002c8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
++/* [0x000002d0] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x000002d8] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++/* [0x000002e0] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255
++/* [0x000002e8] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x000002f0] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x000002f8] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00000300] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255
++/* [0x00000308] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000310] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000318] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
++/* [0x00000320] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00000328] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00000330] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00000338] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000340] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000348] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x00000350] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000358] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
++/* [0x00000360] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
++/* [0x00000368] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x00000370] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
++/* [0x00000378] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
++/* [0x00000380] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
++/* [0x00000388] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
++/* [0x00000390] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
++/* [0x00000398] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
++/* [0x000003a0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
++/* [0x000003a8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x000003b0] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
++/* [0x000003b8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x000003c0] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x000003c8] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:uvloop
++/* [0x000003d0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x000003d8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000003e0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x000003e8] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26 ; mov ra9, rb26
++/* [0x000003f0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000003f8] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29 ; mov ra10, rb29
++/* [0x00000400] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000408] */ 0x15827d80, 0x100202e7, // mov ra11, unif
+// ::mc_filter_uv_b0
-+/* [0x00000448] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000450] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000458] */ 0x938001f6, 0xd0024821, // max r0, r0, 0 ; mov r1, unif
-+/* [0x00000460] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000468] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
-+/* [0x00000470] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000478] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
-+/* [0x00000480] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000488] */ 0x959d527f, 0x10024731, // mov ra_y_next, r1 ; mov vw_setup, rb21
-+/* [0x00000490] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000498] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x000004a0] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x000004a8] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x000004b0] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
-+/* [0x000004b8] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x000004c0] */ 0x918101f6, 0xd0025803, // shl r0, r0, i_shift16 ; mov ra3, unif
-+/* [0x000004c8] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000004d0] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
-+/* [0x000004d8] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
-+/* [0x000004e0] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
-+/* [0x000004e8] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
-+/* [0x000004f0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000004f8] */ 0x15827d80, 0x100213a7, // mov rb14, unif
-+/* [0x00000500] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0
++/* [0x00000410] */ 0x15827d80, 0x100009e7, // mov -, unif
++/* [0x00000418] */ 0x15827d80, 0x100200a7, // mov ra2, unif
++/* [0x00000420] */ 0x959a0dbf, 0x10024823, // mov r0, elem_num ; mov r3, unif
++/* [0x00000428] */ 0x0c0a7c00, 0x14020827, // add r0, ra2.16b, r0
++/* [0x00000430] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000438] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x00000440] */ 0x8d4e0ef6, 0x10025891, // sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
++/* [0x00000448] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000450] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
++/* [0x00000458] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
++/* [0x00000460] */ 0x150a7d80, 0x12020727, // mov ra_y_next, ra2.16a
++/* [0x00000468] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
++/* [0x00000470] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000478] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
++/* [0x00000480] */ 0x0c043dc0, 0xd20207e7, // add ra31, ra1.16a, 3
++/* [0x00000488] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
++/* [0x00000490] */ 0x8c0601bf, 0x14025803, // add r0, r0, ra1.16b ; mov ra3, unif
++/* [0x00000498] */ 0x918101f6, 0xd002480e, // shl r0, r0, i_shift16 ; mov rb14, unif
++/* [0x000004a0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
++/* [0x000004a8] */ 0x150e7d80, 0x18021227, // mov rb8, ra3.8a
++/* [0x000004b0] */ 0x150e7d80, 0x1a021267, // mov rb9, ra3.8b
++/* [0x000004b8] */ 0x150e7d80, 0x1c0212a7, // mov rb10, ra3.8c
++/* [0x000004c0] */ 0x150e7d80, 0x1e0212e7, // mov rb11, ra3.8d
++/* [0x000004c8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000004d0] */ 0x95800dbf, 0xd00653a3, // mov.ifnz rb14, unif ; mov r3, 0
+// :uvloop_b0
-+/* [0x00000508] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
-+/* [0x00000510] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
-+/* [0x00000518] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000520] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+/* [0x00000528] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
-+/* [0x00000530] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000538] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000540] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x00000548] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
-+/* [0x00000550] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000558] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000560] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
-+/* [0x00000568] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000570] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+/* [0x00000578] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000580] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+/* [0x00000588] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000590] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+/* [0x00000598] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x000005a0] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
-+/* [0x000005a8] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
-+/* [0x000005b0] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005b8] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
-+/* [0x000005c0] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000005c8] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
-+/* [0x000005d0] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
-+/* [0x000005d8] */ 0x0d9d27c0, 0x100229e7, // sub.setf -, r3, rb18
-+/* [0x000005e0] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:uvloop_b0
-+/* [0x000005e8] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
-+/* [0x000005f0] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
-+/* [0x000005f8] */ 0x0f9c63c0, 0xd0020c27, // asr vpm, r1, 6
-+/* [0x00000600] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000608] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000610] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000618] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000004d8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
++/* [0x000004e0] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
++/* [0x000004e8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x000004f0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++/* [0x000004f8] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255
++/* [0x00000500] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000508] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000510] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00000518] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255
++/* [0x00000520] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x00000528] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000530] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
++/* [0x00000538] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00000540] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00000548] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00000550] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000558] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000560] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x00000568] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000570] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
++/* [0x00000578] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
++/* [0x00000580] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x00000588] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
++/* [0x00000590] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10
++/* [0x00000598] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
++/* [0x000005a0] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7
++/* [0x000005a8] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11
++/* [0x000005b0] */ 0x8d9c623f, 0x10025847, // sub r1, r1, r0 ; mov ra7, rb6
++/* [0x000005b8] */ 0x0d7e7780, 0x100229e7, // sub.setf -, r3, ra31
++/* [0x000005c0] */ 0x8f1463f6, 0xd0124206, // asr ra8.16a, r1, 6 ; mov rb6, ra5
++/* [0x000005c8] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:uvloop_b0
++/* [0x000005d0] */ 0x95104ff6, 0x10024144, // mov ra5, rb4 ; mov rb4, ra4
++/* [0x000005d8] */ 0x95185ff6, 0x10024105, // mov ra4, rb5 ; mov rb5, ra6
++/* [0x000005e0] */ 0x95207ff6, 0x10024187, // mov ra6, rb7 ; mov rb7, ra8
++/* [0x000005e8] */ 0x0d9cfec0, 0xd00229e7, // sub.setf -, 15, r3
++/* [0x000005f0] */ 0x00000090, 0xf06809e7, // brr.anyn -, r:uv_b0_post_fin
++/* [0x000005f8] */ 0x8d80bef6, 0xd00208e7, // sub r3, 11, r3 ; mov -, unif
++/* [0x00000600] */ 0x95810ff6, 0xd0020827, // mov r0, i_shift16 ; mov -, unif
++/* [0x00000608] */ 0x00010000, 0xe0020867, // mov r1, 0x10000
++/* [0x00000610] */ 0x00000040, 0xf02809e7, // brr.anyz -, r:uv_b0_post12
++/* [0x00000618] */ 0x511c7c39, 0x1006c1c7, // shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++/* [0x00000620] */ 0x51186c39, 0x1006c186, // shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++/* [0x00000628] */ 0x51145c39, 0x1006c145, // shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++/* [0x00000630] */ 0x51104c39, 0x10024104, // shl ra4, ra4, r0 ; mul24 rb4, rb4, r1
++/* [0x00000638] */ 0x119de7c0, 0xd00229e7, // shl.setf -, r3, i_shift30
++/* [0x00000640] */ 0x95105dbf, 0x100d81c6, // mov.ifc ra7, ra4 ; mov.ifc rb6, rb5
++/* [0x00000648] */ 0x95187dbf, 0x100d8144, // mov.ifc ra5, ra6 ; mov.ifc rb4, rb7
++/* [0x00000650] */ 0x00000030, 0xf0f809e7, // brr -, r:uv_b0_post_fin
++/* [0x00000658] */ 0x95144dbf, 0x100901c6, // mov.ifn ra7, ra5 ; mov.ifn rb6, rb4
++/* [0x00000660] */ 0x95105dbf, 0x10090144, // mov.ifn ra5, ra4 ; mov.ifn rb4, rb5
++/* [0x00000668] */ 0x95187dbf, 0x10090105, // mov.ifn ra4, ra6 ; mov.ifn rb5, rb7
++// :uv_b0_post12
++/* [0x00000670] */ 0x95105dbf, 0x100248a3, // mov r2, ra4 ; mov r3, rb5
++/* [0x00000678] */ 0x511c6c39, 0x10024105, // shl ra4, ra7, r0 ; mul24 rb5, rb6, r1
++/* [0x00000680] */ 0x959e749b, 0x100241c6, // mov ra7, r2 ; mov rb6, r3
++/* [0x00000688] */ 0x95187dbf, 0x100248a3, // mov r2, ra6 ; mov r3, rb7
++/* [0x00000690] */ 0x51144c39, 0x10024187, // shl ra6, ra5, r0 ; mul24 rb7, rb4, r1
++/* [0x00000698] */ 0x959e749b, 0x10024144, // mov ra5, r2 ; mov rb4, r3
+// ::mc_filter_uv_b
-+/* [0x00000620] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000628] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
-+/* [0x00000630] */ 0x0c9a0f80, 0x10020827, // add r0, unif, elem_num
-+/* [0x00000638] */ 0x938001f6, 0xd002581c, // max r0, r0, 0 ; mov ra_y_next, unif
-+/* [0x00000640] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
-+/* [0x00000648] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8
-+/* [0x00000650] */ 0x8c8270f6, 0x10025801, // add r0, r0, r3 ; mov ra1, unif
-+/* [0x00000658] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
-+/* [0x00000660] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000668] */ 0x0c041dc0, 0xd2021467, // add rb17, ra1.16a, 1
-+/* [0x00000670] */ 0x0c043dc0, 0xd20214a7, // add rb18, ra1.16a, 3
-+/* [0x00000678] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
-+/* [0x00000680] */ 0x0c9d3e80, 0x100206a7, // add ra_frame_base_next, rb_x_next, r2
-+/* [0x00000688] */ 0x918151f6, 0xd00258c3, // shl r3, r0, i_shift21 ; mov ra3, unif
-+/* [0x00000690] */ 0x0e9c87c0, 0xd00208e7, // shr r3, r3, 8
-+/* [0x00000698] */ 0x0c9d57c0, 0x10020c67, // add vr_setup, r3, rb21
-+/* [0x000006a0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x000006a8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
-+/* [0x000006b0] */ 0x0c9db1c0, 0x100216a7, // add rb26, r0, rb27
-+/* [0x000006b8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x000006c0] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
-+/* [0x000006c8] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
-+/* [0x000006d0] */ 0x800e7036, 0x1c0049ca, // nop ; mov rb10, ra3.8c
-+/* [0x000006d8] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
-+/* [0x000006e0] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
-+/* [0x000006e8] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
++// :uv_b0_post_fin
++/* [0x000006a0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000006a8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait
++/* [0x000006b0] */ 0x00000018, 0xf02809e7, // brr.anyz -, r:uv_filter_b_1
++/* [0x000006b8] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
++/* [0x000006c0] */ 0x954dcdbf, 0x10024471, // mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
++/* [0x000006c8] */ 0x0c027c00, 0x14020827, // add r0, ra0.16b, r0
++/* [0x000006d0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x000006d8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x000006e0] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++// :uv_filter_b_1
++/* [0x000006e8] */ 0x930001f6, 0xd202581c, // max r0, r0, 0 ; mov ra_y_next, ra0.16a
++/* [0x000006f0] */ 0x928191f6, 0x10024823, // min r0, r0, rb_frame_width_minus_1 ; mov r3, unif
++/* [0x000006f8] */ 0x4d808cc7, 0xd0025893, // sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8
++/* [0x00000700] */ 0x8c8270f6, 0x10020827, // add r0, r0, r3 ; mov -, unif
++/* [0x00000708] */ 0x9481c1f6, 0xd00254c0, // and rb_x_next, r0, ~3 ; mov ra0, unif
++/* [0x00000710] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000718] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000720] */ 0x950e0ff6, 0x18024048, // mov ra1, unif ; mov rb8, ra3.8a
++/* [0x00000728] */ 0x950e0ff6, 0x1a064049, // mov.ifnz ra1, unif ; mov rb9, ra3.8b
++/* [0x00000730] */ 0x8c0d3eb6, 0x1c02468a, // add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c
++/* [0x00000738] */ 0x950c0ff6, 0xde0248cb, // mov r3, 0 ; mov rb11, ra3.8d
++/* [0x00000740] */ 0x1104ddc0, 0x14020867, // shl r1, ra1.16b, rb13
++/* [0x00000748] */ 0x0f9c13c0, 0xd0021327, // asr rb12, r1, 1
+// :uvloop_b
-+/* [0x000006f0] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
-+/* [0x000006f8] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
-+/* [0x00000700] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000708] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+/* [0x00000710] */ 0xee454987, 0x10024860, // shr r1, r4, ra_xshift ; v8subs r0, r0, rb20
-+/* [0x00000718] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000720] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000728] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x00000730] */ 0xec414c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8subs r1, r1, rb20
-+/* [0x00000738] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
-+/* [0x00000740] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000748] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
-+/* [0x00000750] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000758] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+/* [0x00000760] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000768] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+/* [0x00000770] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000778] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+/* [0x00000780] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000788] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
-+/* [0x00000790] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
-+/* [0x00000798] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x000007a0] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
-+/* [0x000007a8] */ 0x153e7d80, 0x100203a7, // mov ra14, ra15
-+/* [0x000007b0] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
-+/* [0x000007b8] */ 0x4d38a237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra14, rb10
-+/* [0x000007c0] */ 0x4c3cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra15, rb11
-+/* [0x000007c8] */ 0x4d13023e, 0x10024860, // sub r1, r1, r0 ; mul24 r0, vpm, ra4
-+/* [0x000007d0] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
-+/* [0x000007d8] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x000007e0] */ 0x4f0501ce, 0xd2024821, // asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a
-+/* [0x000007e8] */ 0x409ce007, 0x100049e0, // nop ; mul24 r0, r0, rb14
-+/* [0x000007f0] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
-+/* [0x000007f8] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000800] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000808] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
-+/* [0x00000810] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000818] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000820] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000828] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000830] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000838] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
-+/* [0x00000840] */ 0x00000010, 0xe0020827, // mov r0, 16
-+/* [0x00000848] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000850] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000858] */ 0x0c9dae00, 0x10021c67, // add vw_setup, rb26, r0
-+/* [0x00000860] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000868] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000750] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
++/* [0x00000758] */ 0x8e4539bf, 0xb0029810, // shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
++/* [0x00000760] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000768] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++/* [0x00000770] */ 0x8e456987, 0x10024860, // shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255
++/* [0x00000778] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000780] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000788] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00000790] */ 0x8c416c8f, 0x10024e21, // add t0s, ra_x, r2 ; v8min r1, r1, rb_k255
++/* [0x00000798] */ 0x0c627c80, 0x10020f27, // add t1s, ra_frame_base, r2
++/* [0x000007a0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x000007a8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
++/* [0x000007b0] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x000007b8] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x000007c0] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x000007c8] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x000007d0] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x000007d8] */ 0x4c03d4f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x000007e0] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000007e8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
++/* [0x000007f0] */ 0x8d3447f6, 0xd00279cc, // sub.setf -, r3, 4 ; mov ra12, ra13
++/* [0x000007f8] */ 0xffffff38, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000800] */ 0x55389db7, 0x10024361, // mov ra13, ra14 ; mul24 r1, ra14, rb9
++/* [0x00000808] */ 0x553cadb7, 0x100243a2, // mov ra14, ra15 ; mul24 r2, ra15, rb10
++/* [0x00000810] */ 0x55308037, 0x100243e0, // mov ra15, r0 ; mul24 r0, ra12, rb8
++/* [0x00000818] */ 0x8d1e7236, 0x10225848, // sub r1, r1, r0 ; mov ra8.16b, ra7
++/* [0x00000820] */ 0x4c3cb2b7, 0x10024860, // add r1, r1, r2 ; mul24 r0, ra15, rb11
++/* [0x00000828] */ 0x4d1ce237, 0x14024860, // sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14
++/* [0x00000830] */ 0x55586fce, 0x100241e1, // mov ra7, rb6 ; mul24 r1, r1, ra_k256
++/* [0x00000838] */ 0x8f14e3f6, 0xd0024846, // asr r1, r1, 14 ; mov rb6, ra5
++/* [0x00000840] */ 0x55044fce, 0x12024161, // mov ra5, rb4 ; mul24 r1, r1, ra1.16a
++/* [0x00000848] */ 0x8c127236, 0x10024844, // add r1, r1, r0 ; mov rb4, ra4
++/* [0x00000850] */ 0x55585fce, 0x10024121, // mov ra4, rb5 ; mul24 r1, r1, ra_k256
++/* [0x00000858] */ 0x8c18c3f6, 0x10024845, // add r1, r1, rb12 ; mov rb5, ra6
++/* [0x00000860] */ 0x8d7c77bf, 0x100279c6, // sub.setf -, r3, ra31 ; mov ra6, rb7
++/* [0x00000868] */ 0xfffffec8, 0xf06809e7, // brr.anyn -, r:uvloop_b
++/* [0x00000870] */ 0x0f9cd3c0, 0x10c200e7, // asr ra3.8as, r1, rb13
++/* [0x00000878] */ 0x95232ff6, 0x100049c7, // mov -, vw_wait ; mov rb7, ra8
++/* [0x00000880] */ 0x150e7d80, 0x18020c27, // mov vpm, ra3.8a
++/* [0x00000888] */ 0x959dafff, 0x10025c49, // mov vw_setup, rb26 ; mov ra9, rb26
++/* [0x00000890] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000898] */ 0x959ddfff, 0x10025c4a, // mov vw_setup, rb29 ; mov ra10, rb29
++/* [0x000008a0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x000008a8] */ 0x15827d80, 0x100202e7, // mov ra11, unif
++// ::mc_exit_c
++/* [0x000008b0] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait
++/* [0x000008b8] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit_c_1
++/* [0x000008c0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000008c8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000008d0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000008d8] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x000008e0] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x000008e8] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++/* [0x000008f0] */ 0x009e7000, 0x100009e7, // nop
+// ::mc_exit
-+/* [0x00000870] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000878] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
-+/* [0x00000880] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000888] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00000890] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00000898] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008a0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x000008a8] */ 0x009e7000, 0x100009e7, // nop ; nop
-+/* [0x000008b0] */ 0x009e7000, 0x100009e7, // nop ; nop
-+// ::mc_interrupt_exit8
-+/* [0x000008b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000008c0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008c8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008d0] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x000008d8] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x000008e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008e8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000008f8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000900] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000908] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000910] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00000918] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x00000920] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++// :exit_c_1
++/* [0x000008f8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000900] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00000908] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00000910] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1
++/* [0x00000918] */ 0x00000000, 0xe80009e7, // mov -,srel(0)
++/* [0x00000920] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
+/* [0x00000928] */ 0x009e7000, 0x100009e7, // nop ; nop
++/* [0x00000930] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_setup
-+/* [0x00000930] */ 0x00000010, 0xe00208e7, // mov r3, 16
-+/* [0x00000938] */ 0x15827d80, 0x10020227, // mov ra8, unif
++/* [0x00000938] */ 0x95801ff6, 0xd0025908, // mov tmurs, 1 ; mov ra8, unif
+/* [0x00000940] */ 0x15827d80, 0x10020267, // mov ra9, unif
+/* [0x00000948] */ 0x15827d80, 0x100202a7, // mov ra10, unif
+/* [0x00000950] */ 0x15827d80, 0x100202e7, // mov ra11, unif
-+/* [0x00000958] */ 0x15827d80, 0x10020867, // mov r1, unif
-+/* [0x00000960] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000968] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000970] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000978] */ 0x0d9c13c0, 0xd0021667, // sub rb_frame_width_minus_1,r1,1
-+/* [0x00000980] */ 0x0d9c11c0, 0xd00217a7, // sub rb_frame_height_minus_1,r0,1
-+/* [0x00000988] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000990] */ 0x15827d80, 0x10020827, // mov r0, unif
-+/* [0x00000998] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000009a0] */ 0x0c9e7200, 0x10021627, // add rb24, r1, r0
-+/* [0x000009a8] */ 0x15227d80, 0x10020867, // mov r1, ra8
-+/* [0x000009b0] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x000009b8] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x000009c0] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x000009c8] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000009d0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x000009d8] */ 0x922591f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9
-+/* [0x000009e0] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x000009e8] */ 0x0c9c13c0, 0xd0020767, // add ra_y, r1, 1
-+/* [0x000009f0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x000009f8] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a00] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a08] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000a18] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
-+/* [0x00000a20] */ 0x152a7d80, 0x10020867, // mov r1, ra10
-+/* [0x00000a28] */ 0x119e72c0, 0x10020827, // shl r0,r1,r3
-+/* [0x00000a30] */ 0x0f9e72c0, 0x10020867, // asr r1,r1,r3
-+/* [0x00000a38] */ 0x0f9e70c0, 0x10020827, // asr r0,r0,r3
-+/* [0x00000a40] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x00000a48] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000a50] */ 0x922d91f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11
-+/* [0x00000a58] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000a60] */ 0x0c9c13c0, 0xd0120567, // add ra_y2, r1, 1
-+/* [0x00000a68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
-+/* [0x00000a70] */ 0x0c9e7400, 0x100208a7, // add r2, r2, r0
-+/* [0x00000a78] */ 0x139c03c0, 0xd0020867, // max r1, r1, 0
-+/* [0x00000a80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000a88] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000a90] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
-+/* [0x00000a98] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
-+/* [0x00000aa0] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
-+/* [0x00000aa8] */ 0x00000040, 0xe00207a7, // mov ra30, 64
-+/* [0x00000ab0] */ 0xffffff00, 0xe0021527, // mov rb20, 0xffffff00
-+/* [0x00000ab8] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
-+/* [0x00000ac0] */ 0x00000018, 0xe00215e7, // mov rb23, 24
-+/* [0x00000ac8] */ 0x00000000, 0xe0020227, // mov ra8, 0
-+/* [0x00000ad0] */ 0x00000000, 0xe0020267, // mov ra9, 0
-+/* [0x00000ad8] */ 0x00000000, 0xe00202a7, // mov ra10, 0
-+/* [0x00000ae0] */ 0x00000000, 0xe00202e7, // mov ra11, 0
-+/* [0x00000ae8] */ 0x00000000, 0xe0020327, // mov ra12, 0
-+/* [0x00000af0] */ 0x00000000, 0xe0020367, // mov ra13, 0
-+/* [0x00000af8] */ 0x00000000, 0xe00203a7, // mov ra14, 0
-+/* [0x00000b00] */ 0x00000000, 0xe00203e7, // mov ra15, 0
-+/* [0x00000b08] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000b10] */ 0x159e7480, 0x10020867, // mov r1, r2
-+/* [0x00000b18] */ 0x0f9c23c0, 0xd0020867, // asr r1, r1, 2
-+/* [0x00000b20] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000b28] */ 0x159e7480, 0x10020827, // mov r0, r2
-+/* [0x00000b30] */ 0x149c31c0, 0xd0020827, // and r0, r0, 3
-+/* [0x00000b38] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000b40] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000b48] */ 0x0c9e7040, 0x10021727, // add rb28, r0, r1
-+/* [0x00000b50] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000b58] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000b60] */ 0x0c9e7040, 0x100216e7, // add rb27, r0, r1
-+/* [0x00000b68] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
-+/* [0x00000b70] */ 0x15827d80, 0x100009e7, // mov -, unif
-+/* [0x00000b78] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
-+/* [0x00000b80] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000b88] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
-+/* [0x00000b90] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000b98] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
-+/* [0x00000ba0] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
-+/* [0x00000ba8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
-+/* [0x00000bb0] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
-+/* [0x00000bb8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
-+/* [0x00000bc0] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
++/* [0x00000958] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000960] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000968] */ 0x0d0c1dc0, 0xd4021667, // sub rb_frame_width_minus_1, ra3.16b, 1
++/* [0x00000970] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_frame_height_minus_1, ra3.16a, 1
++/* [0x00000978] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000980] */ 0x15827380, 0x10021627, // or rb24, r1, unif
++/* [0x00000988] */ 0x159a7d80, 0x100208e7, // mov r3, elem_num
++/* [0x00000990] */ 0x0c227cc0, 0x12020827, // add r0, ra8.16a, r3
++/* [0x00000998] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000009a0] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x000009a8] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x000009b0] */ 0x0c201dc0, 0xd4020767, // add ra_y, ra8.16b, 1
++/* [0x000009b8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x000009c0] */ 0x0c267c00, 0x100208a7, // add r2, ra9, r0
++/* [0x000009c8] */ 0x13200dc0, 0xd4020867, // max r1, ra8.16b, 0
++/* [0x000009d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x000009d8] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x000009e0] */ 0x8c9e7452, 0x10025e18, // add t0s, r2, r1 ; mov ra_frame_base, r2
++/* [0x000009e8] */ 0x0c2a7cc0, 0x12020827, // add r0, ra10.16a, r3
++/* [0x000009f0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x000009f8] */ 0x129d91c0, 0x10020827, // min r0, r0, rb_frame_width_minus_1
++/* [0x00000a00] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
++/* [0x00000a08] */ 0x0c281dc0, 0xd4120567, // add ra_y2, ra10.16b, 1
++/* [0x00000a10] */ 0x149dc1c0, 0xd0020827, // and r0, r0, ~3
++/* [0x00000a18] */ 0x0c2e7c00, 0x100208a7, // add r2, ra11, r0
++/* [0x00000a20] */ 0x13280dc0, 0xd4020867, // max r1, ra10.16b, 0
++/* [0x00000a28] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000a30] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x00000a38] */ 0x8c9e7452, 0x10025f19, // add t1s, r2, r1 ; mov ra_frame_base2, r2
++/* [0x00000a40] */ 0x00000001, 0xe0020527, // mov ra_k1, 1
++/* [0x00000a48] */ 0x00000100, 0xe00205a7, // mov ra_k256, 256
++/* [0x00000a50] */ 0x000000ff, 0xe00215a7, // mov rb_k255, 255
++/* [0x00000a58] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
++/* [0x00000a60] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
++/* [0x00000a68] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
++/* [0x00000a70] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
++/* [0x00000a78] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000a80] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000a88] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000a90] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000a98] */ 0x159e7040, 0x10020827, // or r0, r0, r1
++/* [0x00000aa0] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000aa8] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000ab0] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000ab8] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000ac0] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000ac8] */ 0x0c809dc0, 0xd0021367, // add rb13, unif, 9
++/* [0x00000ad0] */ 0x13740dc0, 0xd0020867, // max r1, ra_y, 0
++/* [0x00000ad8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000ae0] */ 0x0c741dc0, 0xd0020767, // add ra_y, ra_y, 1
++/* [0x00000ae8] */ 0x55810d8f, 0x100049e1, // mov -, unif ; mul24 r1, r1, rb_pitch
++/* [0x00000af0] */ 0x0c627380, 0x10020e27, // add t0s, r1, ra_frame_base
++/* [0x00000af8] */ 0x13540dc0, 0xd2020867, // max r1, ra_y2, 0
++/* [0x00000b00] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_frame_height_minus_1
++/* [0x00000b08] */ 0x0c541dc0, 0xd2120567, // add ra_y2, ra_y2, 1
++/* [0x00000b10] */ 0x409d000f, 0x100049e1, // nop ; mul24 r1, r1, rb_pitch
++/* [0x00000b18] */ 0x0c667380, 0x10020f27, // add t1s, r1, ra_frame_base2
+// :per_block_setup
-+/* [0x00000bc8] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000bd0] */ 0x15827d80, 0x100207e7, // mov ra31, unif
-+/* [0x00000bd8] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num
-+/* [0x00000be0] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
-+/* [0x00000be8] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
-+/* [0x00000bf0] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000bf8] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c00] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c08] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
-+/* [0x00000c10] */ 0x95048ff6, 0xd40258dc, // mov r3, 8 ; mov ra_y_next, ra1.16b
-+/* [0x00000c18] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
-+/* [0x00000c20] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
-+/* [0x00000c28] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
-+/* [0x00000c30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000c38] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
-+/* [0x00000c40] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
-+/* [0x00000c48] */ 0x8c0676f6, 0x142258d5, // add r3, r3, r3 ; mov ra_y2_next, ra1.16b
-+/* [0x00000c50] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
-+/* [0x00000c58] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
-+/* [0x00000c60] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
-+/* [0x00000c68] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
-+/* [0x00000c70] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
-+/* [0x00000c78] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
-+/* [0x00000c80] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
-+/* [0x00000c88] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
-+/* [0x00000c90] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
-+/* [0x00000c98] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif
-+/* [0x00000ca0] */ 0x119d01c0, 0xd0040827, // shl.ifz r0, r0, i_shift16
-+/* [0x00000ca8] */ 0x119c31c0, 0xd0020227, // shl ra8, r0, 3
-+/* [0x00000cb0] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000cb8] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
-+/* [0x00000cc0] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
-+/* [0x00000cc8] */ 0x01040400, 0xe0020867, // mov r1,0x01040400
-+/* [0x00000cd0] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000cd8] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000ce0] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
-+/* [0x00000ce8] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
-+/* [0x00000cf0] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
-+/* [0x00000cf8] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
-+/* [0x00000d00] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
-+/* [0x00000d08] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
-+/* [0x00000d10] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000d18] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
-+/* [0x00000d20] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
-+/* [0x00000d28] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
-+/* [0x00000d30] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
-+/* [0x00000d38] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
-+/* [0x00000d40] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000d48] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000d50] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
-+/* [0x00000d58] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
-+/* [0x00000d60] */ 0x902203bf, 0x1e7240e0, // ror ra3.8d, r1, ra8.8d ; mov r0, unif
-+/* [0x00000d68] */ 0x9020d3bf, 0x1c724061, // ror ra1.8d, r1, ra8.8c ; mov r1, rb13
-+/* [0x00000d70] */ 0x910e0e76, 0x18024844, // shl r1, unif, r1 ; mov rb4, ra3.8a
-+/* [0x00000d78] */ 0x8f0e70f6, 0x1a024485, // asr ra18, r0, r3 ; mov rb5, ra3.8b
-+/* [0x00000d80] */ 0x00000000, 0xf0f7e9e7, // bra -, ra31
-+/* [0x00000d88] */ 0x910e70f6, 0x1c024806, // shl r0, r0, r3 ; mov rb6, ra3.8c
-+/* [0x00000d90] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d
-+/* [0x00000d98] */ 0x0f9c93c0, 0xd0021327, // asr rb12, r1, 9
++/* [0x00000b20] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000b28] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b30] */ 0x959a0ff6, 0x10024061, // mov ra1, unif ; mov r1, elem_num
++/* [0x00000b38] */ 0x154e7d80, 0x10020467, // mov ra_xshift, ra_xshift_next
++/* [0x00000b40] */ 0x159c1fc0, 0x10021027, // mov rx_xshift2, rx_xshift2_next
++/* [0x00000b48] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
++/* [0x00000b50] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000b58] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
++/* [0x00000b60] */ 0x119c31c0, 0xd00204e7, // shl ra_xshift_next, r0, 3
++/* [0x00000b68] */ 0x15067d80, 0x14020727, // mov ra_y_next, ra1.16b
++/* [0x00000b70] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
++/* [0x00000b78] */ 0x0c9e7400, 0x100206a7, // add ra_frame_base_next, r2, r0
++/* [0x00000b80] */ 0x0c067c40, 0x12020827, // add r0, ra1.16a, r1
++/* [0x00000b88] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000b90] */ 0x928191f6, 0x10024822, // min r0, r0, rb_frame_width_minus_1 ; mov r2, unif
++/* [0x00000b98] */ 0x119c31c0, 0xd0021067, // shl rx_xshift2_next, r0, 3
++/* [0x00000ba0] */ 0x15067d80, 0x14220567, // mov ra_y2_next, ra1.16b
++/* [0x00000ba8] */ 0x9481c1f6, 0xd0025801, // and r0, r0, ~3 ; mov ra1, unif
++/* [0x00000bb0] */ 0x0c9e7400, 0x100214e7, // add rx_frame_base2_next, r2, r0
++/* [0x00000bb8] */ 0x159dcfc0, 0x10021c67, // mov vw_setup, rb28
++/* [0x00000bc0] */ 0x0d058f80, 0x14021767, // sub rb29, rb24, ra1.16b
++/* [0x00000bc8] */ 0x0c045dc0, 0xd2021467, // add rb17, ra1.16a, 5
++/* [0x00000bd0] */ 0x0c047dc0, 0xd20214a7, // add rb18, ra1.16a, 7
++/* [0x00000bd8] */ 0x11047dc0, 0xd2020827, // shl r0, ra1.16a, 7
++/* [0x00000be0] */ 0x0c067180, 0x14020827, // add r0, r0, ra1.16b
++/* [0x00000be8] */ 0x119d01c0, 0xd0020827, // shl r0, r0, i_shift16
++/* [0x00000bf0] */ 0x8c81b1f6, 0x100256a0, // add rb26, r0, rb27 ; mov r0, unif
++/* [0x00000bf8] */ 0x918101f6, 0xd0045805, // shl.ifz r0, r0, i_shift16 ; mov ra5, unif
++/* [0x00000c00] */ 0x01040400, 0xe00208a7, // mov r2, 0x01040400
++/* [0x00000c08] */ 0x911431f6, 0xd202420e, // shl ra8, r0, 3 ; mov rb14, ra5.16a
++/* [0x00000c10] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000c18] */ 0x10227380, 0x1e4200a7, // ror ra2.8a, r1, ra8.8d
++/* [0x00000c20] */ 0x10227380, 0x1c420027, // ror ra0.8a, r1, ra8.8c
++/* [0x00000c28] */ 0x10227580, 0x1e5200a7, // ror ra2.8b, r2, ra8.8d
++/* [0x00000c30] */ 0x10227580, 0x1c520027, // ror ra0.8b, r2, ra8.8c
++/* [0x00000c38] */ 0x050b0a00, 0xe0020867, // mov r1,0x050b0a00
++/* [0x00000c40] */ 0x10227380, 0x1e6200a7, // ror ra2.8c, r1, ra8.8d
++/* [0x00000c48] */ 0x10227380, 0x1c620027, // ror ra0.8c, r1, ra8.8c
++/* [0x00000c50] */ 0x11283a40, 0xe0020867, // mov r1,0x11283a40
++/* [0x00000c58] */ 0x10227380, 0x1e7200a7, // ror ra2.8d, r1, ra8.8d
++/* [0x00000c60] */ 0x10227380, 0x1c720027, // ror ra0.8d, r1, ra8.8c
++/* [0x00000c68] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000c70] */ 0x10227380, 0x1e4200e7, // ror ra3.8a, r1, ra8.8d
++/* [0x00000c78] */ 0x10227380, 0x1c420067, // ror ra1.8a, r1, ra8.8c
++/* [0x00000c80] */ 0x0a0b0500, 0xe0020867, // mov r1,0x0a0b0500
++/* [0x00000c88] */ 0x10227380, 0x1e5200e7, // ror ra3.8b, r1, ra8.8d
++/* [0x00000c90] */ 0x10227380, 0x1c520067, // ror ra1.8b, r1, ra8.8c
++/* [0x00000c98] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000ca0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000ca8] */ 0x10227380, 0x1c620067, // ror ra1.8c, r1, ra8.8c
++/* [0x00000cb0] */ 0x01010000, 0xe0020867, // mov r1,0x01010000
++/* [0x00000cb8] */ 0x10227380, 0x1e7200e7, // ror ra3.8d, r1, ra8.8d
++/* [0x00000cc0] */ 0x10227380, 0x1c720067, // ror ra1.8d, r1, ra8.8c
++/* [0x00000cc8] */ 0x950e0dbf, 0x18025112, // mov rb4, ra3.8a ; mov ra18, unif
++/* [0x00000cd0] */ 0x150e7d80, 0x1a021167, // mov rb5, ra3.8b
++/* [0x00000cd8] */ 0x150e7d80, 0x1c0211a7, // mov rb6, ra3.8c
++/* [0x00000ce0] */ 0x154a7d80, 0x10060167, // mov.ifnz ra5, ra18
++/* [0x00000ce8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000cf0] */ 0x1114ddc0, 0x14020827, // shl r0, ra5.16b, rb13
++/* [0x00000cf8] */ 0x0f9c91c0, 0xd0021327, // asr rb12, r0, 9
++/* [0x00000d00] */ 0x950c0ff6, 0xde0248c7, // mov r3, 0 ; mov rb7, ra3.8d
+// ::mc_filter
-+/* [0x00000da0] */ 0x0f9cf1c0, 0xd00213a7, // asr rb14, r0, 15
++/* [0x00000d08] */ 0x11141dc0, 0xd20213a7, // shl rb14, ra5.16a, 1
+// :yloop
-+/* [0x00000da8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
-+/* [0x00000db0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
-+/* [0x00000db8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000dc0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+/* [0x00000dc8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000dd0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000dd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000de0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x00000de8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20
-+/* [0x00000df0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000df8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000e00] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+/* [0x00000e08] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
-+/* [0x00000e10] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000e18] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
-+/* [0x00000e20] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00000e28] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+/* [0x00000e30] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00000e38] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+/* [0x00000e40] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00000e48] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+/* [0x00000e50] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00000e58] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
-+/* [0x00000e60] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00000e68] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
-+/* [0x00000e70] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00000e78] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
-+/* [0x00000e80] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00000e88] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
-+/* [0x00000e90] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00000e98] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
-+/* [0x00000ea0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
-+/* [0x00000ea8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
-+/* [0x00000eb0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000eb8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
-+/* [0x00000ec0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
-+/* [0x00000ec8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
-+/* [0x00000ed0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
-+/* [0x00000ed8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
-+/* [0x00000ee0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
-+/* [0x00000ee8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
-+/* [0x00000ef0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
-+/* [0x00000ef8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
-+/* [0x00000f00] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
-+/* [0x00000f08] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
-+/* [0x00000f10] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
-+/* [0x00000f18] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
-+/* [0x00000f20] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00000f28] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
-+/* [0x00000f30] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
-+/* [0x00000f38] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00000f40] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
-+/* [0x00000f48] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00000f50] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00000f58] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00000f60] */ 0xfffffc48, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00000f68] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00000f70] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00000f78] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000d10] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
++/* [0x00000d18] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
++/* [0x00000d20] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000d28] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++/* [0x00000d30] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000d38] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000d40] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000d48] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00000d50] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255
++/* [0x00000d58] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000d60] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000d68] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++/* [0x00000d70] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255
++/* [0x00000d78] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000d80] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
++/* [0x00000d88] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00000d90] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00000d98] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00000da0] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000da8] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000db0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x00000db8] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000dc0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000dc8] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000dd0] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x00000dd8] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00000de0] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x00000de8] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000df0] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x00000df8] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00000e00] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
++/* [0x00000e08] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
++/* [0x00000e10] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
++/* [0x00000e18] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000e20] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
++/* [0x00000e28] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
++/* [0x00000e30] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
++/* [0x00000e38] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
++/* [0x00000e40] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
++/* [0x00000e48] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
++/* [0x00000e50] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
++/* [0x00000e58] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
++/* [0x00000e60] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
++/* [0x00000e68] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
++/* [0x00000e70] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
++/* [0x00000e78] */ 0x8d9f223f, 0x10020867, // sub r1, r1, r0 ; mov -, vw_wait
++/* [0x00000e80] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
++/* [0x00000e88] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00000e90] */ 0x409ce00f, 0x100049e1, // nop ; mul24 r1, r1, rb14
++/* [0x00000e98] */ 0x0c9cc3c0, 0x10020867, // add r1, r1, rb12
++/* [0x00000ea0] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00000ea8] */ 0xfffffe48, 0xf06809e7, // brr.anyn -, r:yloop
++/* [0x00000eb0] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00000eb8] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x00000ec0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x00000ec8] */ 0xfffffc38, 0xf0f809e7, // brr -, r:per_block_setup
++/* [0x00000ed0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x00000ed8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x00000ee0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
+// ::mc_filter_b
-+/* [0x00000f80] */ 0x0f9d01c0, 0xd00213a7, // asr rb14, r0, i_shift16
+// :yloopb
-+/* [0x00000f88] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
-+/* [0x00000f90] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
-+/* [0x00000f98] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+/* [0x00000fa0] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+/* [0x00000fa8] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
-+/* [0x00000fb0] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
-+/* [0x00000fb8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fc0] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x00000fc8] */ 0xec614c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20
-+/* [0x00000fd0] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00000fd8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
-+/* [0x00000fe0] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+/* [0x00000fe8] */ 0xec654c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
-+/* [0x00000ff0] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+/* [0x00000ff8] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
-+/* [0x00001000] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+/* [0x00001008] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+/* [0x00001010] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+/* [0x00001018] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+/* [0x00001020] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+/* [0x00001028] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+/* [0x00001030] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+/* [0x00001038] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
-+/* [0x00001040] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+/* [0x00001048] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
-+/* [0x00001050] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+/* [0x00001058] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
-+/* [0x00001060] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+/* [0x00001068] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
-+/* [0x00001070] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
-+/* [0x00001078] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
-+/* [0x00001080] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
-+/* [0x00001088] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
-+/* [0x00001090] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001098] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
-+/* [0x000010a0] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
-+/* [0x000010a8] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
-+/* [0x000010b0] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
-+/* [0x000010b8] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
-+/* [0x000010c0] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
-+/* [0x000010c8] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
-+/* [0x000010d0] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
-+/* [0x000010d8] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
-+/* [0x000010e0] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
-+/* [0x000010e8] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
-+/* [0x000010f0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12
-+/* [0x000010f8] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
-+/* [0x00001100] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
-+/* [0x00001108] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14
-+/* [0x00001110] */ 0x4c4b808e, 0xd0024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8
-+/* [0x00001118] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
-+/* [0x00001120] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
-+/* [0x00001128] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
-+/* [0x00001130] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
-+/* [0x00001138] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
-+/* [0x00001140] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
-+/* [0x00001148] */ 0xfffffa60, 0xf0f809e7, // brr -, r:per_block_setup
-+/* [0x00001150] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
-+/* [0x00001158] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
-+/* [0x00001160] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++/* [0x00000ee8] */ 0xcd5117de, 0xa00269e3, // sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0
++/* [0x00000ef0] */ 0x8e4539bf, 0xb0029819, // shr r0, r4, ra_xshift ; mov.ifz ra_frame_base2, rx_frame_base2_next ; ldtmu1
++/* [0x00000ef8] */ 0x956a7d9b, 0x1004461f, // mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++/* [0x00000f00] */ 0x95710dbf, 0x10044763, // mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++/* [0x00000f08] */ 0x8e5409f6, 0x14129855, // shr r1, r4, rx_xshift2 ; mov.ifz ra_y2, ra_y2_next
++/* [0x00000f10] */ 0x13740dc0, 0xd00208a7, // max r2, ra_y, 0
++/* [0x00000f18] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000f20] */ 0x4c741dd3, 0xd0024762, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++/* [0x00000f28] */ 0x8c616c87, 0x10024e20, // add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255
++/* [0x00000f30] */ 0x13540dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00000f38] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_frame_height_minus_1
++/* [0x00000f40] */ 0x4c541dd3, 0xd2124562, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
++/* [0x00000f48] */ 0x8c656c8f, 0x10024f21, // add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255
++/* [0x00000f50] */ 0x0000ff00, 0xe20229e7, // mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++/* [0x00000f58] */ 0x40027030, 0x180049e3, // nop ; mul24 r3, ra0.8a, r0
++/* [0x00000f60] */ 0x40038031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++/* [0x00000f68] */ 0x4003f030, 0xda0049e2, // nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++/* [0x00000f70] */ 0x40037031, 0xda00c9e2, // nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++/* [0x00000f78] */ 0x4d03e4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++/* [0x00000f80] */ 0x40036031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00000f88] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++/* [0x00000f90] */ 0x40035031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00000f98] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++/* [0x00000fa0] */ 0x40074031, 0xd800c9e3, // nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000fa8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++/* [0x00000fb0] */ 0x40073031, 0xda00c9e3, // nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00000fb8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++/* [0x00000fc0] */ 0x40072031, 0xdc00c9e3, // nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000fc8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++/* [0x00000fd0] */ 0x40071031, 0xde00c9e3, // nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00000fd8] */ 0x8d9df4ff, 0x10024823, // sub r0, r2, r3 ; mov r3, rb31
++/* [0x00000fe0] */ 0x8d2087f6, 0xd00269e1, // sub.setf -, r3, 8 ; mov r1, ra8
++/* [0x00000fe8] */ 0x95249dbf, 0x10024208, // mov ra8, ra9 ; mov rb8, rb9
++/* [0x00000ff0] */ 0xfffffed8, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00000ff8] */ 0x9528adbf, 0x10024249, // mov ra9, ra10 ; mov rb9, rb10
++/* [0x00001000] */ 0x952cbdbf, 0x1002428a, // mov ra10, ra11 ; mov rb10, rb11
++/* [0x00001008] */ 0x959e7009, 0x100242cb, // mov ra11, r0 ; mov rb11, r1
++/* [0x00001010] */ 0x4008803e, 0x180049e0, // nop ; mul24 r0, rb8, ra2.8a
++/* [0x00001018] */ 0x4008903e, 0x1a0049e1, // nop ; mul24 r1, rb9, ra2.8b
++/* [0x00001020] */ 0x4d08a23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
++/* [0x00001028] */ 0x4d08b23e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb11, ra2.8d
++/* [0x00001030] */ 0x4c204237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb4
++/* [0x00001038] */ 0x4c245237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra9, rb5
++/* [0x00001040] */ 0x4d286237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra10, rb6
++/* [0x00001048] */ 0x4c2c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb7
++/* [0x00001050] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb12
++/* [0x00001058] */ 0x4d5927ce, 0x100269e1, // sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
++/* [0x00001060] */ 0x0f9ce3c0, 0xd0020867, // asr r1, r1, 14
++/* [0x00001068] */ 0x409ce00f, 0x100049e0, // nop ; mul24 r0, r1, rb14
++/* [0x00001070] */ 0x4c4b808e, 0xd2024821, // add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0
++/* [0x00001078] */ 0x8c9f223f, 0x10020867, // add r1, r1, r0 ; mov -, vw_wait
++/* [0x00001080] */ 0x119c83c0, 0xd0020867, // shl r1, r1, 8
++/* [0x00001088] */ 0xfffffe40, 0xf06809e7, // brr.anyn -, r:yloopb
++/* [0x00001090] */ 0x0f9cd3c0, 0x10020867, // asr r1, r1, rb13
++/* [0x00001098] */ 0x129d63c0, 0x10020867, // min r1, r1, rb_k255
++/* [0x000010a0] */ 0x139c03c0, 0xd0020c27, // max vpm, r1, 0
++/* [0x000010a8] */ 0xfffffa58, 0xf0f809e7, // brr -, r:per_block_setup
++/* [0x000010b0] */ 0x159dafc0, 0x10021c67, // mov vw_setup, rb26
++/* [0x000010b8] */ 0x159ddfc0, 0x10021c67, // mov vw_setup, rb29
++/* [0x000010c0] */ 0x15827d80, 0x10021ca7, // mov vw_addr, unif
++// ::mc_interrupt_exit12c
++/* [0x000010c8] */ 0x95272dbf, 0x100229e7, // mov.setf -, ra9 ; mov -, vw_wait
++/* [0x000010d0] */ 0x00000020, 0xf02809e7, // brr.anyz -, r:exit12_c_1
++/* [0x000010d8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000010e0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000010e8] */ 0x009e7000, 0x100009e7, // nop
++/* [0x000010f0] */ 0x0d250dc0, 0xd0021c67, // sub vw_setup, ra9, -16
++/* [0x000010f8] */ 0x152a7d80, 0x10021c67, // mov vw_setup, ra10
++/* [0x00001100] */ 0x152e7d80, 0x10021ca7, // mov vw_addr, ra11
++/* [0x00001108] */ 0x00000000, 0xe0020267, // mov ra9, 0
+// ::mc_interrupt_exit12
-+/* [0x00001168] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001170] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001178] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001180] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001188] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001190] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x00001198] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011a8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011b8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011c8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011d8] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e0] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
-+/* [0x000011e8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x000011f0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x000011f8] */ 0x009e7000, 0x100009e7, // nop ; nop
++// :exit12_c_1
++/* [0x00001110] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001118] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x00001120] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x00001128] */ 0x159f2fc0, 0xb00009e7, // mov -, vw_wait ; nop ; ldtmu1
++/* [0x00001130] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001138] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001140] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001148] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001150] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001158] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001160] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001168] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001170] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001178] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001180] */ 0x00000010, 0xe80009e7, // mov -,sacq(0)
++/* [0x00001188] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x00001190] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x00001198] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_exit1
-+/* [0x00001200] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001208] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001210] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001218] */ 0x009e7000, 0xa00009e7, // ldtmu0
-+/* [0x00001220] */ 0x009e7000, 0xb00009e7, // ldtmu1
-+/* [0x00001228] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x00001230] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
-+/* [0x00001238] */ 0x009e7000, 0x100009e7, // nop ; nop
++/* [0x000011a0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000011a8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000011b0] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000011b8] */ 0x009e7000, 0xa00009e7, // ldtmu0
++/* [0x000011c0] */ 0x009e7000, 0xb00009e7, // ldtmu1
++/* [0x000011c8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
++/* [0x000011d0] */ 0x00000001, 0xe00209a7, // mov interrupt, 1; nop
++/* [0x000011d8] */ 0x009e7000, 0x100009e7, // nop ; nop
+// ::mc_end
+};
+#ifdef __HIGHC__
+#pragma Align_to(8, rpi_shader)
+#endif
-diff --git a/libavcodec/rpi_shader.h b/libavcodec/rpi_shader.h
+diff --git b/libavcodec/rpi_shader.h a/libavcodec/rpi_shader.h
new file mode 100644
-index 0000000..9772796
+index 0000000..3b1229e
--- /dev/null
-+++ b/libavcodec/rpi_shader.h
-@@ -0,0 +1,19 @@
++++ a/libavcodec/rpi_shader.h
+@@ -0,0 +1,20 @@
+#ifndef rpi_shader_H
+#define rpi_shader_H
+
+extern unsigned int rpi_shader[];
+
+#define mc_setup_uv (rpi_shader + 0)
-+#define mc_filter_uv (rpi_shader + 132)
-+#define mc_filter_uv_b0 (rpi_shader + 274)
-+#define mc_filter_uv_b (rpi_shader + 392)
-+#define mc_exit (rpi_shader + 540)
-+#define mc_interrupt_exit8 (rpi_shader + 558)
-+#define mc_setup (rpi_shader + 588)
-+#define mc_filter (rpi_shader + 872)
-+#define mc_filter_b (rpi_shader + 992)
-+#define mc_interrupt_exit12 (rpi_shader + 1114)
-+#define mc_exit1 (rpi_shader + 1152)
-+#define mc_end (rpi_shader + 1168)
++#define mc_filter_uv (rpi_shader + 112)
++#define mc_filter_uv_b0 (rpi_shader + 260)
++#define mc_filter_uv_b (rpi_shader + 424)
++#define mc_exit_c (rpi_shader + 556)
++#define mc_exit (rpi_shader + 574)
++#define mc_setup (rpi_shader + 590)
++#define mc_filter (rpi_shader + 834)
++#define mc_filter_b (rpi_shader + 954)
++#define mc_interrupt_exit12c (rpi_shader + 1074)
++#define mc_interrupt_exit12 (rpi_shader + 1092)
++#define mc_exit1 (rpi_shader + 1128)
++#define mc_end (rpi_shader + 1144)
+
+#endif
-diff --git a/libavcodec/rpi_shader.qasm b/libavcodec/rpi_shader.qasm
+diff --git b/libavcodec/rpi_shader.qasm a/libavcodec/rpi_shader.qasm
new file mode 100644
-index 0000000..aa9e1e7
+index 0000000..6fd6af5
--- /dev/null
-+++ b/libavcodec/rpi_shader.qasm
-@@ -0,0 +1,1098 @@
++++ a/libavcodec/rpi_shader.qasm
+@@ -0,0 +1,1150 @@
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be routated through their
++# local 4. As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
+# register allocation
+#
+# ra0...ra7 eight horizontal filter coefficients
@@ -13420,7 +13493,7 @@ index 0000000..aa9e1e7
+#
+# rb8...rb11 eight vertical filter coefficients
+
-+# ra4 y: Fiter, UV: 0x10000
++# ra4 y: Fiter, UV: part -of b0 -> b stash
+
+# rb12 offset to add before shift (round + weighting offsets)
+# rb13 shift: denom + 6 + 9
@@ -13442,10 +13515,10 @@ index 0000000..aa9e1e7
+# ra22 ra_k256 256
+# ra23 ra_y2_next ra_y2_next
+#
-+# rb20 0xffffff00
-+# rb21 vpm_setup for reading/writing 16bit results into VPM
++# rb20 -- free --
++# rb21 -- free --
+# rb22 rb_k255 255
-+# rb23 24
++# rb23 -- free --
+#
+# rb24 vdw_setup_1(dst_pitch)
+# rb25 frame width-1
@@ -13462,9 +13535,10 @@ index 0000000..aa9e1e7
+# ra27 next ra25
+# ra28 next y
+# ra29 y for next texture access
-+# ra30 64
+#
-+# ra31 next kernel address
++# Use an even numbered register as a link register to avoid corrupting flags
++# ra30 next kernel address
++# ra31 chroma-B height+3; free otherwise
+
+.set rb_frame_width_minus_1, rb25
+.set rb_frame_height_minus_1, rb30
@@ -13496,22 +13570,46 @@ index 0000000..aa9e1e7
+.set rb_k255, rb22
+.set ra_k256, ra22
+
++.set ra_link, ra30
++
+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
+.set i_shift16, -16
+.set i_shift21, -11
++.set i_shift30, -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, r_vpm, r_dma
++ mov r2, qpu_num
++ asr r1, r2, 2
++ shl r1, r1, 6
++ and r0, r2, 3
++ or r0, r0, r1
++
++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++ add r_vpm, r0, r1 # VPM 8bit storage
++
++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++ shl r0, r0, 5
++ add r_dma, r0, r1 # DMA out
++.endm
++
+
+################################################################################
+# mc_setup_uv(next_kernel, x, y, ref_u_base, ref_v_base, frame_width, frame_height, pitch, dst_pitch, offset, denom, vpm_id)
+::mc_setup_uv
-+
-+# Read starting kernel
-+mov ra31, unif
++ mov tmurs, 1 ; mov ra_link, unif # No swap TMUs ; Next fn
+
+# Load first request location
-+add ra_x, unif, elem_num # Store x
-+mov ra_y, unif # Store y
++mov ra0, unif
++mov r0, elem_num
++
++add ra_x, ra0.16b, r0 # Store x
++mov ra_y, ra0.16a # Store y
+mov ra_frame_base, unif # Store frame u base
-+nop
++mov r1, vdw_setup_1(0) # Merged with dst_stride shortly, delay slot for ra_frame_base
+sub ra_u2v_ref_offset, unif, ra_frame_base # Store offset to add to move from u to v in reference frame
+
+# Read image dimensions
@@ -13521,77 +13619,59 @@ index 0000000..aa9e1e7
+# get source pitch
+mov rb16, unif
+
-+# get destination pitch
-+mov r0, unif
-+mov r1, vdw_setup_1(0)
-+add rb24, r1, r0
++# get destination vdw setup
++add rb24, r1, unif # dst_stride
+
+# load constants
++ mov ra_k1, 1
++ mov ra_k256, 256
++ mov rb_k255, 255
+
-+mov ra4, 0x10000
-+mov ra_k1, 1
-+mov ra_k256, 256
-+mov ra30, 64
++# touch registers to keep simulator happy
+
-+mov rb20, 0xffffff00
-+mov rb_k255, 255
-+mov rb23, 24
++ # ra/b4..7: B0 -> B stash registers
++ mov ra4, 0 ; mov rb4, 0
++ mov ra5, 0 ; mov rb5, 0
++ mov ra6, 0 ; mov rb6, 0
++ mov ra7, 0 ; mov rb7, 0
+
-+# touch vertical context to keep simulator happy
++ # ra12..15: vertical scroll registers
++ mov ra12, 0
++ mov ra13, 0
++ mov ra14, 0
++ mov ra15, 0
+
-+mov ra8, 0
-+mov ra9, 0
-+mov ra10, 0
-+mov ra11, 0
-+mov ra12, 0
-+mov ra13, 0
-+mov ra14, 0
-+mov ra15, 0
++ # ra9 - delayed setup - must be 0 initially
++ mov ra9, 0
+
+# Compute base address for first and second access
+mov r0, ra_x # Load x
-+max r0, r0, 0; mov r1, ra_y # Load y
++max r0, r0, 0 ; mov r1, ra_y # Load y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, ra_frame_base # Load the frame base
-+shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
++shl ra_xshift_next, r0, 3 ; mov r2, ra_u2v_ref_offset
+add ra_y, r1, 1
+add r0, r0, r3
+and r0, r0, ~3
-+max r1, r1, 0 ; mov ra_x, r0 # y
++max r1, r1, 0 ; mov ra_x, r0 # y
+min r1, r1, rb_frame_height_minus_1
+# submit texture requests for first line
+add r2, r2, r0 ; mul24 r1, r1, rb_pitch
+add t0s, r0, r1 ; mov ra_frame_base, r2
+add t1s, r2, r1
+
-+mov r2, 9
-+add rb13, r2, unif # denominator
++add rb13, 9, unif # denominator
+mov -, unif # Unused
+
-+# Compute part of VPM to use for DMA output
-+mov r2, unif
-+shl r2, r2, 1 # Convert QPU numbers to be even (this means we can only use 8 QPUs, but is necessary as we need to save 16bit intermediate results)
-+and r2, r2, 15
-+mov r1, r2
-+asr r1, r1, 2
-+shl r1, r1, 6
-+mov r0, r2
-+and r0, r0, 3
-+add r0, r0, r1
++mov -, unif # ??? same as (register) qpu_num
+
-+mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+add rb28, r0, r1 # VPM 8bit storage
-+asr r2, r0, 1 # r0 = bc0000d
-+mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR which is Y[5:0],H[0] for 16 bit
-+add rb21, r2, r1 # VPM for 16bit intermediates
-+mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+shl r0, r0, 5
-+add rb27, r0, r1 # DMA out
++# Compute part of VPM to use for DMA output
++m_calc_dma_regs rb28, rb27
+
+# submit texture requests for second line
+max r1, ra_y, 0
+min r1, r1, rb_frame_height_minus_1
+add ra_y, ra_y, 1
-+bra -, ra31
++bra -, ra_link
+nop ; mul24 r1, r1, rb_pitch
+add t0s, r1, ra_x
+add t1s, r1, ra_frame_base
@@ -13605,20 +13685,24 @@ index 0000000..aa9e1e7
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+::mc_filter_uv
-+mov ra31, unif
++mov ra_link, unif
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num # x
-+max r0, r0, 0 ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
++mov ra2, unif # x_y
++mov r0, elem_num ; mov r3, unif # frame_base
++
++add r0, ra2.16b, r0 # x
++max r0, r0, 0
++min r0, r0, rb_frame_width_minus_1
+# compute offset from frame base u to frame base v
+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
+shl ra_xshift_next, r0, 3
+add r0, r0, r3 ; mov ra1, unif # ; width_height
+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
-+mov ra_y_next, r1 ; mov vw_setup, rb28
++mov ra_y_next, ra2.16a ; mov vw_setup, rb28
++
+add ra_frame_base_next, rb_x_next, r2
+
+# set up VPM write
@@ -13628,9 +13712,19 @@ index 0000000..aa9e1e7
+add rb17, ra1.16a, 1
+add rb18, ra1.16a, 3
+shl r0, ra1.16a, 7
++
++ mov.setf -, ra9 ; mov -, vw_wait
++ brr.anyz -, r:filter_uv_1
++
+add r0, r0, ra1.16b # Combine width and height of destination area
+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
+add rb26, r0, rb27 ; mov ra3, unif # ; V filter coeffs
++# >>> (skip V DMA if never requested)
++
++ sub vw_setup, ra9, -16
++ mov vw_setup, ra10
++ mov vw_addr, ra11
++:filter_uv_1
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
@@ -13662,12 +13756,12 @@ index 0000000..aa9e1e7
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0 # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
++add t0s, ra_x, r2 ; v8min r1, r1, rb_k255
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
@@ -13677,13 +13771,13 @@ index 0000000..aa9e1e7
+
+# apply horizontal filter
+nop ; mul24 r3, ra0.8a, r0
-+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+sub r0, r2, r3 ; mov r3, rb31
+sub.setf -, r3, 4 ; mov ra12, ra13
+brr.anyn -, r:uvloop
@@ -13707,24 +13801,15 @@ index 0000000..aa9e1e7
+asr r1, r1, rb13
+min r1, r1, rb_k255 # Delay 2
+max vpm, r1, 0 # Delay 3
++# >>>
+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
++# DMA out for U & stash for V
++ mov vw_setup, rb26 ; mov ra9, rb26 # VDW setup 0
++ bra -, ra_link
++ mov vw_setup, rb29 ; mov ra10, rb29 # Stride
++ mov vw_addr, unif # u_dst_addr
++ mov ra11, unif # v_dst_addr
++# >>>
+
+################################################################################
+
@@ -13733,19 +13818,23 @@ index 0000000..aa9e1e7
+# At this point we have already issued two pairs of texture requests for the current block
+# ra_x, ra_x16_base point to the current coordinates for this block
+::mc_filter_uv_b0
-+mov ra31, unif
++mov -, unif # Ignore chain address - always "b"
+
+# per-channel shifts were calculated on the *previous* invocation
+
+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num # x
-+max r0, r0, 0 ; mov r1, unif # y
-+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # frame_base
-+sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next # compute offset from frame base u to frame base v ;
++mov ra2, unif # x_y
++mov r0, elem_num ; mov r3, unif # frame_base
++
++add r0, ra2.16b, r0 # x
++max r0, r0, 0
++min r0, r0, rb_frame_width_minus_1
++# compute offset from frame base u to frame base v
++sub r2, unif, r3 ; mov ra_xshift, ra_xshift_next
+shl ra_xshift_next, r0, 3
-+add r0, r0, r3 ; mov ra1, unif # ; width_height
-+and rb_x_next, r0, ~3 ; mov ra0, unif # ; H filter coeffs
-+mov ra_y_next, r1 ; mov vw_setup, rb21
++add r0, r0, r3 ; mov ra1, unif # ; width_height
++and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
++mov ra_y_next, ra2.16a
+
+add ra_frame_base_next, rb_x_next, r2
+
@@ -13753,14 +13842,12 @@ index 0000000..aa9e1e7
+# chroma filter always goes -ve, +ve, +ve, -ve. This is fixed in the
+# filter code. Unpack into b regs for V
+
-+# set up VPM write, we need to save 16bit precision
-+
+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
++add ra31, ra1.16a, 3
+shl r0, ra1.16a, 7
-+add r0, r0, ra1.16b # Combine width and height of destination area
-+shl r0, r0, i_shift16 ; mov ra3, unif # ; V filter coeffs
++add r0, r0, ra1.16b ; mov ra3, unif # Combine width and height of destination area ; V filter coeffs
++shl r0, r0, i_shift16 ; mov rb14, unif # U weight L0
+add rb26, r0, rb27
+
+mov rb8, ra3.8a
@@ -13773,8 +13860,8 @@ index 0000000..aa9e1e7
+
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
-+mov rb14, unif # U weight L0
+mov.ifnz rb14, unif ; mov r3, 0 # V weight L0 ; Loop counter
++
+# rb14 unused in b0 but will hang around till the second pass
+
+# retrieve texture results and pick out bytes
@@ -13785,62 +13872,127 @@ index 0000000..aa9e1e7
+# retrieve texture results and pick out bytes
+# then submit two more texture requests
+
-+sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
-+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
-+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
-+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++ sub.setf -, r3, rb17 ; v8adds r3, r3, ra_k1 ; ldtmu0 # loop counter increment
++ shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
++ mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
++ mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
++ shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+
-+max r2, ra_y, 0 # y
-+min r2, r2, rb_frame_height_minus_1
-+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
-+add t1s, ra_frame_base, r2
++ max r2, ra_y, 0 # y
++ min r2, r2, rb_frame_height_minus_1
++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
++ add t0s, ra_x, r2 ; v8min r1, r1, rb_k255
++ add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
+
-+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
++ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
-+nop ; mul24 r3, ra0.8a, r0
-+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+sub r0, r2, r3 ; mov r3, rb31
-+sub.setf -, r3, 4 ; mov ra12, ra13
-+brr.anyn -, r:uvloop_b0
-+mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13
-+mov ra14, ra15
-+mov ra15, r0 ; mul24 r0, ra12, rb8
++ nop ; mul24 r3, ra0.8a, r0
++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ sub r0, r2, r3 ; mov r3, rb31
++ sub.setf -, r3, 4 ; mov ra12, ra13
++ brr.anyn -, r:uvloop_b0
++ mov ra13, ra14 ; mul24 r1, ra14, rb9 # ra14 is about to be ra13
++ mov ra14, ra15 ; mul24 r2, ra15, rb10 # ra15 is about to be ra14
++ mov ra15, r0 ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop_b0
+
-+# apply vertical filter and write to VPM
++# apply vertical filter and write to B-FIFO
+
-+sub r1, r1, r0 ; mul24 r0, ra14, rb10
-+sub.setf -, r3, rb18
-+brr.anyn -, r:uvloop_b0
-+add r1, r1, r0 ; mul24 r0, ra15, rb11
-+sub r1, r1, r0 ; mov -, vw_wait
-+asr vpm, r1, 6
-+# >>> .anyn uvloop_b0
++ sub r1, r1, r0 ; mov ra8.16b, ra7 # start of B FIFO writes
++ add r1, r1, r2 ; mul24 r0, ra15, rb11 # N.B. ra15 write gap
++ sub r1, r1, r0 ; mov ra7, rb6
+
-+# in pass0 we don't really need to save any results, but need to discard the uniforms
-+# DMA out for U
++# FIFO goes:
++# b7a, a6a, b5a, a4a, b4a, a5a, b6a, a7a : b7b, a6b, b5b, a4b, b4b, a5b, b6b, a7b
++# This arrangement optimizes the inner loop FIFOs at the expense of making the
++# bulk shift between loops quite a bit nastier
++# a8 used as temp
+
-+bra -, ra31
-+mov -, unif # Delay 1
-+mov -, unif # Delay 2
-+nop # Delay 3
++ sub.setf -, r3, ra31
++ asr ra8.16a, r1, 6 ; mov rb6, ra5 # This discards the high bits that might be bad
++ brr.anyn -, r:uvloop_b0
++ mov ra5, rb4 ; mov rb4, ra4
++ mov ra4, rb5 ; mov rb5, ra6
++ mov ra6, rb7 ; mov rb7, ra8
++# >>>
+
++# 1st half done all results now in the a/b4..7 fifo
++
++# Need to bulk rotate FIFO for heights other than 16
++# plausible heights are 16, 12, 8, 6, 4, 3, 2 and that is all we deal with
++# we are allowed 3/4 cb_size w/h :-(
++
++# Destination uniforms discarded
++# At the end drop through to _b - we will always do b after b0
++
++ sub.setf -, 15, r3 # 12 + 3 of preroll
++ brr.anyn -, r:uv_b0_post_fin # h > 12 (n) => 16 (do nothing)
++ sub r3, 11, r3 ; mov -, unif # r3 = shifts wanted ; Discard u_dst_addr
++ mov r0, i_shift16 ; mov -, unif # ; Discard v_dst_addr
++ mov r1, 0x10000
++# >>>
++ brr.anyz -, r:uv_b0_post12 # h == 12 deal with specially
++# If h != 16 && h != 12 then h <= 8 so
++# shift 8 with discard (.16b = .16a on all regs)
++ shl.ifnz ra7, ra7, r0 ; mul24.ifnz rb7, rb7, r1
++ shl.ifnz ra6, ra6, r0 ; mul24.ifnz rb6, rb6, r1
++ shl.ifnz ra5, ra5, r0 ; mul24.ifnz rb5, rb5, r1
++# >>>
++ shl ra4, ra4, r0 ; mul24 rb4, rb4, r1
++
++ shl.setf -, r3, i_shift30 # b2 -> C, b1 -> N
++# Shift 4
++ mov.ifc ra7, ra4 ; mov.ifc rb6, rb5
++ mov.ifc ra5, ra6 ; mov.ifc rb4, rb7
++ # If we shifted by 4 here then the max length remaining is 4
++ # so that is it
++
++ brr -, r:uv_b0_post_fin
++# Shift 2
++ mov.ifn ra7, ra5 ; mov.ifn rb6, rb4
++ mov.ifn ra5, ra4 ; mov.ifn rb4, rb5
++ mov.ifn ra4, ra6 ; mov.ifn rb5, rb7
++ # 6 / 2 so need 6 outputs
++# >>>
++
++:uv_b0_post12
++# this one is annoying as we need to swap halves of things that don't
++# really want to be swapped
++
++# b7a, a6a, b5a, a4a
++# b4a, a5a, b6a, a7a
++# b7b, a6b, b5b, a4b
++# b4b, a5b, b6b, a7b
++
++ mov r2, ra4 ; mov r3, rb5
++ shl ra4, ra7, r0 ; mul24 rb5, rb6, r1
++ mov ra7, r2 ; mov rb6, r3
++
++ mov r2, ra6 ; mov r3, rb7
++ shl ra6, ra5, r0 ; mul24 rb7, rb4, r1
++ mov ra5, r2 ; mov rb4, r3
++
++:uv_b0_post_fin
++ # drop through
+
+################################################################################
+
+::mc_filter_uv_b
-+mov ra31, unif
++
++ mov ra_link, unif
++ mov.setf -, ra9 ; mov -, vw_wait # Delayed V DMA
++ brr.anyz -, r:uv_filter_b_1
++
++ mov ra0, unif ; mov r0, elem_num
+
+# per-channel shifts were calculated on the *previous* invocation
+
@@ -13848,30 +14000,23 @@ index 0000000..aa9e1e7
+mov ra_xshift, ra_xshift_next ; mov vw_setup, rb28
+
+# get base addresses and per-channel shifts for *next* invocation
-+add r0, unif, elem_num # x
-+max r0, r0, 0 ; mov ra_y_next, unif # y
++add r0, ra0.16b, r0 # x
++# >>>
++ sub vw_setup, ra9, -16
++ mov vw_setup, ra10
++ mov vw_addr, ra11
++:uv_filter_b_1
++
++max r0, r0, 0 ; mov ra_y_next, ra0.16a # y
+min r0, r0, rb_frame_width_minus_1 ; mov r3, unif # V frame_base
+# compute offset from frame base u to frame base v
+sub r2, unif, r3 ; mul24 ra_xshift_next, r0, 8 # U frame_base
-+add r0, r0, r3 ; mov ra1, unif # width_height
++add r0, r0, r3 ; mov -, unif # discard width_height
+and rb_x_next, r0, ~3 ; mov ra0, unif # H filter coeffs
+
-+sub rb29, rb24, ra1.16b # Compute vdw_setup1(dst_pitch-width)
-+add rb17, ra1.16a, 1
-+add rb18, ra1.16a, 3
-+shl r0, ra1.16a, 7
++# rb17, rb26, rb29, ra31 inherited from B0 as w/h must be the same
+
-+add ra_frame_base_next, rb_x_next, r2
-+
-+# r0 is currently height<<7
-+# For vr_setup we want height<<20 (so 20-7=13 additional bits)
-+shl r3, r0, i_shift21 ; mov ra3, unif # Shl 13 + Mask off top 8 bits ; V filter coeffs
-+shr r3, r3, 8
-+add vr_setup, r3, rb21
-+
-+add r0, r0, ra1.16b # Combine width and height of destination area
-+shl r0, r0, i_shift16 # Shift into bits 16 upwards of the vdw_setup0 register
-+add rb26, r0, rb27
++mov ra3, unif # V filter coeffs
+
+# get filter coefficients
+
@@ -13882,7 +14027,7 @@ index 0000000..aa9e1e7
+# The unif read occurs unconditionally, only the write is conditional
+mov ra1, unif ; mov rb8, ra3.8a # U offset/weight ;
+mov.ifnz ra1, unif ; mov rb9, ra3.8b # V offset/weight ;
-+nop ; mov rb10, ra3.8c
++add ra_frame_base_next, rb_x_next, r2 ; mov rb10, ra3.8c
+mov r3, 0 ; mov rb11, ra3.8d # Loop counter ;
+
+shl r1, ra1.16b, rb13
@@ -13902,12 +14047,12 @@ index 0000000..aa9e1e7
+shr r0, r4, ra_xshift ; mov.ifz ra_x, rb_x_next ; ldtmu1
+mov.ifz ra_frame_base, ra_frame_base_next ; mov rb31, r3
+mov.ifz ra_y, ra_y_next ; mov r3, rb_pitch
-+shr r1, r4, ra_xshift ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++shr r1, r4, ra_xshift ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+
+max r2, ra_y, 0 # y
+min r2, r2, rb_frame_height_minus_1
+add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+add t0s, ra_x, r2 ; v8subs r1, r1, rb20
++add t0s, ra_x, r2 ; v8min r1, r1, rb_k255
+add t1s, ra_frame_base, r2
+
+# generate seven shifted versions
@@ -13916,100 +14061,105 @@ index 0000000..aa9e1e7
+mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+
+nop ; mul24 r3, ra0.8a, r0
-+nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
++nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++add r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
+sub r0, r2, r3 ; mov r3, rb31
+sub.setf -, r3, 4 ; mov ra12, ra13
+brr.anyn -, r:uvloop_b
+mov ra13, ra14 ; mul24 r1, ra14, rb9
-+mov ra14, ra15
++mov ra14, ra15 ; mul24 r2, ra15, rb10
+mov ra15, r0 ; mul24 r0, ra12, rb8
+# >>> .anyn uvloop_b
+
+# apply vertical filter and write to VPM
+
-+sub r1, r1, r0 ; mul24 r0, ra14, rb10
-+add r1, r1, r0 ; mul24 r0, ra15, rb11
-+# Beware: vpm read gets unsigned 16-bit value, so we must sign extend it
-+sub r1, r1, r0 ; mul24 r0, vpm, ra4 # ra4 = 0x10000
-+sub.setf -, r3, rb18 ; mul24 r1, r1, ra_k256
-+asr r1, r1, 14 # shift2=6
++ sub r1, r1, r0 ; mov ra8.16b, ra7 # FIFO rotate (all ra/b4..7)
++ add r1, r1, r2 ; mul24 r0, ra15, rb11
++ sub r1, r1, r0 ; mul24 r0, ra7.16b, rb14
++ mov ra7, rb6 ; mul24 r1, r1, ra_k256
++ asr r1, r1, 14 ; mov rb6, ra5 # shift2=6
+
-+asr r0, r0, i_shift16 ; mul24 r1, r1, ra1.16a
-+nop ; mul24 r0, r0, rb14
++ mov ra5, rb4 ; mul24 r1, r1, ra1.16a
++ add r1, r1, r0 ; mov rb4, ra4
+
-+add r1, r1, r0 ; mov -, vw_wait
-+shl r1, r1, 8 # Lose bad top 8 bits & sign extend
++ mov ra4, rb5 ; mul24 r1, r1, ra_k256 # Lose bad top 8 bits & sign extend
++ add r1, r1, rb12 ; mov rb5, ra6 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
+
-+add r1, r1, rb12 # rb12 = (offsetL0 + offsetL1 + 1) << (rb13 - 1)
++ sub.setf -, r3, ra31 ; mov ra6, rb7
++ brr.anyn -, r:uvloop_b
++ asr ra3.8as, r1, rb13
++ mov -, vw_wait ; mov rb7, ra8 # vw_wait is B-reg (annoyingly) ; Final FIFO mov
++ mov vpm, ra3.8a
++# >>>
+
-+brr.anyn -, r:uvloop_b
-+asr r1, r1, rb13 # Delay 1
-+min r1, r1, rb_k255 # Delay 2
-+max vpm, r1, 0 # Delay 3
++# DMA out for U & stash for V
++
++ mov vw_setup, rb26 ; mov ra9, rb26 # VDW setup 0
++ bra -, ra_link
++ mov vw_setup, rb29 ; mov ra10, rb29 # Stride
++ mov vw_addr, unif # u_dst_addr
++ mov ra11, unif # v_dst_addr
+
+
-+# DMA out for U
-+
-+mov vw_setup, rb26 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
-+
-+# DMA out for V
-+# We need to wait for the U to complete first, but have nothing useful to compute while we wait.
-+# Could potentially push this write into the start of the next pipeline stage.
-+mov r0, 16
-+mov -, vw_wait
-+
-+bra -, ra31
-+add vw_setup, rb26, r0 # VDW setup 0
-+mov vw_setup, rb29 # Stride
-+mov vw_addr, unif # start the VDW
+
+################################################################################
+
+# mc_exit()
+
++::mc_exit_c
++ mov.setf -, ra9 ; mov -, vw_wait
++# Annoyingly it looks iike condition codes don't work on writes to special
++# registers so we have to branch around the writes
++ brr.anyz -, r:exit_c_1
++ nop
++ nop
++ nop
++# >>>
++
++ sub vw_setup, ra9, -16
++ mov vw_setup, ra10
++ mov vw_addr, ra11
++ nop
++:exit_c_1
++
+::mc_exit
-+mov -, vw_wait # wait on the VDW
++ ldtmu0
++ ldtmu1
++ ldtmu0
++ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW
+
-+mov -,srel(0)
++ mov -,srel(0)
+
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
-+
-+nop ; nop ; thrend
-+nop ; nop # delay slot 1
-+nop ; nop # delay slot 2
++ nop ; nop ; thrend
++ nop ; nop # delay slot 1
++ nop ; nop # delay slot 2
+
+# mc_interrupt_exit8()
-+::mc_interrupt_exit8
-+mov -, vw_wait # wait on the VDW
-+
-+ldtmu0
-+ldtmu1
-+ldtmu0
-+ldtmu1
-+
-+mov -,sacq(0) # 1
-+mov -,sacq(0) # 2
-+mov -,sacq(0) # 3
-+mov -,sacq(0) # 4
-+mov -,sacq(0) # 5
-+mov -,sacq(0) # 6
-+mov -,sacq(0) # 7
-+
-+nop ; nop ; thrend
-+mov interrupt, 1; nop # delay slot 1
-+nop ; nop # delay slot 2
-+
++#::mc_interrupt_exit8
++#mov -, vw_wait # wait on the VDW
++#
++#ldtmu0
++#ldtmu1
++#ldtmu0
++#ldtmu1
++#
++#mov -,sacq(0) # 1
++#mov -,sacq(0) # 2
++#mov -,sacq(0) # 3
++#mov -,sacq(0) # 4
++#mov -,sacq(0) # 5
++#mov -,sacq(0) # 6
++#mov -,sacq(0) # 7
++#
++#nop ; nop ; thrend
++#mov interrupt, 1; nop # delay slot 1
++#nop ; nop # delay slot 2
++#
+
+
+
@@ -14022,115 +14172,79 @@ index 0000000..aa9e1e7
+################################################################################
+# mc_setup(y_x, ref_y_base, y2_x2, ref_y2_base, frame_width_height, pitch, dst_pitch, offset_shift, tbd, next_kernel)
+::mc_setup
-+ mov r3, 16
-+
+ # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+ mov ra8, unif # y_x
-+ mov ra9, unif # ref_y_base
-+ mov ra10, unif # y2_x2
-+ mov ra11, unif # ref_y2_base
++ mov tmurs, 1 ; mov ra8, unif # No TMU swap ; y_x
++ mov ra9, unif # ref_y_base
++ mov ra10, unif # y2_x2
++ mov ra11, unif # ref_y2_base
+
+# Read image dimensions
-+ mov r1, unif # width_height
-+ shl r0,r1,r3
-+ asr r1,r1,r3 # width
-+ asr r0,r0,r3 # height
-+ sub rb_frame_width_minus_1,r1,1
-+ sub rb_frame_height_minus_1,r0,1
-+
-+# get source pitch
-+ mov rb_pitch, unif # src_pitch
++ mov ra3, unif # width_height
++ mov rb_pitch, unif # src_pitch [ra3 delay]
++ sub rb_frame_width_minus_1, ra3.16b, 1
++ sub rb_frame_height_minus_1, ra3.16a, 1
+
+# get destination pitch
-+ mov r0, unif # dst_pitch
+ mov r1, vdw_setup_1(0)
-+ add rb24, r1, r0
++ or rb24, r1, unif # dst_pitch
+
+# Compute base address for first and second access
-+ mov r1, ra8 # y_x
-+ shl r0,r1,r3 # r0 is x<<16
-+ asr r1,r1,r3 # r1 is y
-+ asr r0,r0,r3 # r0 is x
-+ add r0, r0, elem_num # Load x
++ mov r3, elem_num
++ add r0, ra8.16a, r3 # Load x + elem_num
+ max r0, r0, 0
-+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra9 # Load the frame base
++ min r0, r0, rb_frame_width_minus_1
+ shl ra_xshift_next, r0, 3 # Compute shifts
-+ add ra_y, r1, 1
-+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
-+ add r2, r2, r0 # r2 is address for frame0 (not including y offset)
-+ max r1, r1, 0
++ add ra_y, ra8.16b, 1
++ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
++ add r2, ra9, r0 # ra9 is address for frame0 (not including y offset)
++ max r1, ra8.16b, 0
+ min r1, r1, rb_frame_height_minus_1
-+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
-+ add t0s, r2, r1 ; mov ra_frame_base, r2
++ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
++ add t0s, r2, r1 ; mov ra_frame_base, r2
+
-+ mov r1, ra10 # y_x
-+ shl r0,r1,r3 # r0 is x<<16
-+ asr r1,r1,r3 # r1 is y
-+ asr r0,r0,r3 # r0 is x
-+ add r0, r0, elem_num # Load x
++ # r3 still contains elem_num
++ add r0, ra10.16a, r3 # Load x
+ max r0, r0, 0
-+ min r0, r0, rb_frame_width_minus_1 ; mov r2, ra11 # Load the frame base
++ min r0, r0, rb_frame_width_minus_1
+ shl rx_xshift2_next, r0, 3 # Compute shifts
-+ add ra_y2, r1, 1
-+ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
-+ add r2, r2, r0 # r2 is address for frame1 (not including y offset)
-+ max r1, r1, 0
++ add ra_y2, ra10.16b, 1
++ and r0, r0, ~3 # r0 gives the clipped and aligned x coordinate
++ add r2, ra11, r0 # r2 is address for frame1 (not including y offset)
++ max r1, ra10.16b, 0
+ min r1, r1, rb_frame_height_minus_1
-+ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
-+ add t1s, r2, r1 ; mov ra_frame_base2, r2
-+
++ nop ; mul24 r1, r1, rb_pitch # r2 contains the addresses (not including y offset) for frame0
++ add t1s, r2, r1 ; mov ra_frame_base2, r2
+
+# load constants
+
+ mov ra_k1, 1
+ mov ra_k256, 256
-+ mov ra30, 64
-+
-+ mov rb20, 0xffffff00
+ mov rb_k255, 255
-+ mov rb23, 24
+
+# touch vertical context to keep simulator happy
+
-+ mov ra8, 0
-+ mov ra9, 0
-+ mov ra10, 0
-+ mov ra11, 0
-+ mov ra12, 0
-+ mov ra13, 0
-+ mov ra14, 0
-+ mov ra15, 0
++ mov ra8, 0 ; mov rb8, 0
++ mov ra9, 0 ; mov rb9, 0
++ mov ra10, 0 ; mov rb10, 0
++ mov ra11, 0 ; mov rb11, 0
+
+# Compute part of VPM to use
-+ mov r2, qpu_num
-+ mov r1, r2
-+ asr r1, r1, 2
-+ shl r1, r1, 6
-+ mov r0, r2
-+ and r0, r0, 3
-+ add r0, r0, r1
-+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+ add rb28, r0, r1 # VPM for saving data
-+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+ shl r0, r0, 5
-+ add rb27, r0, r1 # Command for dma output
++ m_calc_dma_regs rb28, rb27
+
+# Weighted prediction denom
-+ add rb13, unif, 9 # unif = weight denom + 6
-+
-+ mov -, unif # Unused
++ add rb13, unif, 9 # unif = weight denom + 6
+
+# submit texture requests for second line
+ max r1, ra_y, 0
+ min r1, r1, rb_frame_height_minus_1
+ add ra_y, ra_y, 1
-+ nop ; mul24 r1, r1, rb_pitch
++ mov -, unif ; mul24 r1, r1, rb_pitch # unused ;
+ add t0s, r1, ra_frame_base
+
+ max r1, ra_y2, 0
+ min r1, r1, rb_frame_height_minus_1
+ add ra_y2, ra_y2, 1
-+ nop ; mul24 r1, r1, rb_pitch
++ nop ; mul24 r1, r1, rb_pitch
+ add t1s, r1, ra_frame_base2
+
+# FALL THROUGHT TO PER-BLOCK SETUP
@@ -14139,7 +14253,7 @@ index 0000000..aa9e1e7
+# P and B blocks share the same setup code to save on Icache space
+:per_block_setup
+ mov.setf -, [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-+ mov ra31, unif
++ mov ra_link, unif
+
+ mov ra1, unif ; mov r1, elem_num # y_x ; elem_num has implicit unpack??
+
@@ -14153,7 +14267,7 @@ index 0000000..aa9e1e7
+ max r0, r0, 0
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base
+ shl ra_xshift_next, r0, 3 # Compute shifts
-+ mov r3, 8 ; mov ra_y_next, ra1.16b
++ mov ra_y_next, ra1.16b
+ and r0, r0, ~3 ; mov ra1, unif # y2_x2
+ add ra_frame_base_next, r2, r0
+
@@ -14161,7 +14275,7 @@ index 0000000..aa9e1e7
+ max r0, r0, 0
+ min r0, r0, rb_frame_width_minus_1 ; mov r2, unif # Load the frame base
+ shl rx_xshift2_next, r0, 3 # Compute shifts
-+ add r3, r3, r3 ; mov ra_y2_next, ra1.16b # r3 = 16 ;
++ mov ra_y2_next, ra1.16b
+ and r0, r0, ~3 ; mov ra1, unif # width_height ; r0 gives the clipped and aligned x coordinate
+ add rx_frame_base2_next, r2, r0 # r2 is address for frame1 (not including y offset)
+
@@ -14178,8 +14292,9 @@ index 0000000..aa9e1e7
+ add rb26, r0, rb27 ; mov r0, unif # Packed filter offsets
+
+# get filter coefficients and discard unused B frame values
-+ shl.ifz r0, r0, i_shift16 # Pick half to use
-+ shl ra8, r0, 3
++ shl.ifz r0, r0, i_shift16 ; mov ra5, unif # Pick half to use ; L0 offset/weight
++ mov r2, 0x01040400 # [ra5 delay]
++ shl ra8, r0, 3 ; mov rb14, ra5.16a
+
+# Pack the 1st 4 filter coefs for H & V tightly
+
@@ -14187,9 +14302,8 @@ index 0000000..aa9e1e7
+ ror ra2.8a, r1, ra8.8d
+ ror ra0.8a, r1, ra8.8c
+
-+ mov r1,0x01040400
-+ ror ra2.8b, r1, ra8.8d
-+ ror ra0.8b, r1, ra8.8c
++ ror ra2.8b, r2, ra8.8d
++ ror ra0.8b, r2, ra8.8c
+
+ mov r1,0x050b0a00 # -ve
+ ror ra2.8c, r1, ra8.8d
@@ -14215,27 +14329,31 @@ index 0000000..aa9e1e7
+ ror ra3.8c, r1, ra8.8d
+ ror ra1.8c, r1, ra8.8c
+
-+# Extract weighted prediction information in parallel
-+
+ mov r1,0x01010000 # -ve
-+ ror ra3.8d, r1, ra8.8d ; mov r0, unif # ; weight L1 weight L1 (hi16)/weight L0 (lo16)
-+ ror ra1.8d, r1, ra8.8c ; mov r1, rb13 # ; rb13 = weight denom + 6 + 9
++ ror ra3.8d, r1, ra8.8d
++ ror ra1.8d, r1, ra8.8c
+
-+# r3 = 16 from (long way) above
-+ shl r1, unif, r1 ; mov rb4, ra3.8a # combined offet = ((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) ;
-+ asr ra18, r0, r3 ; mov rb5, ra3.8b
-+ bra -, ra31
-+ shl r0, r0, r3 ; mov rb6, ra3.8c
-+ mov r3, 0 ; mov rb7, ra3.8d # loop count ;
-+ asr rb12, r1, 9
++# Extract weighted prediction information in parallel
++# We are annoyingly A src limited here
+
-+# >>> branch ra31
++ mov rb4, ra3.8a ; mov ra18, unif
++ mov rb5, ra3.8b
++ mov rb6, ra3.8c
++ mov.ifnz ra5, ra18
++
++ bra -, ra_link
++
++ shl r0, ra5.16b, rb13 # Offset calc
++ asr rb12, r0, 9 # For B l1 & L0 offsets should be identical so it doesn't matter which we use
++ mov r3, 0 ; mov rb7, ra3.8d
++# >>> branch ra_link
+#
+# r3 = 0
-+# ra18 = weight L1
-+# r0 = weight L0 << 16 (will be put into rb14 in filter preamble)
-+# rb13 = weight denom + 6 + 9
-+# rb12 = (((is P) ? offset L0 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# ra18.16a = weight L1
++# ra5.16a = weight L0/L1 depending on side (wanted for 2x mono-pred)
++# rb12 = (((is P) ? offset L0/L1 * 2 : offset L1 + offset L0) + 1) << (rb13 - 1)
++# rb13 = weight denom + 6 + 9
++# rb14 = weight L0
+
+
+################################################################################
@@ -14244,8 +14362,9 @@ index 0000000..aa9e1e7
+# At this point we have already issued two pairs of texture requests for the current block
+
+::mc_filter
-+# r0 = weight << 16; We want weight * 2 in rb14
-+ asr rb14, r0, 15
++# ra5.16a = weight << 16; We want weight * 2 in rb14
++
++ shl rb14, ra5.16a, 1
+
+# r3 = 0
+
@@ -14269,12 +14388,12 @@ index 0000000..aa9e1e7
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++ add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+
+ max r2, ra_y2, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
++ add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
@@ -14283,21 +14402,21 @@ index 0000000..aa9e1e7
+
+# apply horizontal filter
+ nop ; mul24 r3, ra0.8a, r0
-+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+ nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
-+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
-+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
-+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
-+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+ sub r0, r2, r3 ; mov r3, rb31
+
+ sub.setf -, r3, 8 ; mov r1, ra8
@@ -14359,7 +14478,7 @@ index 0000000..aa9e1e7
+
+::mc_filter_b
+ # r0 = weightL0 << 16, we want it in rb14
-+ asr rb14, r0, i_shift16
++# asr rb14, r0, i_shift16
+
+:yloopb
+# retrieve texture results and pick out bytes
@@ -14377,12 +14496,12 @@ index 0000000..aa9e1e7
+ max r2, ra_y, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_frame_base, r2 ; v8subs r0, r0, rb20 # v8subs masks out all but bottom byte
++ add t0s, ra_frame_base, r2 ; v8min r0, r0, rb_k255 # v8subs masks out all but bottom byte
+
+ max r2, ra_y2, 0 # y
+ min r2, r2, rb_frame_height_minus_1
+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+ add t1s, ra_frame_base2, r2 ; v8subs r1, r1, rb20
++ add t1s, ra_frame_base2, r2 ; v8min r1, r1, rb_k255
+
+# generate seven shifted versions
+# interleave with scroll of vertical context
@@ -14391,21 +14510,21 @@ index 0000000..aa9e1e7
+
+# apply horizontal filter
+ nop ; mul24 r3, ra0.8a, r0
-+ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8
-+ nop ; mul24 r2, ra0.8b << 1, r0 << 1
-+ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9
-+ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2
-+ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10
-+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3
-+ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11
-+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4
-+ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12
-+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5
-+ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6
-+ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14
-+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7
-+ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15
++ nop ; mul24.ifnz r3, ra0.8a << 8, r1 << 8 @ "mul_used", 0
++ nop ; mul24 r2, ra0.8b << 1, r0 << 1 @ "mul_used", 0
++ nop ; mul24.ifnz r2, ra0.8b << 9, r1 << 9 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
++ nop ; mul24.ifnz r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
+ sub r0, r2, r3 ; mov r3, rb31
+
+ sub.setf -, r3, 8 ; mov r1, ra8
@@ -14417,7 +14536,6 @@ index 0000000..aa9e1e7
+ # >>> .anyn yloopb
+
+ # apply vertical filter and write to VPM
-+
+ nop ; mul24 r0, rb8, ra2.8a
+ nop ; mul24 r1, rb9, ra2.8b
+ sub r1, r1, r0 ; mul24 r0, rb10, ra2.8c
@@ -14433,7 +14551,7 @@ index 0000000..aa9e1e7
+
+ asr r1, r1, 14
+ nop ; mul24 r0, r1, rb14
-+ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18 << 8
++ add r0, r0, r2 ; mul24 r1, r1 << 8, ra18.16a << 8 @ "mul_used", 0
+
+ add r1, r1, r0 ; mov -, vw_wait
+ shl r1, r1, 8
@@ -14450,26 +14568,26 @@ index 0000000..aa9e1e7
+ mov vw_addr, unif # start the VDW Delay 3
+
+################################################################################
++::mc_interrupt_exit12c
++ mov.setf -, ra9 ; mov -, vw_wait
++ brr.anyz -, r:exit12_c_1
++ nop
++ nop
++ nop
++# >>>
++
++ sub vw_setup, ra9, -16
++ mov vw_setup, ra10
++ mov vw_addr, ra11
++ mov ra9, 0
++:exit12_c_1
+
+# mc_interrupt_exit12()
+::mc_interrupt_exit12
-+ mov -, vw_wait # wait on the VDW
-+
-+ # Dummy wait to test instructions
-+# mov r3,1000000
-+#:dummy_loop
-+# sub.setf r3, r3, 1
-+# nop
-+# nop
-+# brr.anynn -, r:dummy_loop
-+# nop
-+# nop
-+# nop
-+
-+ ldtmu0
+ ldtmu0
+ ldtmu1
-+ ldtmu1
++ ldtmu0
++ mov -, vw_wait ; nop ; ldtmu1 # wait on the VDW
+
+ mov -,sacq(0) # 1
+ mov -,sacq(0) # 2
@@ -14502,477 +14620,12 @@ index 0000000..aa9e1e7
+
+::mc_end
+# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_user_vcsm.h b/libavcodec/rpi_user_vcsm.h
+diff --git b/libavcodec/rpi_zc.c a/libavcodec/rpi_zc.c
new file mode 100644
-index 0000000..db41a4d
+index 0000000..9ac22aa
--- /dev/null
-+++ b/libavcodec/rpi_user_vcsm.h
-@@ -0,0 +1,459 @@
-+/*****************************************************************************
-+* Copyright 2001 - 2011 Broadcom Corporation. All rights reserved.
-+*
-+* This program is the proprietary software of Broadcom Corporation and/or
-+* its licensors, and may only be used, duplicated, modified or distributed
-+* pursuant to the terms and conditions of a separate, written license
-+* agreement executed between you and Broadcom (an "Authorized License").
-+* Except as set forth in an Authorized License, Broadcom grants no license
-+* (express or implied), right to use, or waiver of any kind with respect to
-+* the Software, and Broadcom expressly reserves all rights in and to the
-+* Software and all intellectual property rights therein. IF YOU HAVE NO
-+* AUTHORIZED LICENSE, THEN YOU HAVE NO RIGHT TO USE THIS SOFTWARE IN ANY
-+* WAY, AND SHOULD IMMEDIATELY NOTIFY BROADCOM AND DISCONTINUE ALL USE OF
-+* THE SOFTWARE.
-+*
-+* Except as expressly set forth in the Authorized License,
-+* 1. This program, including its structure, sequence and organization,
-+* constitutes the valuable trade secrets of Broadcom, and you shall use
-+* all reasonable efforts to protect the confidentiality thereof, and to
-+* use this information only in connection with your use of Broadcom
-+* integrated circuit products.
-+* 2. TO THE MAXIMUM EXTENT PERMITTED BY LAW, THE SOFTWARE IS PROVIDED "AS IS"
-+* AND WITH ALL FAULTS AND BROADCOM MAKES NO PROMISES, REPRESENTATIONS OR
-+* WARRANTIES, EITHER EXPRESS, IMPLIED, STATUTORY, OR OTHERWISE, WITH
-+* RESPECT TO THE SOFTWARE. BROADCOM SPECIFICALLY DISCLAIMS ANY AND ALL
-+* IMPLIED WARRANTIES OF TITLE, MERCHANTABILITY, NONINFRINGEMENT, FITNESS
-+* FOR A PARTICULAR PURPOSE, LACK OF VIRUSES, ACCURACY OR COMPLETENESS,
-+* QUIET ENJOYMENT, QUIET POSSESSION OR CORRESPONDENCE TO DESCRIPTION. YOU
-+* ASSUME THE ENTIRE RISK ARISING OUT OF USE OR PERFORMANCE OF THE SOFTWARE.
-+* 3. TO THE MAXIMUM EXTENT PERMITTED BY LAW, IN NO EVENT SHALL BROADCOM OR ITS
-+* LICENSORS BE LIABLE FOR (i) CONSEQUENTIAL, INCIDENTAL, SPECIAL, INDIRECT,
-+* OR EXEMPLARY DAMAGES WHATSOEVER ARISING OUT OF OR IN ANY WAY RELATING TO
-+* YOUR USE OF OR INABILITY TO USE THE SOFTWARE EVEN IF BROADCOM HAS BEEN
-+* ADVISED OF THE POSSIBILITY OF SUCH DAMAGES; OR (ii) ANY AMOUNT IN EXCESS
-+* OF THE AMOUNT ACTUALLY PAID FOR THE SOFTWARE ITSELF OR U.S. $1, WHICHEVER
-+* IS GREATER. THESE LIMITATIONS SHALL APPLY NOTWITHSTANDING ANY FAILURE OF
-+* ESSENTIAL PURPOSE OF ANY LIMITED REMEDY.
-+*****************************************************************************/
-+
-+#ifndef __USER_VCSM__H__INCLUDED__
-+#define __USER_VCSM__H__INCLUDED__
-+
-+/* VideoCore Shared Memory - user interface library.
-+**
-+** This library provides all the necessary abstraction for any application to
-+** make use of the shared memory service which is distributed accross a kernel
-+** driver and a videocore service.
-+**
-+** It is an application design decision to choose or not to use this service.
-+**
-+** The logical flow of operations that a user application needs to follow when
-+** using this service is:
-+**
-+** 1) Initialize the service.
-+** 2) Allocate shared memory blocks.
-+** 3) Start using the allocated blocks.
-+** - In order to gain ownership on a block, lock the allocated block,
-+** locking a block returns a valid address that the user application
-+** can access.
-+** - When finished with using the block for the current execution cycle
-+** or function, and so when giving up the ownership, unlock the block.
-+** 4) A block can be locked/unlocked as many times required - within or outside
-+** of - a specific execution context.
-+** 5) To completely release an allocated block, free it.
-+** 6) If the service is no longer required, terminate it.
-+**
-+**
-+** Some generic considerations:
-+
-+** Allocating memory blocks.
-+**
-+** Memory blocks can be allocated in different manners depending on the cache
-+** behavior desired. A given block can either be:
-+
-+** - Allocated in a non cached fashion all the way through host and videocore.
-+** - Allocated in a cached fashion on host OR videocore.
-+** - Allocated in a cached fashion on host AND videocore.
-+**
-+** It is an application decision to determine how to allocate a block. Evidently
-+** if the application will be doing substantial read/write accesses to a given block,
-+** it is recommended to allocate the block at least in a 'host cached' fashion for
-+** better results.
-+**
-+**
-+** Locking memory blocks.
-+**
-+** When the memory block has been allocated in a host cached fashion, locking the
-+** memory block (and so taking ownership of it) will trigger a cache invalidation.
-+**
-+** For the above reason and when using host cached allocation, it is important that
-+** an application properly implements the lock/unlock mechanism to ensure cache will
-+** stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+** It is possible to dynamically change the host cache behavior (ie cached or non
-+** cached) of a given allocation without needing to free and re-allocate the block.
-+** This feature can be useful for such application which requires access to the block
-+** only at certain times and not otherwise. By changing the cache behavior dynamically
-+** the application can optimize performances for a given duration of use.
-+** Such dynamic cache behavior remapping only applies to host cache and not videocore
-+** cache. If one requires to change the videocore cache behavior, then a new block
-+** must be created to replace the old one.
-+**
-+** On successful locking, a valid pointer is returned that the application can use
-+** to access to data inside the block. There is no guarantee that the pointer will
-+** stay valid following the unlock action corresponding to this lock.
-+**
-+**
-+** Unocking memory blocks.
-+**
-+** When the memory block has been allocated in a host cached fashion, unlocking the
-+** memory block (and so forgiving its ownership) will trigger a cache flush unless
-+** explicitely asked not to flush the cache for performances reasons.
-+**
-+** For the above reason and when using host cached allocation, it is important that
-+** an application properly implements the lock/unlock mechanism to ensure cache will
-+** stay coherent, otherwise there is no guarantee it will at all be.
-+**
-+**
-+** A complete API is defined below.
-+*/
-+
-+#ifdef __cplusplus
-+extern "C"
-+{
-+#endif
-+
-+/* Different status that can be dumped.
-+*/
-+typedef enum
-+{
-+ VCSM_STATUS_VC_WALK_ALLOC = 0, // Walks *all* the allocation on videocore.
-+ // Result of the walk is seen in the videocore
-+ // log.
-+ VCSM_STATUS_HOST_WALK_MAP, // Walks the *full* mapping allocation on host
-+ // driver (ie for all processes). Result of
-+ // the walk is seen in the kernel log.
-+ VCSM_STATUS_HOST_WALK_PID_MAP, // Walks the per process mapping allocation on host
-+ // driver (for current process). Result of
-+ // the walk is seen in the kernel log.
-+ VCSM_STATUS_HOST_WALK_PID_ALLOC, // Walks the per process host allocation on host
-+ // driver (for current process). Result of
-+ // the walk is seen in the kernel log.
-+ VCSM_STATUS_VC_MAP_ALL, // Equivalent to both VCSM_STATUS_VC_WALK_ALLOC and
-+ // VCSM_STATUS_HOST_WALK_MAP.
-+ //
-+ VCSM_STATUS_NONE, // Must be last - invalid.
-+
-+} VCSM_STATUS_T;
-+
-+/* Different kind of cache behavior.
-+*/
-+typedef enum
-+{
-+ VCSM_CACHE_TYPE_NONE = 0, // No caching applies.
-+ VCSM_CACHE_TYPE_HOST, // Allocation is cached on host (user space).
-+ VCSM_CACHE_TYPE_VC, // Allocation is cached on videocore.
-+ VCSM_CACHE_TYPE_HOST_AND_VC, // Allocation is cached on both host and videocore.
-+
-+} VCSM_CACHE_TYPE_T;
-+
-+/* Initialize the vcsm processing.
-+**
-+** Must be called once before attempting to do anything else.
-+**
-+** Returns 0 on success, -1 on error.
-+*/
-+int vcsm_init( void );
-+
-+
-+/* Terminates the vcsm processing.
-+**
-+** Must be called vcsm services are no longer needed, it will
-+** take care of removing any allocation under the current process
-+** control if deemed necessary.
-+*/
-+void vcsm_exit( void );
-+
-+
-+/* Queries the status of the the vcsm.
-+**
-+** Triggers dump of various kind of information, see the
-+** different variants specified in VCSM_STATUS_T.
-+**
-+** Pid is optional.
-+*/
-+void vcsm_status( VCSM_STATUS_T status, int pid );
-+
-+
-+/* Allocates a non-cached block of memory of size 'size' via the vcsm memory
-+** allocator.
-+**
-+** Returns: 0 on error
-+** a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc( unsigned int size, char *name );
-+
-+
-+/* Allocates a cached block of memory of size 'size' via the vcsm memory
-+** allocator, the type of caching requested is passed as argument of the
-+** function call.
-+**
-+** Returns: 0 on error
-+** a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_cache( unsigned int size, VCSM_CACHE_TYPE_T cache, char *name );
-+
-+
-+/* Shares an allocated block of memory via the vcsm memory allocator.
-+**
-+** Returns: 0 on error
-+** a non-zero opaque handle on success.
-+**
-+** On success, the user must invoke vcsm_lock with the returned opaque
-+** handle to gain access to the memory associated with the opaque handle.
-+** When finished using the memory, the user calls vcsm_unlock_xx (see those
-+** function definition for more details on the one that can be used).
-+**
-+** A well behaved application should make every attempt to lock/unlock
-+** only for the duration it needs to access the memory data associated with
-+** the opaque handle.
-+*/
-+unsigned int vcsm_malloc_share( unsigned int handle );
-+
-+
-+/* Resizes a block of memory allocated previously by vcsm_alloc.
-+**
-+** Returns: 0 on success
-+** -errno on error.
-+**
-+** The handle must be unlocked by user prior to attempting any
-+** resize action.
-+**
-+** On error, the original size allocated against the handle
-+** remains available the same way it would be following a
-+** successful vcsm_malloc.
-+*/
-+int vcsm_resize( unsigned int handle, unsigned int new_size );
-+
-+
-+/* Frees a block of memory that was successfully allocated by
-+** a prior call the vcms_alloc.
-+**
-+** The handle should be considered invalid upon return from this
-+** call.
-+**
-+** Whether any memory is actually freed up or not as the result of
-+** this call will depends on many factors, if all goes well it will
-+** be freed. If something goes wrong, the memory will likely end up
-+** being freed up as part of the vcsm_exit process. In the end the
-+** memory is guaranteed to be freed one way or another.
-+*/
-+void vcsm_free( unsigned int handle );
-+
-+
-+/* Retrieves a videocore opaque handle from a mapped user address
-+** pointer. The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns: 0 on error
-+** a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+** opaque handle (allocated via vcsm_malloc) and it is only
-+** significant for such application which knows what to do
-+** with it, for the others it is just a number with little
-+** use since nothing can be done with it (in particular
-+** for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_ptr( void *usr_ptr );
-+
-+
-+/* Retrieves a videocore opaque handle from a opaque handle
-+** pointer. The videocore handle will correspond to the actual
-+** memory mapped in videocore.
-+**
-+** Returns: 0 on error
-+** a non-zero opaque handle on success.
-+**
-+** Note: the videocore opaque handle is distinct from the user
-+** opaque handle (allocated via vcsm_malloc) and it is only
-+** significant for such application which knows what to do
-+** with it, for the others it is just a number with little
-+** use since nothing can be done with it (in particular
-+** for safety reason it cannot be used to map anything).
-+*/
-+unsigned int vcsm_vc_hdl_from_hdl( unsigned int handle );
-+
-+
-+/* Retrieves a user opaque handle from a mapped user address
-+** pointer.
-+**
-+** Returns: 0 on error
-+** a non-zero opaque handle on success.
-+*/
-+unsigned int vcsm_usr_handle( void *usr_ptr );
-+
-+
-+/* Retrieves a mapped user address from an opaque user
-+** handle.
-+**
-+** Returns: 0 on error
-+** a non-zero address on success.
-+**
-+** On success, the address corresponds to the pointer
-+** which can access the data allocated via the vcsm_malloc
-+** call.
-+*/
-+void *vcsm_usr_address( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle.
-+**
-+** Returns: NULL on error
-+** a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock( unsigned int handle );
-+
-+
-+/* Locks the memory associated with this opaque handle. The lock
-+** also gives a chance to update the *host* cache behavior of the
-+** allocated buffer if so desired. The *videocore* cache behavior
-+** of the allocated buffer cannot be changed by this call and such
-+** attempt will be ignored.
-+**
-+** The system will attempt to honour the cache_update mode request,
-+** the cache_result mode will provide the final answer on which cache
-+** mode is really in use. Failing to change the cache mode will not
-+** result in a failure to lock the buffer as it is an application
-+** decision to choose what to do if (cache_result != cache_update)
-+**
-+** The value returned in cache_result can only be considered valid if
-+** the returned pointer is non NULL. The cache_result pointer may be
-+** NULL if the application does not care about the actual outcome of
-+** its action with regards to the cache behavior change.
-+**
-+** Returns: NULL on error
-+** a valid pointer on success.
-+**
-+** A user MUST lock the handle received from vcsm_malloc
-+** in order to be able to use the memory associated with it.
-+**
-+** On success, the pointer returned is only valid within
-+** the lock content (ie until a corresponding vcsm_unlock_xx
-+** is invoked).
-+*/
-+void *vcsm_lock_cache( unsigned int handle,
-+ VCSM_CACHE_TYPE_T cache_update,
-+ VCSM_CACHE_TYPE_T *cache_result );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+**
-+** Returns: 0 on success
-+** -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr( void *usr_ptr );
-+
-+
-+/* Unlocks the memory associated with this user mapped address.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+** Do not flush cache as the result of the unlock (if cache
-+** flush was otherwise applicable in this case).
-+**
-+** Returns: 0 on success
-+** -errno on error.
-+**
-+** After unlocking a mapped address, the user should no longer
-+** attempt to reference it.
-+*/
-+int vcsm_unlock_ptr_sp( void *usr_ptr, int cache_no_flush );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+**
-+** Returns: 0 on success
-+** -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl( unsigned int handle );
-+
-+
-+/* Unlocks the memory associated with this user opaque handle.
-+** Apply special processing that would override the otherwise
-+** default behavior.
-+**
-+** If 'cache_no_flush' is specified:
-+** Do not flush cache as the result of the unlock (if cache
-+** flush was otherwise applicable in this case).
-+**
-+** Returns: 0 on success
-+** -errno on error.
-+**
-+** After unlocking an opaque handle, the user should no longer
-+** attempt to reference the mapped addressed once associated
-+** with it.
-+*/
-+int vcsm_unlock_hdl_sp( unsigned int handle, int cache_no_flush );
-+
-+/* Clean and/or invalidate the memory associated with this user opaque handle
-+**
-+** Returns: non-zero on error
-+**
-+** structure contains a list of flush/invalidate commands. Commands are:
-+** 0: nop
-+** 1: invalidate given virtual range in L1/L2
-+** 2: clean given virtual range in L1/L2
-+** 3: clean+invalidate given virtual range in L1/L2
-+** 4: flush all L1/L2
-+*/
-+struct vcsm_user_clean_invalid_s {
-+ struct {
-+ unsigned int cmd;
-+ unsigned int handle;
-+ unsigned int addr;
-+ unsigned int size;
-+ } s[8];
-+};
-+
-+int vcsm_clean_invalid( struct vcsm_user_clean_invalid_s *s );
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#endif /* __USER_VCSM__H__INCLUDED__ */
-diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
-new file mode 100644
-index 0000000..9580165
---- /dev/null
-+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,406 @@
++++ a/libavcodec/rpi_zc.c
+@@ -0,0 +1,453 @@
+#include "config.h"
+#ifdef RPI
+#include "rpi_qpu.h"
@@ -14985,6 +14638,7 @@ index 0000000..9580165
+typedef struct ZcPool
+{
+ int numbytes;
++ unsigned int n;
+ struct ZcPoolEnt * head;
+ pthread_mutex_t lock;
+} ZcPool;
@@ -14993,27 +14647,48 @@ index 0000000..9580165
+{
+ // It is important that we start with gmem as other bits of code will expect to see that
+ GPU_MEM_PTR_T gmem;
++ unsigned int n;
+ struct ZcPoolEnt * next;
+ struct ZcPool * pool;
+} ZcPoolEnt;
+
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const int size)
++#if 1
++//#define ALLOC_PAD 0x1000
++#define ALLOC_PAD 0
++#define ALLOC_ROUND 0x1000
++//#define ALLOC_N_OFFSET 0x100
++#define ALLOC_N_OFFSET 0
++#define STRIDE_ROUND 0x80
++#define STRIDE_OR 0x80
++#else
++#define ALLOC_PAD 0
++#define ALLOC_ROUND 0x1000
++#define ALLOC_N_OFFSET 0
++#define STRIDE_ROUND 32
++#define STRIDE_OR 0
++#endif
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
+{
+ ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
+
++ // Round up to 4k & add 4k
++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
+ if (zp == NULL) {
+ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
+ goto fail0;
+ }
+
-+ if (gpu_malloc_cached(size, &zp->gmem) != 0)
++ if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
+ {
-+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", size);
++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
+ goto fail1;
+ }
+
+ zp->next = NULL;
+ zp->pool = pool;
++ zp->n = pool->n++;
+ return zp;
+
+fail1:
@@ -15062,6 +14737,10 @@ index 0000000..9580165
+ }
+
+ pthread_mutex_unlock(&pool->lock);
++
++ // Start with our buffer empty of preconceptions
++// rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++
+ return zp;
+}
+
@@ -15127,7 +14806,8 @@ index 0000000..9580165
+ const unsigned int video_width, const unsigned int video_height)
+{
+ AVRpiZcFrameGeometry geo;
-+ geo.stride_y = (video_width + 32 + 31) & ~31;
++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++// geo.stride_y = ((video_width + 32 + 31) & ~31);
+ geo.stride_c = geo.stride_y / 2;
+// geo.height_y = (video_height + 15) & ~15;
+ geo.height_y = (video_height + 32 + 31) & ~31;
@@ -15139,13 +14819,21 @@ index 0000000..9580165
+{
+ ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
+ AVBufferRef * buf;
++ intptr_t idata = (intptr_t)zp->gmem.arm;
++#if ALLOC_N_OFFSET != 0
++ intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
++#endif
+
+ if (zp == NULL) {
+ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
+ goto fail0;
+ }
+
-+ if ((buf = av_buffer_create(zp->gmem.arm, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
++#if ALLOC_N_OFFSET != 0
++ idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
++#endif
++
++ if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
+ {
+ av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
+ goto fail2;
@@ -15317,6 +15005,18 @@ index 0000000..9580165
+ return p == NULL ? -1 : p->vc_handle;
+}
+
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++ return p == NULL ? 0 : fr_ref->data - p->arm;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++ return fr_ref == NULL ? 0 : fr_ref->size;
++}
++
++
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
+{
+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
@@ -15379,12 +15079,12 @@ index 0000000..9580165
+
+#endif // RPI
+
-diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+diff --git b/libavcodec/rpi_zc.h a/libavcodec/rpi_zc.h
new file mode 100644
-index 0000000..f0109f4
+index 0000000..4dd7a8b
--- /dev/null
-+++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,83 @@
++++ a/libavcodec/rpi_zc.h
+@@ -0,0 +1,88 @@
+#ifndef LIBAVCODEC_RPI_ZC_H
+#define LIBAVCODEC_RPI_ZC_H
+
@@ -15439,6 +15139,11 @@ index 0000000..f0109f4
+// Get the vc_handle from the frame ref
+// Returns -1 if ref doesn't look valid
+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
+// Get the number of bytes allocated from the frame ref
+// Returns 0 if ref doesn't look valid
+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
@@ -15468,10 +15173,10 @@ index 0000000..f0109f4
+
+#endif
+
-diff --git a/libavcodec/utils.c b/libavcodec/utils.c
-index f7adb52..708526e 100644
---- a/libavcodec/utils.c
-+++ b/libavcodec/utils.c
+diff --git b/libavcodec/utils.c a/libavcodec/utils.c
+index 3e8677d..f1efc0d 100644
+--- b/libavcodec/utils.c
++++ a/libavcodec/utils.c
@@ -26,6 +26,12 @@
*/
@@ -15496,7 +15201,7 @@ index f7adb52..708526e 100644
#if HAVE_PTHREADS || HAVE_W32THREADS || HAVE_OS2THREADS
static int default_lockmgr_cb(void **arg, enum AVLockOp op)
{
-@@ -503,6 +513,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
+@@ -508,6 +518,47 @@ int avcodec_fill_audio_frame(AVFrame *frame, int nb_channels,
return ret;
}
@@ -15544,7 +15249,7 @@ index f7adb52..708526e 100644
static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
{
FramePool *pool = avctx->internal->pool;
-@@ -550,6 +601,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
+@@ -555,6 +606,14 @@ static int update_frame_pool(AVCodecContext *avctx, AVFrame *frame)
av_buffer_pool_uninit(&pool->pools[i]);
pool->linesize[i] = linesize[i];
if (size[i]) {
@@ -15559,10 +15264,48 @@ index f7adb52..708526e 100644
pool->pools[i] = av_buffer_pool_init(size[i] + 16 + STRIDE_ALIGN - 1,
CONFIG_MEMORY_POISONING ?
NULL :
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index b31d233..2767306 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
+diff --git b/libavformat/matroskaenc.c a/libavformat/matroskaenc.c
+index 9c7a213..af941ce 100644
+--- b/libavformat/matroskaenc.c
++++ a/libavformat/matroskaenc.c
+@@ -2223,7 +2223,7 @@ static int mkv_check_new_extra_data(AVFormatContext *s, AVPacket *pkt)
+
+ switch (par->codec_id) {
+ case AV_CODEC_ID_FLAC:
+- if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL) && !mkv->is_live) {
++ if (side_data_size && (s->pb->seekable & AVIO_SEEKABLE_NORMAL)) {
+ AVCodecParameters *codecpriv_par;
+ int64_t curpos;
+ if (side_data_size != par->extradata_size) {
+diff --git b/libavformat/mov.c a/libavformat/mov.c
+index f2296f8..4550cf0 100644
+--- b/libavformat/mov.c
++++ a/libavformat/mov.c
+@@ -1186,12 +1186,6 @@ static void mov_metadata_creation_time(AVDictionary **metadata, int64_t time)
+ if (time) {
+ if(time >= 2082844800)
+ time -= 2082844800; /* seconds between 1904-01-01 and Epoch */
+-
+- if ((int64_t)(time * 1000000ULL) / 1000000 != time) {
+- av_log(NULL, AV_LOG_DEBUG, "creation_time is not representable\n");
+- return;
+- }
+-
+ avpriv_dict_set_timestamp(metadata, "creation_time", time * 1000000);
+ }
+ }
+@@ -5794,7 +5788,6 @@ static int mov_read_close(AVFormatContext *s)
+ av_freep(&mov->fragment_index_data);
+
+ av_freep(&mov->aes_decrypt);
+- av_freep(&mov->chapter_tracks);
+
+ return 0;
+ }
+diff --git b/libavformat/mpegts.c a/libavformat/mpegts.c
+index 3eff152..30dfb14 100644
+--- b/libavformat/mpegts.c
++++ a/libavformat/mpegts.c
@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
#endif
{ 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
@@ -15572,11 +15315,11 @@ index b31d233..2767306 100644
{ 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
{ 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
{ 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
-diff --git a/libavformat/utils.c b/libavformat/utils.c
-index 6f343f2..83f26d5 100644
---- a/libavformat/utils.c
-+++ b/libavformat/utils.c
-@@ -691,7 +691,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
+diff --git b/libavformat/utils.c a/libavformat/utils.c
+index a059046..ef70074 100644
+--- b/libavformat/utils.c
++++ a/libavformat/utils.c
+@@ -748,7 +748,7 @@ static int update_wrap_reference(AVFormatContext *s, AVStream *st, int stream_in
int default_stream_index = av_find_default_stream_index(s);
if (s->streams[default_stream_index]->pts_wrap_reference == AV_NOPTS_VALUE) {
for (i = 0; i < s->nb_streams; i++) {
@@ -15585,11 +15328,11 @@ index 6f343f2..83f26d5 100644
continue;
s->streams[i]->pts_wrap_reference = pts_wrap_reference;
s->streams[i]->pts_wrap_behavior = pts_wrap_behavior;
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 694e116..203ca7b 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -425,3 +425,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
+diff --git b/libavutil/buffer.c a/libavutil/buffer.c
+index 8d1aa5f..649876d 100644
+--- b/libavutil/buffer.c
++++ a/libavutil/buffer.c
+@@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
return ret;
}
@@ -15599,11 +15342,11 @@ index 694e116..203ca7b 100644
+ BufferPoolEntry *buf = av_buffer_get_opaque(ref);
+ return buf->opaque;
+}
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 0c0ce12..82e0bc3 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -283,6 +283,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
+diff --git b/libavutil/buffer.h a/libavutil/buffer.h
+index 73b6bd0..d907de3 100644
+--- b/libavutil/buffer.h
++++ a/libavutil/buffer.h
+@@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
*/
AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
@@ -15613,11 +15356,11 @@ index 0c0ce12..82e0bc3 100644
/**
* @}
*/
-diff --git a/pi-util/conf.sh b/pi-util/conf.sh
+diff --git b/pi-util/conf.sh a/pi-util/conf.sh
new file mode 100755
index 0000000..8b596a2
--- /dev/null
-+++ b/pi-util/conf.sh
++++ a/pi-util/conf.sh
@@ -0,0 +1,33 @@
+echo "Configure for Pi2/3"
+
@@ -15652,11 +15395,11 @@ index 0000000..8b596a2
+
+# gcc option for getting asm listing
+# -Wa,-ahls
-diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+diff --git b/pi-util/conf_h265.csv a/pi-util/conf_h265.csv
new file mode 100644
-index 0000000..61d1399
+index 0000000..d3db338
--- /dev/null
-+++ b/pi-util/conf_h265.csv
++++ a/pi-util/conf_h265.csv
@@ -0,0 +1,144 @@
+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
+2,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
@@ -15783,7 +15526,7 @@ index 0000000..61d1399
+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+2,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
+2,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
@@ -15802,12 +15545,12 @@ index 0000000..61d1399
+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+diff --git b/pi-util/ffconf.py a/pi-util/ffconf.py
new file mode 100644
-index 0000000..38f942f
+index 0000000..c896bc6
--- /dev/null
-+++ b/pi-util/ffconf.py
-@@ -0,0 +1,146 @@
++++ a/pi-util/ffconf.py
+@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+import os
@@ -15851,16 +15594,18 @@ index 0000000..38f942f
+ except:
+ pass
+
-+ rv = False
+ if m1 and m2 and m1.group() == m2.group():
+ print >> flog, "Match: " + m1.group()
-+ rv = True
++ rv = 0
+ elif not m1:
+ print >> flog, "****** Cannot find m1"
++ rv = 3
+ elif not m2:
+ print >> flog, "****** Cannot find m2"
++ rv = 2
+ else:
+ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
++ rv = 1
+ flog.close()
+ return rv
+
@@ -15906,19 +15651,25 @@ index 0000000..38f942f
+ print "==== ", name,
+ sys.stdout.flush()
+
-+ if (not testone(os.path.join(conf_root, name), name, a[2], a[3])) :
-+ if exp_test == 1:
-+ failures.append(name)
-+ print ": * FAIL *"
-+ else:
-+ print ": fail"
-+ else:
++ rv = testone(os.path.join(conf_root, name), name, a[2], a[3])
++ if (rv == 0):
+ if exp_test == 2:
+ print ": * OK *"
+ unx_success.append(name)
+ else:
+ print ": ok"
-+
++ elif exp_test > 1 and rv == 1:
++ print ": fail"
++ else:
++ failures.append(name)
++ if rv == 1:
++ print ": * FAIL *"
++ elif (rv == 2) :
++ print ": * CRASH *"
++ elif (rv == 3) :
++ print ": * MD5 MISSING *"
++ else :
++ print ": * BANG *"
+
+ if failures or unx_success:
+ print "Unexpected Failures:", failures
@@ -15954,11 +15705,11 @@ index 0000000..38f942f
+
+ doconf(csva, args.tests)
+
-diff --git a/pi-util/qasm.py b/pi-util/qasm.py
+diff --git b/pi-util/qasm.py a/pi-util/qasm.py
new file mode 100644
index 0000000..1eacc04
--- /dev/null
-+++ b/pi-util/qasm.py
++++ a/pi-util/qasm.py
@@ -0,0 +1,2502 @@
+#!/usr/bin/env python
+
@@ -18462,11 +18213,25 @@ index 0000000..1eacc04
+
+if __name__ == '__main__':
+ main()
-diff --git a/pi-util/rebase_liblinks.py b/pi-util/rebase_liblinks.py
+diff --git b/pi-util/qem.sh a/pi-util/qem.sh
+new file mode 100644
+index 0000000..20ce7ee
+--- /dev/null
++++ a/pi-util/qem.sh
+@@ -0,0 +1,8 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ pi-util/qasm.py
++SRC_FILE=libavcodec/rpi_shader.qasm
++DST_BASE=shader
++
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
+diff --git b/pi-util/rebase_liblinks.py a/pi-util/rebase_liblinks.py
new file mode 100755
index 0000000..6a9a33f
--- /dev/null
-+++ b/pi-util/rebase_liblinks.py
++++ a/pi-util/rebase_liblinks.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
@@ -18505,11 +18270,11 @@ index 0000000..6a9a33f
+
+
+
-diff --git a/pi-util/syncroot.sh b/pi-util/syncroot.sh
+diff --git b/pi-util/syncroot.sh a/pi-util/syncroot.sh
new file mode 100755
index 0000000..d8bdd91
--- /dev/null
-+++ b/pi-util/syncroot.sh
++++ a/pi-util/syncroot.sh
@@ -0,0 +1,43 @@
+set -e
+
@@ -18554,4 +18319,84 @@ index 0000000..d8bdd91
+pi-util/rebase_liblinks.py $DST
+
+
-
+diff --git b/pi-util/v3dusage.py a/pi-util/v3dusage.py
+new file mode 100644
+index 0000000..7e336a9
+--- /dev/null
++++ a/pi-util/v3dusage.py
+@@ -0,0 +1,75 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def main():
++ argp = argparse.ArgumentParser(description="QPU/VPU perf summary")
++ argp.add_argument("logfile")
++ args = argp.parse_args()
++
++
++ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++
++ ttotal = {'idle':0.0}
++ tstart = {}
++ time0 = None
++ idle_start = None
++ qpu_op_no = 0
++ op_count = 0
++
++ with open(args.logfile, "rt") as infile:
++ for line in infile:
++ match = rmatch.match(line)
++ if match:
++# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++ time = float(match.group(1))
++ unit = match.group(3)
++ opstart = not match.group(2)
++ optype = match.group(7)
++ hascb = match.group(8) != "0"
++
++ if unit == 'qpu1':
++ unit = unit + "." + str(qpu_op_no)
++ if not opstart:
++ if hascb or optype == 'EXECUTE_SYNC':
++ qpu_op_no = 0
++ else:
++ qpu_op_no += 1
++
++ # Ignore sync type
++ if optype == 'EXECUTE_SYNC':
++ continue
++
++ if not time0:
++ time0 = time
++
++ if opstart:
++ tstart[unit] = time;
++ elif unit in tstart:
++ op_count += 1
++ if not unit in ttotal:
++ ttotal[unit] = 0.0
++ ttotal[unit] += time - tstart[unit]
++ del tstart[unit]
++
++ if not idle_start and not tstart:
++ idle_start = time
++ elif idle_start and tstart:
++ ttotal['idle'] += time - idle_start
++ idle_start = None
++
++ if not time0:
++ print "No v3d profile records found"
++ else:
++ tlogged = time - time0
++
++ print "Logged time:", tlogged, " Op count:", op_count
++ for unit in sorted(ttotal):
++ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++
++
++if __name__ == '__main__':
++ main()
++
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
index 721a065449..5240cf58ce 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1002-73fde6f9f3d01f7fc0f3ae4b66f6c725f9fb1105.patch
@@ -22,4 +22,3 @@ index 2fd3f2b..7165652 100644
if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
*poutbuf = NULL;
*poutbuf_size = 0;
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
index 15d449d284..37b53e8fb6 100644
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
+++ b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1003-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
@@ -53,4 +53,3 @@ index aca8382..f473f6c 100644
--
2.7.4
-
diff --git a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch b/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch
deleted file mode 100644
index 848158d727..0000000000
--- a/packages/multimedia/ffmpeg/patches/ffmpeg-99.1010-tls-1.2.patch
+++ /dev/null
@@ -1,17 +0,0 @@
---- a/libavformat/tls_openssl.c
-+++ b/libavformat/tls_openssl.c
-@@ -233,12 +233,13 @@ static int tls_open(URLContext *h, const char *uri, int flags, AVDictionary **op
- if ((ret = ff_tls_open_underlying(c, h, uri, options)) < 0)
- goto fail;
-
-- p->ctx = SSL_CTX_new(c->listen ? TLSv1_server_method() : TLSv1_client_method());
-+ p->ctx = SSL_CTX_new(c->listen ? SSLv23_server_method() : SSLv23_client_method());
- if (!p->ctx) {
- av_log(h, AV_LOG_ERROR, "%s\n", ERR_error_string(ERR_get_error(), NULL));
- ret = AVERROR(EIO);
- goto fail;
- }
-+ SSL_CTX_set_options(p->ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3);
- if (c->ca_file) {
- if (!SSL_CTX_load_verify_locations(p->ctx, c->ca_file, NULL))
- av_log(h, AV_LOG_ERROR, "SSL_CTX_load_verify_locations %s\n", ERR_error_string(ERR_get_error(), NULL));